├── Formula ├── hashchop.rb └── tangram.rb ├── LICENSE ├── README.md ├── tangram-0.1-1.rockspec ├── tangram.lua └── tangram ├── cmds.lua ├── db.lua ├── defaults.lua ├── init.lua ├── jumprope.lua ├── main.lua ├── test_db.lua └── test_jumprope.lua /Formula/hashchop.rb: -------------------------------------------------------------------------------- 1 | require 'formula' 2 | 3 | class Hashchop < Formula 4 | homepage 'https://github.com/silentbicycle/hashchop' 5 | url 'https://github.com/silentbicycle/hashchop/archive/master.tar.gz' 6 | sha1 '3452e20fb41e5a0f04a09b69e9978587030dfd75' 7 | version '0.8-0' 8 | 9 | depends_on 'lua' 10 | depends_on 'luarocks' 11 | 12 | def install 13 | system 'luarocks install hashchop-0.8-0.rockspec' 14 | end 15 | 16 | end 17 | -------------------------------------------------------------------------------- /Formula/tangram.rb: -------------------------------------------------------------------------------- 1 | require 'formula' 2 | 3 | class Tangram < Formula 4 | homepage 'https://github.com/silentbicycle/tangram' 5 | url 'https://github.com/silentbicycle/tangram/archive/v0.1-1.tar.gz' 6 | sha1 '34bd7022d0faad96145cd219872ed9d46b3598bb' 7 | version '0.1-1' 8 | 9 | depends_on 'lua' 10 | depends_on 'luarocks' 11 | depends_on 'hashchop' 12 | 13 | def install 14 | system 'luarocks install tangram-0.1-1.rockspec' 15 | end 16 | def test 17 | system 'tangram test' 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2013, Scott Vokes 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in 14 | the documentation and/or other materials provided with the 15 | distribution. 16 | 17 | * Neither the name of Scott Vokes nor the names of other 18 | contributors may be used to endorse or promote products derived 19 | from this software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 22 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 25 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | tangram: a Jumprope-based content store. 2 | 3 | 4 | # Overview 5 | 6 | This is a standalone content store, somewhat like the .git directory 7 | that git uses for internal storage. However, while git is best suited to 8 | storing versioned collections of relatively small, diff-able files, 9 | tangram is best at storing large files. 10 | 11 | It is based on the Jumprope, a data structure I invented. The Jumprope 12 | is a kind of tree of arrays of data chunks, whose overall shape is 13 | derived from the data itself -- duplicated sections of files coalesce 14 | together to branches that are automatically shared, and identical files 15 | end up with the same overall identifier. 16 | 17 | (This is a central component of scatterbrain, a distributed filesystem 18 | I've been working on, but also useful on its own. Since Jumpropes use 19 | [content-addressable storage][CAS] and all data is immutable, it doesn't 20 | really matter where the data is located -- scatterbrain mirrors the data 21 | over a somewhat Dynamo-like distributed hash table, which periodically 22 | checks that all live content is mirrored in a sufficient number of 23 | nodes. I'm still working on the network logic, though, and it will be a 24 | separate project) 25 | 26 | [CAS]: http://en.wikipedia.org/wiki/Content-addressable_storage 27 | 28 | 29 | ## Example Use Cases 30 | 31 | * Storing many variants of genetic data 32 | * Storing design / multimedia assets 33 | * Backing up lots of incremental virtual machine snapshots 34 | 35 | 36 | ## Features 37 | 38 | * Automatic de-duplication of content 39 | * Automatic detection of identical files 40 | * A tagging / property system for saving and searching by file metadata 41 | * High throughput (e.g. HD video pipes to mplayer w/ out skips) 42 | 43 | 44 | # License 45 | 46 | This is released under a 3-clause BSD license. Be nice. 47 | 48 | 49 | # Current Status 50 | 51 | The system works, but the command-line interface and installation 52 | process are still evolving, and a bit rough around the edges. 53 | (Thanks, early adopters. Constructive feedback is appreciated.) 54 | 55 | I have tested it on Linux, OpenBSD, and OSX. 56 | 57 | I *haven't* tested it on Windows yet, but it shouldn't take major 58 | effort to port - there isn't anything OS-specific besides the process 59 | to create a native Lua extension and some default paths. 60 | 61 | 62 | # Installation 63 | 64 | The installation process should eventually be replaced by 65 | `brew install tangram`, `apt-get install tangram`, `pkg_add tangram`, 66 | and the like, but it's still pretty manual. 67 | 68 | 69 | ## Dependencies 70 | 71 | All Lua dependencies are available via [LuaRocks](http://luarocks.org). 72 | 73 | * Lua (http://lua.org) 74 | * SQLite3 (http://sqlite.org) 75 | * A C compiler 76 | * libhashchop and its Lua wrapper (http://github.com/silentbicycle/hashchop/) 77 | * luafilesystem 78 | * slncrypto (for SHA1 hashing) 79 | * zlib and its lua wrapper 80 | * SQLite3's lua wrapper 81 | * lunatest (for testing) 82 | 83 | 84 | ## How to Install 85 | 86 | * Install [Lua](http://lua.org). 87 | * Install [SQLite3](http://sqlite.org), if you don't have like 88 | a dozen copies of it lying around already. 89 | * Install [LuaRocks](http://luarocks.org), the de facto standard packaging 90 | system for Lua. (If you don't want to use LuaRocks, install the other 91 | Lua dependencies yourself.) 92 | * Use LuaRocks to install the `slncrypto`, `zlib`, `luafilesystem`, 93 | `lsqlite3`, and `lunatest` packages. 94 | Type e.g. `luarocks install slncrypto`. 95 | * Download [libhashchop](https://github.com/silentbicycle/hashchop), 96 | build it, and then build and install the lua wrapper with `make lua` 97 | and `make lua-install`. Or, if you want to do it by hand, copy the 98 | dynamic library to wherever Lua puts its native extensions on your 99 | system. (To figure this out, you can fire up the Lua REPL and type 100 | `=package.cpath`. On Unix-like OSs, it's typically something like 101 | `/usr/local/lib/lua/5.1/`.) You may need to modify the paths in 102 | the makefile, if your OS puts Lua's headers/libary somewhere odd. 103 | * Copy the `tangram` subdirectory into Lua's package path (typically 104 | "/usr/local/share/lua/5.1/", check `package.path`), so that the 105 | tangram.* packages can be loaded. 106 | * Copy the tangram.lua script into your path somewhere. 107 | * Run `tangram.lua test`. 108 | 109 | ## Example usage 110 | 111 | $ tangram.lua init # create a content store w/ default settings 112 | $ tangram.lua add foo.bar # add a file to the store 113 | $ cmd | tangram.lua add - # add to the store from stdin 114 | $ tangram.lua list # list known files 115 | $ tangram.lua get 1 # get file with ID #1, print to stdout 116 | $ tangram.lua get 1 foo.baz # get file with ID #1, save to foo.baz 117 | 118 | 119 | # Options 120 | 121 | All commands take the following arguments (which should appear *before* 122 | the command name): 123 | 124 | * -d: dry run, don't write to disk 125 | * -v: verbose 126 | * -s PATH: use custom store path instead of default 127 | 128 | 129 | # Commands 130 | 131 | ## help: print help message 132 | 133 | Print help. 134 | 135 | ## version: print version 136 | 137 | Print the version. 138 | 139 | ## init: initialize data store 140 | 141 | Initialize a data store. 142 | 143 | tangram init [-b BITS] [-f BRANCH_FACTOR] 144 | 145 | Arguments: 146 | 147 | * -b BITS - Set number of bits for rolling hash bitmask (determines chunk size) 148 | * -f BF - Set branching factor (determines average Jumprope limb length) 149 | 150 | ## get: get a file 151 | 152 | Get file content from the store. 153 | 154 | tangram get [-f | -h] KEY [OUT_FILE] 155 | 156 | -f or -h specify that the key is a file ID (-f) or hash (-h), otherwise 157 | it will try to infer the right thing. If OUT_FILE is given, it will save 158 | the content to that file, otherwise it will print to stdout. 159 | 160 | ## add: add a file 161 | 162 | Add a file to the store. 163 | 164 | tangram add [-n NAME] [FILENAME or -] 165 | 166 | Arguments: 167 | 168 | * -n NAME - Store input file as NAME. 169 | 170 | ## info: get info 171 | 172 | Print metadata about a file. 173 | 174 | tangram info ID 175 | 176 | TODO: the info command (without an ID) should print info about the store config 177 | 178 | ## list: list known files 179 | 180 | Print basic info about all stored files. 181 | 182 | ## prop: get/set property 183 | 184 | Get / set a property on a file. These properties don't have any internal 185 | meaning, but exist as a hook to track content metadata. 186 | 187 | tangram prop add ID KEY 188 | tangram prop add ID KEY VALUE 189 | tangram prop del ID 190 | tangram prop del ID KEY 191 | 192 | ## search: search by name or property 193 | 194 | Search by name or property. 195 | 196 | tangram search prop KEYNAME 197 | tangram search prop KEYNAME VALUENAME 198 | tangram search name PATTERN 199 | 200 | ## forget: stop tracking a file 201 | 202 | Stop tracking a file. To actually remove content from the store, use the 203 | GC command. 204 | 205 | tangram forget ID 206 | 207 | ## gc: remove inaccessible content from store 208 | 209 | When files are forgotten, their storage is not automatically reclaimed, 210 | since some of it may be shared by other files. This checks the liveness 211 | of data chunks in the store and deletes any that are no longer referenced. 212 | 213 | ## test: run tests 214 | 215 | Run unit tests. (Requires lunatest.) 216 | 217 | 218 | # Future Developments 219 | 220 | * Better documentation of the Jumprope data structure. Its reference 221 | implementation is included, and (IMHO) commented well, but there are 222 | some subtleties. In the mean time, [my StrangeLoop talk][talk] 223 | includes an attempt to convey my intuitions about how it works. 224 | 225 | [talk]: http://www.infoq.com/presentations/Data-Structures 226 | 227 | * Retrieving specific byte-ranges of content. The Jumprope library 228 | supports it, but it isn't part of the CLI yet. 229 | 230 | * While there is currently no interface for it, the Jumprope has the 231 | necessary metadata to accelerate diff-ing of very large files. 232 | (It automatically identifies large subsets of the files that are 233 | known to be identical and can be skipped.) 234 | 235 | * There isn't any attempt to take advantage of the Jumprope's 236 | embarassingly parallelizable retrieval. Scatterbrain uses async IO to 237 | spread reads over the network, and to maintain an arbitrarily large 238 | look-ahead buffer for the streaming data, but this doesn't bother 239 | with that: it would would only complicate things and lead to more 240 | disk contention. There may be advantages in taking advantage of 241 | parallelism by different means, though. 242 | 243 | 244 | # Acknowledgements 245 | 246 | Thanks to everyone who has given me feedback along the way, particularly 247 | Mike English and Jessica Kerr. 248 | -------------------------------------------------------------------------------- /tangram-0.1-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "tangram" 2 | version = "0.1-1" 3 | source = { 4 | url = "git://github.com/silentbicycle/tangram.git", 5 | tag = "v0.1-1", 6 | file = "tangram-0.1-1.tar.gz", 7 | dir = "tangram", 8 | } 9 | description = { 10 | summary = "A Jumprope-based content store", 11 | detailed = [[ 12 | This is a standalone content store, somewhat like the .git directory 13 | that git uses for internal storage. However, while git is best suited to 14 | storing versioned collections of relatively small, diff-able files, 15 | tangram is best at storing large files. 16 | 17 | It is based on the Jumprope, a data structure I invented. The Jumprope 18 | is a kind of tree of arrays of data chunks, whose overall shape is 19 | derived from the data itself -- duplicated sections of files coalesce 20 | together to branches that are automatically shared, and identical files 21 | end up with the same overall identifier. 22 | ]], 23 | license = "BSD", 24 | homepage = "github.com/silentbicycle/tangram/", 25 | maintainer = "Scott Vokes (vokes.s@gmail.com)", 26 | } 27 | dependencies = { 28 | "lua >= 5.1", 29 | "hashchop >= 0.8-0", 30 | "slncrypto >= 1.1-1", 31 | "lzlib >= 0.3-3", 32 | "luafilesystem >= 1.6.2-1", 33 | "lsqlite3 >= 0.8-1", 34 | "lunatest >= 0.9.1-1", 35 | "lrandom >= 20101118-1", 36 | } 37 | build = { 38 | type = "none", 39 | install = { 40 | bin = { ["tangram"] = "tangram.lua"}, 41 | lua = { 42 | ['tangram.cmds'] = "tangram/cmds.lua", 43 | ['tangram.db'] = "tangram/db.lua", 44 | ['tangram.defaults'] = "tangram/defaults.lua", 45 | ['tangram.init'] = "tangram/init.lua", 46 | ['tangram.jumprope'] = "tangram/jumprope.lua", 47 | ['tangram.main'] = "tangram/main.lua", 48 | ['tangram.test_db'] = "tangram/test_db.lua", 49 | ['tangram.test_jumprope'] = "tangram/test_jumprope.lua", 50 | }, 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /tangram.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env lua 2 | -- Copyright (c) 2012-2013, Scott Vokes 3 | -- 4 | -- All rights reserved. 5 | -- 6 | -- Redistribution and use in source and binary forms, with or without 7 | -- modification, are permitted provided that the following conditions 8 | -- are met: 9 | -- * Redistributions of source code must retain the above copyright 10 | -- notice, this list of conditions and the following disclaimer. 11 | -- * Redistributions in binary form must reproduce the above 12 | -- copyright notice, this list of conditions and the following 13 | -- disclaimer in the documentation and/or other materials 14 | -- provided with the distribution. 15 | -- * Neither the name of Scott Vokes nor the names of other 16 | -- contributors may be used to endorse or promote products 17 | -- derived from this software without specific prior written 18 | -- permission. 19 | -- 20 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 | -- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 | -- COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 | -- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 | -- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | -- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 30 | -- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 | -- POSSIBILITY OF SUCH DAMAGE. 32 | 33 | require "tangram.init" 34 | 35 | -- Use luarocks, if available. 36 | pcall(require, "luarocks.install") 37 | 38 | tangram.main.main(arg) 39 | -------------------------------------------------------------------------------- /tangram/cmds.lua: -------------------------------------------------------------------------------- 1 | require "hashchop" 2 | require "lfs" 3 | 4 | -- This loads either slncrypto (preferred) or luacrypto, which both 5 | -- install as "crypto". 6 | require "crypto" 7 | 8 | require "tangram.db" 9 | local jumprope = require "tangram.jumprope" 10 | 11 | -- Use zlib for compression, if Lua wrapper is available. 12 | local compress, decompress 13 | local ok, zlib = pcall(require, "zlib") 14 | 15 | if ok and zlib then 16 | compress = zlib.compress 17 | decompress = zlib.decompress 18 | end 19 | 20 | module(..., package.seeall) 21 | 22 | --local function log(...) print(...) end 23 | local function log(...) end 24 | local function printf(...) io.write(string.format(...)) end 25 | 26 | local usage = {} 27 | 28 | local function print_usage(cmdname) 29 | for _,row in ipairs(usage[cmdname]) do print(row) end 30 | os.exit(1) 31 | end 32 | 33 | local function file_exists(path) 34 | return lfs.attributes(path) ~= nil 35 | end 36 | 37 | local function mkdir_if_nonexistent(path) 38 | if not file_exists(path) then 39 | assert(lfs.mkdir(path)) 40 | end 41 | end 42 | 43 | local sha1 -- string->sha1 function 44 | if crypto.sha1 ~= nil then -- prefer slncrypto 45 | sha1 = function(s) return crypto.sha1(s):lower() end 46 | elseif crypto.digest ~= nil then -- luacrypto 47 | sha1 = function(s) return crypto.digest("sha1", data) end 48 | end 49 | 50 | local function db_path(cfg) 51 | return cfg.base_path .. "db.sql" 52 | end 53 | 54 | local function pop(t) return table.remove(t, 1) end 55 | 56 | -- Create callbacks jumprope expects for disk I/O 57 | local function init_callbacks(cfg) 58 | local store_path = cfg.base_path .. "store" 59 | 60 | local function hash_fn(hash) 61 | local base = store_path 62 | local head, rest = hash:match("(%w%w%w)(%w+)") 63 | local fullpath = table.concat{base, "/", head, "/", rest} 64 | local basedir = table.concat{base, "/", head} 65 | return fullpath, basedir, rest 66 | end 67 | 68 | local function store_coro() 69 | local base = store_path 70 | -- Walk every content store and yield every SHA1-hash-named file 71 | for dir in lfs.dir(base) do 72 | if dir:match("^%x+") then 73 | for tail in lfs.dir(string.format("%s/%s", base, dir)) do 74 | if tail:match("^%x+") then 75 | local h = table.concat{dir, tail} 76 | coroutine.yield(h) 77 | end 78 | end 79 | end 80 | end 81 | end 82 | 83 | local iter_store = function() return coroutine.wrap(store_coro) end 84 | 85 | local function get(hash) 86 | assert(hash, "no hash given") 87 | local path = hash_fn(hash) 88 | local f = assert(io.open(path, "r")) 89 | local data = f:read("*a") 90 | if decompress then data = decompress(data) end 91 | log("GET ", hash, data:len()) 92 | f:close() 93 | return data 94 | end 95 | 96 | local function exists(hash) 97 | assert(hash, "no hash given") 98 | local path = hash_fn(hash) 99 | return file_exists(path) 100 | end 101 | 102 | local function put(hash, content) 103 | if cfg.dry_run then return end 104 | assert(hash, "no hash given") 105 | log("SAVE ", hash, " => ", content:len()) 106 | local path, basedir, rest = hash_fn(hash) 107 | if not file_exists(basedir) then 108 | assert(lfs.mkdir(basedir)) 109 | end 110 | if file_exists(path) then return end 111 | local f = assert(io.open(path, "w")) 112 | if compress then content = compress(content) end 113 | f:write(content) 114 | f:close() 115 | end 116 | 117 | local function delete(hash) 118 | local path = hash_fn(hash) 119 | assert(os.remove(path), "Unable to delete: " .. tostring(path)) 120 | end 121 | 122 | return {get=get, put=put, exists=exists, delete=delete, 123 | iter_store=iter_store} 124 | end 125 | 126 | usage["init"] = { 127 | "Usage for 'init' command:", 128 | "init [-b RH_BITS] [-f BRANCH_FACTOR]", 129 | " RH_BITS: Bits for rolling hash bitmask (average chunk size ~ 2^RH_BITS).", 130 | " BRANCH_FACTOR: Each Jumprope limb has a 1:BF chance of terminating.", 131 | } 132 | 133 | function cmd_init(arg, cfg) 134 | local path = cfg.base_path or DEFAULTS.base_path 135 | local store_path = path .. "store" 136 | 137 | mkdir_if_nonexistent(path) 138 | mkdir_if_nonexistent(store_path) 139 | 140 | local opts = { 141 | path = db_path(cfg), 142 | rh_bits = nil, 143 | branch_factor = nil, 144 | } 145 | 146 | while true do 147 | if arg[1] == '-b' then -- rh bits 148 | pop(arg) 149 | local b = pop(arg) 150 | b = assert(tonumber(b), "Invalid spec for RH bits") 151 | opts.rh_bits = b 152 | elseif arg[1] == '-f' then 153 | pop(arg) 154 | local f = pop(arg) 155 | f = assert(tonumber(f), "Invalid spec for branch factor") 156 | opts.branch_factor = f 157 | elseif arg[1] then 158 | print_usage("init") 159 | else 160 | break 161 | end 162 | end 163 | 164 | local _ = assert(tangram.db.init_db(opts)) 165 | printf("Initialized jumprope store in: %s\n", opts.path) 166 | end 167 | 168 | local function add_mainloop(f, hc, jr, read_size) 169 | local size = 0 170 | 171 | -- Read the input, sink it into the hashchopper, and for every 172 | -- complete chunk it yields, sink it into the jumprope. 173 | -- This is pretty simple, but error handling adds a bit of code. 174 | while true do 175 | local rd = f:read(read_size) 176 | if rd == nil then break end -- EOF 177 | local res = hc:sink(rd) 178 | if res == "ok" then 179 | -- happy case: bump acc'd size and continue 180 | size = size + rd:len() 181 | log("SUNK: ", rd:len()) 182 | elseif res == "overflow" then 183 | error("Chunk size too large for hashchopper") 184 | elseif res == "full" then 185 | error("Buffer full, needs more flushing") 186 | else 187 | error("Unexpected: " .. tostring(res)) 188 | end 189 | 190 | while true do 191 | local chunk, err = hc:poll() 192 | if chunk then 193 | log("POLL: ", chunk:len()) 194 | -- Since the jumprope's callbacks are blocking, 195 | -- we can just bail out on error here. 196 | assert(jr:sink(chunk)) 197 | elseif err == "underflow" then -- no more chunks 198 | break 199 | elseif err == "overflow" then 200 | error("Too large to fit in buffer") 201 | else 202 | error("Unexpected: " .. tostring(res)) 203 | end 204 | end 205 | end 206 | 207 | local rem, err = hc:finish() 208 | if rem then 209 | -- sink the remaining content 210 | log("REM: ", rem:len()) 211 | assert(jr:sink(rem)) 212 | elseif err == "overflow" then 213 | error("Too large to fit in buffer") 214 | end 215 | return size 216 | end 217 | 218 | usage["add"] = { 219 | "Usage for 'add' command:", 220 | "add [-n SAVE-AS-NAME] [FILENAME or -]", 221 | } 222 | 223 | function cmd_add(arg, cfg) 224 | local fname = "-" 225 | local save_as = nil 226 | 227 | if arg[1] == '-n' then 228 | pop(arg) 229 | save_as = pop(arg) 230 | if not save_as then print_usage("add") end 231 | end 232 | if arg[1] then fname = arg[1] end 233 | local f = io.stdin 234 | 235 | if fname == "-" then 236 | fname = "" 237 | else 238 | f = assert(io.open(fname, "r")) 239 | end 240 | local db = assert(tangram.db.open(db_path(cfg))) 241 | 242 | local db_cfg = db:get_config() 243 | cfg.bits = assert(db_cfg.rh_bits) 244 | cfg.branch_factor = assert(db_cfg.branch_factor) 245 | 246 | local cbs = init_callbacks(cfg) 247 | local hc = hashchop.new(cfg.bits) 248 | local jrs = assert(jumprope.init{get=cbs.get, put=cbs.put, 249 | exists=cbs.exists, hash=sha1}) 250 | local jr = jrs:new() 251 | local size = add_mainloop(f, hc, jr, 2^cfg.bits) 252 | 253 | -- Terminate jumprope and save file metadata 254 | local headhash = assert(jr:finish()) 255 | 256 | if cfg.dry_run then 257 | printf("Not saving (dry run), head hash %s\n", headhash) 258 | return 259 | end 260 | local id = assert(db:add_file(headhash, save_as or fname, size)) 261 | 262 | if id == 0 then 263 | printf("File is already stored, head hash %s\n", headhash) 264 | else 265 | printf("Added file %d, head hash %s\n", id, headhash) 266 | end 267 | end 268 | 269 | usage["get"] = { 270 | "Usage for 'get' command:", 271 | "get [-f | -h] KEY [OUT_FILE]", 272 | " If no out file path is provided, it will print to stdout.", 273 | " If neither '-f' (file ID) nor '-h' (hash) is used, it will", 274 | " attempt to guess whether the key is a file ID or hash.", 275 | -- "[-r FROM:TO] " 276 | -- " -r can be used to fetch only a specific byte-range of the file.", 277 | } 278 | 279 | local function get_headhash_from_args(arg, cfg, db) 280 | -- arg[1] => hash? file ID? filename? 281 | local arg_type = "unknown" 282 | 283 | local v 284 | if arg[1] == '-f' then 285 | arg_type = "id" -- file ID 286 | pop(arg) 287 | v = pop(arg) 288 | elseif arg[1] == '-h' then 289 | arg_type = "hash" -- hash hex digest 290 | pop(arg) 291 | v = pop(arg) 292 | else -- does the arg looks like a file ID or hash? 293 | v = pop(arg) 294 | if v == nil then print_usage("get") end 295 | v = tostring(v) 296 | if v:match("^[0-9]+$") then 297 | arg_type = "id" 298 | elseif v:match("^[0-9a-fA-F]+$") then 299 | arg_type = "hash" 300 | else 301 | print_usage("get") 302 | end 303 | end 304 | 305 | if arg_type == "hash" then 306 | local hashes, conflicts = {}, {} 307 | for hash in db:get_hash_completions(v) do 308 | hashes[#hashes+1] = hash.hash 309 | end 310 | if #hashes == 0 then 311 | return nil, conflicts 312 | elseif #hashes > 1 then 313 | return nil, hashes 314 | else 315 | return hashes[1], nil 316 | end 317 | elseif arg_type == "id" then 318 | local id = tonumber(v) 319 | local info = db:get_file_info(id) 320 | if info and info.hash then 321 | return info.hash, {} 322 | else 323 | printf("Bad file ID: %d\n", id) 324 | os.exit(1) 325 | end 326 | end 327 | end 328 | 329 | function cmd_get(arg, cfg) 330 | local db = assert(tangram.db.open(db_path(cfg))) 331 | 332 | -- Get a single headhash or nil and a "did you mean X,Y,Z..." list. 333 | local headhash, conflicts = get_headhash_from_args(arg, cfg, db) 334 | 335 | if not headhash then 336 | if #conflicts == 0 then 337 | printf("No completion found for hash prefix\n") 338 | os.exit(1) 339 | end 340 | 341 | printf("Ambiguous jumprope spec:\n") 342 | for _,h in ipairs(conflicts) do 343 | printf(" %s\n", h) 344 | end 345 | os.exit(1) 346 | end 347 | 348 | local f = io.stdout 349 | if arg[1] then 350 | f = assert(io.open(arg[1], "w")) 351 | end 352 | local cbs = init_callbacks(cfg) 353 | local jrs = assert(jumprope.init{get=cbs.get, put=cbs.put, 354 | exists=cbs.exists, hash=sha1}) 355 | local jr = jrs:open(headhash) 356 | 357 | for chunk in jr:stream() do 358 | f:write(chunk) 359 | end 360 | f:close() 361 | end 362 | 363 | function cmd_list(arg, cfg) 364 | local db = assert(tangram.db.open(db_path(cfg))) 365 | 366 | printf("%-4s %-10s %-19s %-10s %s\n", 367 | "ID", "hash", "time (UTC)", "size", "filename") 368 | for row in db:get_files() do 369 | printf('%-4d %s %s %-10d %s\n', 370 | row.id, row.hash:sub(1,10), row.timestamp, 371 | row.size, row.name) 372 | end 373 | end 374 | 375 | function cmd_test(arg, cfg) 376 | local ok = pcall(require, "lunatest") 377 | if not ok then 378 | print("test command requires lunatest.") 379 | os.exit(1) 380 | end 381 | 382 | require "tangram.test_db" 383 | require "tangram.test_jumprope" 384 | 385 | lunatest.suite("tangram.test_db") 386 | lunatest.suite("tangram.test_jumprope") 387 | lunatest.run() 388 | end 389 | 390 | function cmd_info(arg, cfg) 391 | -- info for file ID: get tags 392 | local db = assert(tangram.db.open(db_path(cfg))) 393 | local id = assert(pop(arg), "Not a valid file ID") 394 | id = assert(tonumber(id), "Not a valid file ID") 395 | 396 | local info = assert(db:get_file_info(id)) 397 | for _,key in ipairs{"id", "hash", "timestamp", "size", "name"} do 398 | if info[key] then printf("%s %s\n", key, info[key]) end 399 | end 400 | 401 | local props = db:get_properties(id) 402 | for k,v in pairs(props) do 403 | printf(" %s%s%s\n", k, v ~= "" and " - " or "", v) 404 | end 405 | end 406 | 407 | function cmd_forget(arg, cfg) 408 | local db = assert(tangram.db.open(db_path(cfg))) 409 | local id = assert(tonumber(pop(arg) or nil), "Not a valid file ID") 410 | assert(db:rm_file(id)) 411 | end 412 | 413 | function cmd_gc(arg, cfg) 414 | local db = assert(tangram.db.open(db_path(cfg))) 415 | local marks = {} 416 | 417 | local cbs = init_callbacks(cfg) 418 | 419 | -- Wrap get callback in something that marks live files 420 | -- (This does more disk IO than necessary.) 421 | local old_get_cb = cbs.get 422 | cbs.get = function(hash) 423 | local data = old_get_cb(hash) 424 | if data then marks[hash] = true end 425 | return data 426 | end 427 | local jrs = assert(jumprope.init{get=cbs.get, put=cbs.put, 428 | exists=cbs.exists, hash=sha1}) 429 | 430 | for row in db:get_files() do 431 | local jr = jrs:open(row.hash) 432 | for chunk in jr:stream() do 433 | -- just discard the data 434 | end 435 | end 436 | 437 | -- Iterate over hashes in store, delete any unmarked chunks 438 | local count = 0 439 | for f in cbs.iter_store() do 440 | if not marks[f] then 441 | cbs.delete(f) 442 | count = count + 1 443 | end 444 | end 445 | printf("Collected %d chunks\n", count) 446 | end 447 | 448 | usage["prop"] = { 449 | "Usage for 'prop' command:", 450 | " prop add ID KEY [VALUE] -- set file ID's property KEY to VALUE (or \"\").", 451 | " prop del ID -- delete all properties for file ID.", 452 | " prop del ID KEY -- delete property KEY for file ID.", 453 | } 454 | 455 | function cmd_prop(arg, cfg) 456 | local db = assert(tangram.db.open(db_path(cfg))) 457 | local mode = pop(arg) 458 | if mode == "add" then -- prop add ID KEY VAL 459 | local id = assert(tonumber(pop(arg)), "Not a valid file ID") 460 | local key = assert(pop(arg), "Missing property key") 461 | local value = pop(arg) or "" 462 | assert(db:add_property(id, key, value)) 463 | elseif mode == "del" then -- prop del ID KEY 464 | local id = assert(tonumber(pop(arg)), "Not a valid file ID") 465 | local key = pop(arg) 466 | if key then 467 | assert(db:rm_property(id, key)) 468 | else 469 | assert(db:rm_property(id)) 470 | end 471 | else 472 | print_usage("prop") 473 | end 474 | end 475 | 476 | usage["search"] = { 477 | "Usage for 'search' command:", 478 | " search name PATTERN -- search for files whose name matches PATTERN.", 479 | " search prop KEY -- search for files who have property KEY.", 480 | " search prop KEY VALUE -- search for files who have VALUE for property KEY.", 481 | } 482 | 483 | function cmd_search(arg, cfg) 484 | local db = assert(tangram.db.open(db_path(cfg))) 485 | local mode = pop(arg) 486 | if mode == "name" then -- search name PATTERN 487 | local pattern = assert(pop(arg), "Missing name search pattern") 488 | for row in db:search_name(pattern) do 489 | printf("%d %s\n", row.id, row.name) 490 | end 491 | elseif mode == "prop" then -- search prop KEY [VALUE] 492 | local key = assert(pop(arg), "Missing property search key") 493 | local value = pop(arg) 494 | for row in db:search_property(key, value) do 495 | printf("%d %s %s %s\n", row.id, row.name, row.key, row.value) 496 | end 497 | else 498 | print_usage("search") 499 | end 500 | end 501 | -------------------------------------------------------------------------------- /tangram/db.lua: -------------------------------------------------------------------------------- 1 | -- This is an odd one, because it actually returns a table called "sqlite3". 2 | require "lsqlite3" 3 | 4 | require "tangram.defaults" 5 | 6 | local sqlite3 = sqlite3 7 | local assert, print, setmetatable = assert, print, setmetatable 8 | local defaults = DEFAULTS 9 | 10 | module(...) 11 | 12 | local schema = [[ 13 | -- known files 14 | CREATE TABLE IF NOT EXISTS files ( 15 | id INTEGER PRIMARY KEY, 16 | hash TEXT NOT NULL, -- head hash 17 | name TEXT NOT NULL, -- filename 18 | timestamp TIME NOT NULL, -- creation datetime() 19 | size INTEGER NOT NULL, -- file size 20 | CONSTRAINT duped_file UNIQUE (hash, name) ON CONFLICT IGNORE 21 | ); 22 | 23 | -- key value store for arbitrary file metadata 24 | CREATE TABLE IF NOT EXISTS properties ( 25 | fid INTEGER NOT NULL, 26 | key TEXT NOT NULL, 27 | value TEXT NOT NULL, 28 | FOREIGN KEY (fid) REFERENCES file(id) 29 | ); 30 | 31 | CREATE INDEX IF NOT EXISTS prop_index ON properties (fid, key); 32 | 33 | -- configuration for server 34 | CREATE TABLE IF NOT EXISTS config ( 35 | version TEXT NOT NULL, -- internal data format version 36 | rh_bits INTEGER NOT NULL, -- bits for rolling hash 37 | branch_factor INTEGER NOT NULL -- branch factor for jumprope 38 | ); 39 | ]] 40 | 41 | -- file info DB 42 | DB = {} 43 | DB.__index = DB 44 | 45 | function open(path) 46 | path = path or ":memory:" 47 | local db, err = sqlite3.open(path) 48 | if not db then 49 | return nil, ("Unable to open tangram store DB at %s"):format(path) 50 | end 51 | local res = {_db=db, _cache={}} 52 | return setmetatable(res, DB) 53 | end 54 | 55 | function DB:stmt(sql) 56 | local db, cache = self._db, self._cache 57 | if cache[sql] then 58 | local stmt = cache[sql] 59 | stmt:reset() 60 | return stmt 61 | else 62 | local stmt, err = db:prepare(sql) 63 | if not stmt then 64 | assert(nil, db:error_message()) 65 | end 66 | cache[sql] = stmt 67 | return stmt 68 | end 69 | end 70 | 71 | function DB:last_insert_rowid() return self._db:last_insert_rowid() end 72 | function DB:errmsg() return self._db:errmsg() end 73 | 74 | function DB:add_file(hash, name, size) 75 | local stmt = self:stmt([[ 76 | INSERT INTO files (hash, name, size, timestamp) VALUES (?, ?, ?, datetime());]]) 77 | stmt:bind_values(hash, name, size) 78 | local res, err = stmt:step() 79 | if res == sqlite3.DONE then 80 | return self:last_insert_rowid() 81 | else 82 | return nil, self:errmsg() 83 | end 84 | end 85 | 86 | function DB:rm_file(id) 87 | local stmt = self:stmt("DELETE FROM files WHERE id == ?;") 88 | stmt:bind_values(id) 89 | local res, err = stmt:step() 90 | if res == sqlite3.DONE then 91 | return self:rm_property(id) 92 | else 93 | return nil, db:errmsg() 94 | end 95 | end 96 | 97 | -- Get an iterator for all files. 98 | function DB:get_files() 99 | local stmt = self:stmt("SELECT * FROM files;") 100 | return stmt:nrows() 101 | end 102 | 103 | -- Get array of hashes starting with HASH. 104 | function DB:get_hash_completions(hash) 105 | local stmt = self:stmt("SELECT hash FROM files WHERE hash LIKE ?;") 106 | stmt:bind_values((hash or "") .. "%") 107 | return stmt:nrows() 108 | end 109 | 110 | function DB:add_property(id, key, value) 111 | local stmt = self:stmt([[ 112 | INSERT INTO properties (fid, key, value) VALUES (?, ?, ?);]]) 113 | stmt:bind_values(id, key, value) 114 | local res, err = stmt:step() 115 | if res == sqlite3.DONE then 116 | return self:last_insert_rowid() 117 | else 118 | return nil, self:errmsg() 119 | end 120 | end 121 | 122 | -- Get info for a single file ID. 123 | function DB:get_file_info(id) 124 | local stmt = self:stmt("SELECT * FROM files WHERE id == ?;") 125 | stmt:bind_values(id) 126 | local info = {} 127 | for row in stmt:nrows() do 128 | return row 129 | end 130 | return nil, "not found" 131 | end 132 | 133 | -- Get a table of properties associated with a file ID. 134 | function DB:get_properties(id) 135 | local stmt = self:stmt("SELECT key, value FROM properties WHERE fid == ?;") 136 | stmt:bind_values(id) 137 | local props = {} 138 | for row in stmt:nrows() do 139 | props[row.key] = row.value 140 | end 141 | return props 142 | end 143 | 144 | function DB:rm_property(id, key) 145 | local stmt 146 | if key then 147 | stmt = self:stmt([[ 148 | DELETE FROM properties 149 | WHERE fid == ? AND key == ?;]]) 150 | stmt:bind_values(id, key) 151 | else 152 | stmt = self:stmt("DELETE FROM properties WHERE fid == ?;") 153 | stmt:bind_values(id) 154 | end 155 | 156 | local res, err = stmt:step() 157 | if res == sqlite3.DONE then 158 | return self:last_insert_rowid() 159 | else 160 | return nil, self:errmsg() 161 | end 162 | end 163 | 164 | function DB:search_name(name) 165 | local stmt = self:stmt("SELECT id, name FROM files WHERE name LIKE ?;") 166 | stmt:bind_values("%" .. name .. "%") 167 | return stmt:nrows() 168 | end 169 | 170 | function DB:search_hash(hash) 171 | local stmt = self:stmt("SELECT id FROM files WHERE hash LIKE ?;") 172 | stmt:bind_values(hash .. "%") 173 | return stmt:nrows() 174 | end 175 | 176 | -- Search by key and/or value. 177 | function DB:search_property(key, value) 178 | local stmt 179 | if key and value then 180 | stmt = self:stmt([[ 181 | SELECT f.id, f.name, p.key, p.value FROM files f, properties p 182 | WHERE p.key == ? AND p.value == ? AND f.id == p.fid;]]) 183 | stmt:bind_values(key, value) 184 | else 185 | stmt = self:stmt([[ 186 | SELECT f.id, f.name, p.key, p.value FROM files f, properties p 187 | WHERE p.key == ? AND f.id == p.fid;]]) 188 | stmt:bind_values(key) 189 | end 190 | return stmt:nrows() 191 | end 192 | 193 | function DB:get_config() 194 | local stmt = self:stmt("SELECT * FROM config;") 195 | for row in stmt:nrows() do 196 | return row 197 | end 198 | end 199 | 200 | function init_db(opts) 201 | opts = opts or {} 202 | opts.rh_bits = opts.rh_bits or defaults.rh_bits 203 | opts.branch_factor = opts.branch_factor or defaults.branch_factor 204 | 205 | local path = opts.path or ":memory:" 206 | local sql_db, err = sqlite3.open(path) 207 | if not sql_db then 208 | return nil, "Failed to create database at " .. path 209 | end 210 | local code = sql_db:exec(schema) 211 | if code ~= sqlite3.OK then 212 | return nil, sql_db:error_message() 213 | end 214 | 215 | local db = setmetatable({_db=sql_db, _cache={}}, DB) 216 | 217 | local stmt = db:stmt([[ 218 | INSERT INTO config (version, rh_bits, branch_factor) 219 | VALUES (?, ?, ?);]]) 220 | stmt:bind_values(defaults.version, opts.rh_bits, opts.branch_factor) 221 | 222 | local res, err = stmt:step() 223 | if res == sqlite3.DONE then 224 | return db 225 | else 226 | return nil, sql_db:errmsg() 227 | end 228 | end 229 | -------------------------------------------------------------------------------- /tangram/defaults.lua: -------------------------------------------------------------------------------- 1 | local HOME = assert(os.getenv("HOME")) 2 | 3 | DEFAULTS = { 4 | author = "Scott Vokes ", 5 | version = "0.01.02", 6 | 7 | -- base path for local content store 8 | base_path = HOME .. "/.tangram/", 9 | 10 | -- bitmask size for rolling hash 11 | rh_bits = 15, 12 | 13 | -- branching factor for jumprope 14 | branch_factor = 16, 15 | } 16 | -------------------------------------------------------------------------------- /tangram/init.lua: -------------------------------------------------------------------------------- 1 | require "tangram.defaults" 2 | require "tangram.cmds" 3 | require "tangram.db" 4 | require "tangram.jumprope" 5 | require "tangram.main" 6 | -------------------------------------------------------------------------------- /tangram/jumprope.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (c) 2009-2013, Scott Vokes 2 | -- 3 | -- All rights reserved. 4 | -- 5 | -- Redistribution and use in source and binary forms, with or without 6 | -- modification, are permitted provided that the following conditions 7 | -- are met: 8 | -- * Redistributions of source code must retain the above copyright 9 | -- notice, this list of conditions and the following disclaimer. 10 | -- * Redistributions in binary form must reproduce the above 11 | -- copyright notice, this list of conditions and the following 12 | -- disclaimer in the documentation and/or other materials 13 | -- provided with the distribution. 14 | -- * Neither the name of Scott Vokes nor the names of other 15 | -- contributors may be used to endorse or promote products 16 | -- derived from this software without specific prior written 17 | -- permission. 18 | -- 19 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 22 | -- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 23 | -- COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 24 | -- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 25 | -- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | -- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 | -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 29 | -- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 | -- POSSIBILITY OF SUCH DAMAGE. 31 | 32 | -- imports 33 | local fmt, concat, pop = string.format, table.concat, table.remove 34 | local assert, error, math, pcall, setmetatable, tonumber, tostring = 35 | assert, error, math, pcall, setmetatable, tonumber, tostring 36 | 37 | module(...) 38 | 39 | local DEFAULT_BRANCH_FACTOR = 16 40 | DEBUG = false 41 | 42 | -- invariants: 43 | -- . content is only ever appended at level 0 44 | -- . max level only ever increases 1 at a time 45 | -- . 'trunk' is max-level spine of structure 46 | -- . only trunk branches upward, and only at last node 47 | 48 | -- TODO: 49 | -- The 'L' (limb) / 'D' (data) markers should be unnecessary if 50 | -- trunk nodes without upward branches are given an explcit end marker, 51 | -- such as a "00000000000000000000 0\n" link. 52 | 53 | -- All jumpropes should belong to a common set, store callbacks etc. there. 54 | JumpropeSet = {} 55 | JumpropeSet.__index = JumpropeSet 56 | 57 | -- An individual jumprope handle, inside a JumpropeSet. 58 | Jumprope = {} 59 | Jumprope.__index = Jumprope 60 | 61 | -- Sentinel 62 | local UNKNOWN = {} 63 | 64 | local function log(...) 65 | if DEBUG then print(string.format(...)) end 66 | end 67 | 68 | -- Make a new jumprope set. Requires several callbacks: 69 | -- GET: (hash -> data | nil, "error") 70 | -- PUT: (hash, data -> true | nil, "error") 71 | -- EXISTS: (hash -> true | false | nil, "error") 72 | -- HASH: (data -> hash) 73 | -- 74 | -- BRANCH_FACTOR: 1:N chance in branching. 75 | function init(t) 76 | local jrs = setmetatable({}, JumpropeSet) 77 | jrs.get = assert(t.get, "Must specify 'get' callback") 78 | jrs.exists = assert(t.exists, "Must specify 'exists' callback") 79 | jrs.put = assert(t.put, "Must specify 'put' callback") 80 | jrs.hash = assert(t.hash, "Must specify 'hash' callback") 81 | jrs.hash_len = t.hash_len or jrs.hash("foo"):len() 82 | jrs.branch_factor = t.branch_factor or DEFAULT_BRANCH_FACTOR 83 | jrs.min_len = jrs.branch_factor / 4 84 | jrs.max_len = jrs.branch_factor * 4 85 | 86 | jrs._cache = setmetatable({}, {__mode="kv"}) 87 | return jrs 88 | end 89 | 90 | -- Get the head hash of a jumprope, or UNKNOWN if not available. 91 | function Jumprope:head() 92 | return self._headhash 93 | end 94 | 95 | -- Hash a data string. 96 | function Jumprope:hash(data) 97 | return self._set.hash(data) 98 | end 99 | 100 | -- Is a hash evenly divisible by the branch factor? 101 | function Jumprope:is_div(hash) 102 | return self._set:is_div(hash) 103 | end 104 | 105 | -- For a limb node string S, return an iterator of (hash, type, length) tuples. 106 | -- Each line should be have the format of e.g. 107 | -- "da4b9237bacccdf19c0760cab7aec4a8359010b0 D 1\n". 108 | -- The hash should be lowercase. 109 | local function iter_hashes(s) 110 | assert(s, "no string") 111 | return s:gmatch("(%x+) ([LD]) (%d+)\n") 112 | end 113 | 114 | -- Get the total size of the jumprope's data (by summing the trunk's nodes). 115 | function Jumprope:size() 116 | if self._headhash == UNKNOWN then return nil, "incomplete" end 117 | if self._size then return self._size end 118 | 119 | local t, get = 0, self._set.get 120 | 121 | -- get trunk 122 | local ok, res = pcall(get, self._headhash) 123 | if not ok then error(res, 0) end 124 | 125 | local hash_iter = iter_hashes(res) 126 | 127 | for hash, type, chunk_sz in hash_iter do 128 | t = t + chunk_sz 129 | end 130 | 131 | self._size = t 132 | return t 133 | end 134 | 135 | -- Get the count of data nodes used in building the jumprope. 136 | -- (mainly used for testing / benchmarking) 137 | function Jumprope:count() 138 | return self._count 139 | end 140 | 141 | local function push(t, v) t[#t+1] = v end 142 | 143 | 144 | -------------- 145 | -- Creation -- 146 | -------------- 147 | 148 | -- Initialize a new jumprope structure (to be built from streamed data). 149 | function JumpropeSet:new() 150 | local jr = setmetatable({}, Jumprope) 151 | jr._set = self 152 | jr._count = 0 -- node count 153 | jr._headhash = UNKNOWN -- hash for head node 154 | jr._limb = {} -- current limb 155 | jr._limb_size = 0 -- data bytes within current limb 156 | jr._stack = {} -- stack of limbs 157 | jr._level = 0 -- current level 158 | jr._max_level = 0 -- max level of trunk 159 | return jr 160 | end 161 | 162 | function JumpropeSet:is_div(hash_str) 163 | local num = tonumber(hash_str:sub(self.hash_len - 2), 16) 164 | return num % self.branch_factor == 0 165 | end 166 | 167 | local function make_new_limb(self) 168 | self._limb = {} 169 | self._limb_size = 0 170 | end 171 | 172 | -- Grow successive downward limbs until back at level 0. 173 | local function descend_to_zero(self) 174 | while self._level > 0 do 175 | log("growing downward to zero, @ %d", self._level) 176 | self._level = self._level - 1 177 | push(self._stack, {self._limb, self._limb_size}) 178 | make_new_limb(self) 179 | end 180 | end 181 | 182 | -- Branch trunk up one level, saving current context, to be completed with 183 | -- the hash of the rest of the jumprope. 184 | local function branch_trunk_upward(self) 185 | push(self._stack, {self._limb, self._limb_size}) 186 | self._max_level = self._max_level + 1 187 | self._level = self._level + 1 188 | make_new_limb(self) 189 | log("branch_upward to level %d, %d / %d", 190 | self._level, self._level, self._max_level) 191 | 192 | descend_to_zero(self) 193 | return true 194 | end 195 | 196 | -- Append a "hash type length\n" line to the current limb. 197 | -- Type is either "L" (metadata limb) or "D" (data). 198 | local function append_hash(self, type, data, h, limb_len) 199 | local limb = self._limb 200 | h = h or self:hash(data) 201 | local len = (type == "L" and limb_len or data:len()) 202 | assert(len, "no limb length provided") 203 | push(limb, fmt("%s %s %d\n", h, type, len)) 204 | self._count = self._count + 1 205 | --print("append_hash: adding ", len, " now ", self._limb_size + len) 206 | self._limb_size = self._limb_size + len 207 | log("append_hash %s, type %s, len %d", h, type, #limb) 208 | end 209 | 210 | -- Should the current addition also be a breaking point for the current limb? 211 | local function should_break(self, limb, hash, bf) 212 | local len = #limb 213 | local div = self:is_div(hash, bf) 214 | local sb = len >= self._set.max_len or (len >= self._set.min_len and div) 215 | log("%d, %s -> %s", len, tostring(div), tostring(sb)) 216 | return sb 217 | end 218 | 219 | -- Terminate the current limb, popping back up one or more limb(s) 220 | -- according to the hashes of the terminated limbs, then grow back 221 | -- down to limb 0. 222 | local function terminate_branch(self) 223 | local cur_limb = concat(self._limb) 224 | local cur_limb_size = self._limb_size 225 | 226 | local h = self:hash(cur_limb) 227 | local cfg = self._set 228 | local put = cfg.put 229 | --print("Cls", cur_limb_size) 230 | local ok, err = pcall(put, h, cur_limb) 231 | if not ok then return nil, err end 232 | 233 | local pair = pop(self._stack) 234 | self._limb, self._limb_size = pair[1], pair[2] 235 | assert(self._limb_size) 236 | --print("Adding", cur_limb_size, " now ", self._limb_size + cur_limb_size) 237 | log("terminate_branch, level == %d / %d", self._level, self._max_level) 238 | assert(self._level < self._max_level) 239 | self._level = self._level + 1 240 | assert(self._limb_size) 241 | log("LIMB SIZE", self._limb_size) 242 | append_hash(self, "L", cur_limb, h, cur_limb_size) --self._limb_size) 243 | 244 | local is_trunk = self._level == self._max_level 245 | if should_break(self, self._limb, h, cfg.branch_factor) then 246 | log("-- breaking at %d, %s", self._level, tostring(is_trunk)) 247 | if is_trunk then 248 | branch_trunk_upward(self) 249 | else 250 | terminate_branch(self) 251 | end 252 | else 253 | descend_to_zero(self) 254 | end 255 | assert(self._level == 0, "should end terminate_branch with level of 0") 256 | return true 257 | end 258 | 259 | -- Sink data into the jumprope, return true | nil, "error". 260 | function Jumprope:sink(data) 261 | assert(data) 262 | local h = self:hash(data) 263 | local cfg = self._set 264 | local put = cfg.put 265 | local ok, err = pcall(put, h, data) 266 | if not ok then return nil, err end 267 | 268 | assert(self._level == 0, "Appending data at non-zero level") 269 | assert(self._limb) 270 | assert(self._limb_size) 271 | append_hash(self, "D", data, h) 272 | 273 | log("sink %d / %d, %d", 274 | self._level, self._max_level, self._count) 275 | 276 | if should_break(self, self._limb, h, cfg.branch_factor) then 277 | local is_trunk = self._level == self._max_level 278 | if is_trunk then -- trunk; push and increase trunk level 279 | return branch_trunk_upward(self) 280 | else -- branch; close branch and pop to previous 281 | return terminate_branch(self) 282 | end 283 | end 284 | 285 | return true 286 | end 287 | 288 | -- Close out the current limb. 289 | local function pop_limb(self, put) 290 | assert(#self._stack > 0) 291 | local cur_limb = concat(self._limb) 292 | local pair = pop(self._stack) 293 | local cur_limb_size = self._limb_size 294 | self._limb, self._limb_size = pair[1], pair[2] 295 | --print("pop: adding ", cur_limb_size, " now ", self._limb_size + cur_limb_size) 296 | self._limb_size = self._limb_size + cur_limb_size 297 | assert(self._limb_size) 298 | local h = self:hash(cur_limb) 299 | local ok, err = pcall(put, h, cur_limb) 300 | if not ok then return nil, err end 301 | self._count = self._count + 1 302 | push(self._limb, fmt("%s L %d\n", h, cur_limb_size)) 303 | return true 304 | end 305 | 306 | -- EOF has been reached, close out the intermediate data structures 307 | -- and return the head hash or nil, "error". 308 | function Jumprope:finish() 309 | local put = self._set.put 310 | while #self._stack > 0 do 311 | local ok, err = pop_limb(self, put) 312 | if not ok then return nil, err end 313 | end 314 | local root = concat(self._limb) 315 | 316 | local trunk = {} 317 | local total_size = 0 318 | for hash, type, len_str in iter_hashes(root) do 319 | len = tonumber(len_str) 320 | push(trunk, {hash, type, len}) 321 | total_size = total_size + len 322 | --print("TRUNK", hash, type, len) 323 | end 324 | 325 | -- It should have at least one node. 326 | if root == "" then 327 | local h = self:hash("") 328 | local ok, err = pcall(put, h, "") 329 | if not ok then return nil, err end 330 | self._count = 1 331 | root = fmt("%s D 0\n", h) 332 | end 333 | local head = self:hash(root) 334 | local ok, err = pcall(put, head, root) 335 | if not ok then return nil, err end 336 | 337 | -- Clear temporary data 338 | self._limb = nil 339 | self._stack = nil 340 | 341 | -- Save info about root of structure 342 | self._headhash = head 343 | self._size = total_size 344 | return head 345 | end 346 | 347 | 348 | --------------- 349 | -- Retrieval -- 350 | --------------- 351 | 352 | -- Create a handle to an existing jumprope with the head HEADHASH. 353 | function JumpropeSet:open(headhash) 354 | assert(headhash, "no hash given") 355 | local jr = setmetatable({}, Jumprope) 356 | jr._headhash = headhash 357 | jr._set = self 358 | return jr 359 | end 360 | 361 | -- Do sanity checks, then get the portion of data[from:to] that falls 362 | -- within from < s < to (zero-indexed). 363 | -- CHUNK is data[offset:offset + chunk_sz]. 364 | -- (This is only exported for testing.) 365 | function within_span(chunk, offset, from, to, chunk_sz) 366 | local of, ot = from - offset, to - offset 367 | if of < 1 then of = 0 end 368 | 369 | assert(offset + chunk_sz >= from, "offset + chunk_sz <= from") 370 | assert(offset < to, "offset >= to") 371 | 372 | local span = ot - of 373 | local from, to = of + 1, of + span 374 | if to == math.huge then to = nil end 375 | return chunk:sub(from, to) 376 | end 377 | 378 | -- Get an iterator for the jumprope's data between the 379 | -- byte offsets FROM < b < TO, which default to 0 and data:len(). 380 | -- Since the range ends may not coincide with a chunk boundary, 381 | -- fetch and return subsets of chunks as necessary. 382 | -- 383 | -- Unlike Lua, this is 0-indexed, i.e., ("blah"):stream(0,2) yields "bl". 384 | function Jumprope:stream(from, to) 385 | from = from or 0 386 | to = to or math.huge 387 | if self._headhash == UNKNOWN then 388 | error("jumprope is not yet readable", 0) 389 | end 390 | local actual_get, cache = self._set.get, self._set._cache 391 | local get = function(hash) 392 | local v = cache[hash] 393 | if v then return v end 394 | v = actual_get(hash) 395 | -- FIXME: disable cache for now, it's 396 | -- not being collected properly. 397 | --cache[hash] = v 398 | return v 399 | end 400 | local ok, res = pcall(get, self._headhash) 401 | if not ok then error(res, 0) end 402 | 403 | local offset, stack, hash_iter = 0, {}, iter_hashes(res) 404 | 405 | local iterator 406 | iterator = function() 407 | if not stack then return nil end -- already DONE 408 | local hash, type, chunk_sz = hash_iter() 409 | local chunk 410 | 411 | if hash then -- got a chunk 412 | local post = offset + chunk_sz 413 | 414 | -- print(string.format("* %s (%s), %d bytes, offset %d (%s - %s)", 415 | -- hash, type, chunk_sz, offset, from, to)) 416 | if post < from then -- skip chunk 417 | offset = offset + chunk_sz 418 | return iterator() 419 | elseif offset >= to then -- done with iteration 420 | stack = nil 421 | return 422 | elseif type == "L" then -- push stack and descend 423 | assert(offset < to or (offset <= from and post > from)) 424 | push(stack, hash_iter) 425 | ok, chunk = pcall(get, hash) 426 | if not ok then return error(chunk, 0) end 427 | hash_iter = iter_hashes(chunk) 428 | return iterator() 429 | elseif type == "D" then -- yield some/all of data chunk 430 | ok, chunk = pcall(get, hash) 431 | if not ok then error(chunk, 0) end 432 | 433 | if offset > from and post < to then -- full yield 434 | log("YIELDING CONTENT: %d", chunk:len()) 435 | offset = post 436 | return chunk 437 | else -- partial yield 438 | local part = within_span(chunk, offset, from, to, chunk_sz) 439 | log("YIELDING PARTIAL CONTENT: %d", part:len()) 440 | offset = post 441 | return part 442 | end 443 | else 444 | error("Bad type") 445 | end 446 | else 447 | if #stack == 0 then -- EOF 448 | stack = nil 449 | return nil, "done" 450 | else -- pop limb stack and continue 451 | hash_iter = pop(stack) 452 | return iterator() 453 | end 454 | end 455 | end 456 | 457 | return iterator 458 | end 459 | -------------------------------------------------------------------------------- /tangram/main.lua: -------------------------------------------------------------------------------- 1 | -- Copyright (c) 2012-2013, Scott Vokes 2 | -- 3 | -- All rights reserved. 4 | -- 5 | -- Redistribution and use in source and binary forms, with or without 6 | -- modification, are permitted provided that the following conditions 7 | -- are met: 8 | -- * Redistributions of source code must retain the above copyright 9 | -- notice, this list of conditions and the following disclaimer. 10 | -- * Redistributions in binary form must reproduce the above 11 | -- copyright notice, this list of conditions and the following 12 | -- disclaimer in the documentation and/or other materials 13 | -- provided with the distribution. 14 | -- * Neither the name of Scott Vokes nor the names of other 15 | -- contributors may be used to endorse or promote products 16 | -- derived from this software without specific prior written 17 | -- permission. 18 | -- 19 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 22 | -- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 23 | -- COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 24 | -- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 25 | -- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 26 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | -- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 | -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 29 | -- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 | -- POSSIBILITY OF SUCH DAMAGE. 31 | 32 | module(..., package.seeall) 33 | 34 | local usage 35 | 36 | -- global switches 37 | local switches = { 38 | ['-v'] = {l="verbose", f=function(a,c) c.verbose = true end }, 39 | ['-d'] = {l="dry_run", f=function(a,c) c.dry_run = true end }, 40 | ['-s'] = {l="store path", f=function(a,c) c.base_path = table.remove(a, 1) end }, 41 | } 42 | 43 | cmds = { 44 | ['help'] = {l="print this message", f=function(a,c) usage() end }, 45 | ['init'] = {l="initialize data store", f=tangram.cmds.cmd_init, 46 | o="-r RH_BITS -b BRANCH_FACTOR"}, 47 | ['version'] = {l="print version", 48 | f=function(a,c) print(DEFAULTS.version); os.exit(0) end }, 49 | ['add'] = {l="add a file", f=tangram.cmds.cmd_add, o="PATH"}, 50 | ['get'] = {l="get a file", f=tangram.cmds.cmd_get, 51 | o="-r RANGE NAME"}, 52 | ['list'] = {l="list known files", f=tangram.cmds.cmd_list }, 53 | ['test'] = {l="run tests", f=tangram.cmds.cmd_test }, 54 | ['info'] = {l="get info", f=tangram.cmds.cmd_info, o="ID"}, 55 | ['forget'] = {l="forget a file", f=tangram.cmds.cmd_forget, o="ID"}, 56 | ['prop'] = {l="get/set property", f=tangram.cmds.cmd_prop}, 57 | ['search'] = {l="search", f=tangram.cmds.cmd_search}, 58 | ['gc'] = {l="collect garbage", f=tangram.cmds.cmd_gc }, 59 | } 60 | 61 | function usage() 62 | local b = {} 63 | local A = function(...) b[#b+1] = string.format(...) end 64 | A("tangram: jumprope-based archiver by %s\n", DEFAULTS.author) 65 | A(" version %s\n", DEFAULTS.version) 66 | A("Usage: \n") 67 | A(" Arguments\n") 68 | for k,v in pairs(switches) do 69 | A(" %s: %s\n", k, v.l) 70 | end 71 | A(" Commands\n") 72 | for k,v in pairs(cmds) do 73 | A(" %s: %s\n", k, v.l) 74 | end 75 | io.write(table.concat(b)) 76 | os.exit(0) 77 | end 78 | 79 | local function proc_args(arg) 80 | local cfg = {} 81 | 82 | cfg.base_path = os.getenv("TANGRAM_PATH") 83 | 84 | while true do 85 | local a = table.remove(arg, 1) 86 | if not a then break end 87 | if cmds[a] then cfg.cmd = cmds[a]; break end 88 | local sf = switches[a] 89 | if not sf then print("Bad arg: ", a); usage() end 90 | sf.f(arg, cfg) 91 | end 92 | 93 | cfg.bits = cfg.bits or DEFAULTS.rh_bits 94 | cfg.base_path = cfg.base_path or DEFAULTS.base_path 95 | 96 | -- Ensure trailing "/" for base path. 97 | if cfg.base_path:sub(-1) ~= "/" then 98 | cfg.base_path = cfg.base_path .. "/" 99 | end 100 | 101 | return cfg 102 | end 103 | 104 | function main(arg) 105 | if #arg <= 0 then 106 | usage() 107 | else 108 | local cfg = proc_args(arg) 109 | if cfg.cmd then 110 | cfg.cmd.f(arg, cfg) 111 | end 112 | end 113 | end 114 | -------------------------------------------------------------------------------- /tangram/test_db.lua: -------------------------------------------------------------------------------- 1 | local db = tangram.db 2 | 3 | module(..., package.seeall) 4 | 5 | function test_db_creation() 6 | assert_true(db.init_db()) 7 | end 8 | 9 | local exhash = "970318968feb640da723b8826861e41f0718a487" 10 | 11 | local function def_db() 12 | return assert(db.init_db()) 13 | end 14 | 15 | function test_db_and_file_and_check() 16 | local db = def_db() 17 | local res, err = db:add_file(exhash, "bananas.txt", 23) 18 | assert_equal(1, res) 19 | local found 20 | for f in db:get_files() do 21 | if f.id == 1 then found = true end 22 | end 23 | assert_true(found) 24 | end 25 | 26 | function test_db_add_and_remove() 27 | local db = def_db() 28 | local res, err = db:add_file(exhash, "bananas.txt", 23) 29 | assert_equal(1, res) 30 | res, err = db:rm_file(1) 31 | assert_true(res) 32 | for f in db:get_files() do 33 | if f.id == 1 then fail("not deleted") end 34 | end 35 | end 36 | 37 | function test_db_hash_completions() 38 | local db = def_db() 39 | db:add_file(exhash, "bananas.txt", 23) 40 | local hashes = {} 41 | for h in db:get_hash_completions(exhash:sub(1,4)) do 42 | hashes[#hashes+1] = h.hash 43 | end 44 | assert_equal(exhash, hashes[1]) 45 | local hash_h = "ffff" .. exhash:sub(5, exhash:len()) 46 | local hash_t = exhash:sub(1, exhash:len() - 4) .. "ffff" 47 | db:add_file(hash_h, "head.txt", 10) 48 | db:add_file(hash_t, "tail.txt", 20) 49 | 50 | hashes = {} 51 | for h in db:get_hash_completions(exhash:sub(1,4)) do 52 | hashes[#hashes+1] = h.hash 53 | end 54 | assert_equal(2, #hashes) 55 | table.sort(hashes) 56 | assert_equal(exhash, hashes[1]) 57 | assert_equal(hash_t, hashes[2]) 58 | end 59 | 60 | function test_add_property() 61 | local db = def_db() 62 | local id, err = db:add_file(exhash, "bananas.txt", 23) 63 | db:add_property(id, "version", "1") 64 | local props = db:get_properties(id) 65 | assert_equal("1", props.version) 66 | end 67 | 68 | function test_add_and_rm_property() 69 | local db = def_db() 70 | local id, err = db:add_file(exhash, "bananas.txt", 23) 71 | db:add_property(id, "version", "1") 72 | db:rm_property(id) 73 | local props = db:get_properties(id) 74 | assert_equal(nil, props.version) 75 | end 76 | 77 | function test_search_name() 78 | local db = def_db() 79 | local id, err = db:add_file(exhash, "bananas.txt", 23) 80 | 81 | for row in db:search_name("bananas") do 82 | if row.id == id then return end 83 | end 84 | fail("not found") 85 | end 86 | 87 | function test_search_hash() 88 | local db = def_db() 89 | local id, err = db:add_file(exhash, "bananas.txt", 23) 90 | 91 | for row in db:search_hash(exhash) do 92 | if row.id == id then return end 93 | end 94 | fail("not found") 95 | end 96 | 97 | function test_search_property() 98 | local db = def_db() 99 | local id, err = db:add_file(exhash, "bananas.txt", 23) 100 | db:add_property(id, "version", "1") 101 | 102 | for row in db:search_property("version") do 103 | if row.id == id then return end 104 | end 105 | fail("not found") 106 | end 107 | -------------------------------------------------------------------------------- /tangram/test_jumprope.lua: -------------------------------------------------------------------------------- 1 | require "random" 2 | local jumprope = tangram.jumprope 3 | 4 | -- This loads either slncrypto (preferred) or luacrypto, which both 5 | -- install as "crypto". 6 | require "crypto" 7 | 8 | local floor = math.floor 9 | 10 | module(..., package.seeall) 11 | 12 | -- Make a table of counts for each JumpropeSet; weak, so they can be GC'd. 13 | local counts = setmetatable({}, {__mode="v"}) 14 | 15 | local sha1 -- string->sha1 function 16 | if crypto.sha1 ~= nil then -- prefer slncrypto 17 | sha1 = function(s) return crypto.sha1(s):lower() end 18 | elseif crypto.digest ~= nil then -- luacrypto 19 | sha1 = function(s) return crypto.digest("sha1", data) end 20 | end 21 | 22 | function in_mem_JumpropeSet(bf) 23 | bf = bf or 64 24 | local store = {} 25 | local count_fun 26 | local function get(hash) 27 | local v = store[hash] 28 | --print("GET", hash, v and v:len() or "nil") 29 | if v then return v else 30 | error("unknown hash: " .. hash) 31 | end 32 | end 33 | 34 | local function put(hash, data) 35 | assert(data, "no data") 36 | --print("PUT", hash, data:len()) 37 | if not store[hash] then count_fun(true) end 38 | store[hash] = data 39 | return true 40 | end 41 | 42 | local function exists(hash) 43 | return store[hash] ~= nil 44 | end 45 | 46 | local jrs = jumprope.init {get=get, put=put, exists=exists, hash=sha1, 47 | branch_factor=bf} 48 | count_fun = function(n) 49 | local cur = (counts[jrs] or 0) 50 | if n then counts[jrs] = cur + 1 end 51 | return cur 52 | end 53 | return jrs, count_fun 54 | end 55 | 56 | local concat = table.concat 57 | local char = string.char 58 | 59 | function mk_random_string(sz, seed) 60 | seed = seed or 1 61 | local r = random.new() 62 | r:seed(seed) 63 | local buf = {} 64 | for i=1,sz do 65 | buf[i] = char(r:value(256) - 1) 66 | end 67 | return concat(buf) 68 | end 69 | 70 | function test_two_empty_JRs_should_have_the_same_head_hash() 71 | local js = in_mem_JumpropeSet() 72 | local j1, j2 = js:new(), js:new() 73 | -- Add the empty string to #1 and finish it. 74 | assert(j1:sink("")) 75 | assert(j1:finish()) 76 | 77 | -- Just finish #2 with it empty. 78 | assert(j2:finish()) 79 | 80 | assert_true(j1:head(), "head hash should exist") 81 | assert_equal(j1:head(), j2:head(), "head hashes should match") 82 | end 83 | 84 | function test_empty_JRs_should_have_one_node() 85 | local js = in_mem_JumpropeSet() 86 | local j = js:new() 87 | 88 | assert(j:sink("")) 89 | assert(j:finish()) 90 | assert_equal(1, j:count()) 91 | end 92 | 93 | function test_two_JRs_with_the_same_single_string_should_have_the_same_hash() 94 | local js = in_mem_JumpropeSet() 95 | local j1, j2 = js:new(), js:new() 96 | local s = "brevity is the soul of wit" 97 | 98 | assert(j1:sink(s)) 99 | assert(j1:finish()) 100 | 101 | assert(j2:sink(s)) 102 | assert(j2:finish()) 103 | 104 | assert_true(j1:head(), "should exist") 105 | assert_equal(j1:head(), j2:head(), "head hashes should match") 106 | end 107 | 108 | function test_two_JRs_with_the_same_set_of_strings_should_have_the_same_hash() 109 | local js = in_mem_JumpropeSet() 110 | local j1, j2 = js:new(), js:new() 111 | local s = "brevity is the soul of wit" 112 | 113 | for c in s:gmatch("(.)") do assert(j1:sink(c)) end 114 | assert(j1:finish()) 115 | 116 | for c in s:gmatch("(.)") do assert(j2:sink(c)) end 117 | assert(j2:finish()) 118 | 119 | assert_true(j1:head(), "should exist") 120 | assert_equal(j1:head(), j2:head(), "head hashes should match") 121 | end 122 | 123 | function iter_str(s, chunk_size) 124 | local i, len = 1, s:len() 125 | return function () 126 | if i > len then return nil end 127 | local chunk = s:sub(i, i + chunk_size - 1) 128 | i = i + chunk_size 129 | return chunk 130 | end 131 | end 132 | 133 | function test_test_two_JRs_with_the_same_large_string_should_have_the_same_hash() 134 | local js = in_mem_JumpropeSet() 135 | local j1, j2 = js:new(), js:new() 136 | 137 | -- 1 MB string of random binary data 138 | local s = mk_random_string(1024 * 1024, 23) 139 | 140 | -- add in 1kb chunks 141 | for chunk in iter_str(s, 1024) do 142 | j1:sink(chunk) 143 | j2:sink(chunk) 144 | end 145 | 146 | j1:finish() 147 | j2:finish() 148 | 149 | assert_equal(j1:head(), j2:head()) 150 | end 151 | 152 | function test_two_JRs_with_the_same_string_should_add_few_new_nodes_when_changed() 153 | local js, count_fun = in_mem_JumpropeSet() 154 | local j1, j2 = js:new(), js:new() 155 | 156 | -- 1 MB string of random binary data 157 | local s = mk_random_string(1024 * 1024, 23) 158 | 159 | -- add in 1kb chunks 160 | for chunk in iter_str(s, 1024) do 161 | j1:sink(chunk) 162 | end 163 | local ok, err = j1:finish() 164 | assert(ok, err) 165 | 166 | local pre_count = count_fun() 167 | 168 | local i = 0 169 | for chunk in iter_str(s, 1024) do 170 | i = i + 1 171 | if i == 100 then 172 | j2:sink(("x"):rep(1024)) 173 | else 174 | j2:sink(chunk) 175 | end 176 | end 177 | 178 | assert(j2:finish()) 179 | 180 | local post_count = count_fun() 181 | 182 | assert_not_equal(j1:head(), j2:head(), "head hashes should not match") 183 | assert_lte(0.01 * pre_count, post_count - pre_count) 184 | end 185 | 186 | -- Test that finish -> pop_limb computes limb size correctly 187 | function test_sink_100_one_byte_chunks_and_total_length() 188 | local js = in_mem_JumpropeSet() 189 | local j = js:new() 190 | local lim = 100 191 | 192 | -- add "0" .. "9" over and over 193 | for i=0,lim - 1 do 194 | local chunk = tostring(i % 10) 195 | j:sink(chunk) 196 | end 197 | assert(j:finish()) 198 | 199 | assert_equal(lim, j:size(), "size should match") 200 | end 201 | 202 | -- Test that terminate_branch computes limb size correctly 203 | function test_sink_1000_one_byte_chunks_and_total_length() 204 | local js = in_mem_JumpropeSet() 205 | local j = js:new() 206 | local lim = 1000 207 | 208 | -- add "0" .. "9" over and over 209 | for i=0,lim - 1 do 210 | local chunk = tostring(i % 10) 211 | j:sink(chunk) 212 | end 213 | assert(j:finish()) 214 | 215 | assert_equal(lim, j:size(), "size should match") 216 | end 217 | 218 | function test_put_failures_should_be_passed_to_user() 219 | local count = 5 220 | local function nop() end 221 | local function put(hash, data) 222 | count = count - 1 223 | if count == 0 then error("fail", 0) end 224 | return true 225 | end 226 | 227 | local jrs = jumprope.init({put=put, get=nop, exists=nop, hash=sha1}) 228 | local jr = jrs:new() 229 | for i=1,5 do 230 | local ok, err = jr:sink("blah") 231 | if i == 5 then 232 | assert_nil(ok, "should fail") 233 | assert_equal("fail", err, "should get error message") 234 | else 235 | assert(ok) 236 | end 237 | end 238 | end 239 | 240 | function test_within_span() 241 | local ws = jumprope.within_span 242 | local s = "abcdefghijklmnopqrstuvwxyz" 243 | local function ws(exp, offset, from, to) 244 | assert_equal(exp, jumprope.within_span(s:sub(offset+1, offset+1+5), 245 | offset, from, to, 5)) 246 | end 247 | 248 | ws("a", 0, 0, 1) 249 | ws("b", 0, 1, 2) 250 | ws("b", 1, 1, 2) 251 | ws("cdef", 1, 2, 6) 252 | ws("yz", 24, 24, 26) 253 | ws("z", 25, 25, 26) 254 | end 255 | 256 | -- Compare strings, but in a way that makes off-by-ones obvious, rather than 257 | -- printing "got (VERY LONG STRING), expected (OTHER VERY LONG STRING)". 258 | local function off_by_one_check(rejoined, expected) 259 | assert_equal(expected:sub(1, 2), rejoined:sub(1, 2), "first 2 chars should match") 260 | assert_equal(expected:sub(-2), rejoined:sub(-2), "last 2 chars should match") 261 | assert_equal(expected:len(), rejoined:len(), "sizes should match") 262 | assert_true(rejoined == expected, "should equal input") 263 | end 264 | 265 | function check_it(s, chunk_sz, from, to) 266 | local js = in_mem_JumpropeSet() 267 | local j = js:new() 268 | 269 | -- add in chunk_sz pieces 270 | for chunk in iter_str(s, chunk_sz) do 271 | -- print("< ", chunk) 272 | j:sink(chunk) 273 | end 274 | j:finish() 275 | assert_equal(s:len(), j:size(), "j:size() is incorrect") 276 | 277 | local buf = {} 278 | local iter = assert(j:stream(from, to)) 279 | 280 | for chunk in iter do 281 | buf[#buf+1] = chunk 282 | -- print(">", chunk) 283 | end 284 | 285 | local expected = s:sub(from + 1, to) 286 | local rejoined = concat(buf) 287 | 288 | off_by_one_check(rejoined, expected) 289 | end 290 | 291 | function test_get_content_from_part_of_small_string() 292 | local s = "abcdefghijklmnopqrstuvwxyz" 293 | for chunk_sz=2,4 do 294 | for start=0,25 do 295 | for len=1,6 do 296 | --print("\n### CSL ", chunk_sz, start, len) 297 | check_it(s, chunk_sz, start, start + len) 298 | end 299 | end 300 | end 301 | end 302 | 303 | function test_a_jumprope_iterator_should_return_the_same_content_as_the_original_input() 304 | -- 1 MB string of random binary data 305 | local s = mk_random_string(1024 * 1024, 23) 306 | check_it(s, 63, 0, s:len() - 1) 307 | end 308 | 309 | function test_get_content_from_halfway_to_the_end() 310 | -- 1 MB string of random binary data 311 | local sz = 1024 * 1024 312 | local s = mk_random_string(sz, 23) 313 | local start = floor(sz/2) 314 | check_it(s, 63, start, sz - 1) 315 | end 316 | 317 | function test_get_the_first_half_of_the_content() 318 | -- 1 MB string of random binary data 319 | local sz = 1024 * 1024 320 | local s = mk_random_string(sz, 23) 321 | check_it(s, 63, 0, floor(sz/2)) 322 | end 323 | 324 | function test_open_existing_jumprope() 325 | local sz = 1024 * 1024 326 | local s = mk_random_string(sz, 27) 327 | 328 | local chunk_sz = 999 329 | local from, to = 0, sz 330 | 331 | local js = in_mem_JumpropeSet() 332 | local j = js:new() 333 | 334 | -- add in chunk_sz pieces 335 | for chunk in iter_str(s, chunk_sz) do 336 | -- print("< ", chunk) 337 | j:sink(chunk) 338 | end 339 | 340 | local head = j:finish() 341 | assert_true(head, "Didn't return headhash") 342 | 343 | local j2 = js:open(head) 344 | 345 | local buf = {} 346 | 347 | local iter = assert(j2:stream(from, to)) 348 | 349 | for chunk in iter do 350 | buf[#buf+1] = chunk 351 | end 352 | 353 | -- check content 354 | local rejoined = table.concat(buf) 355 | off_by_one_check(rejoined, s) 356 | 357 | -- check size 358 | assert_equal(j:size(), j2:size(), "Size doesn't match") 359 | end 360 | --------------------------------------------------------------------------------