├── Formula
    ├── hashchop.rb
    └── tangram.rb
├── LICENSE
├── README.md
├── tangram-0.1-1.rockspec
├── tangram.lua
└── tangram
    ├── cmds.lua
    ├── db.lua
    ├── defaults.lua
    ├── init.lua
    ├── jumprope.lua
    ├── main.lua
    ├── test_db.lua
    └── test_jumprope.lua


/Formula/hashchop.rb:
--------------------------------------------------------------------------------
 1 | require 'formula'
 2 | 
 3 | class Hashchop < Formula
 4 |   homepage 'https://github.com/silentbicycle/hashchop'
 5 |   url 'https://github.com/silentbicycle/hashchop/archive/master.tar.gz'
 6 |   sha1 '3452e20fb41e5a0f04a09b69e9978587030dfd75'
 7 |   version '0.8-0'
 8 | 
 9 |   depends_on 'lua'
10 |   depends_on 'luarocks'
11 | 
12 |   def install
13 |     system 'luarocks install hashchop-0.8-0.rockspec'
14 |   end
15 |   
16 | end
17 | 


--------------------------------------------------------------------------------
/Formula/tangram.rb:
--------------------------------------------------------------------------------
 1 | require 'formula'
 2 | 
 3 | class Tangram < Formula
 4 |   homepage 'https://github.com/silentbicycle/tangram'
 5 |   url 'https://github.com/silentbicycle/tangram/archive/v0.1-1.tar.gz'
 6 |   sha1 '34bd7022d0faad96145cd219872ed9d46b3598bb'
 7 |   version '0.1-1'
 8 | 
 9 |   depends_on 'lua'
10 |   depends_on 'luarocks'
11 |   depends_on 'hashchop'
12 | 
13 |   def install
14 |     system 'luarocks install tangram-0.1-1.rockspec'
15 |   end
16 |   def test
17 |     system 'tangram test'
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2013, Scott Vokes <vokes.s@gmail.com>
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions
 7 | are met:
 8 | 
 9 |     * Redistributions of source code must retain the above copyright
10 |       notice, this list of conditions and the following disclaimer.
11 | 
12 |     * Redistributions in binary form must reproduce the above copyright
13 |       notice, this list of conditions and the following disclaimer in
14 |       the documentation and/or other materials provided with the
15 |       distribution.
16 | 
17 |     * Neither the name of Scott Vokes nor the names of other
18 |       contributors may be used to endorse or promote products derived
19 |       from this software without specific prior written permission.
20 |  
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
22 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
25 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | tangram: a Jumprope-based content store.
  2 | 
  3 | 
  4 | # Overview
  5 | 
  6 | This is a standalone content store, somewhat like the .git directory
  7 | that git uses for internal storage. However, while git is best suited to
  8 | storing versioned collections of relatively small, diff-able files,
  9 | tangram is best at storing large files.
 10 | 
 11 | It is based on the Jumprope, a data structure I invented. The Jumprope
 12 | is a kind of tree of arrays of data chunks, whose overall shape is
 13 | derived from the data itself -- duplicated sections of files coalesce
 14 | together to branches that are automatically shared, and identical files
 15 | end up with the same overall identifier.
 16 | 
 17 | (This is a central component of scatterbrain, a distributed filesystem
 18 | I've been working on, but also useful on its own. Since Jumpropes use
 19 | [content-addressable storage][CAS] and all data is immutable, it doesn't
 20 | really matter where the data is located -- scatterbrain mirrors the data
 21 | over a somewhat Dynamo-like distributed hash table, which periodically
 22 | checks that all live content is mirrored in a sufficient number of
 23 | nodes. I'm still working on the network logic, though, and it will be a
 24 | separate project)
 25 | 
 26 | [CAS]: http://en.wikipedia.org/wiki/Content-addressable_storage
 27 | 
 28 | 
 29 | ## Example Use Cases
 30 | 
 31 |  * Storing many variants of genetic data
 32 |  * Storing design / multimedia assets
 33 |  * Backing up lots of incremental virtual machine snapshots
 34 | 
 35 | 
 36 | ## Features
 37 | 
 38 |  * Automatic de-duplication of content
 39 |  * Automatic detection of identical files
 40 |  * A tagging / property system for saving and searching by file metadata
 41 |  * High throughput (e.g. HD video pipes to mplayer w/ out skips)
 42 | 
 43 | 
 44 | # License
 45 | 
 46 | This is released under a 3-clause BSD license. Be nice.
 47 | 
 48 | 
 49 | # Current Status
 50 | 
 51 | The system works, but the command-line interface and installation
 52 | process are still evolving, and a bit rough around the edges.
 53 | (Thanks, early adopters. Constructive feedback is appreciated.)
 54 | 
 55 | I have tested it on Linux, OpenBSD, and OSX.
 56 | 
 57 | I *haven't* tested it on Windows yet, but it shouldn't take major
 58 | effort to port - there isn't anything OS-specific besides the process
 59 | to create a native Lua extension and some default paths.
 60 | 
 61 | 
 62 | # Installation
 63 | 
 64 | The installation process should eventually be replaced by
 65 | `brew install tangram`, `apt-get install tangram`, `pkg_add tangram`,
 66 | and the like, but it's still pretty manual.
 67 | 
 68 | 
 69 | ## Dependencies
 70 | 
 71 |  All Lua dependencies are available via [LuaRocks](http://luarocks.org).
 72 | 
 73 |  * Lua (http://lua.org)
 74 |  * SQLite3 (http://sqlite.org)
 75 |  * A C compiler
 76 |  * libhashchop and its Lua wrapper (http://github.com/silentbicycle/hashchop/)
 77 |  * luafilesystem
 78 |  * slncrypto (for SHA1 hashing)
 79 |  * zlib and its lua wrapper
 80 |  * SQLite3's lua wrapper
 81 |  * lunatest (for testing)
 82 | 
 83 | 
 84 | ## How to Install
 85 | 
 86 |  * Install [Lua](http://lua.org). 
 87 |  * Install [SQLite3](http://sqlite.org), if you don't have like
 88 |    a dozen copies of it lying around already.
 89 |  * Install [LuaRocks](http://luarocks.org), the de facto standard packaging
 90 |    system for Lua. (If you don't want to use LuaRocks, install the other
 91 |    Lua dependencies yourself.)
 92 |  * Use LuaRocks to install the `slncrypto`, `zlib`, `luafilesystem`,
 93 |    `lsqlite3`, and `lunatest` packages.
 94 |    Type e.g. `luarocks install slncrypto`.
 95 |  * Download [libhashchop](https://github.com/silentbicycle/hashchop),
 96 |    build it, and then build and install the lua wrapper with `make lua`
 97 |    and `make lua-install`. Or, if you want to do it by hand, copy the
 98 |    dynamic library to wherever Lua puts its native extensions on your
 99 |    system. (To figure this out, you can fire up the Lua REPL and type
100 |    `=package.cpath`. On Unix-like OSs, it's typically something like
101 |    `/usr/local/lib/lua/5.1/`.) You may need to modify the paths in
102 |    the makefile, if your OS puts Lua's headers/libary somewhere odd.
103 |  * Copy the `tangram` subdirectory into Lua's package path (typically
104 |    "/usr/local/share/lua/5.1/", check `package.path`), so that the
105 |    tangram.* packages can be loaded.
106 |  * Copy the tangram.lua script into your path somewhere.
107 |  * Run `tangram.lua test`.
108 | 
109 | ## Example usage
110 | 
111 |     $ tangram.lua init            # create a content store w/ default settings
112 |     $ tangram.lua add foo.bar     # add a file to the store
113 |     $ cmd | tangram.lua add -     # add to the store from stdin
114 |     $ tangram.lua list            # list known files
115 |     $ tangram.lua get 1           # get file with ID #1, print to stdout
116 |     $ tangram.lua get 1 foo.baz   # get file with ID #1, save to foo.baz
117 | 
118 | 
119 | # Options
120 | 
121 | All commands take the following arguments (which should appear *before*
122 | the command name):
123 | 
124 |  * -d: dry run, don't write to disk
125 |  * -v: verbose
126 |  * -s PATH: use custom store path instead of default
127 | 
128 | 
129 | # Commands
130 | 
131 | ## help: print help message
132 | 
133 | Print help.
134 | 
135 | ## version: print version
136 | 
137 | Print the version.
138 | 
139 | ## init: initialize data store
140 | 
141 | Initialize a data store.
142 | 
143 |     tangram init [-b BITS] [-f BRANCH_FACTOR]
144 | 
145 | Arguments:
146 | 
147 |  * -b BITS - Set number of bits for rolling hash bitmask (determines chunk size)
148 |  * -f BF - Set branching factor (determines average Jumprope limb length)
149 | 
150 | ## get: get a file
151 | 
152 | Get file content from the store.
153 | 
154 |     tangram get [-f | -h] KEY [OUT_FILE]
155 |     
156 | -f or -h specify that the key is a file ID (-f) or hash (-h), otherwise
157 | it will try to infer the right thing. If OUT_FILE is given, it will save
158 | the content to that file, otherwise it will print to stdout.
159 | 
160 | ## add: add a file
161 | 
162 | Add a file to the store.
163 | 
164 |     tangram add [-n NAME] [FILENAME or -]
165 | 
166 | Arguments:
167 | 
168 |  * -n NAME - Store input file as NAME.
169 | 
170 | ## info: get info
171 | 
172 | Print metadata about a file.
173 | 
174 |     tangram info ID
175 |     
176 | TODO: the info command (without an ID) should print info about the store config
177 | 
178 | ## list: list known files
179 | 
180 | Print basic info about all stored files.
181 | 
182 | ## prop: get/set property
183 | 
184 | Get / set a property on a file. These properties don't have any internal
185 | meaning, but exist as a hook to track content metadata.
186 | 
187 |     tangram prop add ID KEY
188 |     tangram prop add ID KEY VALUE
189 |     tangram prop del ID
190 |     tangram prop del ID KEY
191 | 
192 | ## search: search by name or property
193 | 
194 | Search by name or property.
195 | 
196 |     tangram search prop KEYNAME
197 |     tangram search prop KEYNAME VALUENAME
198 |     tangram search name PATTERN
199 | 
200 | ## forget: stop tracking a file
201 | 
202 | Stop tracking a file. To actually remove content from the store, use the
203 | GC command.
204 | 
205 |     tangram forget ID
206 | 
207 | ## gc: remove inaccessible content from store
208 | 
209 | When files are forgotten, their storage is not automatically reclaimed,
210 | since some of it may be shared by other files. This checks the liveness
211 | of data chunks in the store and deletes any that are no longer referenced.
212 | 
213 | ## test: run tests
214 | 
215 | Run unit tests. (Requires lunatest.)
216 | 
217 | 
218 | # Future Developments
219 | 
220 |  * Better documentation of the Jumprope data structure. Its reference
221 |    implementation is included, and (IMHO) commented well, but there are
222 |    some subtleties. In the mean time, [my StrangeLoop talk][talk]
223 |    includes an attempt to convey my intuitions about how it works.
224 | 
225 | [talk]: http://www.infoq.com/presentations/Data-Structures
226 | 
227 |  * Retrieving specific byte-ranges of content. The Jumprope library
228 |    supports it, but it isn't part of the CLI yet.
229 | 
230 |  * While there is currently no interface for it, the Jumprope has the
231 |    necessary metadata to accelerate diff-ing of very large files.
232 |    (It automatically identifies large subsets of the files that are
233 |    known to be identical and can be skipped.)
234 |    
235 |  * There isn't any attempt to take advantage of the Jumprope's
236 |    embarassingly parallelizable retrieval. Scatterbrain uses async IO to
237 |    spread reads over the network, and to maintain an arbitrarily large
238 |    look-ahead buffer for the streaming data, but this doesn't bother
239 |    with that: it would would only complicate things and lead to more
240 |    disk contention. There may be advantages in taking advantage of
241 |    parallelism by different means, though.
242 | 
243 | 
244 | # Acknowledgements
245 | 
246 | Thanks to everyone who has given me feedback along the way, particularly
247 | Mike English and Jessica Kerr.
248 | 


--------------------------------------------------------------------------------
/tangram-0.1-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "tangram"
 2 | version = "0.1-1"
 3 | source = {
 4 |     url = "git://github.com/silentbicycle/tangram.git",
 5 |     tag = "v0.1-1",
 6 |     file = "tangram-0.1-1.tar.gz",
 7 |     dir = "tangram",
 8 | }
 9 | description = {
10 |     summary    = "A Jumprope-based content store",
11 |     detailed   = [[
12 | This is a standalone content store, somewhat like the .git directory
13 | that git uses for internal storage. However, while git is best suited to
14 | storing versioned collections of relatively small, diff-able files,
15 | tangram is best at storing large files.
16 |             
17 | It is based on the Jumprope, a data structure I invented. The Jumprope
18 | is a kind of tree of arrays of data chunks, whose overall shape is
19 | derived from the data itself -- duplicated sections of files coalesce
20 | together to branches that are automatically shared, and identical files
21 | end up with the same overall identifier.
22 | ]],
23 | license    = "BSD",
24 | homepage   = "github.com/silentbicycle/tangram/",
25 | maintainer = "Scott Vokes (vokes.s@gmail.com)",
26 | }
27 | dependencies = {
28 |     "lua >= 5.1",
29 |     "hashchop >= 0.8-0",
30 |     "slncrypto >= 1.1-1",
31 |     "lzlib >= 0.3-3",
32 |     "luafilesystem >= 1.6.2-1",
33 |     "lsqlite3 >= 0.8-1",
34 |     "lunatest >= 0.9.1-1",
35 |     "lrandom >= 20101118-1",
36 | }
37 | build = {
38 |     type = "none",
39 |     install = {
40 |         bin = { ["tangram"] = "tangram.lua"},
41 |         lua = {
42 |             ['tangram.cmds'] = "tangram/cmds.lua",
43 |             ['tangram.db'] = "tangram/db.lua",
44 |             ['tangram.defaults'] = "tangram/defaults.lua",
45 |             ['tangram.init'] = "tangram/init.lua",
46 |             ['tangram.jumprope'] = "tangram/jumprope.lua",
47 |             ['tangram.main'] = "tangram/main.lua",
48 |             ['tangram.test_db'] = "tangram/test_db.lua",
49 |             ['tangram.test_jumprope'] = "tangram/test_jumprope.lua",
50 |         },
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/tangram.lua:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env lua
 2 | -- Copyright (c) 2012-2013, Scott Vokes <vokes.s@gmail.com>
 3 | --
 4 | -- All rights reserved.
 5 | -- 
 6 | -- Redistribution and use in source and binary forms, with or without
 7 | -- modification, are permitted provided that the following conditions
 8 | -- are met:
 9 | --     * Redistributions of source code must retain the above copyright
10 | --       notice, this list of conditions and the following disclaimer.
11 | --     * Redistributions in binary form must reproduce the above
12 | --       copyright notice, this list of conditions and the following
13 | --       disclaimer in the documentation and/or other materials
14 | --       provided with the distribution.
15 | --     * Neither the name of Scott Vokes nor the names of other
16 | --       contributors may be used to endorse or promote products
17 | --       derived from this software without specific prior written
18 | --       permission.
19 | -- 
20 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 | -- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 | -- COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 | -- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 | -- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | -- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 | -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 | -- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 | -- POSSIBILITY OF SUCH DAMAGE.
32 | 
33 | require "tangram.init"
34 | 
35 | -- Use luarocks, if available.
36 | pcall(require, "luarocks.install")
37 | 
38 | tangram.main.main(arg)
39 | 


--------------------------------------------------------------------------------
/tangram/cmds.lua:
--------------------------------------------------------------------------------
  1 | require "hashchop"
  2 | require "lfs"
  3 | 
  4 | -- This loads either slncrypto (preferred) or luacrypto, which both
  5 | -- install as "crypto".
  6 | require "crypto"
  7 | 
  8 | require "tangram.db"
  9 | local jumprope = require "tangram.jumprope"
 10 | 
 11 | -- Use zlib for compression, if Lua wrapper is available.
 12 | local compress, decompress
 13 | local ok, zlib = pcall(require, "zlib")
 14 | 
 15 | if ok and zlib then
 16 |     compress = zlib.compress
 17 |     decompress = zlib.decompress
 18 | end
 19 | 
 20 | module(..., package.seeall)
 21 | 
 22 | --local function log(...) print(...) end
 23 | local function log(...) end
 24 | local function printf(...) io.write(string.format(...)) end
 25 | 
 26 | local usage = {}
 27 | 
 28 | local function print_usage(cmdname)
 29 |     for _,row in ipairs(usage[cmdname]) do print(row) end
 30 |     os.exit(1)
 31 | end
 32 | 
 33 | local function file_exists(path)
 34 |     return lfs.attributes(path) ~= nil
 35 | end
 36 | 
 37 | local function mkdir_if_nonexistent(path)
 38 |     if not file_exists(path) then
 39 |         assert(lfs.mkdir(path))
 40 |     end
 41 | end
 42 | 
 43 | local sha1                         -- string->sha1 function
 44 | if crypto.sha1 ~= nil then         -- prefer slncrypto
 45 |     sha1 = function(s) return crypto.sha1(s):lower() end
 46 | elseif crypto.digest ~= nil then   -- luacrypto
 47 |     sha1 = function(s) return crypto.digest("sha1", data) end
 48 | end
 49 | 
 50 | local function db_path(cfg)
 51 |     return cfg.base_path .. "db.sql"
 52 | end
 53 | 
 54 | local function pop(t) return table.remove(t, 1) end
 55 | 
 56 | -- Create callbacks jumprope expects for disk I/O
 57 | local function init_callbacks(cfg)
 58 |     local store_path = cfg.base_path .. "store"
 59 |     
 60 |     local function hash_fn(hash)
 61 |         local base = store_path
 62 |         local head, rest = hash:match("(%w%w%w)(%w+)")
 63 |         local fullpath = table.concat{base, "/", head, "/", rest}
 64 |         local basedir = table.concat{base, "/", head}
 65 |         return fullpath, basedir, rest
 66 |     end
 67 | 
 68 |     local function store_coro()
 69 |         local base = store_path
 70 |         -- Walk every content store and yield every SHA1-hash-named file 
 71 |         for dir in lfs.dir(base) do
 72 |             if dir:match("^%x+") then
 73 |                 for tail in lfs.dir(string.format("%s/%s", base, dir)) do
 74 |                     if tail:match("^%x+") then
 75 |                         local h = table.concat{dir, tail}
 76 |                         coroutine.yield(h)
 77 |                     end
 78 |                 end
 79 |             end
 80 |         end
 81 |     end
 82 | 
 83 |     local iter_store = function() return coroutine.wrap(store_coro) end
 84 |     
 85 |     local function get(hash)
 86 |         assert(hash, "no hash given")
 87 |         local path = hash_fn(hash)
 88 |         local f = assert(io.open(path, "r"))
 89 |         local data = f:read("*a")
 90 |         if decompress then data = decompress(data) end
 91 |         log("GET ", hash, data:len())
 92 |         f:close()
 93 |         return data
 94 |     end
 95 | 
 96 |     local function exists(hash)
 97 |         assert(hash, "no hash given")
 98 |         local path = hash_fn(hash)
 99 |         return file_exists(path)
100 |     end
101 | 
102 |     local function put(hash, content)
103 |         if cfg.dry_run then return end
104 |         assert(hash, "no hash given")
105 |         log("SAVE ", hash, " => ", content:len())
106 |         local path, basedir, rest = hash_fn(hash)
107 |         if not file_exists(basedir) then
108 |             assert(lfs.mkdir(basedir))
109 |         end
110 |         if file_exists(path) then return end
111 |         local f = assert(io.open(path, "w"))
112 |         if compress then content = compress(content) end
113 |         f:write(content)
114 |         f:close()
115 |     end
116 | 
117 |     local function delete(hash)
118 |         local path = hash_fn(hash)
119 |         assert(os.remove(path), "Unable to delete: " .. tostring(path))
120 |     end
121 | 
122 |     return {get=get, put=put, exists=exists, delete=delete,
123 |             iter_store=iter_store}
124 | end
125 | 
126 | usage["init"] = {
127 |     "Usage for 'init' command:",
128 |     "init [-b RH_BITS] [-f BRANCH_FACTOR]",
129 |     "  RH_BITS: Bits for rolling hash bitmask (average chunk size ~ 2^RH_BITS).",
130 |     "  BRANCH_FACTOR: Each Jumprope limb has a 1:BF chance of terminating.",
131 | }
132 | 
133 | function cmd_init(arg, cfg)
134 |     local path = cfg.base_path or DEFAULTS.base_path
135 |     local store_path = path .. "store"
136 | 
137 |     mkdir_if_nonexistent(path)
138 |     mkdir_if_nonexistent(store_path)
139 | 
140 |     local opts = {
141 |         path = db_path(cfg),
142 |         rh_bits = nil,
143 |         branch_factor = nil,
144 |     }
145 | 
146 |     while true do
147 |         if arg[1] == '-b' then  -- rh bits
148 |             pop(arg)
149 |             local b = pop(arg)
150 |             b = assert(tonumber(b), "Invalid spec for RH bits")
151 |             opts.rh_bits = b
152 |         elseif arg[1] == '-f' then
153 |             pop(arg)
154 |             local f = pop(arg)
155 |             f = assert(tonumber(f), "Invalid spec for branch factor")
156 |             opts.branch_factor = f
157 |         elseif arg[1] then
158 |             print_usage("init")
159 |         else
160 |             break
161 |         end
162 |     end
163 | 
164 |     local _ = assert(tangram.db.init_db(opts))
165 |     printf("Initialized jumprope store in: %s\n", opts.path)
166 | end
167 | 
168 | local function add_mainloop(f, hc, jr, read_size)
169 |     local size = 0
170 | 
171 |     -- Read the input, sink it into the hashchopper, and for every
172 |     -- complete chunk it yields, sink it into the jumprope.
173 |     -- This is pretty simple, but error handling adds a bit of code.
174 |     while true do
175 |         local rd = f:read(read_size)
176 |         if rd == nil then break end -- EOF
177 |         local res = hc:sink(rd)
178 |         if res == "ok" then
179 |             -- happy case: bump acc'd size and continue
180 |             size = size + rd:len()
181 |             log("SUNK: ", rd:len())
182 |         elseif res == "overflow" then
183 |             error("Chunk size too large for hashchopper")
184 |         elseif res == "full" then
185 |             error("Buffer full, needs more flushing")
186 |         else
187 |             error("Unexpected: " .. tostring(res))
188 |         end
189 | 
190 |         while true do
191 |             local chunk, err = hc:poll()
192 |             if chunk then
193 |                 log("POLL: ", chunk:len())
194 |                 -- Since the jumprope's callbacks are blocking,
195 |                 -- we can just bail out on error here.
196 |                 assert(jr:sink(chunk))
197 |             elseif err == "underflow" then     -- no more chunks
198 |                 break
199 |             elseif err == "overflow" then
200 |                 error("Too large to fit in buffer")
201 |             else
202 |                 error("Unexpected: " .. tostring(res))
203 |             end 
204 |         end
205 |     end
206 |     
207 |     local rem, err = hc:finish()
208 |     if rem then
209 |         -- sink the remaining content
210 |         log("REM: ", rem:len())
211 |         assert(jr:sink(rem))
212 |     elseif err == "overflow" then
213 |         error("Too large to fit in buffer")
214 |     end
215 |     return size
216 | end
217 | 
218 | usage["add"] = {
219 |     "Usage for 'add' command:",
220 |     "add [-n SAVE-AS-NAME] [FILENAME or -]",
221 | }
222 | 
223 | function cmd_add(arg, cfg)
224 |     local fname = "-"
225 |     local save_as = nil
226 | 
227 |     if arg[1] == '-n' then
228 |         pop(arg)
229 |         save_as = pop(arg)
230 |         if not save_as then print_usage("add") end
231 |     end
232 |     if arg[1] then fname = arg[1] end
233 |     local f = io.stdin
234 | 
235 |     if fname == "-" then
236 |         fname = "<stdin>"
237 |     else
238 |         f = assert(io.open(fname, "r"))
239 |     end
240 |     local db = assert(tangram.db.open(db_path(cfg)))
241 | 
242 |     local db_cfg = db:get_config()
243 |     cfg.bits = assert(db_cfg.rh_bits)
244 |     cfg.branch_factor = assert(db_cfg.branch_factor)
245 | 
246 |     local cbs = init_callbacks(cfg)
247 |     local hc = hashchop.new(cfg.bits)
248 |     local jrs = assert(jumprope.init{get=cbs.get, put=cbs.put,
249 |                                      exists=cbs.exists, hash=sha1})
250 |     local jr = jrs:new()
251 |     local size = add_mainloop(f, hc, jr, 2^cfg.bits)
252 | 
253 |     -- Terminate jumprope and save file metadata
254 |     local headhash = assert(jr:finish())
255 | 
256 |     if cfg.dry_run then
257 |         printf("Not saving (dry run), head hash %s\n", headhash)
258 |         return
259 |     end
260 |     local id = assert(db:add_file(headhash, save_as or fname, size))
261 |     
262 |     if id == 0 then
263 |         printf("File is already stored, head hash %s\n", headhash)
264 |     else
265 |         printf("Added file %d, head hash %s\n", id, headhash)
266 |     end
267 | end
268 | 
269 | usage["get"] = {
270 |     "Usage for 'get' command:",
271 |     "get [-f | -h] KEY [OUT_FILE]",
272 |     "  If no out file path is provided, it will print to stdout.",
273 |     "  If neither '-f' (file ID) nor '-h' (hash) is used, it will",
274 |     "  attempt to guess whether the key is a file ID or hash.",
275 |     -- "[-r FROM:TO] "
276 |     -- "  -r can be used to fetch only a specific byte-range of the file.",
277 | }
278 | 
279 | local function get_headhash_from_args(arg, cfg, db)
280 |     -- arg[1] => hash? file ID? filename?
281 |     local arg_type = "unknown"
282 | 
283 |     local v
284 |     if arg[1] == '-f' then
285 |         arg_type = "id"         -- file ID
286 |         pop(arg)
287 |         v = pop(arg)
288 |     elseif arg[1] == '-h' then
289 |         arg_type = "hash"       -- hash hex digest
290 |         pop(arg)
291 |         v = pop(arg)
292 |     else            -- does the arg looks like a file ID or hash?
293 |         v = pop(arg)
294 |         if v == nil then print_usage("get") end
295 |         v = tostring(v)
296 |         if v:match("^[0-9]+$") then
297 |             arg_type = "id"
298 |         elseif v:match("^[0-9a-fA-F]+$") then
299 |             arg_type = "hash"
300 |         else
301 |             print_usage("get")
302 |         end
303 |     end
304 | 
305 |     if arg_type == "hash" then
306 |         local hashes, conflicts = {}, {}
307 |         for hash in db:get_hash_completions(v) do
308 |             hashes[#hashes+1] = hash.hash
309 |         end
310 |         if #hashes == 0 then
311 |             return nil, conflicts
312 |         elseif #hashes > 1 then
313 |             return nil, hashes
314 |         else
315 |             return hashes[1], nil
316 |         end
317 |     elseif arg_type == "id" then
318 |         local id = tonumber(v)
319 |         local info = db:get_file_info(id)
320 |         if info and info.hash then
321 |             return info.hash, {}
322 |         else
323 |             printf("Bad file ID: %d\n", id)
324 |             os.exit(1)
325 |         end
326 |     end
327 | end
328 | 
329 | function cmd_get(arg, cfg)
330 |     local db = assert(tangram.db.open(db_path(cfg)))
331 | 
332 |     -- Get a single headhash or nil and a "did you mean X,Y,Z..." list.
333 |     local headhash, conflicts = get_headhash_from_args(arg, cfg, db)
334 | 
335 |     if not headhash then
336 |         if #conflicts == 0 then
337 |             printf("No completion found for hash prefix\n")
338 |             os.exit(1)
339 |         end
340 | 
341 |         printf("Ambiguous jumprope spec:\n")
342 |         for _,h in ipairs(conflicts) do
343 |             printf("  %s\n", h)
344 |         end
345 |         os.exit(1)
346 |     end
347 | 
348 |     local f = io.stdout
349 |     if arg[1] then
350 |         f = assert(io.open(arg[1], "w"))
351 |     end
352 |     local cbs = init_callbacks(cfg)
353 |     local jrs = assert(jumprope.init{get=cbs.get, put=cbs.put,
354 |                                      exists=cbs.exists, hash=sha1})
355 |     local jr = jrs:open(headhash)
356 | 
357 |     for chunk in jr:stream() do
358 |         f:write(chunk)
359 |     end
360 |     f:close()
361 | end
362 | 
363 | function cmd_list(arg, cfg)
364 |     local db = assert(tangram.db.open(db_path(cfg)))
365 | 
366 |     printf("%-4s  %-10s  %-19s  %-10s  %s\n",
367 |            "ID", "hash", "time (UTC)", "size", "filename")
368 |     for row in db:get_files() do
369 |         printf('%-4d  %s  %s  %-10d  %s\n',
370 |                row.id, row.hash:sub(1,10), row.timestamp,
371 |                row.size, row.name)
372 |     end
373 | end
374 | 
375 | function cmd_test(arg, cfg)
376 |     local ok = pcall(require, "lunatest")
377 |     if not ok then
378 |         print("test command requires lunatest.")
379 |         os.exit(1)
380 |     end
381 |     
382 |     require "tangram.test_db"
383 |     require "tangram.test_jumprope"
384 | 
385 |     lunatest.suite("tangram.test_db")
386 |     lunatest.suite("tangram.test_jumprope")
387 |     lunatest.run()
388 | end
389 | 
390 | function cmd_info(arg, cfg)
391 |     -- info for file ID: get tags
392 |     local db = assert(tangram.db.open(db_path(cfg)))
393 |     local id = assert(pop(arg), "Not a valid file ID")
394 |     id = assert(tonumber(id), "Not a valid file ID")
395 | 
396 |     local info = assert(db:get_file_info(id))
397 |     for _,key in ipairs{"id", "hash", "timestamp", "size", "name"} do
398 |         if info[key] then printf("%s %s\n", key, info[key]) end
399 |     end
400 | 
401 |     local props = db:get_properties(id)
402 |     for k,v in pairs(props) do
403 |         printf("  %s%s%s\n", k, v ~= "" and " - " or "", v)
404 |     end
405 | end
406 | 
407 | function cmd_forget(arg, cfg)
408 |     local db = assert(tangram.db.open(db_path(cfg)))
409 |     local id = assert(tonumber(pop(arg) or nil), "Not a valid file ID")
410 |     assert(db:rm_file(id))
411 | end
412 | 
413 | function cmd_gc(arg, cfg)
414 |     local db = assert(tangram.db.open(db_path(cfg)))
415 |     local marks = {}
416 | 
417 |     local cbs = init_callbacks(cfg)
418 | 
419 |     -- Wrap get callback in something that marks live files
420 |     -- (This does more disk IO than necessary.)
421 |     local old_get_cb = cbs.get
422 |     cbs.get = function(hash)
423 |                   local data = old_get_cb(hash)
424 |                   if data then marks[hash] = true end
425 |                   return data
426 |               end
427 |     local jrs = assert(jumprope.init{get=cbs.get, put=cbs.put,
428 |                                      exists=cbs.exists, hash=sha1})
429 | 
430 |     for row in db:get_files() do
431 |         local jr = jrs:open(row.hash)
432 |         for chunk in jr:stream() do
433 |             -- just discard the data
434 |         end
435 |     end
436 |     
437 |     -- Iterate over hashes in store, delete any unmarked chunks
438 |     local count = 0
439 |     for f in cbs.iter_store() do
440 |         if not marks[f] then
441 |             cbs.delete(f)
442 |             count = count + 1
443 |         end
444 |     end
445 |     printf("Collected %d chunks\n", count)
446 | end
447 | 
448 | usage["prop"] = {
449 |     "Usage for 'prop' command:",
450 |     "  prop add ID KEY [VALUE] -- set file ID's property KEY to VALUE (or \"\").",
451 |     "  prop del ID -- delete all properties for file ID.",
452 |     "  prop del ID KEY -- delete property KEY for file ID.",
453 | }
454 | 
455 | function cmd_prop(arg, cfg)
456 |     local db = assert(tangram.db.open(db_path(cfg)))
457 |     local mode = pop(arg)
458 |     if mode == "add" then    -- prop add ID KEY VAL
459 |         local id = assert(tonumber(pop(arg)), "Not a valid file ID")
460 |         local key = assert(pop(arg), "Missing property key")
461 |         local value = pop(arg) or ""
462 |         assert(db:add_property(id, key, value))
463 |     elseif mode == "del" then  -- prop del ID KEY
464 |         local id = assert(tonumber(pop(arg)), "Not a valid file ID")
465 |         local key = pop(arg)
466 |         if key then
467 |             assert(db:rm_property(id, key))
468 |         else
469 |             assert(db:rm_property(id))
470 |         end
471 |     else
472 |         print_usage("prop")
473 |     end
474 | end
475 | 
476 | usage["search"] = {
477 |     "Usage for 'search' command:",
478 |     "  search name PATTERN -- search for files whose name matches PATTERN.",
479 |     "  search prop KEY -- search for files who have property KEY.",
480 |     "  search prop KEY VALUE -- search for files who have VALUE for property KEY.",
481 | }
482 | 
483 | function cmd_search(arg, cfg)
484 |     local db = assert(tangram.db.open(db_path(cfg)))
485 |     local mode = pop(arg)
486 |     if mode == "name" then  -- search name PATTERN
487 |         local pattern = assert(pop(arg), "Missing name search pattern")
488 |         for row in db:search_name(pattern) do
489 |             printf("%d  %s\n", row.id, row.name)
490 |         end
491 |     elseif mode == "prop" then -- search prop KEY [VALUE]
492 |         local key = assert(pop(arg), "Missing property search key")
493 |         local value = pop(arg)
494 |         for row in db:search_property(key, value) do
495 |             printf("%d  %s  %s  %s\n", row.id, row.name, row.key, row.value)
496 |         end
497 |     else
498 |         print_usage("search")
499 |     end
500 | end
501 | 


--------------------------------------------------------------------------------
/tangram/db.lua:
--------------------------------------------------------------------------------
  1 | -- This is an odd one, because it actually returns a table called "sqlite3".
  2 | require "lsqlite3"
  3 | 
  4 | require "tangram.defaults"
  5 | 
  6 | local sqlite3 = sqlite3
  7 | local assert, print, setmetatable = assert, print, setmetatable
  8 | local defaults = DEFAULTS
  9 | 
 10 | module(...)
 11 | 
 12 | local schema = [[
 13 | -- known files
 14 | CREATE TABLE IF NOT EXISTS files (
 15 |     id INTEGER PRIMARY KEY,
 16 |     hash TEXT NOT NULL,      -- head hash
 17 |     name TEXT NOT NULL,      -- filename
 18 |     timestamp TIME NOT NULL, -- creation datetime()
 19 |     size INTEGER NOT NULL,   -- file size
 20 |     CONSTRAINT duped_file UNIQUE (hash, name) ON CONFLICT IGNORE
 21 | );
 22 | 
 23 | -- key value store for arbitrary file metadata
 24 | CREATE TABLE IF NOT EXISTS properties (
 25 |     fid INTEGER NOT NULL,
 26 |     key TEXT NOT NULL,
 27 |     value TEXT NOT NULL,
 28 |     FOREIGN KEY (fid) REFERENCES file(id)
 29 | );
 30 | 
 31 | CREATE INDEX IF NOT EXISTS prop_index ON properties (fid, key);
 32 | 
 33 | -- configuration for server
 34 | CREATE TABLE IF NOT EXISTS config (
 35 |     version TEXT NOT NULL,            -- internal data format version
 36 |     rh_bits INTEGER NOT NULL,         -- bits for rolling hash
 37 |     branch_factor INTEGER NOT NULL    -- branch factor for jumprope
 38 | );
 39 | ]]
 40 | 
 41 | -- file info DB
 42 | DB = {}
 43 | DB.__index = DB
 44 | 
 45 | function open(path)
 46 |     path = path or ":memory:"
 47 |     local db, err = sqlite3.open(path)
 48 |     if not db then
 49 |         return nil, ("Unable to open tangram store DB at %s"):format(path)
 50 |     end
 51 |     local res = {_db=db, _cache={}}
 52 |     return setmetatable(res, DB)
 53 | end
 54 | 
 55 | function DB:stmt(sql)
 56 |     local db, cache = self._db, self._cache
 57 |     if cache[sql] then
 58 |         local stmt = cache[sql]
 59 |         stmt:reset()
 60 |         return stmt
 61 |     else
 62 |         local stmt, err = db:prepare(sql)
 63 |         if not stmt then
 64 |             assert(nil, db:error_message())
 65 |         end
 66 |         cache[sql] = stmt
 67 |         return stmt
 68 |     end
 69 | end
 70 | 
 71 | function DB:last_insert_rowid() return self._db:last_insert_rowid() end
 72 | function DB:errmsg() return self._db:errmsg() end
 73 | 
 74 | function DB:add_file(hash, name, size)
 75 |     local stmt = self:stmt([[
 76 | INSERT INTO files (hash, name, size, timestamp) VALUES (?, ?, ?, datetime());]])
 77 |     stmt:bind_values(hash, name, size)
 78 |     local res, err = stmt:step()
 79 |     if res == sqlite3.DONE then
 80 |         return self:last_insert_rowid()
 81 |     else
 82 |         return nil, self:errmsg()
 83 |     end
 84 | end
 85 | 
 86 | function DB:rm_file(id)
 87 |     local stmt = self:stmt("DELETE FROM files WHERE id == ?;")
 88 |     stmt:bind_values(id)
 89 |     local res, err = stmt:step()
 90 |     if res == sqlite3.DONE then
 91 |         return self:rm_property(id)
 92 |     else
 93 |         return nil, db:errmsg()
 94 |     end
 95 | end
 96 | 
 97 | -- Get an iterator for all files.
 98 | function DB:get_files()
 99 |     local stmt = self:stmt("SELECT * FROM files;")
100 |     return stmt:nrows()
101 | end
102 | 
103 | -- Get array of hashes starting with HASH.
104 | function DB:get_hash_completions(hash)
105 |     local stmt = self:stmt("SELECT hash FROM files WHERE hash LIKE ?;")
106 |     stmt:bind_values((hash or "") .. "%")
107 |     return stmt:nrows()
108 | end
109 | 
110 | function DB:add_property(id, key, value)
111 |     local stmt = self:stmt([[
112 | INSERT INTO properties (fid, key, value) VALUES (?, ?, ?);]])
113 |     stmt:bind_values(id, key, value)
114 |     local res, err = stmt:step()
115 |     if res == sqlite3.DONE then
116 |         return self:last_insert_rowid()
117 |     else
118 |         return nil, self:errmsg()
119 |     end
120 | end
121 | 
122 | -- Get info for a single file ID.
123 | function DB:get_file_info(id)
124 |     local stmt = self:stmt("SELECT * FROM files WHERE id == ?;")
125 |     stmt:bind_values(id)
126 |     local info = {}
127 |     for row in stmt:nrows() do
128 |         return row
129 |     end
130 |     return nil, "not found"
131 | end
132 | 
133 | -- Get a table of properties associated with a file ID.
134 | function DB:get_properties(id)
135 |     local stmt = self:stmt("SELECT key, value FROM properties WHERE fid == ?;")
136 |     stmt:bind_values(id)
137 |     local props = {}
138 |     for row in stmt:nrows() do
139 |         props[row.key] = row.value
140 |     end
141 |     return props
142 | end
143 | 
144 | function DB:rm_property(id, key)
145 |     local stmt
146 |     if key then
147 |         stmt = self:stmt([[
148 | DELETE FROM properties
149 | WHERE fid == ? AND key == ?;]])
150 |         stmt:bind_values(id, key)
151 |     else
152 |         stmt = self:stmt("DELETE FROM properties WHERE fid == ?;")
153 |         stmt:bind_values(id)
154 |     end
155 | 
156 |     local res, err = stmt:step()
157 |     if res == sqlite3.DONE then
158 |         return self:last_insert_rowid()
159 |     else
160 |         return nil, self:errmsg()
161 |     end
162 | end
163 | 
164 | function DB:search_name(name)
165 |     local stmt = self:stmt("SELECT id, name FROM files WHERE name LIKE ?;")
166 |     stmt:bind_values("%" .. name .. "%")
167 |     return stmt:nrows()
168 | end
169 | 
170 | function DB:search_hash(hash)
171 |     local stmt = self:stmt("SELECT id FROM files WHERE hash LIKE ?;")
172 |     stmt:bind_values(hash .. "%")
173 |     return stmt:nrows()
174 | end
175 | 
176 | -- Search by key and/or value.
177 | function DB:search_property(key, value)
178 |     local stmt
179 |     if key and value then
180 |         stmt = self:stmt([[
181 | SELECT f.id, f.name, p.key, p.value FROM files f, properties p
182 | WHERE p.key == ? AND p.value == ? AND f.id == p.fid;]])
183 |         stmt:bind_values(key, value)
184 |     else
185 |         stmt = self:stmt([[
186 | SELECT f.id, f.name, p.key, p.value FROM files f, properties p
187 | WHERE p.key == ? AND f.id == p.fid;]])
188 |         stmt:bind_values(key)
189 |     end
190 |     return stmt:nrows()
191 | end
192 | 
193 | function DB:get_config()
194 |     local stmt = self:stmt("SELECT * FROM config;")
195 |     for row in stmt:nrows() do
196 |         return row
197 |     end
198 | end
199 | 
200 | function init_db(opts)
201 |     opts = opts or {}
202 |     opts.rh_bits = opts.rh_bits or defaults.rh_bits
203 |     opts.branch_factor = opts.branch_factor or defaults.branch_factor
204 | 
205 |     local path = opts.path or ":memory:"
206 |     local sql_db, err = sqlite3.open(path)
207 |     if not sql_db then
208 |         return nil, "Failed to create database at " .. path
209 |     end
210 |     local code = sql_db:exec(schema)
211 |     if code ~= sqlite3.OK then
212 |         return nil, sql_db:error_message()
213 |     end
214 | 
215 |     local db = setmetatable({_db=sql_db, _cache={}}, DB)
216 | 
217 |     local stmt = db:stmt([[
218 | INSERT INTO config (version, rh_bits, branch_factor)
219 | VALUES (?, ?, ?);]])
220 |     stmt:bind_values(defaults.version, opts.rh_bits, opts.branch_factor)
221 | 
222 |     local res, err = stmt:step()
223 |     if res == sqlite3.DONE then
224 |         return db
225 |     else
226 |         return nil, sql_db:errmsg()
227 |     end
228 | end
229 | 


--------------------------------------------------------------------------------
/tangram/defaults.lua:
--------------------------------------------------------------------------------
 1 | local HOME = assert(os.getenv("HOME"))
 2 | 
 3 | DEFAULTS = {
 4 |     author = "Scott Vokes <vokes.s@gmail.com>",
 5 |     version = "0.01.02",
 6 | 
 7 |     -- base path for local content store
 8 |     base_path = HOME .. "/.tangram/",
 9 | 
10 |     -- bitmask size for rolling hash
11 |     rh_bits = 15,
12 | 
13 |     -- branching factor for jumprope
14 |     branch_factor = 16,
15 | }
16 | 


--------------------------------------------------------------------------------
/tangram/init.lua:
--------------------------------------------------------------------------------
1 | require "tangram.defaults"
2 | require "tangram.cmds"
3 | require "tangram.db"
4 | require "tangram.jumprope"
5 | require "tangram.main"
6 | 


--------------------------------------------------------------------------------
/tangram/jumprope.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright (c) 2009-2013, Scott Vokes <vokes.s@gmail.com>
  2 | --
  3 | -- All rights reserved.
  4 | -- 
  5 | -- Redistribution and use in source and binary forms, with or without
  6 | -- modification, are permitted provided that the following conditions
  7 | -- are met:
  8 | --     * Redistributions of source code must retain the above copyright
  9 | --       notice, this list of conditions and the following disclaimer.
 10 | --     * Redistributions in binary form must reproduce the above
 11 | --       copyright notice, this list of conditions and the following
 12 | --       disclaimer in the documentation and/or other materials
 13 | --       provided with the distribution.
 14 | --     * Neither the name of Scott Vokes nor the names of other
 15 | --       contributors may be used to endorse or promote products
 16 | --       derived from this software without specific prior written
 17 | --       permission.
 18 | -- 
 19 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 22 | -- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 23 | -- COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 24 | -- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 25 | -- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 26 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 27 | -- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 28 | -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 29 | -- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 30 | -- POSSIBILITY OF SUCH DAMAGE.
 31 | 
 32 | -- imports
 33 | local fmt, concat, pop = string.format, table.concat, table.remove
 34 | local assert, error, math, pcall, setmetatable, tonumber, tostring =
 35 |     assert, error, math, pcall, setmetatable, tonumber, tostring
 36 |     
 37 | module(...)
 38 |     
 39 | local DEFAULT_BRANCH_FACTOR = 16
 40 | DEBUG = false
 41 | 
 42 | -- invariants:
 43 | -- . content is only ever appended at level 0
 44 | -- . max level only ever increases 1 at a time
 45 | -- . 'trunk' is max-level spine of structure
 46 | -- . only trunk branches upward, and only at last node
 47 | 
 48 | -- TODO:
 49 | -- The 'L' (limb) / 'D' (data) markers should be unnecessary if
 50 | -- trunk nodes without upward branches are given an explcit end marker,
 51 | -- such as a "00000000000000000000 0\n" link.
 52 | 
 53 | -- All jumpropes should belong to a common set, store callbacks etc. there.
 54 | JumpropeSet = {}
 55 | JumpropeSet.__index = JumpropeSet
 56 | 
 57 | -- An individual jumprope handle, inside a JumpropeSet.
 58 | Jumprope = {}
 59 | Jumprope.__index = Jumprope
 60 | 
 61 | -- Sentinel
 62 | local UNKNOWN = {}
 63 | 
 64 | local function log(...)
 65 |     if DEBUG then print(string.format(...)) end
 66 | end
 67 | 
 68 | -- Make a new jumprope set. Requires several callbacks:
 69 | -- GET: (hash -> data | nil, "error")
 70 | -- PUT: (hash, data -> true | nil, "error")
 71 | -- EXISTS: (hash -> true | false | nil, "error")
 72 | -- HASH: (data -> hash)
 73 | --
 74 | -- BRANCH_FACTOR: 1:N chance in branching.
 75 | function init(t)
 76 |     local jrs = setmetatable({}, JumpropeSet)
 77 |     jrs.get = assert(t.get, "Must specify 'get' callback")
 78 |     jrs.exists = assert(t.exists, "Must specify 'exists' callback")
 79 |     jrs.put = assert(t.put, "Must specify 'put' callback")
 80 |     jrs.hash = assert(t.hash, "Must specify 'hash' callback")
 81 |     jrs.hash_len = t.hash_len or jrs.hash("foo"):len()
 82 |     jrs.branch_factor = t.branch_factor or DEFAULT_BRANCH_FACTOR
 83 |     jrs.min_len = jrs.branch_factor / 4
 84 |     jrs.max_len = jrs.branch_factor * 4
 85 |     
 86 |     jrs._cache = setmetatable({}, {__mode="kv"})
 87 |     return jrs
 88 | end
 89 | 
 90 | -- Get the head hash of a jumprope, or UNKNOWN if not available.
 91 | function Jumprope:head()
 92 |     return self._headhash
 93 | end
 94 | 
 95 | -- Hash a data string.
 96 | function Jumprope:hash(data)
 97 |     return self._set.hash(data)
 98 | end
 99 | 
100 | -- Is a hash evenly divisible by the branch factor?
101 | function Jumprope:is_div(hash)
102 |     return self._set:is_div(hash)
103 | end
104 | 
105 | -- For a limb node string S, return an iterator of (hash, type, length) tuples.
106 | -- Each line should be have the format of e.g.
107 | --     "da4b9237bacccdf19c0760cab7aec4a8359010b0 D 1\n".
108 | -- The hash should be lowercase.
109 | local function iter_hashes(s)
110 |     assert(s, "no string")
111 |     return s:gmatch("(%x+) ([LD]) (%d+)\n")
112 | end
113 | 
114 | -- Get the total size of the jumprope's data (by summing the trunk's nodes).
115 | function Jumprope:size()
116 |     if self._headhash == UNKNOWN then return nil, "incomplete" end
117 |     if self._size then return self._size end
118 |     
119 |     local t, get = 0, self._set.get
120 |     
121 |     -- get trunk
122 |     local ok, res = pcall(get, self._headhash)
123 |     if not ok then error(res, 0) end   
124 |     
125 |     local hash_iter = iter_hashes(res)
126 |     
127 |     for hash, type, chunk_sz in hash_iter do
128 |         t = t + chunk_sz
129 |     end
130 |     
131 |     self._size = t
132 |     return t
133 | end
134 | 
135 | -- Get the count of data nodes used in building the jumprope.
136 | -- (mainly used for testing / benchmarking)
137 | function Jumprope:count()
138 |     return self._count
139 | end
140 | 
141 | local function push(t, v) t[#t+1] = v end
142 | 
143 | 
144 | --------------
145 | -- Creation --
146 | --------------
147 | 
148 | -- Initialize a new jumprope structure (to be built from streamed data).
149 | function JumpropeSet:new()
150 |     local jr = setmetatable({}, Jumprope)
151 |     jr._set = self
152 |     jr._count = 0                -- node count
153 |     jr._headhash = UNKNOWN       -- hash for head node
154 |     jr._limb = {}                -- current limb
155 |     jr._limb_size = 0            -- data bytes within current limb
156 |     jr._stack = {}               -- stack of limbs
157 |     jr._level = 0                -- current level
158 |     jr._max_level = 0            -- max level of trunk
159 |     return jr
160 | end
161 | 
162 | function JumpropeSet:is_div(hash_str)
163 |     local num = tonumber(hash_str:sub(self.hash_len - 2), 16)
164 |     return num % self.branch_factor == 0
165 | end
166 | 
167 | local function make_new_limb(self)
168 |     self._limb = {}
169 |     self._limb_size = 0
170 | end
171 | 
172 | -- Grow successive downward limbs until back at level 0.
173 | local function descend_to_zero(self)
174 |     while self._level > 0 do
175 |         log("growing downward to zero, @ %d", self._level)
176 |         self._level = self._level - 1
177 |         push(self._stack, {self._limb, self._limb_size})
178 |         make_new_limb(self)
179 |     end
180 | end
181 | 
182 | -- Branch trunk up one level, saving current context, to be completed with
183 | -- the hash of the rest of the jumprope.
184 | local function branch_trunk_upward(self)
185 |     push(self._stack, {self._limb, self._limb_size})
186 |     self._max_level = self._max_level + 1
187 |     self._level = self._level + 1
188 |     make_new_limb(self)
189 |     log("branch_upward to level %d, %d / %d",
190 |         self._level, self._level, self._max_level)
191 |     
192 |     descend_to_zero(self)
193 |     return true
194 | end
195 | 
196 | -- Append a "hash type length\n" line to the current limb.
197 | -- Type is either "L" (metadata limb) or "D" (data).
198 | local function append_hash(self, type, data, h, limb_len)
199 |     local limb = self._limb
200 |     h = h or self:hash(data)
201 |     local len = (type == "L" and limb_len or data:len())
202 |     assert(len, "no limb length provided")
203 |     push(limb, fmt("%s %s %d\n", h, type, len))
204 |     self._count = self._count + 1
205 |     --print("append_hash: adding ", len, " now ", self._limb_size + len)
206 |     self._limb_size = self._limb_size + len
207 |     log("append_hash %s, type %s, len %d", h, type, #limb)
208 | end
209 | 
210 | -- Should the current addition also be a breaking point for the current limb?
211 | local function should_break(self, limb, hash, bf)
212 |     local len = #limb
213 |     local div = self:is_div(hash, bf)
214 |     local sb = len >= self._set.max_len or (len >= self._set.min_len and div)
215 |     log("%d, %s -> %s", len, tostring(div), tostring(sb))
216 |     return sb
217 | end
218 | 
219 | -- Terminate the current limb, popping back up one or more limb(s)
220 | -- according to the hashes of the terminated limbs, then grow back
221 | -- down to limb 0.
222 | local function terminate_branch(self)
223 |     local cur_limb = concat(self._limb)
224 |     local cur_limb_size = self._limb_size
225 |     
226 |     local h = self:hash(cur_limb)
227 |     local cfg = self._set
228 |     local put = cfg.put
229 |     --print("Cls", cur_limb_size)
230 |     local ok, err = pcall(put, h, cur_limb)
231 |     if not ok then return nil, err end
232 |     
233 |     local pair = pop(self._stack)
234 |     self._limb, self._limb_size = pair[1], pair[2]
235 |     assert(self._limb_size)
236 |     --print("Adding", cur_limb_size, " now ", self._limb_size + cur_limb_size)
237 |     log("terminate_branch, level == %d / %d", self._level, self._max_level)
238 |     assert(self._level < self._max_level)
239 |     self._level = self._level + 1
240 |     assert(self._limb_size)
241 |     log("LIMB SIZE", self._limb_size)
242 |     append_hash(self, "L", cur_limb, h, cur_limb_size) --self._limb_size)
243 |     
244 |     local is_trunk = self._level == self._max_level
245 |     if should_break(self, self._limb, h, cfg.branch_factor) then
246 |         log("-- breaking at %d, %s", self._level, tostring(is_trunk))
247 |         if is_trunk then
248 |             branch_trunk_upward(self)
249 |         else
250 |             terminate_branch(self)
251 |         end
252 |     else
253 |         descend_to_zero(self)
254 |     end
255 |     assert(self._level == 0, "should end terminate_branch with level of 0")
256 |     return true
257 | end
258 | 
259 | -- Sink data into the jumprope, return true | nil, "error".
260 | function Jumprope:sink(data)
261 |     assert(data)
262 |     local h = self:hash(data)
263 |     local cfg = self._set
264 |     local put = cfg.put
265 |     local ok, err = pcall(put, h, data)
266 |     if not ok then return nil, err end
267 |     
268 |     assert(self._level == 0, "Appending data at non-zero level")
269 |     assert(self._limb)
270 |     assert(self._limb_size)
271 |     append_hash(self, "D", data, h)
272 |     
273 |     log("sink %d / %d, %d",
274 |         self._level, self._max_level, self._count)
275 |     
276 |     if should_break(self, self._limb, h, cfg.branch_factor) then
277 |         local is_trunk = self._level == self._max_level
278 |         if is_trunk then   -- trunk; push and increase trunk level
279 |             return branch_trunk_upward(self)
280 |         else               -- branch; close branch and pop to previous
281 |             return terminate_branch(self)
282 |         end
283 |     end
284 |     
285 |     return true
286 | end
287 | 
288 | -- Close out the current limb.
289 | local function pop_limb(self, put)
290 |     assert(#self._stack > 0)
291 |     local cur_limb = concat(self._limb)
292 |     local pair = pop(self._stack)
293 |     local cur_limb_size = self._limb_size
294 |     self._limb, self._limb_size = pair[1], pair[2]
295 |     --print("pop: adding ", cur_limb_size, " now ", self._limb_size + cur_limb_size)
296 |     self._limb_size = self._limb_size + cur_limb_size
297 |     assert(self._limb_size)
298 |     local h = self:hash(cur_limb)
299 |     local ok, err = pcall(put, h, cur_limb)
300 |     if not ok then return nil, err end
301 |     self._count = self._count + 1
302 |     push(self._limb, fmt("%s L %d\n", h, cur_limb_size))
303 |     return true
304 | end
305 | 
306 | -- EOF has been reached, close out the intermediate data structures
307 | -- and return the head hash or nil, "error".
308 | function Jumprope:finish()
309 |     local put = self._set.put
310 |     while #self._stack > 0 do
311 |         local ok, err = pop_limb(self, put)
312 |         if not ok then return nil, err end
313 |     end
314 |     local root = concat(self._limb)
315 |     
316 |     local trunk = {}
317 |     local total_size = 0
318 |     for hash, type, len_str in iter_hashes(root) do
319 |         len = tonumber(len_str)
320 |         push(trunk,  {hash, type, len})
321 |         total_size = total_size + len
322 |         --print("TRUNK", hash, type, len)
323 |     end
324 |     
325 |     -- It should have at least one node.
326 |     if root == "" then
327 |         local h = self:hash("")
328 |         local ok, err = pcall(put, h, "")
329 |         if not ok then return nil, err end
330 |         self._count = 1
331 |         root = fmt("%s D 0\n", h)
332 |     end
333 |     local head = self:hash(root)
334 |     local ok, err = pcall(put, head, root)
335 |     if not ok then return nil, err end
336 |     
337 |     -- Clear temporary data
338 |     self._limb = nil
339 |     self._stack = nil
340 |     
341 |     -- Save info about root of structure
342 |     self._headhash = head
343 |     self._size = total_size
344 |     return head
345 | end
346 | 
347 | 
348 | ---------------
349 | -- Retrieval --
350 | ---------------
351 | 
352 | -- Create a handle to an existing jumprope with the head HEADHASH.
353 | function JumpropeSet:open(headhash)
354 |     assert(headhash, "no hash given")
355 |     local jr = setmetatable({}, Jumprope)
356 |     jr._headhash = headhash
357 |     jr._set = self
358 |     return jr
359 | end
360 | 
361 | -- Do sanity checks, then get the portion of data[from:to] that falls
362 | -- within from < s < to (zero-indexed).
363 | -- CHUNK is data[offset:offset + chunk_sz].
364 | -- (This is only exported for testing.)
365 | function within_span(chunk, offset, from, to, chunk_sz)
366 |     local of, ot = from - offset, to - offset
367 |     if of < 1 then of = 0 end
368 |     
369 |     assert(offset + chunk_sz >= from, "offset + chunk_sz <= from")
370 |     assert(offset < to, "offset >= to")
371 |     
372 |     local span = ot - of
373 |     local from, to = of + 1, of + span
374 |     if to == math.huge then to = nil end
375 |     return chunk:sub(from, to)
376 | end
377 | 
378 | -- Get an iterator for the jumprope's data between the 
379 | -- byte offsets FROM < b < TO, which default to 0 and data:len().
380 | -- Since the range ends may not coincide with a chunk boundary,
381 | -- fetch and return subsets of chunks as necessary.
382 | -- 
383 | -- Unlike Lua, this is 0-indexed, i.e., ("blah"):stream(0,2) yields "bl".
384 | function Jumprope:stream(from, to)
385 |     from = from or 0
386 |     to = to or math.huge
387 |     if self._headhash == UNKNOWN then
388 |         error("jumprope is not yet readable", 0)
389 |     end
390 |     local actual_get, cache = self._set.get, self._set._cache
391 |     local get = function(hash)
392 |                     local v = cache[hash]
393 |                     if v then return v end
394 |                     v = actual_get(hash)
395 |                     -- FIXME: disable cache for now, it's
396 |                     -- not being collected properly.
397 |                     --cache[hash] = v
398 |                     return v
399 |                 end
400 |     local ok, res = pcall(get, self._headhash)
401 |     if not ok then error(res, 0) end
402 |     
403 |     local offset, stack, hash_iter = 0, {}, iter_hashes(res)
404 |     
405 |     local iterator
406 |     iterator = function()
407 |        if not stack then return nil end  -- already DONE
408 |        local hash, type, chunk_sz = hash_iter()
409 |        local chunk
410 |        
411 |        if hash then                      -- got a chunk
412 |            local post = offset + chunk_sz
413 |            
414 |            -- print(string.format("* %s (%s), %d bytes, offset %d (%s - %s)",
415 |            --                   hash, type, chunk_sz, offset, from, to))
416 |            if post < from then            -- skip chunk
417 |                offset = offset + chunk_sz
418 |                return iterator()
419 |            elseif offset >= to then       -- done with iteration
420 |                stack = nil
421 |                return
422 |            elseif type == "L" then        -- push stack and descend
423 |                assert(offset < to or (offset <= from and post > from))
424 |                push(stack, hash_iter)
425 |                ok, chunk = pcall(get, hash)
426 |                if not ok then return error(chunk, 0) end
427 |                hash_iter = iter_hashes(chunk)
428 |                return iterator()
429 |            elseif type == "D" then        -- yield some/all of data chunk
430 |                ok, chunk = pcall(get, hash)
431 |                if not ok then error(chunk, 0) end
432 |                
433 |                if offset > from and post < to then     -- full yield
434 |                    log("YIELDING CONTENT: %d", chunk:len())
435 |                    offset = post
436 |                    return chunk
437 |                else                                      -- partial yield
438 |                    local part = within_span(chunk, offset, from, to, chunk_sz)
439 |                    log("YIELDING PARTIAL CONTENT: %d", part:len())
440 |                    offset = post
441 |                    return part
442 |                end
443 |            else
444 |                error("Bad type")
445 |            end
446 |        else
447 |            if #stack == 0 then            -- EOF
448 |                stack = nil
449 |                return nil, "done"
450 |            else                   -- pop limb stack and continue
451 |                hash_iter = pop(stack)
452 |                return iterator()
453 |            end
454 |        end
455 |    end
456 |     
457 |    return iterator
458 | end
459 | 


--------------------------------------------------------------------------------
/tangram/main.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright (c) 2012-2013, Scott Vokes <vokes.s@gmail.com>
  2 | --
  3 | -- All rights reserved.
  4 | -- 
  5 | -- Redistribution and use in source and binary forms, with or without
  6 | -- modification, are permitted provided that the following conditions
  7 | -- are met:
  8 | --     * Redistributions of source code must retain the above copyright
  9 | --       notice, this list of conditions and the following disclaimer.
 10 | --     * Redistributions in binary form must reproduce the above
 11 | --       copyright notice, this list of conditions and the following
 12 | --       disclaimer in the documentation and/or other materials
 13 | --       provided with the distribution.
 14 | --     * Neither the name of Scott Vokes nor the names of other
 15 | --       contributors may be used to endorse or promote products
 16 | --       derived from this software without specific prior written
 17 | --       permission.
 18 | -- 
 19 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 20 | -- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 21 | -- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 22 | -- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 23 | -- COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 24 | -- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 25 | -- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 26 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 27 | -- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 28 | -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 29 | -- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 30 | -- POSSIBILITY OF SUCH DAMAGE.
 31 | 
 32 | module(..., package.seeall)
 33 | 
 34 | local usage
 35 | 
 36 | -- global switches
 37 | local switches = {
 38 |     ['-v'] = {l="verbose", f=function(a,c) c.verbose = true end },
 39 |     ['-d'] = {l="dry_run", f=function(a,c) c.dry_run = true end },
 40 |     ['-s'] = {l="store path", f=function(a,c) c.base_path = table.remove(a, 1) end },
 41 | }
 42 | 
 43 | cmds = {
 44 |     ['help'] = {l="print this message", f=function(a,c) usage() end },
 45 |     ['init'] = {l="initialize data store", f=tangram.cmds.cmd_init,
 46 |                 o="-r RH_BITS -b BRANCH_FACTOR"},
 47 |     ['version'] = {l="print version",
 48 |                    f=function(a,c) print(DEFAULTS.version); os.exit(0) end },
 49 |     ['add'] = {l="add a file", f=tangram.cmds.cmd_add, o="PATH"},
 50 |     ['get'] = {l="get a file", f=tangram.cmds.cmd_get,
 51 |                o="-r RANGE NAME"},
 52 |     ['list'] = {l="list known files", f=tangram.cmds.cmd_list },
 53 |     ['test'] = {l="run tests", f=tangram.cmds.cmd_test },
 54 |     ['info'] = {l="get info", f=tangram.cmds.cmd_info, o="ID"},
 55 |     ['forget'] = {l="forget a file", f=tangram.cmds.cmd_forget, o="ID"},
 56 |     ['prop'] = {l="get/set property", f=tangram.cmds.cmd_prop},
 57 |     ['search'] = {l="search", f=tangram.cmds.cmd_search},
 58 |     ['gc'] = {l="collect garbage", f=tangram.cmds.cmd_gc },
 59 | }
 60 | 
 61 | function usage()
 62 |     local b = {}
 63 |     local A = function(...) b[#b+1] = string.format(...) end
 64 |     A("tangram: jumprope-based archiver by %s\n", DEFAULTS.author)
 65 |     A("    version %s\n", DEFAULTS.version)
 66 |     A("Usage: \n")
 67 |     A("  Arguments\n")
 68 |     for k,v in pairs(switches) do
 69 |         A("    %s: %s\n", k, v.l)
 70 |     end
 71 |     A("  Commands\n")
 72 |     for k,v in pairs(cmds) do
 73 |         A("    %s: %s\n", k, v.l)
 74 |     end
 75 |     io.write(table.concat(b))
 76 |     os.exit(0)
 77 | end
 78 | 
 79 | local function proc_args(arg)
 80 |     local cfg = {}
 81 | 
 82 |     cfg.base_path = os.getenv("TANGRAM_PATH")
 83 | 
 84 |     while true do
 85 |         local a = table.remove(arg, 1)
 86 |         if not a then break end
 87 |         if cmds[a] then cfg.cmd = cmds[a]; break end
 88 |         local sf = switches[a]
 89 |         if not sf then print("Bad arg: ", a); usage() end
 90 |         sf.f(arg, cfg)
 91 |     end
 92 | 
 93 |     cfg.bits = cfg.bits or DEFAULTS.rh_bits
 94 |     cfg.base_path = cfg.base_path or DEFAULTS.base_path
 95 | 
 96 |     -- Ensure trailing "/" for base path.
 97 |     if cfg.base_path:sub(-1) ~= "/" then
 98 |         cfg.base_path = cfg.base_path .. "/"
 99 |     end
100 | 
101 |     return cfg
102 | end
103 | 
104 | function main(arg)
105 |     if #arg <= 0 then
106 |         usage()
107 |     else
108 |         local cfg = proc_args(arg)
109 |         if cfg.cmd then
110 |             cfg.cmd.f(arg, cfg)
111 |         end
112 |     end
113 | end
114 | 


--------------------------------------------------------------------------------
/tangram/test_db.lua:
--------------------------------------------------------------------------------
  1 | local db = tangram.db
  2 | 
  3 | module(..., package.seeall)
  4 | 
  5 | function test_db_creation()
  6 |     assert_true(db.init_db())
  7 | end
  8 | 
  9 | local exhash = "970318968feb640da723b8826861e41f0718a487"
 10 | 
 11 | local function def_db()
 12 |     return assert(db.init_db())
 13 | end
 14 | 
 15 | function test_db_and_file_and_check()
 16 |     local db = def_db()
 17 |     local res, err = db:add_file(exhash, "bananas.txt", 23)
 18 |     assert_equal(1, res)
 19 |     local found
 20 |     for f in db:get_files() do
 21 |         if f.id == 1 then found = true end
 22 |     end
 23 |     assert_true(found)
 24 | end
 25 | 
 26 | function test_db_add_and_remove()
 27 |     local db = def_db()
 28 |     local res, err = db:add_file(exhash, "bananas.txt", 23)
 29 |     assert_equal(1, res)
 30 |     res, err = db:rm_file(1)
 31 |     assert_true(res)
 32 |     for f in db:get_files() do
 33 |         if f.id == 1 then fail("not deleted") end
 34 |     end
 35 | end
 36 | 
 37 | function test_db_hash_completions()
 38 |     local db = def_db()
 39 |     db:add_file(exhash, "bananas.txt", 23)
 40 |     local hashes = {}
 41 |     for h in db:get_hash_completions(exhash:sub(1,4)) do
 42 |         hashes[#hashes+1] = h.hash
 43 |     end
 44 |     assert_equal(exhash, hashes[1])
 45 |     local hash_h = "ffff" .. exhash:sub(5, exhash:len())
 46 |     local hash_t = exhash:sub(1, exhash:len() - 4) .. "ffff"
 47 |     db:add_file(hash_h, "head.txt", 10)
 48 |     db:add_file(hash_t, "tail.txt", 20)
 49 |     
 50 |     hashes = {}
 51 |     for h in db:get_hash_completions(exhash:sub(1,4)) do
 52 |         hashes[#hashes+1] = h.hash
 53 |     end
 54 |     assert_equal(2, #hashes)
 55 |     table.sort(hashes)
 56 |     assert_equal(exhash, hashes[1])
 57 |     assert_equal(hash_t, hashes[2])
 58 | end
 59 | 
 60 | function test_add_property()
 61 |     local db = def_db()
 62 |     local id, err = db:add_file(exhash, "bananas.txt", 23)
 63 |     db:add_property(id, "version", "1")
 64 |     local props = db:get_properties(id)
 65 |     assert_equal("1", props.version)
 66 | end
 67 | 
 68 | function test_add_and_rm_property()
 69 |     local db = def_db()
 70 |     local id, err = db:add_file(exhash, "bananas.txt", 23)
 71 |     db:add_property(id, "version", "1")
 72 |     db:rm_property(id)
 73 |     local props = db:get_properties(id)
 74 |     assert_equal(nil, props.version)
 75 | end
 76 | 
 77 | function test_search_name()
 78 |     local db = def_db()
 79 |     local id, err = db:add_file(exhash, "bananas.txt", 23)
 80 | 
 81 |     for row in db:search_name("bananas") do
 82 |         if row.id == id then return end
 83 |     end
 84 |     fail("not found")
 85 | end
 86 | 
 87 | function test_search_hash()
 88 |     local db = def_db()
 89 |     local id, err = db:add_file(exhash, "bananas.txt", 23)
 90 | 
 91 |     for row in db:search_hash(exhash) do
 92 |         if row.id == id then return end
 93 |     end
 94 |     fail("not found")
 95 | end
 96 | 
 97 | function test_search_property()
 98 |     local db = def_db()
 99 |     local id, err = db:add_file(exhash, "bananas.txt", 23)
100 |     db:add_property(id, "version", "1")
101 | 
102 |     for row in db:search_property("version") do
103 |         if row.id == id then return end
104 |     end
105 |     fail("not found")
106 | end
107 | 


--------------------------------------------------------------------------------
/tangram/test_jumprope.lua:
--------------------------------------------------------------------------------
  1 | require "random"
  2 | local jumprope = tangram.jumprope
  3 | 
  4 | -- This loads either slncrypto (preferred) or luacrypto, which both
  5 | -- install as "crypto".
  6 | require "crypto"
  7 | 
  8 | local floor = math.floor
  9 | 
 10 | module(..., package.seeall)
 11 | 
 12 | -- Make a table of counts for each JumpropeSet; weak, so they can be GC'd.
 13 | local counts = setmetatable({}, {__mode="v"})
 14 | 
 15 | local sha1                         -- string->sha1 function
 16 | if crypto.sha1 ~= nil then         -- prefer slncrypto
 17 |     sha1 = function(s) return crypto.sha1(s):lower() end
 18 | elseif crypto.digest ~= nil then   -- luacrypto
 19 |     sha1 = function(s) return crypto.digest("sha1", data) end
 20 | end
 21 | 
 22 | function in_mem_JumpropeSet(bf)
 23 |    bf = bf or 64
 24 |    local store = {}
 25 |    local count_fun
 26 |    local function get(hash)
 27 |       local v = store[hash]
 28 |       --print("GET", hash, v and v:len() or "nil")
 29 |       if v then return v else
 30 |          error("unknown hash: " .. hash)
 31 |       end
 32 |    end
 33 | 
 34 |    local function put(hash, data)
 35 |       assert(data, "no data")
 36 |       --print("PUT", hash, data:len())
 37 |       if not store[hash] then count_fun(true) end
 38 |       store[hash] = data
 39 |       return true
 40 |   end
 41 | 
 42 |    local function exists(hash)
 43 |        return store[hash] ~= nil
 44 |    end
 45 | 
 46 |    local jrs = jumprope.init {get=get, put=put, exists=exists, hash=sha1,
 47 |                               branch_factor=bf}
 48 |    count_fun = function(n)
 49 |                   local cur = (counts[jrs] or 0)
 50 |                   if n then counts[jrs] = cur + 1 end
 51 |                   return cur
 52 |                end
 53 |    return jrs, count_fun
 54 | end
 55 | 
 56 | local concat = table.concat
 57 | local char = string.char
 58 | 
 59 | function mk_random_string(sz, seed)
 60 |    seed = seed or 1
 61 |    local r = random.new()
 62 |    r:seed(seed)
 63 |    local buf = {}
 64 |    for i=1,sz do
 65 |       buf[i] = char(r:value(256) - 1)
 66 |    end
 67 |    return concat(buf)
 68 | end
 69 | 
 70 | function test_two_empty_JRs_should_have_the_same_head_hash()
 71 |    local js = in_mem_JumpropeSet()
 72 |    local j1, j2 = js:new(), js:new()
 73 |    -- Add the empty string to #1 and finish it.
 74 |    assert(j1:sink(""))
 75 |    assert(j1:finish())
 76 | 
 77 |    -- Just finish #2 with it empty.
 78 |    assert(j2:finish())
 79 | 
 80 |    assert_true(j1:head(), "head hash should exist")
 81 |    assert_equal(j1:head(), j2:head(), "head hashes should match")
 82 | end
 83 | 
 84 | function test_empty_JRs_should_have_one_node()
 85 |    local js = in_mem_JumpropeSet()
 86 |    local j = js:new()
 87 | 
 88 |    assert(j:sink(""))
 89 |    assert(j:finish())
 90 |    assert_equal(1, j:count())
 91 | end
 92 | 
 93 | function test_two_JRs_with_the_same_single_string_should_have_the_same_hash()
 94 |    local js = in_mem_JumpropeSet()
 95 |    local j1, j2 = js:new(), js:new()
 96 |    local s = "brevity is the soul of wit"
 97 | 
 98 |    assert(j1:sink(s))
 99 |    assert(j1:finish())
100 | 
101 |    assert(j2:sink(s))
102 |    assert(j2:finish())
103 | 
104 |    assert_true(j1:head(), "should exist")
105 |    assert_equal(j1:head(), j2:head(), "head hashes should match")
106 | end
107 | 
108 | function test_two_JRs_with_the_same_set_of_strings_should_have_the_same_hash()
109 |    local js = in_mem_JumpropeSet()
110 |    local j1, j2 = js:new(), js:new()
111 |    local s = "brevity is the soul of wit"
112 | 
113 |    for c in s:gmatch("(.)") do assert(j1:sink(c)) end
114 |    assert(j1:finish())
115 | 
116 |    for c in s:gmatch("(.)") do assert(j2:sink(c)) end
117 |    assert(j2:finish())
118 | 
119 |    assert_true(j1:head(), "should exist")
120 |    assert_equal(j1:head(), j2:head(), "head hashes should match")
121 | end
122 | 
123 | function iter_str(s, chunk_size)
124 |    local i, len = 1, s:len()
125 |    return function ()
126 |              if i > len then return nil end
127 |              local chunk = s:sub(i, i + chunk_size - 1)
128 |              i = i + chunk_size
129 |              return chunk
130 |       end
131 | end
132 | 
133 | function test_test_two_JRs_with_the_same_large_string_should_have_the_same_hash()
134 |    local js = in_mem_JumpropeSet()
135 |    local j1, j2 = js:new(), js:new()
136 | 
137 |    -- 1 MB string of random binary data
138 |    local s = mk_random_string(1024 * 1024, 23)
139 | 
140 |    -- add in 1kb chunks
141 |    for chunk in iter_str(s, 1024) do
142 |       j1:sink(chunk)
143 |       j2:sink(chunk)
144 |    end
145 | 
146 |    j1:finish()
147 |    j2:finish()
148 | 
149 |    assert_equal(j1:head(), j2:head())
150 | end
151 | 
152 | function test_two_JRs_with_the_same_string_should_add_few_new_nodes_when_changed()
153 |    local js, count_fun = in_mem_JumpropeSet()
154 |    local j1, j2 = js:new(), js:new()
155 | 
156 |    -- 1 MB string of random binary data
157 |    local s = mk_random_string(1024 * 1024, 23)
158 | 
159 |    -- add in 1kb chunks
160 |    for chunk in iter_str(s, 1024) do
161 |       j1:sink(chunk)
162 |    end
163 |    local ok, err = j1:finish()
164 |    assert(ok, err)
165 | 
166 |    local pre_count = count_fun()
167 | 
168 |    local i = 0
169 |    for chunk in iter_str(s, 1024) do
170 |       i = i + 1
171 |       if i == 100 then
172 |          j2:sink(("x"):rep(1024))
173 |       else
174 |          j2:sink(chunk)
175 |       end
176 |    end
177 | 
178 |    assert(j2:finish())
179 | 
180 |    local post_count = count_fun()
181 | 
182 |    assert_not_equal(j1:head(), j2:head(), "head hashes should not match")
183 |    assert_lte(0.01 * pre_count, post_count - pre_count)
184 | end
185 | 
186 | -- Test that finish -> pop_limb computes limb size correctly
187 | function test_sink_100_one_byte_chunks_and_total_length()
188 |    local js = in_mem_JumpropeSet()
189 |    local j = js:new()
190 |    local lim = 100
191 | 
192 |    -- add "0" .. "9" over and over
193 |    for i=0,lim - 1 do
194 |       local chunk = tostring(i % 10)
195 |       j:sink(chunk)
196 |    end
197 |    assert(j:finish())
198 | 
199 |    assert_equal(lim, j:size(), "size should match")
200 | end
201 | 
202 | -- Test that terminate_branch computes limb size correctly
203 | function test_sink_1000_one_byte_chunks_and_total_length()
204 |    local js = in_mem_JumpropeSet()
205 |    local j = js:new()
206 |    local lim = 1000
207 | 
208 |    -- add "0" .. "9" over and over
209 |    for i=0,lim - 1 do
210 |       local chunk = tostring(i % 10)
211 |       j:sink(chunk)
212 |    end
213 |    assert(j:finish())
214 | 
215 |    assert_equal(lim, j:size(), "size should match")
216 | end
217 | 
218 | function test_put_failures_should_be_passed_to_user()
219 |    local count = 5
220 |    local function nop() end
221 |    local function put(hash, data)
222 |       count = count - 1
223 |       if count == 0 then error("fail", 0) end
224 |       return true
225 |    end
226 | 
227 |    local jrs = jumprope.init({put=put, get=nop, exists=nop, hash=sha1})
228 |    local jr = jrs:new()
229 |    for i=1,5 do
230 |       local ok, err = jr:sink("blah")
231 |       if i == 5 then
232 |          assert_nil(ok, "should fail")
233 |          assert_equal("fail", err, "should get error message")
234 |       else
235 |          assert(ok)
236 |       end
237 |    end 
238 | end
239 | 
240 | function test_within_span()
241 |    local ws = jumprope.within_span
242 |    local s = "abcdefghijklmnopqrstuvwxyz"
243 |    local function ws(exp, offset, from, to)
244 |       assert_equal(exp, jumprope.within_span(s:sub(offset+1, offset+1+5),
245 |                                              offset, from, to, 5))
246 |    end 
247 | 
248 |    ws("a", 0, 0, 1)
249 |    ws("b", 0, 1, 2)
250 |    ws("b", 1, 1, 2)
251 |    ws("cdef", 1, 2, 6)
252 |    ws("yz", 24, 24, 26)
253 |    ws("z", 25, 25, 26)
254 | end
255 | 
256 | -- Compare strings, but in a way that makes off-by-ones obvious, rather than
257 | -- printing "got (VERY LONG STRING), expected (OTHER VERY LONG STRING)".
258 | local function off_by_one_check(rejoined, expected)
259 |    assert_equal(expected:sub(1, 2), rejoined:sub(1, 2), "first 2 chars should match")
260 |    assert_equal(expected:sub(-2), rejoined:sub(-2), "last 2 chars should match")
261 |    assert_equal(expected:len(), rejoined:len(), "sizes should match")
262 |    assert_true(rejoined == expected, "should equal input")
263 | end
264 | 
265 | function check_it(s, chunk_sz, from, to)
266 |    local js = in_mem_JumpropeSet()
267 |    local j = js:new()
268 | 
269 |    -- add in chunk_sz pieces
270 |    for chunk in iter_str(s, chunk_sz) do
271 |       -- print("< ", chunk)
272 |       j:sink(chunk)
273 |    end
274 |    j:finish()
275 |    assert_equal(s:len(), j:size(), "j:size() is incorrect")
276 | 
277 |    local buf = {}
278 |    local iter = assert(j:stream(from, to))
279 | 
280 |    for chunk in iter do
281 |      buf[#buf+1] = chunk
282 |      -- print(">", chunk)
283 |    end
284 | 
285 |    local expected = s:sub(from + 1, to)
286 |    local rejoined = concat(buf)
287 | 
288 |    off_by_one_check(rejoined, expected)
289 | end
290 | 
291 | function test_get_content_from_part_of_small_string()
292 |    local s = "abcdefghijklmnopqrstuvwxyz"
293 |    for chunk_sz=2,4 do
294 |       for start=0,25 do
295 |          for len=1,6 do
296 |             --print("\n### CSL ", chunk_sz, start, len)
297 |             check_it(s, chunk_sz, start, start + len)
298 |          end
299 |       end
300 |    end
301 | end
302 | 
303 | function test_a_jumprope_iterator_should_return_the_same_content_as_the_original_input()
304 |    -- 1 MB string of random binary data
305 |    local s = mk_random_string(1024 * 1024, 23)
306 |    check_it(s, 63, 0, s:len() - 1)
307 | end
308 | 
309 | function test_get_content_from_halfway_to_the_end()
310 |    -- 1 MB string of random binary data
311 |    local sz = 1024 * 1024
312 |    local s = mk_random_string(sz, 23)
313 |    local start = floor(sz/2)
314 |    check_it(s, 63, start, sz - 1)
315 | end
316 | 
317 | function test_get_the_first_half_of_the_content()
318 |    -- 1 MB string of random binary data
319 |    local sz = 1024 * 1024
320 |    local s = mk_random_string(sz, 23)
321 |    check_it(s, 63, 0, floor(sz/2))
322 | end
323 | 
324 | function test_open_existing_jumprope()
325 |    local sz = 1024 * 1024
326 |    local s = mk_random_string(sz, 27)
327 | 
328 |    local chunk_sz = 999
329 |    local from, to = 0, sz
330 | 
331 |    local js = in_mem_JumpropeSet()
332 |    local j = js:new()
333 | 
334 |    -- add in chunk_sz pieces
335 |    for chunk in iter_str(s, chunk_sz) do
336 |       -- print("< ", chunk)
337 |       j:sink(chunk)
338 |    end
339 | 
340 |    local head = j:finish()
341 |    assert_true(head, "Didn't return headhash")
342 | 
343 |    local j2 = js:open(head)
344 |    
345 |    local buf = {}
346 | 
347 |    local iter = assert(j2:stream(from, to))
348 | 
349 |    for chunk in iter do
350 |      buf[#buf+1] = chunk
351 |    end
352 | 
353 |    -- check content
354 |    local rejoined = table.concat(buf)
355 |    off_by_one_check(rejoined, s)
356 | 
357 |    -- check size
358 |    assert_equal(j:size(), j2:size(), "Size doesn't match")
359 | end
360 | 


--------------------------------------------------------------------------------