├── .gitignore ├── platforms ├── .gitignore ├── darwin_386 │ └── .gitignore ├── darwin_amd64 │ └── .gitignore ├── installers │ └── osx │ │ ├── .gitignore │ │ ├── resources │ │ ├── welcome.html │ │ ├── conclusion.html │ │ └── license.html │ │ ├── Makefile │ │ └── distribution.xml ├── linux_386 │ ├── .gitignore │ └── Vagrantfile ├── linux_amd64 │ ├── .gitignore │ └── Vagrantfile ├── README.md ├── Makefile ├── tar.README.md └── archive.py ├── data └── data.go ├── data_version.go ├── data_info.go ├── Makefile ├── regexp.go ├── data_list.go ├── dev ├── changelog.md ├── cli.md ├── formats.md ├── roadmap.md └── designdoc.md ├── serialize.go ├── data_handle.go ├── datafile.go ├── s3store.go ├── field_user_input.go ├── data_index.go ├── commands.go ├── data_publish.go ├── data_ref.go ├── data_get.go ├── data_config.go ├── util.go ├── data_user.go ├── data_manifest.go ├── data_pack.go ├── README.md └── data_blob.go /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | data/data 3 | -------------------------------------------------------------------------------- /platforms/.gitignore: -------------------------------------------------------------------------------- 1 | archives/ 2 | -------------------------------------------------------------------------------- /platforms/darwin_386/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | -------------------------------------------------------------------------------- /platforms/darwin_amd64/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | -------------------------------------------------------------------------------- /platforms/installers/osx/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkg 2 | root/ 3 | -------------------------------------------------------------------------------- /platforms/linux_386/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | .vagrant/ 4 | -------------------------------------------------------------------------------- /platforms/linux_amd64/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | .vagrant/ 4 | -------------------------------------------------------------------------------- /data/data.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/jbenet/data" 6 | "os" 7 | ) 8 | 9 | // This package (data/data) builds the `data` commandline tool. 10 | // Everything is in the proper data library package. This extra 11 | // package is necessary because packages must yield _either_ a 12 | // library or executable. `data` needed to be both, hence this. 13 | 14 | func main() { 15 | err := data.Cmd_data.Dispatch(os.Args[1:]) 16 | if err != nil { 17 | if len(err.Error()) > 0 { 18 | fmt.Fprintf(os.Stderr, "%v\n", err) 19 | } 20 | os.Exit(1) 21 | } 22 | return 23 | } 24 | -------------------------------------------------------------------------------- /platforms/installers/osx/resources/welcome.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 |

data

11 |

dataset package manager

12 | Publish and download datasets as easy as: 13 |
14 |
15 |
16 | > data publish foo/bar
17 | > data get foo/bar
18 | 
19 |
20 | data is a simple commandline tool with a supporting package index on the web. Use data get to find, download, and manage your datasets. When you're ready to publish data to the world, just run data publish. 21 | 22 |
23 |
24 | Press the "Continue" button below. 25 | 26 | 27 | -------------------------------------------------------------------------------- /platforms/README.md: -------------------------------------------------------------------------------- 1 | # building data 2 | 3 | At present (Go 1.2), cross-compiling go does not work with cgo. It seems 4 | (not actually sure, as not very familiar with all the deps) data uses cgo 5 | extensively. While there seems to be a gcc work-around, it would be useful 6 | to test the tool in all the platforms. Thus, for now, all supported archs 7 | will have a vm in this directory. The process, then, is: 8 | 9 | 1. setup + launch the vm 10 | 1. compile + test data in vm 11 | 1. place release binary in `/platforms//data` 12 | 1. `make -tar` + `make dist` to package bins up 13 | 14 | ## TODO 15 | 16 | 17 | Add VMs: 18 | 19 | - Windows: http://www.modern.ie/en-us/virtualization-tools#downloads 20 | - Darwin 21 | - BSD 22 | -------------------------------------------------------------------------------- /data_version.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "github.com/gonuts/flag" 5 | "github.com/jbenet/commander" 6 | ) 7 | 8 | const Version = "0.1.1" 9 | 10 | var cmd_data_version = &commander.Command{ 11 | UsageLine: "version", 12 | Short: "Show data version information.", 13 | Long: `data version - Show data version information. 14 | 15 | Returns the current version of data and exits. 16 | `, 17 | Run: versionCmd, 18 | Flag: *flag.NewFlagSet("data-user-auth", flag.ExitOnError), 19 | } 20 | 21 | func init() { 22 | cmd_data_version.Flag.Bool("number", false, "show only the number") 23 | } 24 | 25 | func versionCmd(c *commander.Command, _ []string) error { 26 | number := c.Flag.Lookup("number").Value.Get().(bool) 27 | if !number { 28 | pOut("data version ") 29 | } 30 | pOut("%s\n", Version) 31 | return nil 32 | } 33 | -------------------------------------------------------------------------------- /data_info.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/jbenet/commander" 6 | ) 7 | 8 | var cmd_data_info = &commander.Command{ 9 | UsageLine: "info []", 10 | Short: "Show dataset information.", 11 | Long: `data info - Show dataset information. 12 | 13 | Returns the Datafile corresponding to (or in current 14 | directory) and exits. 15 | `, 16 | Run: infoCmd, 17 | } 18 | 19 | func infoCmd(c *commander.Command, args []string) error { 20 | if len(args) < 1 { 21 | return datasetInfo(DatafileName) 22 | } 23 | 24 | return datasetInfo(DatafilePath(args[0])) 25 | } 26 | 27 | func datasetInfo(path string) error { 28 | df, err := NewDatafile(path) 29 | if err != nil { 30 | dErr("Error: %s\n", err) 31 | return fmt.Errorf("Invalid dataset path: %s", path) 32 | } 33 | 34 | buf, err := df.Marshal() 35 | if err != nil { 36 | return err 37 | } 38 | 39 | pOut("%s\n", buf) 40 | return nil 41 | } 42 | -------------------------------------------------------------------------------- /platforms/installers/osx/resources/conclusion.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 |

Installation complete!

11 | data is now installed in your system. To see a list of available commands, open a terminal and enter: 12 |
13 |
14 |
15 | > data
16 | 
17 |
18 | You're now able to install all the datadex datasets from the commandline. For example, to get the MNIST dataset, run: 19 |
20 |
21 |
22 | > data get jbenet/mnist
23 | 
24 |
25 | Find datasets and more informaton at http://datadex.io, or in these documents: 26 |
27 |
28 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /platforms/installers/osx/Makefile: -------------------------------------------------------------------------------- 1 | ifneq ($(shell uname),Darwin) 2 | $(error "Making osx installers is only supported in osx.") 3 | endif 4 | 5 | all: pkg 6 | 7 | # for now, amd64. lipo both together later. 8 | BIN=../../darwin_amd64/data 9 | VERSION=$(shell $(BIN) version --number) 10 | PKGNAME=data-v$(VERSION)-OSX-Installer 11 | 12 | # setup the package contents 13 | root: root/usr/bin/data 14 | 15 | root/usr/bin/data: $(BIN) 16 | -mkdir -p $(dir $@) 17 | cp $< $@ 18 | 19 | # build the packages 20 | data.pkg: root root/usr/bin/data 21 | pkgbuild \ 22 | --root root \ 23 | --identifier io.datadex.data \ 24 | --version $(VERSION) \ 25 | --ownership recommended \ 26 | data.pkg 27 | 28 | $(PKGNAME).pkg: data.pkg distribution.xml 29 | productbuild \ 30 | --distribution distribution.xml \ 31 | --resources resources \ 32 | --package-path data.pkg \ 33 | --version $(VERSION) \ 34 | $(PKGNAME).pkg 35 | 36 | pkg: $(PKGNAME).pkg 37 | 38 | clean: 39 | rm -rf -- root/ 40 | rm -f -- *.pkg 41 | -------------------------------------------------------------------------------- /platforms/Makefile: -------------------------------------------------------------------------------- 1 | 2 | VERSION=$(shell data version --number) 3 | 4 | PLATFORMS= \ 5 | darwin_amd64 \ 6 | linux_386 \ 7 | linux_amd64 \ 8 | # darwin_386 \ 9 | # windows_386 \ 10 | # windows_amd64 \ 11 | 12 | BINS=$(addsuffix /data,$(PLATFORMS)) 13 | 14 | ARCHIVES=$(patsubst %,archives/data-v$(VERSION)-%.tar.gz,$(PLATFORMS)) 15 | 16 | OTHER= \ 17 | installers/osx/data-v$(VERSION)-OSX-Installer.pkg 18 | 19 | all: $(ARCHIVES) $(OTHER) 20 | 21 | %s: %s/data 22 | 23 | linux_%/data: 24 | -rm $@ 25 | cd $(dir $@) && \ 26 | vagrant up && \ 27 | vagrant ssh -c "source ~/.bashrc; cd data; make deps; make;" && \ 28 | vagrant suspend 29 | 30 | darwin_%/data: 31 | cd ../ && $(MAKE) 32 | 33 | windows_%/data: 34 | $(error not implemented) 35 | 36 | archives/data-v$(VERSION)-%.tar.gz: %/data 37 | ./archive.py $( help 41 | 42 | To see a reference of all data commands run: 43 | 44 | data commands help | less 45 | -------------------------------------------------------------------------------- /regexp.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "regexp" 5 | ) 6 | 7 | var UserRegexp *regexp.Regexp 8 | var IdentRegexp *regexp.Regexp 9 | var PathRegexp *regexp.Regexp 10 | var EmailRegexp *regexp.Regexp 11 | var HandleRegexp *regexp.Regexp 12 | var NonIdentRegexp *regexp.Regexp 13 | 14 | func init() { 15 | identRE := "[A-Za-z0-9-_.]+" 16 | pathRE := "((" + identRE + ")/(" + identRE + "))" 17 | handleRE := pathRE + "(\\." + identRE + ")?(@" + identRE + ")?" 18 | emailRE := `(?i)[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,6}` 19 | nonIdentRE := "[^A-Za-z0-9-_.]+" 20 | 21 | UserRegexp = compileRegexp("^" + identRE + "$") 22 | IdentRegexp = compileRegexp("^" + identRE + "$") 23 | PathRegexp = compileRegexp("^" + pathRE + "$") 24 | EmailRegexp = compileRegexp("^" + emailRE + "$") 25 | HandleRegexp = compileRegexp("^" + handleRE + "$") 26 | NonIdentRegexp = compileRegexp(nonIdentRE) 27 | } 28 | 29 | func compileRegexp(s string) *regexp.Regexp { 30 | r, err := regexp.Compile(s) 31 | if err != nil { 32 | pOut("%s", err) 33 | pOut("%v", r) 34 | panic("Regex does not compile: " + s) 35 | } 36 | return r 37 | } 38 | -------------------------------------------------------------------------------- /platforms/linux_386/Vagrantfile: -------------------------------------------------------------------------------- 1 | # github.com/jbenet/platform-vms/i386/linux_ubuntu/go 2 | 3 | Vagrant.configure("2") do |config| 4 | config.vm.box = 'precise32' 5 | config.vm.box_url = 'http://files.vagrantup.com/precise32.box' 6 | 7 | # synced files 8 | config.vm.synced_folder "../../", "/home/vagrant/go/src/github.com/jbenet/data" 9 | 10 | # increase VM memory to 512 MB 11 | config.vm.provider "virtualbox" do |v| 12 | v.customize ["modifyvm", :id, "--memory", "512"] 13 | end 14 | 15 | # run provisioning scripts 16 | config.vm.provision :shell, :inline => <<-eos 17 | 18 | # install tools 19 | apt-get install -y make 20 | apt-get install -y git bzr mercurial # for go get 21 | 22 | # install go 23 | echo "installing go..." 24 | cd /tmp 25 | wget -q https://go.googlecode.com/files/go1.2.linux-386.tar.gz 26 | tar xf go1.2.linux-386.tar.gz 27 | mv go /usr/local/go 28 | chown -R vagrant /home/vagrant/go 29 | ln -s go/src/github.com/jbenet/data /home/vagrant/data 30 | 31 | # setup go workspace 32 | echo "export GOROOT=/usr/local/go" >> /home/vagrant/.bash_profile 33 | echo "export GOPATH=/home/vagrant/go" >> /home/vagrant/.bash_profile 34 | echo "export PATH=\\$PATH:\\$GOROOT/bin:\\$GOPATH/bin" >> /home/vagrant/.bash_profile 35 | 36 | eos 37 | 38 | end 39 | -------------------------------------------------------------------------------- /platforms/linux_amd64/Vagrantfile: -------------------------------------------------------------------------------- 1 | # github.com/jbenet/platform-vms/amd64/linux_ubuntu/go 2 | 3 | Vagrant.configure("2") do |config| 4 | config.vm.box = 'precise64' 5 | config.vm.box_url = 'http://files.vagrantup.com/precise64.box' 6 | 7 | # synced files 8 | config.vm.synced_folder "../../", "/home/vagrant/go/src/github.com/jbenet/data" 9 | 10 | # increase VM memory to 512 MB 11 | config.vm.provider "virtualbox" do |v| 12 | v.customize ["modifyvm", :id, "--memory", "512"] 13 | end 14 | 15 | # run provisioning scripts 16 | config.vm.provision :shell, :inline => <<-eos 17 | 18 | # install tools 19 | apt-get install -y make 20 | apt-get install -y git bzr mercurial # for go get 21 | 22 | # install go 23 | echo "installing go..." 24 | cd /tmp 25 | wget -q https://go.googlecode.com/files/go1.2.linux-amd64.tar.gz 26 | tar xf go1.2.linux-amd64.tar.gz 27 | mv go /usr/local/go 28 | chown -R vagrant /home/vagrant/go 29 | ln -s go/src/github.com/jbenet/data /home/vagrant/data 30 | 31 | # setup go workspace 32 | echo "export GOROOT=/usr/local/go" >> /home/vagrant/.bash_profile 33 | echo "export GOPATH=/home/vagrant/go" >> /home/vagrant/.bash_profile 34 | echo "export PATH=\\$PATH:\\$GOROOT/bin:\\$GOPATH/bin" >> /home/vagrant/.bash_profile 35 | 36 | eos 37 | 38 | end 39 | -------------------------------------------------------------------------------- /platforms/installers/osx/resources/license.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 |

The MIT License

11 |

Copyright (c) 2014 Juan Batiz-Benet

12 |

Permission is hereby granted, free of charge, to any person obtaining a copy 13 | of this software and associated documentation files (the "Software"), to deal 14 | in the Software without restriction, including without limitation the rights 15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | copies of the Software, and to permit persons to whom the Software is 17 | furnished to do so, subject to the following conditions:

18 |

The above copyright notice and this permission notice shall be included in 19 | all copies or substantial portions of the Software.

20 |

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 | THE SOFTWARE.

27 | 28 | 29 | -------------------------------------------------------------------------------- /platforms/installers/osx/distribution.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | data -- dataset package manager 4 | io.datadex 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | data.pkg 16 | 18 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /data_list.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "github.com/jbenet/commander" 5 | "io/ioutil" 6 | "path" 7 | ) 8 | 9 | var cmd_data_list = &commander.Command{ 10 | UsageLine: "list ", 11 | Short: "List installed datasets.", 12 | Long: `data list - List insalled datasets. 13 | 14 | Returns all the datasets installed in the dataset working directory, 15 | end exits. 16 | `, 17 | Run: listCmd, 18 | } 19 | 20 | func listCmd(*commander.Command, []string) error { 21 | return listDatasets(DatasetDir) 22 | } 23 | 24 | func listDatasets(dir string) error { 25 | authors, err := ioutil.ReadDir(dir) 26 | 27 | if err != nil { 28 | pErr("data: error reading dataset directory \"%s\"\n", dir) 29 | return err 30 | } 31 | 32 | // for each author dir 33 | for _, a := range authors { 34 | // skip hidden files 35 | if a.Name()[0] == '.' { 36 | continue 37 | } 38 | 39 | author := path.Join(dir, a.Name()) 40 | datasets, err := ioutil.ReadDir(author) 41 | if err != nil { 42 | continue 43 | } 44 | 45 | // for each dataset dir 46 | for _, d := range datasets { 47 | // skip hidden files 48 | if d.Name()[0] == '.' { 49 | continue 50 | } 51 | 52 | dataset := path.Join(a.Name(), d.Name()) 53 | datafile, err := NewDatafile(DatafilePath(dataset)) 54 | if err != nil { 55 | pErr("Error: %s\n", err) 56 | continue 57 | } 58 | 59 | pOut("%s\n", datafile.Dataset) 60 | } 61 | } 62 | 63 | return nil 64 | } 65 | -------------------------------------------------------------------------------- /dev/changelog.md: -------------------------------------------------------------------------------- 1 | # data changelog 2 | 3 | ## v0.1.1 2014-02-05 4 | 5 | - data help: groups commands 6 | - publish guide messages 7 | - default dataset id to cwd basename 8 | - changed Manifest -> .data/Manifest filename 9 | - data get: install path is handle 10 | - data get: no littering if not found 11 | - data blob: creates dir(path) 12 | - data config flexibility 13 | - semver support 14 | 15 | 16 | ## v0.1.0 2014-01-21 17 | 18 | First preview (alpha) 19 | 20 | - release builds 21 | - data commands (for reference) 22 | - data pack make -- Datafile defaults 23 | - datadex api suffix 24 | - data blob put -- verify hash 25 | - data blob {hash, check} 26 | - datadex interop 27 | - data config: env var, --edit 28 | - s3 token based auth for uploading 29 | - s3 anonymous downloading 30 | 31 | ## v0.0.5 2014-01-09 32 | 33 | Publishing + downloading packages. 34 | 35 | - data pack publish 36 | - data publish 37 | - data get (using pack) 38 | - data user {add, auth, pass, info, url} 39 | - data config 40 | 41 | ## v0.0.4 2014-01-03 42 | 43 | Manifest manipulation and packaging. 44 | 45 | - data manifest {add, rm, hash, check} 46 | - data pack {make, manifest, upload, download, check} 47 | 48 | ## v0.0.3 2013-12-13 49 | 50 | Uploading datasets. 51 | 52 | - data manifest (list + hash files) 53 | - data blob (blobs to storage service) 54 | 55 | 56 | ## v0.0.2 2013-11-24 57 | 58 | Downloading datasets. 59 | 60 | - data get (downloads + installs a dataset) 61 | 62 | ## v0.0.1 2013-11-22 63 | 64 | Initial version. 65 | 66 | - command dispatch 67 | - datafile format (yml + structure) 68 | - datafile parsing (loading/dumping) 69 | - data version 70 | - data help (just usage for now) 71 | - data list (show installed datasets) 72 | - data info (loads/dumps dataset's Datafile) 73 | -------------------------------------------------------------------------------- /platforms/archive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import re 5 | 6 | # constants 7 | readme_file = 'tar.README.md' 8 | semver_regx = r'^[0-9]+\.[0-9]+\.[0-9]+$' # lacks pre-releases/builds 9 | valid_archs = [ 10 | 'darwin_amd64', 11 | 'darwin_386', 12 | 'linux_amd64', 13 | 'linux_386', 14 | 'windows_amd64', 15 | 'windows_386', 16 | ] 17 | 18 | 19 | def check(cond, msg): 20 | if not cond: 21 | print 'Error:', msg 22 | exit(-1) 23 | 24 | def write_readme(output, arch, version): 25 | with open(output, 'w') as out: 26 | with open('../%s' % readme_file) as inp: 27 | txt = inp.read() 28 | txt = txt % {'arch': arch, 'version': version} 29 | out.write(txt) 30 | 31 | 32 | def make_archive(arch, vers): 33 | if arch not in valid_archs: 34 | print "Error: arch '%s' not supported" % arch 35 | return -1 36 | 37 | if not re.match(semver_regx, vers): 38 | print "Error: version '%s' is not like X.X.X" % vers 39 | return -1 40 | 41 | if not os.path.exists('%s/data' % arch): 42 | print "Error: binary '%s/data' not found" % arch 43 | return -1 44 | 45 | # move into arch dir 46 | os.chdir(arch) 47 | 48 | # setup directory 49 | dir = 'data-v%s-%s' % (vers, arch) 50 | os.system('mkdir -p %s' % dir) 51 | 52 | # write files 53 | os.system('cp data %s/data' % dir) 54 | write_readme('%s/README.md' % dir, arch, vers) 55 | 56 | # tar 57 | tar = '%s.tar.gz' % dir 58 | os.system('tar czf %s %s' % (tar, dir)) 59 | 60 | # move into place 61 | os.chdir('..') 62 | os.system('mkdir -p archives') 63 | os.system('mv %s/%s archives/%s' % (arch, tar, tar)) 64 | os.system('rm -rf %s/%s' % (arch, dir)) 65 | 66 | print 'packaged archives/%s' % tar 67 | return dir 68 | 69 | 70 | def main(): 71 | import sys 72 | if '-h' in sys.argv or len(sys.argv) < 3: 73 | print 'Usage: %s ' % sys.argv[0] 74 | print 'Prepares the release archive for a given architecture.' 75 | exit(0 if '-h' in sys.argv else -1) 76 | 77 | arch = sys.argv[1] 78 | vers = sys.argv[2] 79 | 80 | archs = valid_archs if arch == 'all' else [arch] 81 | 82 | for arch in archs: 83 | make_archive(arch, vers) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /serialize.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "launchpad.net/goyaml" 9 | "os" 10 | "path" 11 | ) 12 | 13 | type SerializedFile struct { 14 | Path string "-" 15 | Format interface{} "-" 16 | } 17 | 18 | func (f *SerializedFile) Marshal() ([]byte, error) { 19 | dOut("Marshalling %s\n", f.Path) 20 | return goyaml.Marshal(f.Format) 21 | } 22 | 23 | func (f *SerializedFile) Unmarshal(buf []byte) error { 24 | err := goyaml.Unmarshal(buf, f.Format) 25 | if err != nil { 26 | return err 27 | } 28 | 29 | dOut("Unmarshalling %s\n", f.Path) 30 | return nil 31 | } 32 | 33 | func (f *SerializedFile) Write(w io.Writer) error { 34 | buf, err := f.Marshal() 35 | if err != nil { 36 | return err 37 | } 38 | 39 | _, err = w.Write(buf) 40 | return err 41 | } 42 | 43 | func (f *SerializedFile) Read(r io.Reader) error { 44 | buf, err := ioutil.ReadAll(r) 45 | if err != nil { 46 | return err 47 | } 48 | 49 | return f.Unmarshal(buf) 50 | } 51 | 52 | func (f *SerializedFile) WriteFile() error { 53 | if len(f.Path) < 1 { 54 | return fmt.Errorf("SerializedFile: No path provided for writing.") 55 | } 56 | 57 | buf, err := f.Marshal() 58 | if err != nil { 59 | return err 60 | } 61 | 62 | err = os.MkdirAll(path.Dir(f.Path), 0777) 63 | if err != nil { 64 | return err 65 | } 66 | 67 | return ioutil.WriteFile(f.Path, buf, 0666) 68 | } 69 | 70 | func (f *SerializedFile) ReadFile() error { 71 | if len(f.Path) < 1 { 72 | return fmt.Errorf("SerializedFile: No path provided for reading.") 73 | } 74 | 75 | buf, err := ioutil.ReadFile(f.Path) 76 | if err != nil { 77 | return err 78 | } 79 | 80 | return f.Unmarshal(buf) 81 | } 82 | 83 | func (f *SerializedFile) ReadBlob(ref string) error { 84 | i, err := NewMainDataIndex() 85 | if err != nil { 86 | return err 87 | } 88 | 89 | r, err := i.BlobStore.Get(BlobKey(ref)) 90 | if err != nil { 91 | return err 92 | } 93 | 94 | err = f.Read(r) 95 | if err != nil { 96 | return err 97 | } 98 | 99 | return nil 100 | } 101 | 102 | func Marshal(in interface{}) (io.Reader, error) { 103 | buf, err := goyaml.Marshal(in) 104 | if err != nil { 105 | return nil, err 106 | } 107 | 108 | // pOut("\n") 109 | // pOut("%s\n", buf) 110 | // pOut("\n") 111 | return bytes.NewReader(buf), nil 112 | } 113 | 114 | func Unmarshal(in io.Reader, out interface{}) error { 115 | buf, err := ioutil.ReadAll(in) 116 | if err != nil { 117 | return err 118 | } 119 | 120 | // pOut("\n") 121 | // pOut("%s\n", buf) 122 | // pOut("\n") 123 | return goyaml.Unmarshal(buf, out) 124 | } 125 | 126 | // Userful for converting between representations 127 | func MarshalUnmarshal(in interface{}, out interface{}) error { 128 | // struct -> yaml -> map for easy access 129 | rdr, err := Marshal(in) 130 | if err != nil { 131 | return err 132 | } 133 | 134 | return Unmarshal(rdr, out) 135 | } 136 | -------------------------------------------------------------------------------- /data_handle.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "path" 6 | "strings" 7 | ) 8 | 9 | // /[.][@] 10 | 11 | type Handle struct { 12 | Author string 13 | Name string 14 | Format string 15 | Version string 16 | } 17 | 18 | // There are problems with goyaml setters/getters. 19 | // Unmarshaling fails. 20 | // 21 | // func (d Handle) GetYAML() (string, interface{}) { 22 | // pOut("GetYAML\n") 23 | // return "", d.string 24 | // } 25 | // 26 | // func (d Handle) SetYAML(tag string, value interface{}) bool { 27 | // s, ok := value.(string) 28 | // d.string = s 29 | // pOut("SetYAML %s %s\n", d.string, &d) 30 | // return ok 31 | // } 32 | 33 | func NewHandle(s string) *Handle { 34 | d := new(Handle) 35 | d.SetDataset(s) 36 | return d 37 | } 38 | 39 | func (d *Handle) Dataset() string { 40 | s := d.Path() 41 | 42 | if len(d.Format) > 0 { 43 | s = fmt.Sprintf("%s.%s", s, d.Format) 44 | } 45 | 46 | if len(d.Version) > 0 { 47 | s = fmt.Sprintf("%s@%s", s, d.Version) 48 | } 49 | 50 | return s 51 | } 52 | 53 | func (d *Handle) Path() string { 54 | return path.Join(d.Author, d.Name) 55 | } 56 | 57 | func (d *Handle) InstallPath() string { 58 | return path.Join(DatasetDir, d.Dataset()) 59 | } 60 | 61 | // order: rsplit @, split /, rsplit . 62 | func (d *Handle) SetDataset(s string) { 63 | // no / is invalid 64 | if strings.Index(s, "/") == 0 { 65 | return 66 | } 67 | 68 | nam_idx := strings.Index(s, "/") 69 | if nam_idx < 0 { 70 | nam_idx = 0 71 | } 72 | 73 | ver_idx := strings.LastIndex(s, "@") 74 | if ver_idx < 0 { 75 | ver_idx = len(s) // no version in handle. 76 | } 77 | 78 | // this precludes names that have periods... use different delimiter? 79 | fmt_idx := strings.LastIndex(s[nam_idx+1:ver_idx], ".") 80 | if fmt_idx < 0 { 81 | fmt_idx = ver_idx // no format in handle. 82 | } else { 83 | fmt_idx += nam_idx + 1 84 | } 85 | 86 | // parts 87 | d.Author = slice(s, 0, nam_idx) 88 | d.Name = slice(s, nam_idx+1, fmt_idx) 89 | d.Format = slice(s, fmt_idx+1, ver_idx) 90 | d.Version = slice(s, ver_idx+1, len(s)) 91 | } 92 | 93 | func (d *Handle) GoString() string { 94 | return d.Dataset() 95 | } 96 | 97 | func (d *Handle) Valid() bool { 98 | return IsDatasetHandle(d.Dataset()) 99 | } 100 | 101 | // utils 102 | 103 | func slice(s string, from int, to int) string { 104 | from = maxInt(from, 0) 105 | to = minInt(to, len(s)) 106 | return s[minInt(from, to):to] 107 | } 108 | 109 | // https://groups.google.com/forum/#!topic/golang-nuts/dbyqx_LGUxM is silly. 110 | func minInt(x, y int) (r int) { 111 | if x < y { 112 | return x 113 | } 114 | return y 115 | } 116 | 117 | func maxInt(x, y int) (r int) { 118 | if x > y { 119 | return x 120 | } 121 | return y 122 | } 123 | 124 | func handleError(handle string, problem string) error { 125 | return fmt.Errorf("Invalid handle (%s): %s", problem, handle) 126 | } 127 | 128 | func IsDatasetHandle(str string) bool { 129 | return HandleRegexp.MatchString(str) 130 | } 131 | -------------------------------------------------------------------------------- /datafile.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "path" 5 | ) 6 | 7 | /* 8 | # Datafile format 9 | # A YAML (inc json) doc with the following keys: 10 | 11 | # required 12 | handle: /[.][@] 13 | title: Dataset Title 14 | 15 | # optional functionality 16 | dependencies: [] 17 | formats: { : } 18 | 19 | # optional information 20 | description: Text describing dataset. 21 | repository: 22 | website: 23 | license: 24 | contributors: ["Author Name [] [(url)]>", ...] 25 | sources: [] 26 | */ 27 | 28 | // Serializable into YAML 29 | type datafileContents struct { 30 | Dataset string 31 | Tagline string 32 | 33 | Mirrors []string ",omitempty" 34 | Dependencies []string ",omitempty" 35 | Formats map[string]string ",omitempty" 36 | 37 | Description string ",omitempty" 38 | Repository string ",omitempty" 39 | Website string ",omitempty" 40 | License string ",omitempty" 41 | Authors []string ",omitempty" 42 | Contributors []string ",omitempty" 43 | Sources []string ",omitempty" 44 | } 45 | 46 | type Datafile struct { 47 | SerializedFile "-" 48 | datafileContents ",inline" 49 | } 50 | 51 | const DatasetDir = "datasets" 52 | const DatafileName = "Datafile" 53 | 54 | func DatafilePath(dataset string) string { 55 | return path.Join(DatasetDir, dataset, DatafileName) 56 | } 57 | 58 | func NewDatafile(path string) (*Datafile, error) { 59 | df := &Datafile{SerializedFile: SerializedFile{Path: path}} 60 | df.SerializedFile.Format = df 61 | 62 | if len(path) > 0 { 63 | err := df.ReadFile() 64 | if err != nil { 65 | return df, err 66 | } 67 | } 68 | return df, nil 69 | } 70 | 71 | func NewDefaultDatafile() (*Datafile, error) { 72 | return NewDatafile(DatafileName) 73 | } 74 | 75 | func NewDatafileWithRef(ref string) (*Datafile, error) { 76 | f, _ := NewDatafile("") 77 | err := f.ReadBlob(ref) 78 | if err != nil { 79 | return nil, err 80 | } 81 | return f, nil 82 | } 83 | 84 | func (d *Datafile) Handle() *Handle { 85 | return NewHandle(d.Dataset) 86 | } 87 | 88 | func (d *Datafile) Valid() bool { 89 | return d.Handle().Valid() 90 | } 91 | 92 | // datafile manipulation utils 93 | 94 | // Return array of all Datafiles 95 | func NewDatafiles(filenames []string) ([]*Datafile, error) { 96 | files := []*Datafile{} 97 | for _, p := range filenames { 98 | f, err := NewDatafile(p) 99 | if err != nil { 100 | return nil, err 101 | } 102 | 103 | files = append(files, f) 104 | } 105 | return files, nil 106 | } 107 | 108 | // group Datafiles { path : [Datafile, ], } 109 | type DatafileGroupMap map[string][]*Datafile 110 | 111 | func GroupedDatafiles(files []*Datafile) *DatafileGroupMap { 112 | grouped := DatafileGroupMap{} 113 | 114 | for _, f := range files { 115 | group := f.Handle().Path() 116 | grouped[group] = append(grouped[group], f) 117 | } 118 | 119 | return &grouped 120 | } 121 | -------------------------------------------------------------------------------- /dev/cli.md: -------------------------------------------------------------------------------- 1 | ``` 2 | data 3 | 4 | version Show data version information. 5 | config Manage data configuration. 6 | info Show dataset information. 7 | list List installed datasets. 8 | get Download and install dataset. 9 | publish Guided dataset publishing. 10 | 11 | user Manage users and credentials. 12 | add Register new user with index. 13 | auth Authenticate user account. 14 | pass Change user password. 15 | info Show (or edit) public user information. 16 | url Output user profile url. 17 | 18 | manifest Generate and manipulate dataset manifest. 19 | add Adds to manifest (does not hash). 20 | rm Removes from manifest. 21 | hash Hashes and adds checksum to manifest. 22 | check Verifies checksum matches manifest. 23 | 24 | pack Dataset packaging, upload, and download. 25 | make Create or update package description. 26 | manifest Show current package manifest. 27 | upload Upload package to remote storage. 28 | download Download package from remote storage. 29 | checksum Verify all file checksums match. 30 | 31 | blob Manage blobs in the blobstore. 32 | put Upload blob named by to blobstore. 33 | get Download blob named by from blobstore. 34 | url Output Url for blob named by . 35 | check Verify blob contents named by match . 36 | show Output blob contents named by . 37 | ``` 38 | 39 | git backed (use git internally to manage repository changes) 40 | 41 | ``` 42 | data 43 | 44 | version Show data version information. 45 | config Manage data configuration. 46 | info Show dataset information. 47 | list List installed datasets. 48 | get Download and install dataset. 49 | publish Guided dataset publishing. 50 | 51 | user Manage users and credentials. 52 | add Register new user with index. 53 | auth Authenticate user account. 54 | pass Change user password. 55 | info Show (or edit) public user information. 56 | url Output user profile url. 57 | 58 | pack Dataset packaging, upload, and download. 59 | add Add contents to package staging. 60 | rm Removes contents from package staging. 61 | status Show the working tree status. 62 | commit Record changes to package repository. 63 | upload Upload package to remote storage and post to index. 64 | download Download package from remote storage. 65 | checksum Verify all file checksums match. 66 | 67 | blob Manage blobs in the blobstore (unaware of pack) 68 | put Upload blob named from to blobstore. 69 | get Download blob named from blobstore to . 70 | check Verify blob contents in match . 71 | url Output Url for blob named by . 72 | show Output blob contents named by . 73 | ``` 74 | -------------------------------------------------------------------------------- /s3store.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/jbenet/s3" 6 | "github.com/jbenet/s3/s3util" 7 | "io" 8 | "strings" 9 | ) 10 | 11 | type S3Store struct { 12 | bucket string 13 | domain string 14 | config *s3util.Config 15 | 16 | // used for auth credentials 17 | dataIndex *DataIndex 18 | } 19 | 20 | // format from `aws sts` cmd 21 | type AwsCredentials struct { 22 | SecretAccessKey string 23 | SessionToken string 24 | AccessKeyId string 25 | } 26 | 27 | func NewS3Store(bucket string, index *DataIndex) (*S3Store, error) { 28 | 29 | if len(bucket) < 1 { 30 | return nil, fmt.Errorf("Invalid (empty) S3 Bucket name.") 31 | } 32 | 33 | if index == nil { 34 | return nil, fmt.Errorf("Invalid (nil) DataIndex.") 35 | } 36 | 37 | s := &S3Store{ 38 | bucket: bucket, 39 | domain: "s3.amazonaws.com", 40 | dataIndex: index, 41 | } 42 | 43 | s.config = &s3util.Config{ 44 | Service: s3.DefaultService, 45 | Keys: new(s3.Keys), 46 | } 47 | 48 | return s, nil 49 | } 50 | 51 | func (s *S3Store) SetAwsCredentials(c *AwsCredentials) { 52 | s.config.AccessKey = c.AccessKeyId 53 | s.config.SecretKey = c.SecretAccessKey 54 | s.config.SecurityToken = c.SessionToken 55 | 56 | // pOut("Got Aws Credentials:\n") 57 | // pOut(" AccessKey: %s\n", s.config.AccessKey) 58 | // pOut(" SecretKey: %s\n", s.config.SecretKey) 59 | // pOut(" SessToken: %s\n\n", s.config.SecurityToken) 60 | } 61 | 62 | func (s *S3Store) AwsCredentials() *AwsCredentials { 63 | if s.config == nil || len(s.config.AccessKey) == 0 { 64 | return nil 65 | } 66 | 67 | return &AwsCredentials{ 68 | AccessKeyId: s.config.AccessKey, 69 | SecretAccessKey: s.config.SecretKey, 70 | SessionToken: s.config.SecurityToken, 71 | } 72 | } 73 | 74 | func (s *S3Store) Url(key string) string { 75 | if !strings.HasPrefix(key, "/") { 76 | key = "/" + key 77 | } 78 | return fmt.Sprintf("http://%s.%s%s", s.bucket, s.domain, key) 79 | } 80 | 81 | func (s *S3Store) Has(key string) (bool, error) { 82 | url := s.Url(key) 83 | rc, err := s3util.Open(url, s.config) 84 | 85 | if err == nil { 86 | rc.Close() 87 | return true, nil 88 | } 89 | 90 | if strings.Contains(err.Error(), "unwanted http status 404:") { 91 | return false, nil 92 | } 93 | 94 | return false, err 95 | } 96 | 97 | func (s *S3Store) Put(key string, value io.Reader) error { 98 | err := s.ensureUserAwsCredentials() 99 | if err != nil { 100 | return fmt.Errorf("aws credentials error: %v", err) 101 | } 102 | 103 | url := s.Url(key) 104 | w, err := s3util.Create(url, nil, s.config) 105 | if err != nil { 106 | return err 107 | } 108 | 109 | _, err = io.Copy(w, value) 110 | if err != nil { 111 | return err 112 | } 113 | 114 | err = w.Close() 115 | if err != nil { 116 | return err 117 | } 118 | 119 | return nil 120 | } 121 | 122 | func (s *S3Store) Get(key string) (io.ReadCloser, error) { 123 | url := s.Url(key) 124 | return s3util.Open(url, s.config) 125 | } 126 | 127 | func (s *S3Store) getUserAwsCredentials() error { 128 | u := configUser() 129 | if !isNamedUser(u) { 130 | return fmt.Errorf("must be signed in to request aws credentials") 131 | } 132 | 133 | ui := s.dataIndex.NewUserIndex(u) 134 | c, err := ui.AwsCred() 135 | if err != nil { 136 | return err 137 | } 138 | 139 | s.SetAwsCredentials(c) 140 | return nil 141 | } 142 | 143 | func (s *S3Store) ensureUserAwsCredentials() error { 144 | // if we already have credentials, do nothing. 145 | if s.AwsCredentials() != nil { 146 | return nil 147 | } 148 | 149 | return s.getUserAwsCredentials() 150 | } 151 | -------------------------------------------------------------------------------- /field_user_input.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "os" 5 | "regexp" 6 | "strings" 7 | ) 8 | 9 | type InputField struct { 10 | Prompt string 11 | Value *string 12 | Pattern *regexp.Regexp 13 | Help string 14 | } 15 | 16 | func ensureDatafileInPath(path string) error { 17 | _, err := os.Stat(path) 18 | if err == nil { 19 | return nil 20 | } 21 | 22 | // if it doesn't exist, create it. 23 | f, err := os.Create(path) 24 | defer f.Close() 25 | 26 | return nil 27 | } 28 | 29 | func fillOutDatafileInPath(path string) error { 30 | 31 | err := ensureDatafileInPath(path) 32 | if err != nil { 33 | return err 34 | } 35 | 36 | df, err := NewDatafile(path) 37 | if err != nil { 38 | return err 39 | } 40 | 41 | return fillOutDatafile(df) 42 | } 43 | 44 | func fillOutDatafile(df *Datafile) error { 45 | pOut("Writing Datafile fields...\n") 46 | pOut("'Field description [current value]'\n") 47 | 48 | h := df.Handle() 49 | fields := []InputField{ 50 | InputField{ 51 | "owner id (required)", 52 | &h.Author, 53 | UserRegexp, 54 | "Must be a valid username. Can only contain [a-z0-9-_.].", 55 | }, 56 | InputField{ 57 | "dataset id (required)", 58 | &h.Name, 59 | IdentRegexp, 60 | "Must be a valid dataset id. Can only contain [a-z0-9-_.].", 61 | }, 62 | InputField{ 63 | "dataset version (required)", 64 | &h.Version, 65 | IdentRegexp, 66 | "Must be a valid version. Can only contain [a-z0-9-_.].", 67 | }, 68 | InputField{"tagline description (required)", &df.Tagline, nil, 69 | `A tagline is required to describe your package to others. 70 | Good taglines are like titles: short, descriptive phrases.`}, 71 | InputField{"long description (optional)", &df.Description, nil, ""}, 72 | InputField{"license name (optional)", &df.License, nil, ""}, 73 | } 74 | 75 | for _, field := range fields { 76 | err := fillOutField(field) 77 | if err != nil { 78 | return err 79 | } 80 | 81 | df.Dataset = h.Dataset() 82 | if df.Valid() { 83 | err = df.WriteFile() 84 | if err != nil { 85 | return err 86 | } 87 | } 88 | } 89 | 90 | return nil 91 | } 92 | 93 | func fillOutField(f InputField) error { 94 | 95 | // validator function 96 | valid := func(val string) bool { 97 | if strings.Contains(f.Prompt, "required") && len(val) < 1 { 98 | return false 99 | } 100 | 101 | if f.Pattern != nil && !f.Pattern.MatchString(val) { 102 | return false 103 | } 104 | 105 | return true 106 | } 107 | 108 | for { 109 | pOut("Enter %s [%s]: ", f.Prompt, *f.Value) 110 | line, err := readInput() 111 | if err != nil { 112 | return err 113 | } 114 | 115 | // if not required, and entered nothing, get out. 116 | if len(line) == 0 && valid(*f.Value) { 117 | break 118 | } 119 | 120 | // if valid input 121 | if valid(line) { 122 | *f.Value = line 123 | break 124 | } 125 | 126 | if len(f.Help) > 0 { 127 | pOut(" Error: %s\n", f.Help) 128 | } else { 129 | pOut(" Error: Invalid input.\n") 130 | } 131 | } 132 | 133 | dOut("entered: %s\n", *f.Value) 134 | return nil 135 | } 136 | 137 | func fillOutUserProfile(p *UserProfile) error { 138 | pOut("Editing user profile. [Current value].\n") 139 | 140 | fields := []InputField{ 141 | InputField{"Full Name", &p.Name, nil, ""}, 142 | // "Email (required)": &p.Email, 143 | InputField{"Website Url", &p.Website, nil, ""}, 144 | InputField{"Github username", &p.Github, nil, ""}, 145 | InputField{"Twitter username", &p.Twitter, nil, ""}, 146 | } 147 | 148 | for _, f := range fields { 149 | err := fillOutField(f) 150 | if err != nil { 151 | return err 152 | } 153 | } 154 | 155 | return nil 156 | } 157 | -------------------------------------------------------------------------------- /data_index.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "io/ioutil" 7 | "net/http" 8 | "strings" 9 | ) 10 | 11 | type DataIndex struct { 12 | Name string 13 | Http *HttpClient 14 | 15 | // For now, use S3Store directly. clean up interface later. 16 | // BlobStore blobStore 17 | BlobStore *S3Store 18 | } 19 | 20 | var mainDataIndex *DataIndex 21 | 22 | const mainIndexName = "datadex" 23 | 24 | // why not use `func init()`? some commands don't need an index 25 | // is annoying to error out on an S3 key when S3 isn't needed. 26 | func NewMainDataIndex() (*DataIndex, error) { 27 | if mainDataIndex != nil { 28 | return mainDataIndex, nil 29 | } 30 | 31 | i := &DataIndex{Name: mainIndexName} 32 | err := error(nil) 33 | 34 | i.Http, err = NewHttpClient(i.Name) 35 | if err != nil { 36 | return nil, err 37 | } 38 | 39 | i.BlobStore, err = NewS3Store("datadex.archives", i) 40 | if err != nil { 41 | return nil, err 42 | } 43 | 44 | mainDataIndex = i 45 | return mainDataIndex, nil 46 | } 47 | 48 | const HttpHeaderUser = "X-Data-User" 49 | const HttpHeaderToken = "X-Data-Token" 50 | const HttpHeaderContentType = "Content-Type" 51 | const HttpHeaderContentTypeYaml = "application/yaml" 52 | const ApiUrlSuffix = "/api/v1" 53 | 54 | // Controls authenticated http accesses. 55 | type HttpClient struct { 56 | BaseUrl string 57 | Url string 58 | User string 59 | AuthToken string 60 | } 61 | 62 | func NewHttpClient(index string) (*HttpClient, error) { 63 | i, err := configGetIndex(index) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | h := &HttpClient{ 69 | BaseUrl: strings.ToLower(i["url"]), 70 | User: i["user"], 71 | AuthToken: i["token"], 72 | } 73 | 74 | // ensure url has protocol prefix 75 | if !strings.HasPrefix(h.BaseUrl, "http://") && 76 | !strings.HasPrefix(h.BaseUrl, "https://") { 77 | h.BaseUrl = "http://" + h.BaseUrl 78 | } 79 | h.Url = h.BaseUrl 80 | 81 | // ensure url has api suffix 82 | if !strings.HasSuffix(strings.ToLower(h.Url), ApiUrlSuffix) { 83 | h.Url = h.Url + ApiUrlSuffix 84 | } 85 | 86 | return h, nil 87 | } 88 | 89 | func (h HttpClient) SubUrl(path string) string { 90 | return h.Url + "/" + path 91 | } 92 | 93 | func (h *HttpClient) Get(path string) (*http.Response, error) { 94 | dOut("http index get %s\n", h.SubUrl(path)) 95 | 96 | req, err := http.NewRequest("GET", h.SubUrl(path), nil) 97 | if err != nil { 98 | return nil, err 99 | } 100 | 101 | req.Header.Add(HttpHeaderToken, h.AuthToken) 102 | req.Header.Add(HttpHeaderUser, h.User) 103 | return h.DoRequest(req) 104 | } 105 | 106 | func (h *HttpClient) Post(path string, body interface{}) (*http.Response, error) { 107 | dOut("http index post %s\n", h.SubUrl(path)) 108 | 109 | rdr := io.Reader(nil) 110 | var err error 111 | if body != nil { 112 | rdr, err = Marshal(body) 113 | if err != nil { 114 | return nil, err 115 | } 116 | } 117 | 118 | req, err := http.NewRequest("POST", h.SubUrl(path), rdr) 119 | if err != nil { 120 | return nil, err 121 | } 122 | 123 | req.Header.Add(HttpHeaderContentType, HttpHeaderContentTypeYaml) 124 | req.Header.Add(HttpHeaderToken, h.AuthToken) 125 | req.Header.Add(HttpHeaderUser, h.User) 126 | return h.DoRequest(req) 127 | } 128 | 129 | func (h *HttpClient) DoRequest(req *http.Request) (*http.Response, error) { 130 | resp, err := http.DefaultClient.Do(req) 131 | if err != nil { 132 | return nil, err 133 | } 134 | 135 | c := resp.StatusCode 136 | if 200 <= c && c < 400 { 137 | return resp, nil 138 | } 139 | 140 | e, _ := ioutil.ReadAll(resp.Body) 141 | resp.Body.Close() 142 | 143 | s := strings.TrimSpace(string(e[:])) 144 | return nil, fmt.Errorf("HTTP error status code: %d (%s)", c, s) 145 | } 146 | -------------------------------------------------------------------------------- /commands.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "github.com/gonuts/flag" 5 | "github.com/jbenet/commander" 6 | "strings" 7 | "time" 8 | ) 9 | 10 | var Cmd_data = &commander.Command{ 11 | UsageLine: "data [] []", 12 | Short: "dataset package manager", 13 | Long: `data - dataset package manager 14 | 15 | Basic commands: 16 | 17 | get Download and install dataset. 18 | list List installed datasets. 19 | info Show dataset information. 20 | publish Guided dataset publishing. 21 | 22 | Tool commands: 23 | 24 | version Show data version information. 25 | config Manage data configuration. 26 | user Manage users and credentials. 27 | commands List all available commands. 28 | 29 | Advanced Commands: 30 | 31 | blob Manage blobs in the blobstore. 32 | manifest Generate and manipulate dataset manifest. 33 | pack Dataset packaging, upload, and download. 34 | 35 | Use "data help " for more information about a command. 36 | `, 37 | Run: dataCmd, 38 | Subcommands: []*commander.Command{ 39 | cmd_data_version, 40 | cmd_data_config, 41 | cmd_data_info, 42 | cmd_data_list, 43 | cmd_data_get, 44 | cmd_data_manifest, 45 | cmd_data_pack, 46 | cmd_data_blob, 47 | cmd_data_publish, 48 | cmd_data_user, 49 | cmd_data_commands, 50 | }, 51 | Flag: *flag.NewFlagSet("data", flag.ExitOnError), 52 | } 53 | 54 | func dataCmd(c *commander.Command, args []string) error { 55 | pOut(c.Long) 56 | return nil 57 | } 58 | 59 | var cmd_root *commander.Command 60 | 61 | func init() { 62 | // this funky alias is to resolve cyclical decl references. 63 | cmd_root = Cmd_data 64 | } 65 | 66 | var cmd_data_commands = &commander.Command{ 67 | UsageLine: "commands", 68 | Short: "List all available commands.", 69 | Long: `data commands - List all available commands. 70 | 71 | Lists all available commands (and sub-commands) and exits. 72 | `, 73 | Run: commandsCmd, 74 | Subcommands: []*commander.Command{ 75 | cmd_data_commands_help, 76 | }, 77 | } 78 | 79 | var cmd_data_commands_help = &commander.Command{ 80 | UsageLine: "help", 81 | Short: "List all available commands' help pages.", 82 | Long: `data commands help - List all available commands's help pages. 83 | 84 | Shows the pages of all available commands (and sub-commands) and exits. 85 | Outputs a markdown document, also viewable at http://datadex.io/doc/ref 86 | `, 87 | Run: commandsHelpCmd, 88 | } 89 | 90 | func commandsCmd(c *commander.Command, args []string) error { 91 | var listCmds func(c *commander.Command) 92 | listCmds = func(c *commander.Command) { 93 | pOut("%s\n", c.FullSpacedName()) 94 | for _, sc := range c.Subcommands { 95 | listCmds(sc) 96 | } 97 | } 98 | 99 | listCmds(c.Parent) 100 | return nil 101 | } 102 | 103 | func commandsHelpCmd(c *commander.Command, args []string) error { 104 | pOut(referenceHeaderMsg) 105 | pOut("Generated on %s.\n\n", time.Now().UTC().Format("2006-01-02")) 106 | 107 | var printCmds func(*commander.Command, int) 108 | printCmds = func(c *commander.Command, level int) { 109 | pOut("%s ", strings.Repeat("#", level)) 110 | pOut("%s\n\n", c.FullSpacedName()) 111 | pOut("```\n") 112 | pOut("%s\n", c.Long) 113 | pOut("```\n\n") 114 | 115 | for _, sc := range c.Subcommands { 116 | printCmds(sc, level+1) 117 | } 118 | } 119 | 120 | printCmds(c.Parent.Parent, 1) 121 | return nil 122 | } 123 | 124 | const referenceHeaderMsg = ` 125 | # data command reference 126 | 127 | This document lists every data command (including subcommands), along with 128 | its help page. It can be viewed by running 'data commands help', and 129 | at http://datadex.io/doc/ref 130 | 131 | ` 132 | -------------------------------------------------------------------------------- /dev/formats.md: -------------------------------------------------------------------------------- 1 | # data formats 2 | 3 | 4 | One of the important design goals is format-fluidity: ability to store datasets in various formats and transfer between them. Suppose a graph of formats, datasets should be able to traverse strongly connected components. So, if a dataset is published in XML, I should be able to request it in json.[1] This is easy for homogeneous datasets, but gets complicated when one dataset includes files in multiple formats, or it has metadata separated out. 5 | 6 | This is complicated further when thinking about how datasets get authored/published to the index, and retrieved thereafter. In brief, the idea is to follow github pattern: `/`, which reduces namespace problems. This includes versions (tags/branches): `/@`. Note: this handle will be used in projects' Datafiles, to specify dependencies (datasets composed of other datasets[2]), etc. 7 | 8 | 9 | Some possibilities: 10 | 11 | 12 | 1. Let formats be branches like any other. `/@`. Since version and format are now in the same namespace, would see things like: `foo/bar@1.0-json`, `foo/bar@1.2-xml`. This complicates maintenance: both new versions or new formats require a "row" of "commits" along the formats or versions, respectively. 13 | 14 | 2. Let formats be dimensions (see [3]). `/#format:`. Would see things like: `foo/bar@format:json`, `foo/bar@format:xml` There would be dimensional 'defaults' (as HEAD is default tag) that could be specified in the package description file. 15 | 16 | 3. Let formats be specified separately. `/.`. e.g. `foo/bar.json`, `foo/bar.xml`. This seems neat and nice. 17 | 18 | 4. Punt. let authors choose their formats in the dataset. Would see things like: `foo/bar-json`, `foo/xmlbar`. Would not have format-fluidity :(. Naming wont be held to standard if users control it... 19 | 20 | 21 | So far, I like 2 and 3 the best. 2 implies building [3] below, or at least a subset of the functionality. Building [3] would also make it easier to convert between formats. Just unclear how likely data across domains would be generalizable to this DIR. Would genomics/proteomics data fit this? 22 | 23 | 24 | 25 | [1] implementation detail to choose where to be in the `index stores one fmt and tool converts locally <--> index stores every format` spectrum. Most likely in between: index stores every format but constructs them lazily) 26 | 27 | [2] think of docker images. datasets can be expressed as instructions that construct it (some files from foo/dataset1 + some from bar/dataset2). This implies that a selecting sub-portions of a dataset could be a really useful mechanic.[3] 28 | 29 | ### selecting 30 | 31 | [3] imagine selecting [n-m] rows of a given dataset. Unclear yet how this should work exactly, but i've ideas along a dataset intermediate representation (DIR), where data is expressed as points in a multi-dimensional space, and a dataset is expressed as a subspace, or intervals across some dimensions. This would work well even for tables, allowing one to select slices of a dataset with something like: /#[:[:]]` e.g. 32 | 33 | lecun/norb#class # points that have a class 34 | lecun/norb#class:car # points that have class `car` 35 | lecun/norb#set:training # points in the training set 36 | lecun/norb#y:0:10 # points where `0 <= y <= 10` 37 | 38 | (and of course, can specify multiple comma-delimited dimensions) 39 | 40 | Or: 41 | 42 | lecun/norb#class # points that have a class 43 | lecun/norb#class[car] # points that have class `car` 44 | lecun/norb#set[training] # points in the training set 45 | lecun/norb#y[0, 10] # points where `0 <= y <= 10` 46 | lecun/norb#y]0, 10[ # points where `0 < y < 10` 47 | 48 | This seems like a really powerful thing to enable. Unclear how to do it well at present. Lots and lots of edge cases. This can come in later versions but must not close doors to it now. (Another note: i realize this basically is a dumber query string `?param=val`, problem with using a query string is these handles may have to be embedded in URLs :/ though i guess hashes are out in that case...) 49 | 50 | -------------------------------------------------------------------------------- /data_publish.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gonuts/flag" 6 | "github.com/jbenet/commander" 7 | ) 8 | 9 | var cmd_data_publish = &commander.Command{ 10 | UsageLine: "publish", 11 | Short: "Guided dataset publishing.", 12 | Long: `data publish - Guided dataset publishing. 13 | 14 | This command guides the user through the necessary steps to 15 | create a data package (Datafile and Manifest), uploads it, 16 | and publishes it to the dataset index. 17 | 18 | See 'data pack'. 19 | `, 20 | Run: publishCmd, 21 | Flag: *flag.NewFlagSet("data-pack-publish", flag.ExitOnError), 22 | } 23 | 24 | func init() { 25 | cmd_data_publish.Flag.Bool("clean", true, 26 | "rebuild manifest (data pack make --clean)") 27 | cmd_data_publish.Flag.Bool("force", false, 28 | "force publish (data pack publish --force)") 29 | } 30 | 31 | func publishCmd(c *commander.Command, args []string) error { 32 | u := configUser() 33 | if !isNamedUser(u) { 34 | return fmt.Errorf(NotLoggedInErr) 35 | } 36 | 37 | pOut("==> Guided Data Package Publishing.\n") 38 | pOut(PublishMsgWelcome) 39 | 40 | pOut("\n==> Step 1/3: Creating the package.\n") 41 | pOut(PublishMsgDatafile) 42 | err := packMakeCmd(c, []string{}) 43 | if err != nil { 44 | return err 45 | } 46 | 47 | pOut("\n==> Step 2/3: Uploading the package contents.\n") 48 | pOut(PublishMsgUpload) 49 | err = packUploadCmd(c, []string{}) 50 | if err != nil { 51 | return err 52 | } 53 | 54 | pOut("\n==> Step 3/3: Publishing the package to the index.\n") 55 | pOut(PublishMsgPublish) 56 | return packPublishCmd(c, []string{}) 57 | } 58 | 59 | const NotLoggedInErr = `You are not logged in. First, either: 60 | 61 | - Run 'data user add' to create a new user account. 62 | - Run 'data user auth' to log in to an existing user account. 63 | 64 | 65 | Why does publishing require a registered user account (and email)? The index 66 | service needs to distinguish users to perform many of its tasks. For example: 67 | 68 | - Verify who can or cannot publish datasets, or modify already published ones. 69 | (i.e. the creator + collaborators should be able to, others should not). 70 | - Profiles credit people for the datasets they have published. 71 | - Malicious users can be removed, and their email addresses blacklisted to 72 | prevent further abuse. 73 | ` 74 | 75 | const PublishMsgWelcome = ` 76 | Welcome to Data Package Publishing. You should read these short 77 | messages carefully, as they contain important information about 78 | how data works, and how your data package will be published. 79 | 80 | First, a 'data package' is a collection of files, containing: 81 | - various files with your data, in any format. 82 | - 'Datafile', a file with descriptive information about the package. 83 | - 'Manifest', a file listing the other files in the package and their checksums. 84 | 85 | This tool will automatically: 86 | 1. Create the package 87 | - Generate a 'Datafile', with information you will provide. 88 | - Generate a 'Manifest', with all the files in the current directory. 89 | 2. Upload the package contents 90 | 3. Publish the package to the index 91 | 92 | (Note: to specify which files are part of the package, and other advanced 93 | features, use the 'data pack' command directly. See 'data pack help'.) 94 | 95 | ` 96 | 97 | const PublishMsgDatafile = ` 98 | First, let's write the package's Datafile, which contains important 99 | information about the package. The 'owner id' is the username of the 100 | package's owner (usually your username). The 'dataset id' is the identifier 101 | which defines this dataset. Good 'dataset ids' are like names: short, unique, 102 | and memorable. For example: "mnist" or "cifar". Choose it carefully. 103 | 104 | ` 105 | 106 | const PublishMsgUpload = ` 107 | Now, data will upload the contents of the package (this directory) to the index 108 | sotrage service. This may take a while, if the files are large (over 100MB). 109 | 110 | ` 111 | 112 | const PublishMsgPublish = ` 113 | Finally, data will publish the package to the index, where others can find 114 | and download your package. The index is available through data, and on the web. 115 | 116 | ` 117 | -------------------------------------------------------------------------------- /dev/roadmap.md: -------------------------------------------------------------------------------- 1 | # data roadmap 2 | 3 | This document briefly outlines desired features to implement. 4 | 5 | 6 | ## command dispatch 7 | 8 | Need to implement the skeleton of the project: command parsing/dispatch. 9 | 10 | ## data list 11 | 12 | data list 13 | 14 | List the datasets in the current project 15 | 16 | ## data config 17 | 18 | data config user.name = 'jbenet' 19 | data config --global user.name = 'jbenet' 20 | 21 | Allow the configuration of `data`, using (`git` like) config files. 22 | Consider using a `~/.dataconfig` global config file. 23 | Consider using a `data/config` (or `.dataconfig`) local config file. 24 | 25 | ## data update 26 | 27 | data update 28 | 29 | Download and install newer version. 30 | Also, check whether data is up-to-date on every run (inc option to silence). 31 | 32 | ## data get 33 | 34 | data get / 35 | data get http://datadex.io// 36 | 37 | Download and install packages from the dataset index (datadex, configurable). 38 | No arguments looks into the directory's `Datafile` (configurable) 39 | Allow installation of packages using `/` ref-naming. 40 | Allow installation of packages using `https?://...//` urls. 41 | Use a `--save` flag to store into a `Datafile`. 42 | Installed datasets go into the `data/` directory (configurable) of the project. 43 | Should download compressed files, and use array of mirrors. 44 | 45 | 46 | ## data manifest 47 | 48 | data manifest 49 | 50 | Generate the data manifest file (`.data/manifest`? `Manifest`?), a list of 51 | 52 | 53 | 54 | Hash function? `sha1` for now. Discuss whether to use `sha256`. 55 | This manifest file is all that is needed to reconstruct the dataset. 56 | 57 | (Manifest files can support a poor-man's version control (changesets of 58 | filename/filehashes). Basically, a much simpler `git-annex`. Potentially use 59 | git to store Datafile + manifest: 60 | - making data refs = git refs 61 | - repository abstraction great for storing all package versions) 62 | 63 | Subcommands: 64 | 65 | data manifest add [ | -a, --all] 66 | data manifest rm [ | -m, --missing | -a, --all] 67 | data manifest hash [ | -r, --rehash] 68 | data manifest check [ | -a, --all] 69 | 70 | ## data blob 71 | 72 | data blob 73 | 74 | Manipulate blobs in the (remote) blobstore. 75 | 76 | Subcommands: 77 | 78 | data blob get [ | --all] # download blob from blobstore 79 | data blob put [ | --all] # upload blob to blobstore 80 | 81 | 82 | ## data pack 83 | 84 | data upload 85 | 86 | Upload package archive to the chosen storage service. This ensures the final 87 | archive URL is listed as a mirror in the Datafile. 88 | 89 | Use a datadex-specific s3 bucket: 90 | 91 | data upload 92 | data upload datadex 93 | # PUTs to http://datadex.archives.s3.amazonaws.com////-.tar.gz 94 | 95 | Things to do: 96 | 97 | - ensure there is a version 98 | - ensure there is a title 99 | - prompt for a description 100 | - prompt for a license 101 | 102 | 103 | ## data publish 104 | 105 | data publish 106 | 107 | Upload and register this package to the dataset index (datadex, configurable). 108 | Registered packages require extra definitions in their `Datafile`. 109 | 110 | Things to do: 111 | 112 | - ensure uploaded 113 | - post datafile to datadex/publish 114 | 115 | 116 | 117 | ## data format 118 | 119 | data format / 120 | data put /. 121 | ref: /. 122 | 123 | Convert a dataset from one format to another. 124 | Allow datasets to have multiple formats. 125 | Formats should be convertible -- `f : f(dataset.fmt1) -> dataset.fmt2` 126 | Formats should be defined/enabled per-dataset (in their Datafile). 127 | 128 | ## data tag 129 | 130 | data tag 131 | data get /@ 132 | data put /@[:] 133 | ref: /@ 134 | 135 | List the available (named) tags. 136 | Allow referencing of datasets using specific tags. 137 | Unnamed tags are version hashes. 138 | Named tags are aliases to version hashes. 139 | Put tags to create aliases. 140 | 141 | ## data slice 142 | 143 | ref: /# 144 | 145 | See [`dev/formats`](formats.md). 146 | -------------------------------------------------------------------------------- /data_ref.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | ) 7 | 8 | const RefLatest = "latest" 9 | 10 | // serializable into YAML 11 | type DatasetRefs struct { 12 | 13 | // All published refs are listed here. { ref-hash : iso-timestamp } 14 | Published map[string]string 15 | 16 | // Automatic named pointers to published references. { version : ref-hash } 17 | // Generated from dataset handle versions. 18 | Versions map[string]string 19 | } 20 | 21 | func (r DatasetRefs) LastUpdated() string { 22 | pl := sortMapByValue(r.Published) 23 | if len(pl) > 0 { 24 | return pl[len(pl)-1].Value 25 | } 26 | return "" 27 | } 28 | 29 | func (r DatasetRefs) LatestPublished() string { 30 | s := r.SortedPublished() 31 | if len(s) == 0 { 32 | return "" 33 | } 34 | return s[len(s)-1] 35 | } 36 | 37 | func (r DatasetRefs) SortedPublished() []string { 38 | vs := []string{} 39 | pl := sortMapByValue(r.Published) 40 | for _, p := range pl { 41 | vs = append(vs, p.Key) 42 | } 43 | return vs 44 | } 45 | 46 | // Resolves a ref. If not found, returns "" 47 | func (r DatasetRefs) ResolveRef(ref string) string { 48 | 49 | // default to latest (like HEAD) 50 | if len(ref) == 0 { 51 | ref = RefLatest 52 | } 53 | 54 | // latest -> timestamp sorted 55 | if ref == RefLatest { 56 | return r.LatestPublished() 57 | } 58 | 59 | // look it up in versions table 60 | if ref2, found := r.Versions[ref]; found { 61 | return ref2 62 | } 63 | 64 | // Guess we have no link, check it's a published ref. 65 | if _, found := r.Published[ref]; found { 66 | return ref 67 | } 68 | 69 | // Ref not found 70 | return "" 71 | } 72 | 73 | // Return the named version for ref, or ref if not found. 74 | func (r DatasetRefs) ResolveVersion(ref string) string { 75 | 76 | // Resolve ref first. 77 | ref = r.ResolveRef(ref) 78 | 79 | // Find version for ref. 80 | for v, r := range r.Versions { 81 | if r == ref { 82 | return v 83 | } 84 | } 85 | return ref 86 | } 87 | 88 | type HttpRefIndex struct { 89 | Http *HttpClient 90 | Dataset string 91 | Refs *DatasetRefs 92 | } 93 | 94 | func (h *HttpRefIndex) FetchRefs(refresh bool) error { 95 | // already fetched? 96 | if h.Refs != nil && !refresh { 97 | return nil 98 | } 99 | 100 | resp, err := h.Http.Get("") 101 | if err != nil { 102 | return err 103 | } 104 | defer resp.Body.Close() 105 | 106 | refs := &DatasetRefs{} 107 | err = Unmarshal(resp.Body, refs) 108 | if err != nil { 109 | return err 110 | } 111 | 112 | // set at the end, once we're sure no errors happened 113 | h.Refs = refs 114 | return nil 115 | } 116 | 117 | func (h *HttpRefIndex) Has(ref string) (bool, error) { 118 | return httpExists(h.Http.SubUrl(ref)) 119 | } 120 | 121 | func (h *HttpRefIndex) Get(ref string) (string, error) { 122 | resp, err := h.Http.Get(ref) 123 | if err != nil { 124 | return "", err 125 | } 126 | defer resp.Body.Close() 127 | 128 | buf, err := ioutil.ReadAll(resp.Body) 129 | if err != nil { 130 | return "", err 131 | } 132 | 133 | return string(buf[:]), nil 134 | } 135 | 136 | func (h *HttpRefIndex) Put(ref string) error { 137 | resp, err := h.Http.Post(ref, nil) 138 | if err != nil { 139 | return err 140 | } 141 | 142 | resp.Body.Close() 143 | return nil 144 | } 145 | 146 | func (h *HttpRefIndex) VersionRef(version string) (string, error) { 147 | err := h.FetchRefs(false) 148 | if err != nil { 149 | return "", err 150 | } 151 | 152 | ref := h.Refs.ResolveRef(version) 153 | if ref == "" { 154 | return ref, fmt.Errorf("No ref for version: %s", version) 155 | } 156 | return ref, nil 157 | } 158 | 159 | func (h *HttpRefIndex) RefVersion(ref string) (string, error) { 160 | err := h.FetchRefs(false) 161 | if err != nil { 162 | return "", err 163 | } 164 | 165 | ver := h.Refs.ResolveVersion(ref) 166 | if ver == "" { 167 | return ver, fmt.Errorf("No version for ref: %s", ref) 168 | } 169 | return ver, nil 170 | } 171 | 172 | func (h *HttpRefIndex) RefTimestamp(ref string) (string, error) { 173 | err := h.FetchRefs(false) 174 | if err != nil { 175 | return "", err 176 | } 177 | 178 | time, _ := h.Refs.Published[ref] 179 | return time, nil 180 | } 181 | 182 | func (h *HttpRefIndex) SortedPublished() []string { 183 | return h.Refs.SortedPublished() 184 | } 185 | 186 | // DataIndex extension to generate a RefIndex 187 | func (d *DataIndex) RefIndex(dataset string) *HttpRefIndex { 188 | ri := &HttpRefIndex{ 189 | Http: &HttpClient{ 190 | BaseUrl: d.Http.BaseUrl, 191 | Url: d.Http.Url + "/" + dataset + "/" + "refs", 192 | User: d.Http.User, 193 | AuthToken: d.Http.AuthToken, 194 | }, 195 | Dataset: dataset, 196 | } 197 | return ri 198 | } 199 | -------------------------------------------------------------------------------- /data_get.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/jbenet/commander" 6 | "os" 7 | "path" 8 | "strings" 9 | ) 10 | 11 | var cmd_data_get = &commander.Command{ 12 | UsageLine: "get []", 13 | Short: "Download and install dataset.", 14 | Long: `data get - Download and install dataset. 15 | 16 | Downloads the dataset specified, and installs its files into the 17 | current dataset working directory. The dataset argument is a Handle 18 | of the form: 19 | 20 | /[@]. 21 | 22 | For example: 23 | 24 | jbenet/foo 25 | jbenet/foo@latest 26 | jbenet/foo@1.0 27 | 28 | Loosely, data-get's process is: 29 | 30 | - Locate dataset Datafile and Manifest. (via provided argument). 31 | - Download Datafile and Manifest, to local Repository. 32 | - Download Blobs, listed in Manifest to local Repository. 33 | - Reconstruct Files, listed in Manifest. 34 | - Install Files, into working directory. 35 | 36 | `, 37 | Run: getCmd, 38 | } 39 | 40 | func getCmd(c *commander.Command, args []string) error { 41 | var datasets []string 42 | 43 | if len(args) > 0 { 44 | // if args, get those datasets. 45 | datasets = args 46 | } else { 47 | // if no args, use Datafile dependencies 48 | df, _ := NewDefaultDatafile() 49 | for _, dep := range df.Dependencies { 50 | if NewHandle(dep).Valid() { 51 | datasets = append(datasets, dep) 52 | } 53 | } 54 | } 55 | 56 | if len(datasets) == 0 { 57 | return fmt.Errorf("%v: no datasets specified.\nEither enter a "+ 58 | "argument, or add dependencies in a Datafile.", c.FullName()) 59 | } 60 | 61 | installed_datasets := []string{} 62 | for _, ds := range datasets { 63 | ds, err := GetDataset(ds) 64 | if err != nil { 65 | return err 66 | } 67 | installed_datasets = append(installed_datasets, ds) 68 | } 69 | 70 | if len(datasets) == 0 { 71 | return nil 72 | } 73 | 74 | // If many, Installation Summary 75 | pErr("---------\n") 76 | for _, ds := range installed_datasets { 77 | err := installedDatasetMessage(ds) 78 | if err != nil { 79 | pErr("%v\n", err) 80 | } 81 | } 82 | return nil 83 | } 84 | 85 | func GetDataset(dataset string) (string, error) { 86 | dataset = strings.ToLower(dataset) 87 | 88 | // add lookup in datadex here. 89 | h := NewHandle(dataset) 90 | if h.Valid() { 91 | // handle version can get resolved 92 | err := GetDatasetFromIndex(h) 93 | return h.Dataset(), err 94 | } 95 | 96 | return "", fmt.Errorf("Unclear how to handle dataset identifier: %s", dataset) 97 | } 98 | 99 | func GetDatasetFromIndex(h *Handle) error { 100 | di, err := NewMainDataIndex() 101 | if err != nil { 102 | return err 103 | } 104 | 105 | pErr("Downloading %s from %s (%s).\n", h.Dataset(), di.Name, di.Http.Url) 106 | 107 | // Get manifest ref 108 | mref, err := di.handleRef(h) 109 | if err != nil { 110 | return err 111 | } 112 | 113 | // Prepare local directories 114 | dir := h.InstallPath() 115 | if err := os.RemoveAll(dir); err != nil { 116 | return err 117 | } 118 | 119 | if err := os.MkdirAll(dir, 0777); err != nil { 120 | return err 121 | } 122 | 123 | cwd, err := os.Getwd() 124 | if err != nil { 125 | return err 126 | } 127 | 128 | if err := os.Chdir(dir); err != nil { 129 | return err 130 | } 131 | 132 | // move back out 133 | defer os.Chdir(cwd) 134 | 135 | // download manifest 136 | if err := downloadManifest(di, mref); err != nil { 137 | return err 138 | } 139 | 140 | // download pack 141 | p, err := NewPack() 142 | if err != nil { 143 | return err 144 | } 145 | 146 | if err := p.Download(); err != nil { 147 | return err 148 | } 149 | 150 | pErr("\n") 151 | return nil 152 | } 153 | 154 | func (d *DataIndex) handleRef(h *Handle) (string, error) { 155 | ri := d.RefIndex(h.Path()) 156 | 157 | // Fetch refs first. 158 | err := ri.FetchRefs(false) 159 | if err != nil { 160 | if strings.Contains(err.Error(), "404 page not found") { 161 | return "", fmt.Errorf("Error: %v not found.", h.Dataset()) 162 | } 163 | return "", fmt.Errorf("Error finding manifest for %v. %s", h.Dataset(), err) 164 | } 165 | 166 | // Resolve named version first. 167 | h.Version, err = ri.RefVersion(h.Version) 168 | if err != nil { 169 | return "", fmt.Errorf("Error finding version %v. %s", h.Dataset(), err) 170 | } 171 | 172 | // Resolve ref. 173 | ref, err := ri.VersionRef(h.Version) 174 | if err != nil { 175 | return "", fmt.Errorf("Error finding manifest for %v. %s", h.Dataset(), err) 176 | } 177 | return ref, nil 178 | } 179 | 180 | func downloadManifest(d *DataIndex, ref string) error { 181 | return d.getBlob(ref, ManifestFileName) 182 | } 183 | 184 | func installedDatasetMessage(dataset string) error { 185 | h := NewHandle(dataset) 186 | fpath := DatafilePath(h.Dataset()) 187 | df, err := NewDatafile(fpath) 188 | if err != nil { 189 | return err 190 | } 191 | 192 | pOut("Installed %s at %s\n", df.Dataset, path.Dir(fpath)) 193 | return nil 194 | } 195 | -------------------------------------------------------------------------------- /dev/designdoc.md: -------------------------------------------------------------------------------- 1 | ## WARNING - WIP 2 | 3 | data is in very early development. 4 | This document is too. Track ideas here, and in the [roadmap](roadmap.md) 5 | 6 | # data designdoc 7 | 8 | data - a package manager for datasets 9 | datahub - a centralized dataset hosting service 10 | 11 | ## Abstract 12 | 13 | data : datasets :: git : source code 14 | data : datahub :: git : github 15 | 16 | 17 | ## Introduction 18 | 19 | ### Prerequisites 20 | 21 | This document assumes strong familiarity with the following software 22 | engineering concepts and systems: 23 | 24 | - data formats 25 | - datasets 26 | - version control: `git, hg` 27 | - central source code repositories: `github, google code` 28 | - package managers: `aptitude, pip, npm, brew` 29 | - package indices: `ubuntu packages, pypi, npm registry, docker index` 30 | - containers: `LXC, docker` 31 | 32 | Dataset management is a mess. There are millions of datasets strewn across the 33 | internet, encoded in thousands of formats. [more gripes here] 34 | 35 | 36 | ## Design Goals 37 | 38 | data must be 39 | 40 | - **format agnostic**: no special treatment of specific formats. ideally, data 41 | itself does not understand formats. 42 | - **domain agnostic**: no special treatment of specific application domains 43 | and 44 | their biases (e.g. machine learning vs genomics vs neuroscience). 45 | - **platform agnostic**: no special treatment of specific platforms/stacks 46 | (*nix, windows, etc). 47 | 48 | 49 | - **decentralized**: no requirement on one central package index. There will 50 | be one (a default), but data should be capable of pointing to other indices. 51 | - **intuitive UX**: to facilitate adoption in a massively competitive/ 52 | entrenched landscape, it is key to craft a highly intuitive user experience, 53 | with as gradual learning curve as possible. 54 | - **simple to use**: simplicity is key. `data get norb/simple-norb` 55 | 56 | - **modular**: to support development and feature exploration, data should be 57 | modular, isolating functionality. Learn from git. 58 | - **infrastructure**: data is a general infrastructure tool. it aims to solve 59 | core, wide problems. special cases can be handled by sub-tools / 60 | applications on top. 61 | - **command line**: data is a unix-style tool. 62 | 63 | 64 | ## datadex - data index 65 | 66 | (The name datadex is worse than datahub, but datahub seems to be taken by 67 | a related project. Perhaps collaborate? TODO: talk to datahub people.) 68 | 69 | The important power data brings to the table is publishing and downloading 70 | datasets from a repository, public or private. This is achieved by the use 71 | of `datadex`, `data`'s sister tool and a simple website. The plan is to run 72 | one main, global `datadex` (much like most successful package managers out 73 | there) but allow users of `data` to point to whatever `datadex` (repository) 74 | they wish to use. 75 | 76 | The datadex is where data finds datasets when you run: 77 | 78 | data get jbenet/foobar 79 | 80 | Dataset foobar is looked up at the default data index: 81 | http://datadex.io/jbenet/foobar. Users should be able to point to an 82 | entirely different datadex, or even list a secondary one. This is useful in 83 | case the main datadex is {down, unmaintained, controlled by evil baboons}, 84 | and in case a user wishes to run her own private datadex for private datasets. 85 | 86 | See more at https://github.com/jbenet/datadex/blob/master/dev/roadmap.md 87 | 88 | 89 | ## data handles 90 | 91 | building on the roads paved by git and github, data introduces a standard way 92 | to reference every unique dataset version. This is accomplished with 93 | *data handles*: unique, impure, url-friendly identifiers. 94 | 95 | data handle structure: 96 | 97 | /[.][@] 98 | 99 | Where: 100 | 101 | - is the datadex username of the author/packager e.g. `feynman` 102 | - is a unique shortname for the dataset e.g. `spinning-plates` 103 | - is an optional format. details TBD, see [`dev/formats`](formats.md) 104 | defaults to `default` e.g. `json` 105 | - is an optional reference (hash, version, tag, etc). 106 | defaults to `latest` e.g. `1.0` 107 | 108 | Examples: 109 | 110 | jbenet/cifar-10 111 | jbenet/cifar-10.matlab 112 | jbenet/cifar-10@latest 113 | jbenet/cifar-10@1.0 114 | jbenet/cifar-10.matlab@0.8.2-rc1 115 | 116 | ### URL handling 117 | 118 | data handles are meant to be embedded in URLs, as in the datadex: 119 | 120 | http://datadex.io/jbenet/cifar-10@1.0 121 | 122 | (yes @ symbols get encoded) 123 | 124 | 125 | ## data hashes and refs 126 | 127 | data borrows more git concepts: object hashes and references. 128 | 129 | **data hashes**: In git, objects are identified by their hashes (sha1); one can 130 | retrieve an object with `git show `. In data, unique datasets -- 131 | including different versions of the same dataset -- are identified by the hash 132 | value of their dataset archive (i.e. `hash(tar(dataset directory))`). All 133 | published versions of a dataset are hosted in the datadex, e.g.: 134 | 135 | # these are all different datasets: 136 | http://datadex.io/jbenet/cifar-10@49be4be15ec96b72323698a710b650ca5a46f9e6 137 | http://datadex.io/jbenet/cifar-10@e9db19b48ced2631513d2a165e0386686e8a0c8a 138 | http://datadex.io/jbenet/cifar-10@5b13d6abb15dccabb6aaf8573d5a01cd0d74c86d 139 | 140 | These are three different versions of the same named dataset. They may differ 141 | only slightly, or be completely different. 142 | 143 | **data references**: It is very useful to reference object versions (their 144 | hashes) via human-friendly and even logical names, like `2.0`. These names are 145 | simply references (pointers, symlinks) to hashes. data is designed to 146 | understand (and de-reference) named references wherever it would normally 147 | expect a hash. Moreover, the term `ref` is used throughout to mean `reference, 148 | hash, version, tag, etc`. e.g. 149 | 150 | # while these could all point to the same dataset: 151 | http://datadex.io/jbenet/cifar-10 // defaults to @latest 152 | http://datadex.io/jbenet/cifar-10@latest 153 | http://datadex.io/jbenet/cifar-10@1.0 154 | http://datadex.io/jbenet/cifar-10@e9db19b48ced2631513d2a165e0386686e8a0c8a 155 | 156 | 157 | **default ref**: it is worth noting that often the "default reference" will be 158 | used when a reference is expected but not provided. The "default ref" is 159 | `latest`, and it points to the latest published version. 160 | 161 | **tags**: tags are user-specified references, e.g. version numbers like `1.0`. 162 | 163 | ## data manifest 164 | 165 | data uses a manifest of `{ : }` in order to: 166 | 167 | - account what files are part of a dataset 168 | - detect data corruption (check hashes match) 169 | - provide minimal version control (manifest changesets) 170 | 171 | data functions somewhat like `git-annex`: 172 | 173 | - stores (version-controls) the path and object hash in the "repository" 174 | - fetches the large blobs from a storage service 175 | 176 | The blobs from all the datasets stored in the same object store. (Blobs from 177 | different datasets are not segregated into separate bundles). This greatly 178 | reduces storage needs, de-duplicating common blobs across datasets. This is 179 | particularly useful for versions of the same dataset, as not all files change 180 | between versions. 181 | 182 | This design reduces storage both remotely (the datadex service de-duplicates 183 | across all indexed datasets) and locally (users' computers keep one blob cache 184 | for all installed datasets). 185 | 186 | Note: why version control ourselves rather than using git? git isn't great for 187 | huge files yet. It could begin supporting a dup-like rolling checksum, and thus 188 | better version large files. Also, there is another project in the works that 189 | data could leverage. The manifest versioning solution is good enough until one 190 | of those strategies pans out. 191 | -------------------------------------------------------------------------------- /data_config.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | // "code.google.com/p/gcfg" 5 | "fmt" 6 | "github.com/gonuts/flag" 7 | "github.com/jbenet/commander" 8 | "io" 9 | "os" 10 | "os/exec" 11 | "os/user" 12 | "strings" 13 | ) 14 | 15 | // WARNING: the config format will be ini eventually. Go parsers 16 | // don't currently allow writing (modifying) of files. 17 | // Thus, for now, using yaml. Expect this to change. 18 | 19 | var cmd_data_config = &commander.Command{ 20 | UsageLine: "config []", 21 | Short: "Manage data configuration.", 22 | Long: `data config - Manage data configuration. 23 | 24 | Usage: 25 | 26 | data config [] 27 | 28 | Get or set configuration option values. 29 | If argument is not provided, print value, and exit. 30 | If argument is provided, set to , and exit. 31 | 32 | # sets foo.bar = buzz 33 | > data config foo.bar baz 34 | 35 | # gets foo.bar 36 | > data config foo.bar 37 | baz 38 | 39 | Config options are stored in the user's configuration file (~/.dataconfig). 40 | This file is formatted in YAML, and uses the goyaml parser. (In the future, 41 | it may be formatted like .gitconfig (INI style), using the gcfg parser.) 42 | 43 | `, 44 | Run: configCmd, 45 | Flag: *flag.NewFlagSet("data-config", flag.ExitOnError), 46 | } 47 | 48 | func init() { 49 | cmd_data_config.Flag.Bool("show", false, "show config file") 50 | cmd_data_config.Flag.Bool("edit", false, "edit config file in $EDITOR") 51 | } 52 | 53 | func configCmd(c *commander.Command, args []string) error { 54 | if c.Flag.Lookup("show").Value.Get().(bool) { 55 | return printConfig(&Config) 56 | } 57 | 58 | if c.Flag.Lookup("edit").Value.Get().(bool) { 59 | return configEditor() 60 | } 61 | 62 | if len(args) == 0 { 63 | return fmt.Errorf("%s: requires argument.", c.Name()) 64 | } 65 | 66 | if len(args) == 1 { 67 | value := ConfigGet(args[0]) 68 | if value == nil { 69 | return fmt.Errorf("") // empty string prints out nothing. 70 | } 71 | 72 | m, err := Marshal(value) 73 | if err != nil { 74 | return err 75 | } 76 | io.Copy(os.Stdout, m) 77 | return nil 78 | } 79 | 80 | return ConfigSet(args[0], args[1]) 81 | } 82 | 83 | func printConfig(c *ConfigFormat) error { 84 | f, _ := NewConfigfile("") 85 | f.Config = *c 86 | return f.Write(os.Stdout) 87 | } 88 | 89 | func configEditor() error { 90 | ed := os.Getenv("EDITOR") 91 | if len(ed) < 1 { 92 | pErr("No $EDITOR defined. Defaulting to `nano`.") 93 | ed = "nano" 94 | } 95 | 96 | ed, args := execCmdArgs(ed, []string{globalConfigFile}) 97 | cmd := exec.Command(ed, args...) 98 | cmd.Stdin, cmd.Stdout, cmd.Stderr = os.Stdin, os.Stdout, os.Stderr 99 | return cmd.Run() 100 | } 101 | 102 | func ConfigGetString(key string, default_ string) string { 103 | val := ConfigGet(key) 104 | if val == nil { 105 | return default_ 106 | } 107 | return fmt.Sprintf("%s", val) 108 | } 109 | 110 | func ConfigGet(key string) interface{} { 111 | // struct -> map for dynamic walking 112 | m := map[interface{}]interface{}{} 113 | err := MarshalUnmarshal(Config, &m) 114 | if err != nil { 115 | pErr("data config: error serializing: %s", err) 116 | return nil 117 | } 118 | 119 | var cursor interface{} 120 | var exists bool 121 | cursor = m 122 | for _, part := range strings.Split(key, ".") { 123 | cursor, exists = cursor.(map[interface{}]interface{})[part] 124 | if !exists { 125 | return nil 126 | } 127 | } 128 | 129 | return cursor 130 | } 131 | 132 | func ConfigSet(key string, value string) error { 133 | // struct -> map for dynamic walking 134 | m := map[interface{}]interface{}{} 135 | if err := MarshalUnmarshal(Config, &m); err != nil { 136 | return fmt.Errorf("error serializing config: %s", err) 137 | } 138 | 139 | var cursor interface{} 140 | var exists bool 141 | cursor = m 142 | 143 | parts := strings.Split(key, ".") 144 | for n, part := range parts { 145 | mcursor := cursor.(map[interface{}]interface{}) 146 | // last part, set here. 147 | if n == (len(parts) - 1) { 148 | mcursor[part] = value 149 | break 150 | } 151 | 152 | cursor, exists = mcursor[part] 153 | if !exists { // create map if not here. 154 | mcursor[part] = map[interface{}]interface{}{} 155 | cursor = mcursor[part] 156 | } 157 | } 158 | 159 | // write back. 160 | if err := MarshalUnmarshal(&m, Config); err != nil { 161 | return fmt.Errorf("error serializing config: %s", err) 162 | } 163 | 164 | return WriteConfigFile(globalConfigFile, &Config) 165 | } 166 | 167 | var globalConfigFile = "~/.dataconfig" 168 | 169 | // type ConfigFormat struct { 170 | // Index map[string]*struct { 171 | // Url string 172 | // User string 173 | // Token string 174 | // Disabled bool ",omitempty" 175 | // } 176 | // } 177 | 178 | type ConfigFormat map[string]interface{} 179 | 180 | var Config = ConfigFormat{} 181 | 182 | // var DefaultConfigText = `[index "datadex.io:8080"] 183 | // user = 184 | // token = 185 | // ` 186 | var DefaultConfigText = `index: 187 | datadex: 188 | url: http://datadex.io 189 | user: "" 190 | token: "" 191 | ` 192 | 193 | // Load config file on statup 194 | func init() { 195 | 196 | // alt config file path 197 | if cf := os.Getenv("DATA_CONFIG"); len(cf) > 0 { 198 | globalConfigFile = cf 199 | pErr("Using config file path: %s\n", globalConfigFile) 200 | } 201 | 202 | // expand ~/ 203 | usr, err := user.Current() 204 | if err != nil { 205 | panic("error: user context. " + err.Error()) 206 | } 207 | dir := usr.HomeDir + "/" 208 | globalConfigFile = strings.Replace(globalConfigFile, "~/", dir, 1) 209 | 210 | // install config if doesn't exist 211 | if _, err := os.Stat(globalConfigFile); os.IsNotExist(err) { 212 | err := WriteConfigFileText(globalConfigFile, DefaultConfigText) 213 | if err != nil { 214 | panic("error: failed to write config " + globalConfigFile + 215 | ". " + err.Error()) 216 | } 217 | pErr("Wrote new config file: %s\n", globalConfigFile) 218 | } 219 | 220 | // load config 221 | err = ReadConfigFile(globalConfigFile, &Config) 222 | if err != nil { 223 | panic("error: failed to load config " + globalConfigFile + 224 | ". " + err.Error()) 225 | } 226 | } 227 | 228 | func WriteConfigFileText(filename string, text string) error { 229 | file, err := os.Create(filename) 230 | if err != nil { 231 | return err 232 | } 233 | defer file.Close() 234 | _, err = file.Write([]byte(text)) 235 | return err 236 | } 237 | 238 | func WriteConfigFile(filename string, fmt *ConfigFormat) error { 239 | // return gcfg.WriteFile(fmt, filename) 240 | 241 | f, _ := NewConfigfile(filename) 242 | f.Config = *fmt 243 | return f.WriteFile() 244 | } 245 | 246 | func ReadConfigFile(filename string, fmt *ConfigFormat) error { 247 | // return gcfg.ReadFileInto(fmt, filename) 248 | 249 | f, err := NewConfigfile(filename) 250 | if err != nil { 251 | return err 252 | } 253 | 254 | *fmt = f.Config 255 | return nil 256 | } 257 | 258 | // for use with YAML-based config 259 | type Configfile struct { 260 | SerializedFile "-" 261 | Config ConfigFormat "" 262 | } 263 | 264 | func NewConfigfile(path string) (*Configfile, error) { 265 | f := &Configfile{SerializedFile: SerializedFile{Path: path}} 266 | f.Config = ConfigFormat{} 267 | f.SerializedFile.Format = &f.Config 268 | 269 | if len(path) > 0 { 270 | err := f.ReadFile() 271 | if err != nil { 272 | return f, err 273 | } 274 | } 275 | return f, nil 276 | } 277 | 278 | // nice helpers 279 | const AnonymousUser = "anonymous" 280 | 281 | func configUser() string { 282 | return ConfigGetString(fmt.Sprintf("index.%s.user", mainIndexName), "") 283 | } 284 | 285 | func configGetIndex(name string) (map[string]string, error) { 286 | idx_raw := ConfigGet("index." + name) 287 | idx, ok := idx_raw.(map[interface{}]interface{}) 288 | if idx_raw == nil || !ok { 289 | return nil, fmt.Errorf("Config error: invalid index.%s", name) 290 | } 291 | sidx := map[string]string{} 292 | for k, v := range idx { 293 | sidx[k.(string)] = fmt.Sprintf("%s", v) 294 | } 295 | return sidx, nil 296 | } 297 | 298 | func isNamedUser(user string) bool { 299 | return len(user) > 0 && user != AnonymousUser 300 | } 301 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "bufio" 5 | "crypto/sha1" 6 | "fmt" 7 | "github.com/aeden/go-semver" 8 | "github.com/dotcloud/docker/pkg/term" 9 | "github.com/xeonx/timeago" 10 | "io" 11 | "io/ioutil" 12 | "net/http" 13 | "os" 14 | "os/exec" 15 | "path" 16 | "sort" 17 | "strings" 18 | "time" 19 | "unicode" 20 | ) 21 | 22 | var Debug bool 23 | var NotImplementedError = fmt.Errorf("Error: not implemented yet.") 24 | 25 | // Shorthand printing functions. 26 | func pErr(format string, a ...interface{}) { 27 | fmt.Fprintf(os.Stderr, format, a...) 28 | } 29 | 30 | func pOut(format string, a ...interface{}) { 31 | fmt.Fprintf(os.Stdout, format, a...) 32 | } 33 | 34 | func dErr(format string, a ...interface{}) { 35 | if Debug { 36 | pErr(format, a...) 37 | } 38 | } 39 | 40 | func dOut(format string, a ...interface{}) { 41 | if Debug { 42 | pOut(format, a...) 43 | } 44 | } 45 | 46 | // human-readable time ago 47 | func TimeAgo(s string) string { 48 | t, _ := time.Parse("2006-01-02 15:04:05.999999999 -0700 MST", s) 49 | return timeago.English.Format(t) 50 | } 51 | 52 | // Version comparison function 53 | func VersionLess(i, j string) bool { 54 | // attempt to use semver 55 | vi, erri := semver.New(i) 56 | vj, errj := semver.New(j) 57 | if erri == nil && errj == nil { 58 | return vi.LessThan(vj) 59 | } 60 | 61 | // Nope. Compare lexicographically. Gross. 62 | // (There should perhaps be a looser, semver-like comparison, 63 | // which attempts to compare any number of dot-separated ints. 64 | // Semver expects three exactly, but perhaps we want to compare 65 | // "1.8" < "1.10" 66 | return i < j 67 | } 68 | 69 | // Checks whether string is a hash (sha1) 70 | func IsHash(hash string) bool { 71 | if len(hash) != 40 { 72 | return false 73 | } 74 | 75 | for _, r := range hash { 76 | if !unicode.Is(unicode.ASCII_Hex_Digit, r) { 77 | return false 78 | } 79 | } 80 | 81 | return true 82 | } 83 | 84 | func shortHash(hash string) string { 85 | return hash[:7] 86 | } 87 | 88 | func readerHash(r io.Reader) (string, error) { 89 | bf := bufio.NewReader(r) 90 | h := sha1.New() 91 | _, err := bf.WriteTo(h) 92 | if err != nil { 93 | return "", err 94 | } 95 | 96 | hex := fmt.Sprintf("%x", h.Sum(nil)) 97 | return hex, nil 98 | } 99 | 100 | func StringHash(s string) (string, error) { 101 | r := strings.NewReader(s) 102 | h := sha1.New() 103 | _, err := r.WriteTo(h) 104 | if err != nil { 105 | return "", err 106 | } 107 | 108 | hex := fmt.Sprintf("%x", h.Sum(nil)) 109 | return hex, nil 110 | } 111 | 112 | func hashFile(path string) (string, error) { 113 | f, err := os.Open(path) 114 | if err != nil { 115 | return "", err 116 | } 117 | defer f.Close() 118 | 119 | return readerHash(f) 120 | } 121 | 122 | func catFile(path string) error { 123 | f, err := os.Open(path) 124 | if err != nil { 125 | return err 126 | } 127 | defer f.Close() 128 | 129 | br := bufio.NewReader(f) 130 | _, err = io.Copy(os.Stdout, br) 131 | return err 132 | } 133 | 134 | func copyFile(src string, dst string) error { 135 | cmd := exec.Command("cp", src, dst) 136 | return cmd.Run() 137 | } 138 | 139 | // clean up ident string 140 | func identString(ident string) string { 141 | return NonIdentRegexp.ReplaceAllString(ident, "") 142 | } 143 | 144 | // remove duplicates in a string slice 145 | func set(slice []string) []string { 146 | dedup := []string{} 147 | elems := map[string]bool{} 148 | for _, elem := range slice { 149 | _, seen := elems[elem] 150 | if !seen { 151 | dedup = append(dedup, elem) 152 | elems[elem] = true 153 | } 154 | } 155 | return dedup 156 | } 157 | 158 | func validHashes(hashes []string) (valid []string, err error) { 159 | hashes = set(hashes) 160 | 161 | // append only valid hashes 162 | for _, hash := range hashes { 163 | if IsHash(hash) { 164 | valid = append(valid, hash) 165 | } else { 166 | err = fmt.Errorf("invalid : %v", hash) 167 | } 168 | } 169 | 170 | return 171 | } 172 | 173 | // Url utils 174 | 175 | const ArchiveSuffix = ".tar.gz" 176 | 177 | func IsArchiveUrl(str string) bool { 178 | return isUrl(str) && strings.HasSuffix(str, ArchiveSuffix) 179 | } 180 | 181 | func isUrl(str string) bool { 182 | return strings.HasPrefix(str, "http://") || strings.HasPrefix(str, "https://") 183 | } 184 | 185 | func httpExists(url string) (bool, error) { 186 | resp, err := http.Get(url) 187 | if err != nil { 188 | return false, err 189 | } 190 | defer resp.Body.Close() 191 | 192 | c := resp.StatusCode 193 | switch { 194 | case 200 <= c && c < 400: 195 | return true, nil 196 | case 400 <= c && c < 500: 197 | return false, nil 198 | default: 199 | return false, fmt.Errorf("Network or server error retrieving: %s", url) 200 | } 201 | } 202 | 203 | func httpGet(url string) (*http.Response, error) { 204 | dOut("http get %s\n", url) 205 | resp, err := http.Get(url) 206 | if err != nil { 207 | return nil, err 208 | } 209 | 210 | c := resp.StatusCode 211 | if 200 <= c && c < 400 { 212 | return resp, nil 213 | } 214 | 215 | e, _ := ioutil.ReadAll(resp.Body) 216 | resp.Body.Close() 217 | 218 | s := strings.TrimSpace(string(e[:])) 219 | return nil, fmt.Errorf("HTTP error status code: %d (%s)", c, s) 220 | } 221 | 222 | func httpPost(url string, bt string, b io.Reader) (*http.Response, error) { 223 | dOut("http post %s\n", url) 224 | resp, err := http.Post(url, bt, b) 225 | if err != nil { 226 | return nil, err 227 | } 228 | 229 | c := resp.StatusCode 230 | if 200 <= c && c < 400 { 231 | return resp, nil 232 | } 233 | 234 | e, _ := ioutil.ReadAll(resp.Body) 235 | resp.Body.Close() 236 | 237 | s := strings.TrimSpace(string(e[:])) 238 | return nil, fmt.Errorf("HTTP error status code: %d (%s)", c, s) 239 | } 240 | 241 | func httpReadAll(url string) ([]byte, error) { 242 | resp, err := httpGet(url) 243 | if err != nil { 244 | return nil, err 245 | } 246 | defer resp.Body.Close() 247 | 248 | contents, err := ioutil.ReadAll(resp.Body) 249 | if err != nil { 250 | return nil, err 251 | } 252 | 253 | return contents, nil 254 | } 255 | 256 | func httpWriteToFile(url string, filename string) error { 257 | resp, err := httpGet(url) 258 | if err != nil { 259 | return err 260 | } 261 | defer resp.Body.Close() 262 | 263 | file, err := createFile(filename) 264 | if err != nil { 265 | return err 266 | } 267 | defer file.Close() 268 | 269 | _, err = io.Copy(file, resp.Body) 270 | return err 271 | } 272 | 273 | func createFile(filename string) (*os.File, error) { 274 | err := os.MkdirAll(path.Dir(filename), 0777) 275 | if err != nil { 276 | return nil, err 277 | } 278 | 279 | return os.Create(filename) 280 | } 281 | 282 | // Extraction 283 | func extractArchive(filename string) error { 284 | file, err := os.Open(filename) 285 | if err != nil { 286 | return err 287 | } 288 | defer file.Close() 289 | 290 | dst := strings.TrimSuffix(filename, ArchiveSuffix) 291 | err = os.MkdirAll(dst, 0777) 292 | if err != nil { 293 | return err 294 | } 295 | 296 | dst = path.Base(dst) 297 | src := path.Base(filename) 298 | cmd := exec.Command("tar", "xzf", src, "--strip-components", "1", "-C", dst) 299 | cmd.Dir = path.Dir(filename) 300 | out, err := cmd.CombinedOutput() 301 | if err != nil { 302 | outs := string(out) 303 | if strings.Contains(outs, "Error opening archive:") { 304 | return fmt.Errorf(outs) 305 | } 306 | 307 | return err 308 | } 309 | 310 | return nil 311 | } 312 | 313 | // Input 314 | func readInput() (string, error) { 315 | reader := bufio.NewReader(os.Stdin) 316 | line, _, err := reader.ReadLine() 317 | if err != nil { 318 | return "", err 319 | } 320 | return string(line), nil 321 | } 322 | 323 | func readInputSilent() (string, error) { 324 | fd := os.Stdin.Fd() 325 | s, _ := term.SaveState(fd) 326 | term.DisableEcho(fd, s) 327 | 328 | input, err := readInput() 329 | term.RestoreTerminal(fd, s) 330 | 331 | pOut("\n") 332 | return input, err 333 | } 334 | 335 | // Exec helper 336 | func execCmdArgs(path string, args []string) (string, []string) { 337 | if args == nil { 338 | args = []string{} 339 | } 340 | 341 | parts := strings.Split(path, " ") 342 | if len(parts) > 1 { 343 | path = parts[0] 344 | args = append(parts[1:], args...) 345 | } 346 | 347 | return path, args 348 | } 349 | 350 | // Map sorting -- lifted from 351 | // https://groups.google.com/d/msg/golang-nuts/FT7cjmcL7gw/Gj4_aEsE_IsJ 352 | 353 | // A data structure to hold a key/value pair. 354 | type pair struct { 355 | Key string 356 | Value string 357 | } 358 | 359 | // A slice of Pairs that implements sort.Interface to sort by Value. 360 | type pairList []pair 361 | 362 | func (p pairList) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 363 | func (p pairList) Len() int { return len(p) } 364 | func (p pairList) Less(i, j int) bool { return p[i].Value < p[j].Value } 365 | 366 | // A function to turn a map into a PairList, then sort and return it. 367 | func sortMapByValue(m map[string]string) pairList { 368 | p := make(pairList, len(m)) 369 | i := 0 370 | for k, v := range m { 371 | p[i] = pair{k, v} 372 | i++ 373 | } 374 | sort.Sort(p) 375 | return p 376 | } 377 | -------------------------------------------------------------------------------- /data_user.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gonuts/flag" 6 | "github.com/jbenet/commander" 7 | "io" 8 | "io/ioutil" 9 | "os" 10 | "strings" 11 | ) 12 | 13 | var cmd_data_user = &commander.Command{ 14 | UsageLine: "user ", 15 | Short: "Manage users and credentials.", 16 | Long: `data user - Manage users and credentials. 17 | 18 | Usage: 19 | 20 | data user 21 | 22 | Commands: 23 | 24 | add [] Register new user with index. 25 | auth [] Authenticate user account. 26 | pass [] Change user password. 27 | info [] Show (or edit) public user information. 28 | url [] Output user profile url. 29 | 30 | If no argument is provided, data will ask for the username. 31 | 32 | User accounts are needed in order to publish dataset packages to the 33 | dataset index. Packages are listed under their owner's username: 34 | '/'. 35 | 36 | `, 37 | Subcommands: []*commander.Command{ 38 | cmd_data_user_add, 39 | cmd_data_user_auth, 40 | cmd_data_user_pass, 41 | cmd_data_user_info, 42 | cmd_data_user_url, 43 | }, 44 | } 45 | 46 | var cmd_data_user_add = &commander.Command{ 47 | UsageLine: "add []", 48 | Short: "Register new user with index.", 49 | Long: `data user add - Register new user with index. 50 | 51 | Guided process to register a new user account with dataset index. 52 | 53 | See data user. 54 | `, 55 | Run: userAddCmd, 56 | } 57 | 58 | var cmd_data_user_auth = &commander.Command{ 59 | UsageLine: "auth []", 60 | Short: "Authenticate user account.", 61 | Long: `data user auth - Authenticate user account. 62 | 63 | Authenticate (login) user account to index. An auth token is retrieved 64 | and stored in the local config file. 65 | 66 | See data user. 67 | `, 68 | Run: userAuthCmd, 69 | Flag: *flag.NewFlagSet("data-user-auth", flag.ExitOnError), 70 | } 71 | 72 | var cmd_data_user_pass = &commander.Command{ 73 | UsageLine: "pass []", 74 | Short: "Change user password.", 75 | Long: `data user pass - Change user password. 76 | 77 | Guided process to change user account password with dataset index. 78 | 79 | See data user. 80 | `, 81 | Run: userPassCmd, 82 | } 83 | 84 | var cmd_data_user_info = &commander.Command{ 85 | UsageLine: "info []", 86 | Short: "Show (or edit) public user information.", 87 | Long: `data user info - Show (or edit) public user information. 88 | 89 | Output or edit the profile information of a user. Note that profiles 90 | are publicly viewable. User profiles include: 91 | 92 | Full Name 93 | Email Address 94 | Github Username 95 | Twitter Username 96 | Website Url 97 | Packages List 98 | 99 | See data user. 100 | `, 101 | Run: userInfoCmd, 102 | Flag: *flag.NewFlagSet("data-user-info", flag.ExitOnError), 103 | } 104 | 105 | var cmd_data_user_url = &commander.Command{ 106 | UsageLine: "url []", 107 | Short: "Output user profile url.", 108 | Long: `data user url - Output user profile url. 109 | 110 | Output the dataset index url for the profile of user named by . 111 | 112 | See data user. 113 | `, 114 | Run: userUrlCmd, 115 | } 116 | 117 | func init() { 118 | cmd_data_user_info.Flag.Bool("edit", false, "edit user info") 119 | cmd_data_user_auth.Flag.Bool("clear", false, "clear authentication") 120 | } 121 | 122 | func userCmdUserIndex(args []string) (*UserIndex, error) { 123 | var user string 124 | var err error 125 | 126 | if len(args) > 0 && len(args[0]) > 0 { 127 | user = args[0] 128 | } 129 | 130 | for !UserRegexp.MatchString(user) { 131 | pOut("Username: ") 132 | user, err = readInput() 133 | if err != nil { 134 | return nil, err 135 | } 136 | } 137 | 138 | di, err := NewMainDataIndex() 139 | if err != nil { 140 | return nil, err 141 | } 142 | 143 | ui := di.NewUserIndex(user) 144 | return ui, nil 145 | } 146 | 147 | func userAddCmd(c *commander.Command, args []string) error { 148 | ui, err := userCmdUserIndex(args) 149 | if err != nil { 150 | return err 151 | } 152 | 153 | pass, err := inputNewPassword() 154 | if err != nil { 155 | return err 156 | } 157 | 158 | email, err := inputNewEmail() 159 | if err != nil { 160 | return err 161 | } 162 | 163 | err = ui.Add(pass, email) 164 | if err != nil { 165 | return err 166 | } 167 | 168 | pOut("Registered %s.\n", ui.User) 169 | err = ui.Auth(pass) 170 | if err != nil { 171 | return err 172 | } 173 | 174 | pOut("Authenticated as %s.\n", ui.User) 175 | return nil 176 | } 177 | 178 | func userAuthCmd(c *commander.Command, args []string) error { 179 | // clear flag? sign out 180 | if c.Flag.Lookup("clear").Value.Get().(bool) { 181 | if err := ConfigSet("index.datadex.user", ""); err != nil { 182 | return err 183 | } 184 | if err := ConfigSet("index.datadex.token", ""); err != nil { 185 | return err 186 | } 187 | pOut("Signed out.\n") 188 | return nil 189 | } 190 | 191 | ui, err := userCmdUserIndex(args) 192 | if err != nil { 193 | return err 194 | } 195 | 196 | pOut("Password: ") 197 | pass, err := readInputSilent() 198 | if err != nil { 199 | return err 200 | } 201 | 202 | err = ui.Auth(pass) 203 | if err != nil { 204 | return err 205 | } 206 | 207 | pOut("Authenticated as %s.\n", ui.User) 208 | return nil 209 | } 210 | 211 | func userPassCmd(c *commander.Command, args []string) error { 212 | ui, err := userCmdUserIndex(args) 213 | if err != nil { 214 | return err 215 | } 216 | 217 | pOut("Current Password: ") 218 | curp, err := readInputSilent() 219 | if err != nil { 220 | return err 221 | } 222 | 223 | pOut("New ") 224 | newp, err := inputNewPassword() 225 | if err != nil { 226 | return err 227 | } 228 | 229 | err = ui.Pass(curp, newp) 230 | if err != nil { 231 | return err 232 | } 233 | 234 | pOut("Password changed. You will receive an email notification.\n") 235 | return nil 236 | } 237 | 238 | func userInfoCmd(c *commander.Command, args []string) error { 239 | // default to user on config 240 | cu := configUser() 241 | if len(args) == 0 && isNamedUser(cu) { 242 | args = append(args, cu) 243 | } 244 | 245 | ui, err := userCmdUserIndex(args) 246 | if err != nil { 247 | return err 248 | } 249 | 250 | p, err := ui.GetInfo() 251 | if err != nil { 252 | return err 253 | } 254 | 255 | // not editing 256 | if !c.Flag.Lookup("edit").Value.Get().(bool) { 257 | rdr, err := Marshal(p) 258 | if err != nil { 259 | return err 260 | } 261 | 262 | _, err = io.Copy(os.Stdout, rdr) 263 | return err 264 | } 265 | 266 | if cu != ui.User { 267 | return fmt.Errorf("Authenticated as %s."+ 268 | " Reauthenticate with 'data user auth'", cu) 269 | } 270 | 271 | // editing own profile. 272 | err = fillOutUserProfile(p) 273 | if err != nil { 274 | return err 275 | } 276 | 277 | err = ui.PostInfo(p) 278 | if err != nil { 279 | return err 280 | } 281 | 282 | pOut("Profile saved.\n") 283 | return nil 284 | } 285 | 286 | func userUrlCmd(c *commander.Command, args []string) error { 287 | // default to user on config 288 | if len(args) == 0 { 289 | cu := configUser() 290 | if isNamedUser(cu) { 291 | args = append(args, cu) 292 | } 293 | } 294 | 295 | ui, err := userCmdUserIndex(args) 296 | if err != nil { 297 | return err 298 | } 299 | 300 | pOut("%s\n", strings.Replace(ui.Http.Url, "/user", "", 1)) 301 | return nil 302 | } 303 | 304 | const PasswordMinLength = 6 305 | 306 | func inputNewPassword() (string, error) { 307 | var pass string 308 | for len(pass) < PasswordMinLength { 309 | pOut("Password (%d char min): ", PasswordMinLength) 310 | var err error 311 | pass, err = readInputSilent() 312 | if err != nil { 313 | return "", err 314 | } 315 | } 316 | return pass, nil 317 | } 318 | 319 | func inputNewEmail() (string, error) { 320 | var email string 321 | 322 | for !EmailRegexp.MatchString(email) { 323 | pOut("Email (for security): ") 324 | var err error 325 | 326 | email, err = readInput() 327 | if err != nil { 328 | return "", err 329 | } 330 | } 331 | return email, nil 332 | } 333 | 334 | // serializable into YAML 335 | type UserProfile struct { 336 | Name string 337 | Email string 338 | Github string ",omitempty" 339 | Twitter string ",omitempty" 340 | Website string ",omitempty" 341 | Packages []string ",omitempty" 342 | } 343 | 344 | type UserIndex struct { 345 | Http *HttpClient 346 | User string 347 | Refs *DatasetRefs 348 | } 349 | 350 | func (i UserIndex) Passhash(pass string) (string, error) { 351 | // additional hashing of the password before sending. 352 | // this resulting `passhash` is really the user's password. 353 | // this is so that passwords are never seen by the server as plaintext 354 | return StringHash(pass + i.User) 355 | } 356 | 357 | func (i *UserIndex) GetInfo() (*UserProfile, error) { 358 | resp, err := i.Http.Get("info") 359 | if err != nil { 360 | return nil, err 361 | } 362 | defer resp.Body.Close() 363 | 364 | profile := &UserProfile{} 365 | err = Unmarshal(resp.Body, profile) 366 | if err != nil { 367 | return nil, err 368 | } 369 | 370 | return profile, nil 371 | } 372 | 373 | func (i *UserIndex) PostInfo(p *UserProfile) error { 374 | _, err := i.Http.Post("info", p) 375 | return err 376 | } 377 | 378 | func (i *UserIndex) Auth(pass string) error { 379 | ph, err := i.Passhash(pass) 380 | if err != nil { 381 | return err 382 | } 383 | 384 | resp, err := i.Http.Post("auth", ph) 385 | if err != nil { 386 | return err 387 | } 388 | 389 | buf, err := ioutil.ReadAll(resp.Body) 390 | if err != nil { 391 | return fmt.Errorf("Error reading token. %s", err) 392 | } 393 | 394 | token := string(buf[:]) 395 | if !IsHash(token) { 396 | return fmt.Errorf("Invalid token received %s", token) 397 | } 398 | 399 | if err := ConfigSet("index.datadex.user", i.User); err != nil { 400 | return fmt.Errorf("Error setting user. %s", err) 401 | } 402 | 403 | if err := ConfigSet("index.datadex.token", token); err != nil { 404 | return fmt.Errorf("Error setting token. %s", err) 405 | } 406 | 407 | return nil 408 | } 409 | 410 | func (i *UserIndex) Pass(cp string, np string) error { 411 | cph, err := i.Passhash(cp) 412 | if err != nil { 413 | return err 414 | } 415 | 416 | nph, err := i.Passhash(np) 417 | if err != nil { 418 | return err 419 | } 420 | 421 | _, err = i.Http.Post("pass", &NewPassMsg{cph, nph}) 422 | return err 423 | } 424 | 425 | func (i *UserIndex) Add(pass string, email string) error { 426 | ph, err := i.Passhash(pass) 427 | if err != nil { 428 | return err 429 | } 430 | 431 | _, err = i.Http.Post("add", &NewUserMsg{ph, email}) 432 | if err != nil { 433 | if strings.Contains(err.Error(), "user exists") { 434 | m := "Error: username '%s' already in use. Try another." 435 | return fmt.Errorf(m, i.User) 436 | } 437 | } 438 | return err 439 | } 440 | 441 | func (i *UserIndex) AwsCred() (*AwsCredentials, error) { 442 | resp, err := i.Http.Get("awscred") 443 | if err != nil { 444 | return nil, err 445 | } 446 | defer resp.Body.Close() 447 | 448 | creds := &AwsCredentials{} 449 | err = Unmarshal(resp.Body, creds) 450 | if err != nil { 451 | return nil, err 452 | } 453 | 454 | return creds, nil 455 | } 456 | 457 | // DataIndex extension to generate a UserIndex 458 | func (d *DataIndex) NewUserIndex(user string) *UserIndex { 459 | return &UserIndex{ 460 | Http: &HttpClient{ 461 | BaseUrl: d.Http.BaseUrl, 462 | Url: d.Http.Url + "/" + user + "/" + "user", 463 | User: d.Http.User, 464 | AuthToken: d.Http.AuthToken, 465 | }, 466 | User: user, 467 | } 468 | } 469 | 470 | type NewUserMsg struct { 471 | Pass string 472 | Email string 473 | } 474 | 475 | type NewPassMsg struct { 476 | Current string 477 | New string 478 | } 479 | -------------------------------------------------------------------------------- /data_manifest.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "github.com/gonuts/flag" 7 | "github.com/jbenet/commander" 8 | "os" 9 | "path/filepath" 10 | "strings" 11 | ) 12 | 13 | const ManifestFileName = ".data/Manifest" 14 | const noHash = "" 15 | 16 | var cmd_data_manifest = &commander.Command{ 17 | UsageLine: "manifest [[ add | remove | hash | check ] ]", 18 | Short: "Generate and manipulate dataset manifest.", 19 | Long: `data manifest - Generate and manipulate dataset manifest. 20 | 21 | Generates and manipulates this dataset's manifest. The manifest 22 | is a mapping of { : }, and describes all files 23 | that compose a dataset. This mapping is generated by adding and 24 | hashing (checksum) files. 25 | 26 | Running data-manifest without arguments will generate (or patch) 27 | the manifest. Note that already hashed files will not be re-hashed 28 | unless forced to. Some files may be massive, and hashing every run 29 | would be prohibitively expensive. 30 | 31 | Commands: 32 | 33 | add Adds to manifest (does not hash). 34 | rm Removes from manifest. 35 | hash Hashes and adds checksum to manifest. 36 | check Verifies checksum matches manifest. 37 | 38 | (use the --all flag to do it to all available files) 39 | 40 | Loosely, data-manifest's process is: 41 | 42 | - List all files in the working directory. 43 | - Add files to the manifest (effectively tracking them). 44 | - Hash tracked files, adding checksums to the manifest. 45 | `, 46 | Run: manifestCmd, 47 | Subcommands: []*commander.Command{ 48 | cmd_data_manifest_add, 49 | cmd_data_manifest_rm, 50 | cmd_data_manifest_hash, 51 | cmd_data_manifest_check, 52 | }, 53 | } 54 | 55 | var cmd_data_manifest_add = &commander.Command{ 56 | UsageLine: "add ", 57 | Short: "Adds to manifest (does not hash).", 58 | Long: `data manifest add - Adds to manifest (does not hash). 59 | 60 | Adding files to the manifest ensures they are tracked. This command 61 | adds the given to the manifest, saves it, and exits. It does 62 | not automatically hash the file (run 'data manifest hash'). 63 | 64 | See 'data manifest'. 65 | 66 | Arguments: 67 | 68 | path of the file to add. 69 | 70 | `, 71 | Run: manifestAddCmd, 72 | Flag: *flag.NewFlagSet("data-manifest-add", flag.ExitOnError), 73 | } 74 | 75 | var cmd_data_manifest_rm = &commander.Command{ 76 | UsageLine: "rm ", 77 | Short: "Removes from manifest.", 78 | Long: `data manifest rm - Removes from manifest. 79 | 80 | Removing files from the manifest stops tracking them. This command 81 | removes the given (and hash) from the manifest, and exits. 82 | 83 | See 'data manifest'. 84 | 85 | Arguments: 86 | 87 | path of the file to remove. 88 | 89 | `, 90 | Run: manifestRmCmd, 91 | Flag: *flag.NewFlagSet("data-manifest-rm", flag.ExitOnError), 92 | } 93 | 94 | var cmd_data_manifest_hash = &commander.Command{ 95 | UsageLine: "hash ", 96 | Short: "Hashes and adds checksum to manifest.", 97 | Long: `data manifest hash - Hashes and adds checksum to manifest. 98 | 99 | Hashing files in the manifest calculates the file checksums. This command 100 | hashes the given , adds it to the manifest, and exits. 101 | 102 | See 'data manifest'. 103 | 104 | Arguments: 105 | 106 | path of the file to hash. 107 | 108 | `, 109 | Run: manifestHashCmd, 110 | Flag: *flag.NewFlagSet("data-manifest-hash", flag.ExitOnError), 111 | } 112 | 113 | var cmd_data_manifest_check = &commander.Command{ 114 | UsageLine: "check ", 115 | Short: "Verifies checksum matches manifest.", 116 | Long: `data manifest check - Verifies checksum matches manifest. 117 | 118 | The manifest lists the files and their checksums. This command 119 | hashes the given , and prints whether its checksum matches the 120 | stored checksum. 121 | 122 | See 'data manifest'. 123 | 124 | Arguments: 125 | 126 | path of the file to check. 127 | 128 | `, 129 | Run: manifestCheckCmd, 130 | Flag: *flag.NewFlagSet("data-manifest-check", flag.ExitOnError), 131 | } 132 | 133 | func init() { 134 | cmd_data_manifest_add.Flag.Bool("all", false, "add all available files") 135 | cmd_data_manifest_rm.Flag.Bool("all", false, "remove all tracked files") 136 | cmd_data_manifest_hash.Flag.Bool("all", false, "hash all tracked files") 137 | cmd_data_manifest_check.Flag.Bool("all", false, "check all tracked files") 138 | } 139 | 140 | func manifestCmd(c *commander.Command, args []string) error { 141 | mf := NewDefaultManifest() 142 | return mf.Generate() 143 | } 144 | 145 | func manifestCmdPaths(c *commander.Command, args []string) ([]string, error) { 146 | mf := NewDefaultManifest() 147 | paths := args 148 | 149 | // Use all files available if --all is passed in. 150 | all := c.Flag.Lookup("all").Value.Get().(bool) 151 | if all { 152 | paths = []string{} 153 | for path, _ := range mf.Files { 154 | paths = append(paths, path) 155 | } 156 | } 157 | 158 | if len(paths) < 1 { 159 | return nil, fmt.Errorf("%v: no files specified.", c.FullName()) 160 | } 161 | 162 | return paths, nil 163 | } 164 | 165 | func manifestAddCmd(c *commander.Command, args []string) error { 166 | mf := NewDefaultManifest() 167 | paths := args 168 | 169 | // Use all files available if --all is passed in. 170 | all := c.Flag.Lookup("all").Value.Get().(bool) 171 | if all { 172 | paths = listAllFiles(".") 173 | } 174 | 175 | if len(paths) < 1 { 176 | return fmt.Errorf("%v: no files specified.", c.FullName()) 177 | } 178 | 179 | // add files to manifest file 180 | for _, f := range paths { 181 | err := mf.Add(f) 182 | if err != nil { 183 | return err 184 | } 185 | } 186 | 187 | return nil 188 | } 189 | 190 | func manifestRmCmd(c *commander.Command, args []string) error { 191 | mf := NewDefaultManifest() 192 | 193 | paths, err := manifestCmdPaths(c, args) 194 | if err != nil { 195 | return err 196 | } 197 | 198 | // remove files from manifest file 199 | for _, f := range paths { 200 | err := mf.Remove(f) 201 | if err != nil { 202 | return err 203 | } 204 | } 205 | 206 | return nil 207 | } 208 | 209 | func manifestHashCmd(c *commander.Command, args []string) error { 210 | mf := NewDefaultManifest() 211 | 212 | paths, err := manifestCmdPaths(c, args) 213 | if err != nil { 214 | return err 215 | } 216 | 217 | // hash files in manifest file 218 | for _, f := range paths { 219 | err := mf.Hash(f) 220 | if err != nil { 221 | return err 222 | } 223 | } 224 | 225 | return nil 226 | } 227 | 228 | func manifestCheckCmd(c *commander.Command, args []string) error { 229 | mf := NewDefaultManifest() 230 | 231 | paths, err := manifestCmdPaths(c, args) 232 | if err != nil { 233 | return err 234 | } 235 | 236 | // hash files in manifest file 237 | failed := 0 238 | for _, f := range paths { 239 | pass, err := mf.Check(f) 240 | if err != nil { 241 | // return err 242 | } 243 | 244 | if !pass { 245 | failed++ 246 | } 247 | } 248 | 249 | if failed > 0 { 250 | return fmt.Errorf("data manifest check: %d/%d checksums failed.", 251 | failed, len(paths)) 252 | } 253 | 254 | return nil 255 | } 256 | 257 | type Manifest struct { 258 | SerializedFile "-" 259 | Files blobPaths "" 260 | } 261 | 262 | func NewManifest(path string) *Manifest { 263 | mf := &Manifest{SerializedFile: SerializedFile{Path: path}} 264 | 265 | // initialize map 266 | mf.Files = blobPaths{} 267 | mf.SerializedFile.Format = mf.Files 268 | 269 | // attempt to load 270 | if len(path) > 0 { 271 | mf.ReadFile() 272 | } 273 | return mf 274 | } 275 | 276 | func NewDefaultManifest() *Manifest { 277 | return NewManifest(ManifestFileName) 278 | } 279 | 280 | func NewManifestWithRef(ref string) (*Manifest, error) { 281 | f := NewManifest("") 282 | err := f.ReadBlob(ref) 283 | if err != nil { 284 | return nil, err 285 | } 286 | return f, nil 287 | } 288 | 289 | func (mf *Manifest) Generate() error { 290 | pErr("Generating Manifest file...\n") 291 | 292 | // add new files to manifest file 293 | // (for now add everything. `data manifest {add,rm}` in future) 294 | for _, f := range listAllFiles(".") { 295 | err := mf.Add(f) 296 | if err != nil { 297 | return err 298 | } 299 | } 300 | 301 | // warn about manifest-listed files missing from directory 302 | // (basically, missing things. User removes individually, or `rm --missing`) 303 | 304 | // Once all files are listed, hash all the files, storing the hashes. 305 | for f, h := range mf.Files { 306 | if IsHash(h) && h != noHash { 307 | continue 308 | } 309 | 310 | err := mf.Hash(f) 311 | if err != nil { 312 | return err 313 | } 314 | } 315 | 316 | if len(mf.Files) == 0 { 317 | err := mf.WriteFile() 318 | if err != nil { 319 | return nil 320 | } 321 | 322 | pErr("Warning: no files in directory. Manifest is empty.\n") 323 | } else { 324 | pErr("%d files in Manifest.\n", len(mf.Files)) 325 | } 326 | 327 | return nil 328 | 329 | } 330 | 331 | func (mf *Manifest) Clear() error { 332 | for f, _ := range mf.Files { 333 | delete(mf.Files, f) 334 | } 335 | return mf.WriteFile() 336 | } 337 | 338 | func (mf *Manifest) Add(path string) error { 339 | // check, dont override (could have hash value) 340 | _, exists := (mf.Files)[path] 341 | if exists { 342 | return nil 343 | } 344 | 345 | (mf.Files)[path] = noHash 346 | 347 | // Write out file (store incrementally) 348 | err := mf.WriteFile() 349 | if err != nil { 350 | return err 351 | } 352 | 353 | pErr("data manifest: added %s\n", path) 354 | return nil 355 | } 356 | 357 | func (mf *Manifest) Remove(path string) error { 358 | // check, dont remove nonexistent path 359 | _, exists := (mf.Files)[path] 360 | if !exists { 361 | return nil 362 | } 363 | 364 | delete(mf.Files, path) 365 | 366 | // Write out file (store incrementally) 367 | err := mf.WriteFile() 368 | if err != nil { 369 | return err 370 | } 371 | 372 | pErr("data manifest: removed %s\n", path) 373 | return nil 374 | } 375 | 376 | func (mf *Manifest) Hash(path string) error { 377 | h, err := hashFile(path) 378 | if err != nil { 379 | return err 380 | } 381 | 382 | (mf.Files)[path] = h 383 | 384 | // Write out file (store incrementally) 385 | err = mf.WriteFile() 386 | if err != nil { 387 | return err 388 | } 389 | 390 | pErr("data manifest: hashed %.7s %s\n", h, path) 391 | return nil 392 | } 393 | 394 | func (mf *Manifest) Check(path string) (bool, error) { 395 | oldHash, found := (mf.Files)[path] 396 | if !found { 397 | return false, fmt.Errorf("data manifest: file not in manifest %s", path) 398 | } 399 | 400 | mfmt := "data manifest: check %.7s %s %s" 401 | 402 | newHash, err := hashFile(path) 403 | if err != nil { 404 | switch err.(type) { 405 | case *os.PathError: 406 | // non existent files count as not hashing correctly. 407 | pErr(mfmt, oldHash, path, "FAIL - not found\n") 408 | return false, nil 409 | default: 410 | return false, err 411 | } 412 | } 413 | 414 | if newHash != oldHash { 415 | pErr(mfmt, oldHash, path, "FAIL\n") 416 | return false, nil 417 | } 418 | 419 | dOut(mfmt, oldHash, path, "PASS\n") 420 | return true, nil 421 | } 422 | 423 | func (mf *Manifest) PathsForHash(hash string) []string { 424 | l := []string{} 425 | for path, h := range mf.Files { 426 | if h == hash { 427 | l = append(l, path) 428 | } 429 | } 430 | return l 431 | } 432 | 433 | func (mf *Manifest) HashForPath(path string) string { 434 | hash, exists := (mf.Files)[path] 435 | if exists { 436 | return hash 437 | } 438 | return "" 439 | } 440 | 441 | func (mf *Manifest) HashForPathCaseInsensitive(path string) string { 442 | path = strings.ToLower(path) 443 | for opath, h := range mf.Files { 444 | opath = strings.ToLower(opath) 445 | if opath == path { 446 | return h 447 | } 448 | } 449 | return "" 450 | } 451 | 452 | func (mf *Manifest) AllPaths() []string { 453 | l := []string{} 454 | for p, _ := range mf.Files { 455 | l = append(l, p) 456 | } 457 | return l 458 | } 459 | 460 | func (mf *Manifest) AllHashes() []string { 461 | l := []string{} 462 | for _, h := range mf.Files { 463 | l = append(l, h) 464 | } 465 | return l 466 | } 467 | 468 | func (mf *Manifest) Complete() bool { 469 | // must have at least one file (Datafile) 470 | if len(mf.Files) < 1 { 471 | return false 472 | } 473 | 474 | // all hashes must be computed 475 | for _, h := range mf.Files { 476 | if !IsHash(h) || h == noHash { 477 | return false 478 | } 479 | } 480 | 481 | return true 482 | } 483 | 484 | func listAllFiles(path string) []string { 485 | 486 | files := []string{} 487 | walkFn := func(path string, info os.FileInfo, err error) error { 488 | 489 | if info.IsDir() { 490 | 491 | // entirely skip hidden dirs 492 | if len(info.Name()) > 1 && strings.HasPrefix(info.Name(), ".") { 493 | dOut("data manifest: skipping %s/\n", info.Name()) 494 | return filepath.SkipDir 495 | } 496 | 497 | // skip datasets/ 498 | if path == DatasetDir { 499 | dOut("data manifest: skipping %s/\n", info.Name()) 500 | return filepath.SkipDir 501 | } 502 | 503 | // dont store dirs 504 | return nil 505 | } 506 | 507 | // skip manifest file 508 | if path == ManifestFileName { 509 | dOut("data manifest: skipping %s\n", info.Name()) 510 | return nil 511 | } 512 | 513 | // skip hidden files 514 | if strings.HasPrefix(info.Name(), ".") { 515 | dOut("data manifest: skipping %s\n", info.Name()) 516 | return nil 517 | } 518 | 519 | files = append(files, path) 520 | return nil 521 | } 522 | 523 | filepath.Walk(path, walkFn) 524 | return files 525 | } 526 | 527 | func (mf *Manifest) ManifestHash() (string, error) { 528 | buf, err := mf.Marshal() 529 | if err != nil { 530 | return "", err 531 | } 532 | 533 | r := bytes.NewReader(buf) 534 | return readerHash(r) 535 | } 536 | -------------------------------------------------------------------------------- /data_pack.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gonuts/flag" 6 | "github.com/jbenet/commander" 7 | "os" 8 | "path" 9 | "strings" 10 | ) 11 | 12 | var cmd_data_pack = &commander.Command{ 13 | UsageLine: "pack [ download | upload ]", 14 | Short: "Dataset packaging, upload, and download.", 15 | Long: `data pack - Dataset packaging, upload, and download. 16 | 17 | Commands: 18 | 19 | pack make Create or update package description. 20 | pack manifest Show current package manifest. 21 | pack upload Upload package to remote storage. 22 | pack download Download package from remote storage. 23 | pack publish Publish package to dataset index. 24 | pack checksum Verify all file checksums match. 25 | 26 | 27 | What is a data package? 28 | 29 | A data package represents a single dataset, a unit of information. 30 | data makes it easy to find, download, create, publish, and maintain 31 | these datasets/packages. 32 | 33 | Dataset packages are simply file directories with two extra files: 34 | - Datafile, containing dataset description and metadata 35 | - Manifest, containing dataset file paths and checksums 36 | (See 'data help datafile' and 'data help manifest'.) 37 | 38 | data pack make 39 | 40 | 'Packing' is the process of generating the package's Datafile and 41 | Manifest. The Manifest is built automatically, but the Datafile 42 | requires user input, to specify name, author, description, etc. 43 | 44 | data pack manifest 45 | 46 | Shows the current package manifest. This may be out of date with the 47 | current directory contents. 48 | 49 | data pack upload 50 | 51 | Packages, once 'packed' (Datafile + Manifest created), can be uploaded 52 | to a remote storage service (by default, the datadex). This means 53 | uploading all the package's files (blobs) not already present in the 54 | storage service. This is determined using a checksum. 55 | 56 | data pack download 57 | 58 | Similarly, packages can be downloaded or reconstructed in any directory 59 | from the Datafile and Manifest. Running 'data pack download' ensures 60 | all files listed in the Manifest are downloaded to the directory. 61 | 62 | data pack publish 63 | 64 | Packages can be published to the dataset index. Running 'data pack 65 | publish' posts the current manifest reference (hash) to the index. 66 | The package should already be uploaded (to the storage service). 67 | Publishing requires index credentials (see 'data user'). 68 | 69 | data pack checksum 70 | 71 | Packages can be verified entirely by calling the 'data pack checksum' 72 | command. It re-hashes every file and ensures the checksums match. 73 | `, 74 | 75 | Subcommands: []*commander.Command{ 76 | cmd_data_pack_make, 77 | cmd_data_pack_manifest, 78 | cmd_data_pack_upload, 79 | cmd_data_pack_download, 80 | cmd_data_pack_publish, 81 | cmd_data_pack_check, 82 | }, 83 | } 84 | 85 | var cmd_data_pack_make = &commander.Command{ 86 | UsageLine: "make", 87 | Short: "Create or update package description.", 88 | Long: `data pack upload - Upload package contents to remote storage. 89 | 90 | Makes the package's description files: 91 | - Datafile, containing dataset description and metadata (prompts) 92 | - Manifest, containing dataset file paths and checksums (generated) 93 | 94 | See 'data pack'. 95 | `, 96 | Run: packMakeCmd, 97 | Flag: *flag.NewFlagSet("data-pack-make", flag.ExitOnError), 98 | } 99 | 100 | var cmd_data_pack_manifest = &commander.Command{ 101 | UsageLine: "manifest", 102 | Short: "Show current package manifest.", 103 | Long: `data pack manifest - Show current package manifest. 104 | 105 | Shows the package's manifest file and exits. 106 | If no manifest file exists, exit with an error. 107 | 108 | See 'data pack'. 109 | `, 110 | Run: packManifestCmd, 111 | } 112 | 113 | var cmd_data_pack_upload = &commander.Command{ 114 | UsageLine: "upload", 115 | Short: "Upload package contents to remote storage.", 116 | Long: `data pack upload - Upload package contents to remote storage. 117 | 118 | Uploads package's files (blobs) to a remote storage service (datadex). 119 | Blobs are named by their hash (checksum), so data can deduplicate. 120 | Meaning, data can easily tell whether the service already has each 121 | file, avoiding redundant uploads, saving bandwidth, and leveraging 122 | the data uploaded along with other datasets. 123 | 124 | See 'data pack'. 125 | `, 126 | Run: packUploadCmd, 127 | } 128 | 129 | var cmd_data_pack_download = &commander.Command{ 130 | UsageLine: "download", 131 | Short: "Download package contents from remote storage.", 132 | Long: `data pack download - Download package contents from remote storage. 133 | 134 | Downloads package's files (blobs) from remote storage service (datadex). 135 | Blobs are named by their hash (checksum), so data can deduplicate and 136 | ensure integrity. Meaning, data can avoid redundant downloads, saving 137 | bandwidth and speed, as well as verify the correctness of files with 138 | their checksum, preventing corruption. 139 | 140 | See 'data pack'. 141 | `, 142 | Run: packDownloadCmd, 143 | } 144 | 145 | var cmd_data_pack_publish = &commander.Command{ 146 | UsageLine: "publish", 147 | Short: "Publish package reference to dataset index.", 148 | Long: `data pack publish - Publish package reference to dataset index. 149 | 150 | Publishes pckage's manifest reference (hash) to the dataset index. 151 | Package manifest (and all blobs) should be already uploaded. If any 152 | blob has not been uploaded, publish will exit with an error. 153 | 154 | Note: publishing requires data index credentials; see 'data user'. 155 | 156 | See 'data pack'. 157 | `, 158 | Run: packPublishCmd, 159 | Flag: *flag.NewFlagSet("data-pack-publish", flag.ExitOnError), 160 | } 161 | 162 | var cmd_data_pack_check = &commander.Command{ 163 | UsageLine: "check", 164 | Short: "Verify all file checksums match.", 165 | Long: `data pack check - Verify all file checksums match. 166 | 167 | Verifies all package's file (blob) checksums match hashes stored in 168 | the Manifest. This is the way to check package-wide integrity. If any 169 | checksums FAIL, it is suggested that the files be re-downloaded (using 170 | 'data pack download' or 'data blob get '). 171 | 172 | See 'data pack'. 173 | `, 174 | Run: packCheckCmd, 175 | } 176 | 177 | func init() { 178 | cmd_data_pack_make.Flag.Bool("clean", false, "make pack from scratch") 179 | cmd_data_pack_publish.Flag.Bool("force", false, "overwrite published version") 180 | } 181 | 182 | func packMakeCmd(c *commander.Command, args []string) error { 183 | p, err := NewPack() 184 | if err != nil { 185 | return err 186 | } 187 | 188 | return p.Make(c.Flag.Lookup("clean").Value.Get().(bool)) 189 | } 190 | 191 | func packManifestCmd(c *commander.Command, args []string) error { 192 | p, err := NewPack() 193 | if err != nil { 194 | return err 195 | } 196 | 197 | buf, err := p.manifest.Marshal() 198 | if err != nil { 199 | return err 200 | } 201 | 202 | pOut("%s", buf) 203 | return nil 204 | } 205 | 206 | func packUploadCmd(c *commander.Command, args []string) error { 207 | p, err := NewPack() 208 | if err != nil { 209 | return err 210 | } 211 | return p.Upload() 212 | } 213 | 214 | func packDownloadCmd(c *commander.Command, args []string) error { 215 | p, err := NewPack() 216 | if err != nil { 217 | return err 218 | } 219 | return p.Download() 220 | } 221 | 222 | func packPublishCmd(c *commander.Command, args []string) error { 223 | p, err := NewPack() 224 | if err != nil { 225 | return err 226 | } 227 | 228 | force := c.Flag.Lookup("force").Value.Get().(bool) 229 | err = p.Publish(force) 230 | if err != nil { 231 | if strings.Contains(err.Error(), "forbidden") { 232 | u := configUser() 233 | d := p.datafile.Handle().Path() 234 | o := p.datafile.Handle().Author 235 | return fmt.Errorf(PublishingForbiddenMsg, u, d, o, err.Error()) 236 | } 237 | return err 238 | } 239 | 240 | return nil 241 | } 242 | 243 | func packCheckCmd(c *commander.Command, args []string) error { 244 | p, err := NewPack() 245 | if err != nil { 246 | return err 247 | } 248 | 249 | if !p.manifest.Complete() { 250 | pErr("Warning: manifest incomplete. Checksums may be incorrect.") 251 | } 252 | 253 | failures := 0 254 | 255 | for _, file := range p.manifest.AllPaths() { 256 | pass, err := p.manifest.Check(file) 257 | if err != nil { 258 | return err 259 | } 260 | 261 | if !pass { 262 | failures++ 263 | } 264 | } 265 | 266 | count := len(p.manifest.Files) 267 | if failures > 0 { 268 | return fmt.Errorf("data pack: %v/%v checksums failed!", failures, count) 269 | } 270 | 271 | pOut("data pack: %v checksums pass\n", count) 272 | return nil 273 | } 274 | 275 | type Pack struct { 276 | manifest *Manifest 277 | datafile *Datafile 278 | index *DataIndex 279 | } 280 | 281 | func NewPack() (p *Pack, err error) { 282 | p = &Pack{} 283 | p.manifest = NewDefaultManifest() 284 | 285 | p.datafile, _ = NewDefaultDatafile() 286 | // ignore error loading datafile 287 | 288 | p.index, err = NewMainDataIndex() 289 | if err != nil { 290 | return nil, err 291 | } 292 | 293 | return p, nil 294 | } 295 | 296 | func (p *Pack) BlobPaths() (blobPaths, error) { 297 | mfh, err := p.manifest.ManifestHash() 298 | if err != nil { 299 | return blobPaths{}, err 300 | } 301 | 302 | blobs := validBlobHashes(p.manifest.Files) 303 | blobs[p.manifest.Path] = mfh 304 | return blobs, nil 305 | } 306 | 307 | func (p *Pack) Make(clean bool) error { 308 | if clean { 309 | err := p.manifest.Clear() 310 | if err != nil { 311 | return err 312 | } 313 | } 314 | 315 | // fill out datafile defaults. 316 | if len(p.datafile.Dataset) == 0 { 317 | cwd, _ := os.Getwd() 318 | cwd = path.Base(cwd) 319 | name := identString(cwd) 320 | p.datafile.Dataset = configUser() + "/" + name + "@1.0" 321 | } 322 | 323 | // ensure the dataset has required information 324 | err := fillOutDatafile(p.datafile) 325 | if err != nil { 326 | return err 327 | } 328 | 329 | // fill out default website 330 | if len(p.datafile.Website) == 0 { 331 | h := p.datafile.Handle() 332 | p.datafile.Website = "http://datadex.io/" + h.Dataset() 333 | p.datafile.WriteFile() // ignore error. best effort. 334 | } 335 | 336 | // generate manifest 337 | err = p.manifest.Generate() 338 | if err != nil { 339 | return err 340 | } 341 | 342 | return nil 343 | } 344 | 345 | // Check the blobstore to check which blobs in pack have not been uploaded. 346 | func (p *Pack) blobsToUpload() ([]string, error) { 347 | missing := []string{} 348 | 349 | blobs, err := p.BlobPaths() 350 | if err != nil { 351 | return []string{}, err 352 | } 353 | 354 | for _, hash := range blobs { 355 | exists, err := p.index.hasBlob(hash) 356 | if err != nil { 357 | return []string{}, err 358 | } 359 | 360 | if !exists { 361 | dOut("blobstore missing %s\n", hash) 362 | missing = append(missing, hash) 363 | } 364 | } 365 | return missing, nil 366 | } 367 | 368 | // Uploads pack to index. 369 | func (p *Pack) Upload() error { 370 | if !p.manifest.Complete() { 371 | return fmt.Errorf(ManifestIncompleteMsg) 372 | } 373 | 374 | blobs, err := p.BlobPaths() 375 | if err != nil { 376 | return err 377 | } 378 | 379 | return putBlobs(blobs) 380 | } 381 | 382 | // Downloads pack from index. 383 | func (p *Pack) Download() error { 384 | if !p.manifest.Complete() { 385 | return fmt.Errorf(`Manifest incomplete. Get new manifest copy.`) 386 | } 387 | 388 | blobs, err := p.BlobPaths() 389 | if err != nil { 390 | return err 391 | } 392 | 393 | return getBlobs(blobs) 394 | } 395 | 396 | // Publishes pack to the Index 397 | func (p *Pack) Publish(force bool) error { 398 | 399 | // ensure datafile has required info 400 | if !p.datafile.Valid() { 401 | return fmt.Errorf(`Datafile invalid. Try running 'data pack make'`) 402 | } 403 | 404 | // ensure manifest is complete 405 | if !p.manifest.Complete() { 406 | return fmt.Errorf(`Manifest incomplete. Before uploading, either: 407 | - Generate new package manifest with 'data pack make' (uses all files). 408 | - Finish manifest with 'data manifest' (add and hash specific files). 409 | `) 410 | } 411 | 412 | // ensure all blobs have been uploaded 413 | missing, err := p.blobsToUpload() 414 | if err != nil { 415 | return err 416 | } 417 | if len(missing) > 0 { 418 | return fmt.Errorf("%d objects must be uploaded first."+ 419 | " Run 'data pack upload'.", len(missing)) 420 | } 421 | 422 | mfh, err := p.manifest.ManifestHash() 423 | if err != nil { 424 | return err 425 | } 426 | 427 | // Check dataset version isn't already taken. 428 | h := p.datafile.Handle() 429 | ri := p.index.RefIndex(h.Path()) 430 | ref, err := ri.VersionRef(h.Version) 431 | if err != nil { 432 | switch { 433 | // http errors fail. 434 | case strings.Contains(err.Error(), "connection refused"): 435 | return fmt.Errorf(NetErrMsg, p.index.Http.Url) 436 | 437 | // ok if no ref for version. 438 | case strings.Contains(err.Error(), "No ref for version"): 439 | 440 | // ok if not found. 441 | case strings.Contains(err.Error(), "HTTP error status code: 404"): 442 | 443 | default: 444 | return err 445 | } 446 | } 447 | 448 | if ref != "" { 449 | pOut("Found published version %s (%.7s).\n", h.Version, ref) 450 | if ref == mfh { 451 | pOut(PublishedVersionSameMsg, h.Version, ref) 452 | return nil 453 | } 454 | 455 | if !force { 456 | return fmt.Errorf(PublishedVersionDiffersMsg, h.Version, ref, h.Dataset()) 457 | } 458 | 459 | pOut("Using --force. Overwriting %s (%.7s -> %.7s).\n", h.Version, ref, mfh) 460 | } 461 | 462 | // ok seems good to go. 463 | err = ri.Put(mfh) 464 | if err != nil { 465 | return err 466 | } 467 | 468 | pOut("data pack: published %s (%.7s).\n", h.Dataset(), mfh) 469 | pOut("Webpage at %s/%s\n", p.index.Http.BaseUrl, h.Dataset()) 470 | return nil 471 | } 472 | 473 | const PublishedVersionDiffersMsg = `Version %s (%.7s) already published, but contents differ. 474 | If you're trying to publish a new version, increment the version 475 | number in Datafile, and then try again: 476 | 477 | Dataset: %s <--- change this number 478 | 479 | If you're trying to _overwrite_ the published version with this one, 480 | you may do so with the '--force' flag. However, this is not advised. 481 | Make sure you are aware of all side-effects; you might break compatibility 482 | for everyone else using this dataset. You have been warned.` 483 | 484 | const PublishedVersionSameMsg = `Version %s (%.7s) already published. 485 | It has the same contents you're trying to publish, so seems like 486 | your work here is done :) 487 | ` 488 | 489 | const PublishingForbiddenMsg = `You (%s) lack permissions to publish to %s. 490 | Either, fork your own copy of the dataset (see 'data fork'). 491 | Or ask the owner (%s) for collaboration privileges. 492 | (%s)` 493 | 494 | const NetErrMsg = `Connection to the index refused. 495 | Are you connected to the internet? 496 | Is the dataset index down? Check %s` 497 | 498 | const ManifestIncompleteMsg = `Manifest incomplete. Before uploading, either: 499 | - Generate new package manifest with 'data pack make' (uses all files). 500 | - Finish manifest with 'data manifest' (add and hash specific files).` 501 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data - package manager for datasets 2 | 3 | Imagine installing datasets like this: 4 | 5 | data get jbenet/norb 6 | 7 | It's about time we used all we've learned making package managers to fix the 8 | awful data management problem. Read the [designdoc](dev/designdoc.md) and 9 | the [roadmap](dev/roadmap.md). 10 | 11 | #### Table of Contents 12 | 13 | - [Install](#install) 14 | - [Usage](#usage) 15 | - [Datafile](#datafile) 16 | - [Development](#development) 17 | - [About](#about) 18 | - [Examples](#examples) 19 | 20 | ## Install 21 | 22 | Two ways to install: 23 | - [from pre-built binary distributions](http://datadex.io/doc/install) (the easy way) 24 | - [from source](http://datadex.io/doc/source-install) (the hard way) 25 | 26 | ## Usage 27 | 28 | Please see the [command reference](http://datadex.io/doc/ref). 29 | 30 | ### Downloading datasets 31 | 32 | Downloading datasets is trivial: 33 | ``` 34 | > data get jbenet/mnist 35 | Installed jbenet/mnist@1.0 at datasets/jbenet/mnist@1.0 36 | ``` 37 | 38 | ### Add datasets to projects 39 | 40 | Or, if you want to add datasets to a project, create a Datafile like this one: 41 | ``` 42 | > cat Datafile 43 | dependencies: 44 | - jbenet/mnist@1.0 45 | - jbenet/cifar-10 46 | - jbenet/cifar-100 47 | ``` 48 | 49 | Then, run `data get` to install the dependencies (it works like `npm install`): 50 | ``` 51 | > data get 52 | Installed jbenet/mnist@1.0 at datasets/jbenet/mnist@1.0 53 | Installed jbenet/cifar-10@1.0 at datasets/jbenet/cifar-10@1.0 54 | Installed jbenet/cifar-100@1.0 at datasets/jbenet/cifar-100@1.0 55 | ``` 56 | 57 | You can even commit the Datafile to version control, so your collaborators or users can easily get the data: 58 | ``` 59 | > git clone github.com/jbenet/ml-vision-comparisons 60 | > cd ml-vision-comparisons 61 | > data get 62 | Installed jbenet/mnist@1.0 at datasets/jbenet/mnist@1.0 63 | Installed jbenet/cifar-10@1.0 at datasets/jbenet/cifar-10@1.0 64 | Installed jbenet/cifar-100@1.0 at datasets/jbenet/cifar-100@1.0 65 | ``` 66 | 67 | ### Publishing datasets 68 | 69 | Publishing datasets is simple: 70 | 71 | 1. make a directory with all the files you want to publish. 72 | 2. `cd` into it, and run `data publish` within the directory. 73 | 3. `data` will guide you through creating a Datafile. 74 | 4. Then, `data` will upload and publish the package. 75 | 76 | ``` 77 | > data publish 78 | 79 | Published jbenet/mnist@1.0 (b5f84c2). 80 | ``` 81 | 82 | Note that uploading can take a long while, as we'll upload all the files to S3, ensuring others can always get them. 83 | 84 | 85 | ## Datafile 86 | 87 | data tracks the definition of dataset packages, and dependencies in a 88 | `Datafile` (in the style of `Makefile`, `Vagrantfile`, `Procfile`, and 89 | friends). Both published dataset packages, and regular projects use it. 90 | In a way, your project defines a dataset made up of other datasets, like 91 | `package.json` in `npm`. 92 | 93 | ``` 94 | # Datafile format 95 | # A YAML (inc json) doc with the following keys: 96 | 97 | # required 98 | handle: /[.][@] 99 | title: Dataset Title 100 | 101 | # optional functionality 102 | dependencies: [] 103 | formats: { : } 104 | 105 | # optional information 106 | description: Text describing dataset. 107 | repository: 108 | website: 109 | license: 110 | contributors: ["Author Name [] [(url)]>", ...] 111 | sources: [] 112 | ``` 113 | May be outdated. See [datafile.go](datafile.go). 114 | 115 | ### why yaml? 116 | 117 | YAML is much more readable than json. One of `data`'s [design goals 118 | ](https://github.com/jbenet/data/blob/master/dev/designdoc.md#design-goals) 119 | is an Intuitive UX. Since the target users are scientists in various domains, 120 | any extra syntax, parse errors, and other annoyances could cease to provide 121 | the ease of use `data` aims for. I've always found this 122 | 123 | ``` 124 | dataset: feynman/spinning-plate-measurements 125 | title: Measurements of Plate Rotation 126 | contributors: 127 | - Richard Feynman 128 | website: http://caltech.edu/~feynman/not-girls/plate-stuff/trial3 129 | ``` 130 | 131 | much more friendly and approachable than this 132 | 133 | ``` 134 | { 135 | "dataset": "feynman/spinning-plate-measurements", 136 | "title": "Measurements of Plate Rotation", 137 | "contributors": [ 138 | "Richard Feynman " 139 | ], 140 | "website": "http://caltech.edu/~feynman/not-girls/plate-stuff/trial3" 141 | } 142 | ``` 143 | 144 | It's already hard enough to get anyone to do anything. Don't add more hoops to 145 | jump through than necessary. Each step will cause significant dropoff in 146 | conversion funnels. (Remember, [Apple pays Amazon for 1-click buy](https://www.apple.com/pr/library/2000/09/18Apple-Licenses-Amazon-com-1-Click-Patent-and-Trademark.html)...) 147 | 148 | And, since YAML is a superset of json, you can do whatever you want. 149 | 150 | 151 | ## Development 152 | 153 | Setup: 154 | 155 | 1. [install go](http://golang.org/doc/install) 156 | 2. Run 157 | 158 | ``` 159 | git clone https://github.com/jbenet/data 160 | cd data 161 | make deps 162 | make install 163 | ``` 164 | 165 | You'll want to run [datadex](https://github.com/jbenet/datadex) too. 166 | 167 | ## About 168 | 169 | This project started because data management is a massive problem in science. 170 | It should be **trivial** to (a) find, (b) download, (c) track, (d) manage, 171 | (e) re-format, (f) publish, (g) cite, and (h) collaborate on datasets. Data 172 | management is a problem in other domains (engineering, civics, etc), and `data` 173 | seeks to be general enough to be used with any kind of dataset, but the target 174 | use case is saving scientists' time. 175 | 176 | Many people agree we direly need the 177 | "[GitHub for Science](http://static.benet.ai/t/github-for-science.md)"; 178 | scientific collaboration problems are large and numerous. 179 | It is not entirely clear how, and in which order, to tackle these 180 | challenges, or even how to drive adoption of solutions across fields. I think 181 | simple and powerful tools can solve large problems neatly. Perhaps the best 182 | way to tackle scientific collaboration is by decoupling interconnected 183 | problems, and building simple tools to solve them. Over time, reliable 184 | infrastructure can be built with these. git, github, and arxiv are great 185 | examples to follow. 186 | 187 | `data` is an attempt to solve the fairly self-contained issue of downloading, 188 | publishing, and managing datasets. Let's take what computer scientists have 189 | learned about version control and distributed collaboration on source code, 190 | and apply it to the data management problem. Let's build new data tools and 191 | infrastructure with the software engineering and systems design principles 192 | that made git, apt, npm, and github successful. 193 | 194 | ### Acknowledgements 195 | 196 | `data` is released under the MIT License. 197 | 198 | Authored by [@jbenet](https://github.com/jbenet). Feel free to contact me 199 | at , but please post 200 | [issues](https://github.com/jbenet/data/issues) on github first. 201 | 202 | Special thanks to 203 | [@colah](https://github.com/colah) (original idea and 204 | [data.py](https://github.com/colah/data)), 205 | [@damodei](https://github.com/damodei), and 206 | [@davidad](https://github.com/davidad), 207 | who provided valuable thoughts + discussion on this problem. 208 | 209 | ## Examples 210 | 211 | ``` 212 | data - dataset package manager 213 | 214 | Basic commands: 215 | 216 | get Download and install dataset. 217 | list List installed datasets. 218 | info Show dataset information. 219 | publish Guided dataset publishing. 220 | 221 | Tool commands: 222 | 223 | version Show data version information. 224 | config Manage data configuration. 225 | user Manage users and credentials. 226 | commands List all available commands. 227 | 228 | Advanced Commands: 229 | 230 | blob Manage blobs in the blobstore. 231 | manifest Generate and manipulate dataset manifest. 232 | pack Dataset packaging, upload, and download. 233 | 234 | Use "data help " for more information about a command. 235 | ``` 236 | 237 | ### data get 238 | 239 | ``` 240 | # author/dataset 241 | > data get jbenet/bar 242 | Downloading jbenet/foo from datadex. 243 | get blob b53ce99 Manifest 244 | get blob 2183ea8 Datafile 245 | get blob 63443e4 data.csv 246 | copy blob 63443e4 data.txt 247 | copy blob 63443e4 data.xsl 248 | get blob b53ce99 Manifest 249 | 250 | Installed jbenet/foo@1.0 at datasets/jbenet/foo 251 | ``` 252 | 253 | ### data list 254 | 255 | ``` 256 | > data list 257 | jbenet/bar@1.0 258 | ``` 259 | 260 | ### data info 261 | 262 | ``` 263 | > data info jbenet/foo 264 | dataset: jbenet/foo@1.0 265 | title: Foo Dataset 266 | description: The first dataset to use data. 267 | license: MIT 268 | 269 | # shows the Datafile 270 | > cat datasets/jbenet/bar/Datafile 271 | dataset: foo/bar@1.1 272 | ``` 273 | 274 | ### data publish 275 | 276 | ``` 277 | > data publish 278 | ==> Guided Data Package Publishing. 279 | 280 | ==> Step 1/3: Creating the package. 281 | Verifying Datafile fields... 282 | Generating manifest... 283 | data manifest: added Datafile 284 | data manifest: added data.csv 285 | data manifest: added data.txt 286 | data manifest: added data.xsl 287 | data manifest: hashed 2183ea8 Datafile 288 | data manifest: hashed 63443e4 data.csv 289 | data manifest: hashed 63443e4 data.txt 290 | data manifest: hashed 63443e4 data.xsl 291 | 292 | ==> Step 2/3: Uploading the package contents. 293 | put blob 2183ea8 Datafile - uploading 294 | put blob 63443e4 data.csv - exists 295 | put blob b53ce99 Manifest - uploading 296 | 297 | ==> Step 3/3: Publishing the package to the index. 298 | data pack: published jbenet/foo@1.0 (b53ce99). 299 | ``` 300 | 301 | Et voila! You can now use `data get foo/bar` to retrieve it! 302 | 303 | ### data config 304 | 305 | ``` 306 | > data config index.datadex.url http://localhost:8080 307 | > data config index.datadex.url 308 | http://localhost:8080 309 | ``` 310 | 311 | ### data user 312 | 313 | ``` 314 | > data user 315 | data user - Manage users and credentials. 316 | 317 | Commands: 318 | 319 | add Register new user with index. 320 | auth Authenticate user account. 321 | pass Change user password. 322 | info Show (or edit) public user information. 323 | url Output user profile url. 324 | 325 | Use "user help " for more information about a command. 326 | 327 | > data user add 328 | Username: juan 329 | Password (6 char min): 330 | Email (for security): juan@benet.ai 331 | juan registered. 332 | 333 | > data user auth 334 | Username: juan 335 | Password: 336 | Authenticated as juan. 337 | 338 | > data user info 339 | name: "" 340 | email: juan@benet.ai 341 | 342 | > data user info jbenet 343 | name: Juan 344 | email: juan@benet.ai 345 | github: jbenet 346 | twitter: '@jbenet' 347 | website: benet.ai 348 | 349 | > data user info --edit 350 | Editing user profile. [Current value]. 351 | Full Name: [] Juan Batiz-Benet 352 | Website Url: [] 353 | Github username: [] 354 | Twitter username: [] 355 | Profile saved. 356 | 357 | > data user info 358 | name: Juan Batiz-Benet 359 | email: juan@benet.ai 360 | 361 | > data user pass 362 | Username: juan 363 | Current Password: 364 | New Password (6 char min): 365 | Password changed. You will receive an email notification. 366 | 367 | > data user url 368 | http://datadex.io:8080/juan 369 | ``` 370 | 371 | ### data manifest (plumbing) 372 | 373 | ``` 374 | > data manifest add filename 375 | data manifest: added filename 376 | 377 | > data manifest hash filename 378 | data manifest: hashed 61a66fd filename 379 | 380 | > cat .data-manifest 381 | filename: 61a66fda64e397a82d9f0c8b7b3f7ba6bca79b12 382 | 383 | > data manifest rm filename 384 | data manifest: removed filename 385 | ``` 386 | 387 | ### data blob (plumbing) 388 | ``` 389 | data blob - Manage blobs in the blobstore. 390 | 391 | Commands: 392 | 393 | put Upload blobs to a remote blobstore. 394 | get Download blobs from a remote blobstore. 395 | url Output Url for blob named by . 396 | 397 | Use "blob help " for more information about a command. 398 | ``` 399 | 400 | ``` 401 | > cat Manifest 402 | Datafile: 0d0c669b4c2b05402d9cc87298f3d7ce372a4c80 403 | data.csv: 63443e4d74c3a170499fa9cfde5ae2224060b09e 404 | data.txt: 63443e4d74c3a170499fa9cfde5ae2224060b09e 405 | data.xsl: 63443e4d74c3a170499fa9cfde5ae2224060b09e 406 | 407 | > data blob put --all 408 | put blob 0d0c669 Datafile 409 | put blob 63443e4 data.csv 410 | 411 | > data blob get 63443e4d74c3a170499fa9cfde5ae2224060b09e 412 | data blob get 63443e4d74c3a170499fa9cfde5ae2224060b09e 413 | get blob 63443e4 data.csv 414 | copy blob 63443e4 data.txt 415 | copy blob 63443e4 data.xsl 416 | 417 | > data blob url 418 | http://datadex.archives.s3.amazonaws.com/blob/0d0c669b4c2b05402d9cc87298f3d7ce372a4c80 419 | http://datadex.archives.s3.amazonaws.com/blob/63443e4d74c3a170499fa9cfde5ae2224060b09e 420 | ``` 421 | 422 | ### data pack (plumbing) 423 | 424 | This is probably the most informative command to look at. 425 | 426 | ``` 427 | data pack - Dataset packaging, upload, and download. 428 | 429 | Commands: 430 | 431 | make Create or update package description. 432 | manifest Show current package manifest. 433 | upload Upload package contents to remote storage. 434 | download Download package contents from remote storage. 435 | publish Publish package reference to dataset index. 436 | check Verify all file checksums match. 437 | 438 | Use "pack help " for more information about a command. 439 | ``` 440 | 441 | 442 | ``` 443 | > ls 444 | data.csv data.txt data.xsl 445 | 446 | > cat data.* 447 | BAR BAR BAR 448 | BAR BAR BAR 449 | BAR BAR BAR 450 | 451 | > data pack make # interactive 452 | Verifying Datafile fields... 453 | Enter author name (required): foo 454 | Enter dataset id (required): bar 455 | Enter dataset version (required): 1.1 456 | Enter dataset title (optional): Barrr 457 | Enter description (optional): A bar dataset. 458 | Enter license name (optional): MIT 459 | Generating manifest... 460 | data manifest: hashed 0d0c669 Datafile 461 | data manifest: hashed 63443e4 data.csv 462 | data manifest: hashed 63443e4 data.txt 463 | data manifest: hashed 63443e4 data.xsl 464 | 465 | > ls 466 | Datafile Manifest data.csv data.txt data.xsl 467 | 468 | > data pack manifest 469 | Datafile: 0d0c669b4c2b05402d9cc87298f3d7ce372a4c80 470 | data.csv: 63443e4d74c3a170499fa9cfde5ae2224060b09e 471 | data.txt: 63443e4d74c3a170499fa9cfde5ae2224060b09e 472 | data.xsl: 63443e4d74c3a170499fa9cfde5ae2224060b09e 473 | 474 | > data pack upload 475 | put blob 0d0c669 Datafile 476 | put blob 63443e4 data.csv 477 | put blob 8a2e6f6 Manifest 478 | 479 | > rm data.* 480 | 481 | > ls 482 | Datafile Manifest 483 | 484 | > data pack download 485 | get blob 63443e4 data.csv 486 | copy blob 63443e4 data.txt 487 | copy blob 63443e4 data.xsl 488 | 489 | > ls 490 | Datafile Manifest data.csv data.txt data.xsl 491 | 492 | > data pack check 493 | data pack: 4 checksums pass 494 | 495 | > echo "FOO FOO FOO" > data.csv 496 | 497 | > data pack check 498 | data manifest: check 63443e4 data.csv FAIL 499 | data pack: 1/4 checksums failed! 500 | 501 | > data pack download 502 | copy blob 63443e4 data.csv 503 | 504 | > data pack check 505 | data pack: 4 checksums pass 506 | 507 | > data pack publish 508 | data pack: published foo/bar@1.1 (8a2e6f6). 509 | ``` 510 | -------------------------------------------------------------------------------- /data_blob.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "github.com/gonuts/flag" 7 | "github.com/jbenet/commander" 8 | "io" 9 | "os" 10 | "path" 11 | ) 12 | 13 | var cmd_data_blob = &commander.Command{ 14 | UsageLine: "blob ", 15 | Short: "Manage blobs in the blobstore.", 16 | Long: `data blob - Manage blobs in the blobstore. 17 | 18 | Commands: 19 | 20 | put Upload blob named by to blobstore. 21 | get Download blob named by from blobstore. 22 | check Verify blob matches . 23 | url Output Url for blob named by . 24 | show Output blob contents for hash. 25 | hash Output hash for blob contents. 26 | 27 | Arguments: 28 | 29 | The argument is the blob's checksum, and id. 30 | The argument is the blob's target file. 31 | If is omitted, stdin/stdout are used. 32 | 33 | 34 | What is a blob? 35 | 36 | Datasets are made up of files, which are made up of blobs. 37 | (For now, 1 file is 1 blob. Chunking to be implemented) 38 | Blobs are basically blocks of data, which are checksummed 39 | (for integrity, de-duplication, and addressing) using a crypto- 40 | graphic hash function (sha1, for now). If git comes to mind, 41 | that's exactly right. 42 | 43 | Local Blobstores 44 | 45 | data stores blobs in blobstores. Every local dataset has a 46 | blobstore (local caching with links TBI). Like in git, the blobs 47 | are stored safely in the blobstore (different directory) and can 48 | be used to reconstruct any corrupted/deleted/modified dataset files. 49 | 50 | Remote Blobstores 51 | 52 | data uses remote blobstores to distribute datasets across users. 53 | The datadex service includes a blobstore (currently an S3 bucket). 54 | By default, the global datadex blobstore is where things are 55 | uploaded to and retrieved from. 56 | 57 | Since blobs are uniquely identified by their hash, maintaining one 58 | global blobstore helps reduce data redundancy. However, users can 59 | run their own datadex service. (The index and blobstore are tied 60 | together to ensure consistency. Please do not publish datasets to 61 | an index if blobs aren't in that index) 62 | 63 | data can use any remote blobstore you wish. (For now, you have to 64 | recompile, but in the future, you will be able to) Just change the 65 | datadex configuration variable. Or pass in "-s " per command. 66 | 67 | (data-blob is part of the plumbing, lower level tools. 68 | Use it directly if you know what you're doing.) 69 | `, 70 | 71 | Flag: *flag.NewFlagSet("data-blob", flag.ExitOnError), 72 | 73 | Subcommands: []*commander.Command{ 74 | cmd_data_blob_put, 75 | cmd_data_blob_get, 76 | cmd_data_blob_url, 77 | cmd_data_blob_show, 78 | cmd_data_blob_hash, 79 | cmd_data_blob_check, 80 | }, 81 | } 82 | 83 | var cmd_data_blob_put = &commander.Command{ 84 | UsageLine: "put ", 85 | Short: "Upload blobs to a remote blobstore.", 86 | Long: `data blob put - Upload blobs to a remote blobstore. 87 | 88 | Upload the blob contents named by to a remote blobstore. 89 | Blob contents are stored locally, to be used to reconstruct files. 90 | In the future, the blobstore will be able to be changed. For now, 91 | the default blobstore/datadex is used. 92 | 93 | See data blob. 94 | 95 | Arguments: 96 | 97 | name (cryptographic hash, checksum) of the blob. 98 | path of the blob contents to upload. 99 | 100 | `, 101 | Run: blobPutCmd, 102 | Flag: *flag.NewFlagSet("data-blob-put", flag.ExitOnError), 103 | } 104 | 105 | var cmd_data_blob_get = &commander.Command{ 106 | UsageLine: "get []", 107 | Short: "Download blobs from a remote blobstore.", 108 | Long: `data blob get - Download blobs from a remote blobstore. 109 | 110 | Download the blob contents named by from a remote blobstore. 111 | Blob contents are stored locally, to be used to reconstruct files. 112 | In the future, the blobstore will be able to be changed. For now, 113 | the default blobstore/datadex is used. 114 | 115 | See data blob. 116 | 117 | Arguments: 118 | 119 | name (cryptographic hash, checksum) of the blob. 120 | path to put the blob contents in. 121 | 122 | `, 123 | Run: blobGetCmd, 124 | Flag: *flag.NewFlagSet("data-blob-get", flag.ExitOnError), 125 | } 126 | 127 | var cmd_data_blob_url = &commander.Command{ 128 | UsageLine: "url ", 129 | Short: "Output Url for blob named by .", 130 | Long: `data blob url - Output Url for blob named by . 131 | 132 | Output the remote storage url for the blob contents named by . 133 | In the future, the blobstore will be able to be changed. For now, 134 | the default blobstore/datadex is used. 135 | 136 | See data blob. 137 | 138 | Arguments: 139 | 140 | name (cryptographic hash, checksum) of the blob. 141 | 142 | `, 143 | Run: blobUrlCmd, 144 | Flag: *flag.NewFlagSet("data-blob-url", flag.ExitOnError), 145 | } 146 | 147 | var cmd_data_blob_show = &commander.Command{ 148 | UsageLine: "show ", 149 | Short: "Output blob contents for hash.", 150 | Long: `data blob show - Output blob contents for hash. 151 | 152 | Output the blob contents stored in the blobstore for hash. 153 | If the blob is available locally, that copy is used (after 154 | hashing to verify correctness). Otherwise, it is downloaded 155 | from the blobstore. 156 | 157 | See data blob. 158 | 159 | Arguments: 160 | 161 | name (cryptographic hash, checksum) of the blob. 162 | 163 | `, 164 | Run: blobShowCmd, 165 | } 166 | 167 | var cmd_data_blob_hash = &commander.Command{ 168 | UsageLine: "hash ", 169 | Short: "Output hash for blob contents.", 170 | Long: `data blob hash - Output hash for blob contents. 171 | 172 | Output the hash of the blob contents stored in 173 | 174 | See data blob. 175 | 176 | Arguments: 177 | 178 | path of the blob contents 179 | 180 | `, 181 | Run: blobHashCmd, 182 | } 183 | 184 | var cmd_data_blob_check = &commander.Command{ 185 | UsageLine: "check ", 186 | Short: "Verify blob matches ", 187 | Long: `data blob check - Verify blob matches . 188 | 189 | Verify the hash of the blob contents stored in matches 190 | . 191 | 192 | See data blob. 193 | 194 | Arguments: 195 | 196 | name (cryptographic hash, checksum) of the blob. 197 | path of the blob contents 198 | 199 | `, 200 | Run: blobCheckCmd, 201 | } 202 | 203 | func init() { 204 | cmd_data_blob.Flag.Bool("all", false, "all available blobs") 205 | cmd_data_blob_get.Flag.Bool("all", false, "get all available blobs") 206 | cmd_data_blob_put.Flag.Bool("all", false, "put all available blobs") 207 | cmd_data_blob_url.Flag.Bool("all", false, "urls for all available blobs") 208 | cmd_data_blob_check.Flag.Bool("all", false, "check all available blobs") 209 | } 210 | 211 | type blobStore interface { 212 | Has(key string) (bool, error) 213 | Put(key string, value io.Reader) error 214 | Get(key string) (io.ReadCloser, error) 215 | Url(key string) string 216 | } 217 | 218 | // map { path : hash } (backward because of dup hashes) 219 | type blobPaths map[string]string 220 | 221 | // Handles arguments and dispatches subcommand. 222 | func blobCmd(c *commander.Command, args []string) (blobPaths, error) { 223 | 224 | blobs := blobPaths{} 225 | 226 | // Use all blobs in the manifest if --all is passed in. 227 | all := c.Flag.Lookup("all").Value.Get().(bool) 228 | if all { 229 | mf := NewDefaultManifest() 230 | blobs = validBlobHashes(mf.Files) 231 | if len(blobs) < 1 { 232 | return nil, fmt.Errorf("%v: no blobs tracked in manifest.", c.FullName()) 233 | } 234 | } else { 235 | switch len(args) { 236 | case 2: 237 | blobs[args[1]] = args[0] 238 | case 1: 239 | blobs[""] = args[0] 240 | case 0: 241 | return nil, 242 | fmt.Errorf("%v: requires argument (or --all)", c.FullName()) 243 | } 244 | } 245 | 246 | return blobs, nil 247 | } 248 | 249 | func blobGetCmd(c *commander.Command, args []string) error { 250 | blobs, err := blobCmd(c, args) 251 | if err != nil { 252 | return err 253 | } 254 | return getBlobs(blobs) 255 | } 256 | 257 | func blobPutCmd(c *commander.Command, args []string) error { 258 | blobs, err := blobCmd(c, args) 259 | if err != nil { 260 | return err 261 | } 262 | return putBlobs(blobs) 263 | } 264 | 265 | func blobUrlCmd(c *commander.Command, args []string) error { 266 | blobs, err := blobCmd(c, args) 267 | if err != nil { 268 | return err 269 | } 270 | return urlBlobs(blobs) 271 | } 272 | 273 | func blobShowCmd(c *commander.Command, args []string) error { 274 | if len(args) < 1 { 275 | return fmt.Errorf("%v: requires argument", c.FullName()) 276 | } 277 | 278 | hash := args[0] 279 | if !IsHash(hash) { 280 | return fmt.Errorf("%v: invalid hash '%s'", c.FullName(), hash) 281 | } 282 | 283 | dataIndex, err := NewMainDataIndex() 284 | if err != nil { 285 | return err 286 | } 287 | 288 | return dataIndex.copyBlob(hash, os.Stdout) 289 | } 290 | 291 | func blobHashCmd(c *commander.Command, args []string) error { 292 | if len(args) < 1 { 293 | return fmt.Errorf("%v: requires argument", c.FullName()) 294 | } 295 | 296 | hash, err := hashFile(args[0]) 297 | if err != nil { 298 | return err 299 | } 300 | pOut("%s\n", hash) 301 | return nil 302 | } 303 | 304 | func blobCheckCmd(c *commander.Command, args []string) error { 305 | blobs, err := blobCmd(c, args) 306 | if err != nil { 307 | return err 308 | } 309 | if len(args) == 1 { 310 | return fmt.Errorf("%v: requires argument", c.FullName()) 311 | } 312 | return checkBlobs(blobs) 313 | } 314 | 315 | // Uploads all blobs to blobstore 316 | func putBlobs(blobs blobPaths) error { 317 | blobs = validBlobHashes(blobs) 318 | 319 | dataIndex, err := NewMainDataIndex() 320 | if err != nil { 321 | return err 322 | } 323 | 324 | // flip map, to skip dupes 325 | flipped := map[string]string{} 326 | for path, hash := range blobs { 327 | flipped[hash] = path 328 | } 329 | 330 | for hash, path := range flipped { 331 | err = dataIndex.putBlob(hash, path) 332 | if err != nil { 333 | return err 334 | } 335 | } 336 | 337 | return nil 338 | } 339 | 340 | // Downloads all blobs from blobstore 341 | func getBlobs(blobs blobPaths) error { 342 | blobs = validBlobHashes(blobs) 343 | 344 | dataIndex, err := NewMainDataIndex() 345 | if err != nil { 346 | return err 347 | } 348 | 349 | // group map, to copy dupes 350 | grouped := map[string][]string{} 351 | for path, hash := range blobs { 352 | g, _ := grouped[hash] 353 | grouped[hash] = append(g, path) 354 | } 355 | 356 | for hash, paths := range grouped { 357 | 358 | // download one blob 359 | err = dataIndex.getBlob(hash, paths[0]) 360 | if err != nil { 361 | return err 362 | } 363 | 364 | // copy what we got to others 365 | for _, path := range paths[1:] { 366 | pErr("copy blob %.7s %s\n", hash, path) 367 | err := copyFile(paths[0], path) 368 | if err != nil { 369 | return err 370 | } 371 | } 372 | } 373 | 374 | return nil 375 | } 376 | 377 | // Shows all urls for blobs 378 | func urlBlobs(blobs blobPaths) error { 379 | blobs = validBlobHashes(blobs) 380 | 381 | dataIndex, err := NewMainDataIndex() 382 | if err != nil { 383 | return err 384 | } 385 | 386 | for _, hash := range blobs { 387 | pErr("%v\n", dataIndex.urlBlob(hash)) 388 | } 389 | 390 | return nil 391 | } 392 | 393 | // Check all blob hashes 394 | func checkBlobs(blobs blobPaths) error { 395 | 396 | failures := 0 397 | for fpath, hash := range blobs { 398 | pass, err := checkBlob(hash, fpath) 399 | if err != nil { 400 | return err 401 | } 402 | 403 | if !pass { 404 | failures++ 405 | } 406 | } 407 | 408 | count := len(blobs) 409 | if failures > 0 { 410 | return fmt.Errorf("data blob: %v/%v checksums failed!", failures, count) 411 | } 412 | 413 | pOut("data blob: %v checksums pass.\n", count) 414 | return nil 415 | } 416 | 417 | func checkBlob(oldHash string, fpath string) (bool, error) { 418 | mfmt := "check %.7s %s %s" 419 | 420 | newHash, err := hashFile(fpath) 421 | if err != nil { 422 | switch err.(type) { 423 | case *os.PathError: 424 | // non existent files count as not hashing correctly. 425 | pErr(mfmt, oldHash, fpath, "FAIL - not found\n") 426 | return false, nil 427 | default: 428 | return false, err 429 | } 430 | } 431 | 432 | if newHash != oldHash { 433 | pErr(mfmt, oldHash, fpath, "FAIL\n") 434 | return false, nil 435 | } 436 | 437 | dOut(mfmt, oldHash, fpath, "PASS\n") 438 | return true, nil 439 | } 440 | 441 | // DataIndex extension to handle putting blob 442 | func (i *DataIndex) putBlob(hash string, fpath string) error { 443 | 444 | // disallow empty paths 445 | // (stdin doesn't make sense when hashing must have already ocurred) 446 | if len(fpath) == 0 { 447 | return fmt.Errorf("put blob %.7s - error: no path supplied", hash) 448 | } 449 | 450 | fpath = path.Clean(fpath) 451 | 452 | // first, check the blobstore doesn't already have it. 453 | exists, err := i.hasBlob(hash) 454 | if err != nil { 455 | return err 456 | } 457 | 458 | if exists { 459 | pErr("put blob %.7s %s - exists\n", hash, fpath) 460 | return nil 461 | } 462 | 463 | // must verify hash before uploading (for integrity). 464 | // (note that there is a TOCTTOU bug here, so not safe. just helps.) 465 | vh, err := hashFile(fpath) 466 | if err != nil { 467 | return err 468 | } 469 | 470 | if vh != hash { 471 | m := "put blob: %s hash error (expected %s, got %s)" 472 | return fmt.Errorf(m, fpath, hash, vh) 473 | } 474 | 475 | pErr("put blob %.7s %s - uploading\n", hash, fpath) 476 | 477 | f, err := os.Open(fpath) 478 | if err != nil { 479 | return err 480 | } 481 | defer f.Close() 482 | 483 | bf := bufio.NewReader(f) 484 | err = i.BlobStore.Put(BlobKey(hash), bf) 485 | if err != nil { 486 | return err 487 | } 488 | 489 | err = f.Close() 490 | if err != nil { 491 | return err 492 | } 493 | 494 | return nil 495 | } 496 | 497 | // DataIndex extension to handle getting blob 498 | func (i *DataIndex) getBlob(hash string, fpath string) error { 499 | 500 | // disallow empty paths 501 | if len(fpath) == 0 { 502 | return fmt.Errorf("get blob %.7s - error: no path supplied", hash) 503 | } 504 | 505 | fpath = path.Clean(fpath) 506 | 507 | pErr("get blob %.7s %s\n", hash, fpath) 508 | w, err := createFile(fpath) 509 | if err != nil { 510 | return err 511 | } 512 | defer w.Close() 513 | 514 | return i.copyBlob(hash, w) 515 | } 516 | 517 | func (i *DataIndex) copyBlob(hash string, w io.WriteCloser) error { 518 | r, err := i.findBlob(hash) 519 | if err != nil { 520 | return err 521 | } 522 | 523 | br := bufio.NewReader(r) 524 | _, err = io.Copy(w, br) 525 | if err != nil { 526 | return err 527 | } 528 | 529 | err = w.Close() 530 | if err != nil { 531 | return err 532 | } 533 | 534 | err = r.Close() 535 | if err != nil { 536 | return err 537 | } 538 | 539 | return nil 540 | } 541 | 542 | func (i *DataIndex) findBlob(hash string) (io.ReadCloser, error) { 543 | 544 | mf := NewDefaultManifest() 545 | paths := mf.PathsForHash(hash) 546 | for _, p := range paths { 547 | dOut("found local blob copy. verifying hash. %s\n", p) 548 | h, err := hashFile(p) 549 | if err != nil { 550 | continue 551 | } 552 | 553 | if hash == h { 554 | f, err := os.Open(p) 555 | if err != nil { 556 | continue 557 | } 558 | 559 | return f, nil 560 | } 561 | } 562 | 563 | dOut("no local blob copy. fetch from remote blobstore.\n") 564 | return i.BlobStore.Get(BlobKey(hash)) 565 | } 566 | 567 | // DataIndex extension to check if blob exists 568 | func (i *DataIndex) hasBlob(hash string) (bool, error) { 569 | return i.BlobStore.Has(BlobKey(hash)) 570 | } 571 | 572 | // DataIndex extension to handle getting blob url 573 | func (i *DataIndex) urlBlob(hash string) string { 574 | return i.BlobStore.Url(BlobKey(hash)) 575 | } 576 | 577 | // Returns all paths associated with blob 578 | func allBlobPaths(hash string) ([]string, error) { 579 | mf := NewDefaultManifest() 580 | 581 | paths := mf.PathsForHash(hash) 582 | 583 | mfh, err := mf.ManifestHash() 584 | if err != nil { 585 | return []string{}, err 586 | } 587 | 588 | if mfh == hash { 589 | paths = append(paths, mf.Path) 590 | } 591 | 592 | return paths, nil 593 | } 594 | 595 | // Returns the blobstore key for blob 596 | func BlobKey(hash string) string { 597 | return fmt.Sprintf("/blob/%s", hash) 598 | } 599 | 600 | // Prune out invalid blob paths (bad hashes, bad paths) 601 | func validBlobHashes(blobs blobPaths) blobPaths { 602 | pruned := blobPaths{} 603 | for fpath, hash := range blobs { 604 | if IsHash(hash) { 605 | pruned[fpath] = hash 606 | } 607 | } 608 | return pruned 609 | } 610 | --------------------------------------------------------------------------------