├── .gitignore ├── platforms ├── .gitignore ├── darwin_386 │ └── .gitignore ├── darwin_amd64 │ └── .gitignore ├── installers │ └── osx │ │ ├── .gitignore │ │ ├── resources │ │ ├── welcome.html │ │ ├── conclusion.html │ │ └── license.html │ │ ├── Makefile │ │ └── distribution.xml ├── linux_386 │ ├── .gitignore │ └── Vagrantfile ├── linux_amd64 │ ├── .gitignore │ └── Vagrantfile ├── README.md ├── Makefile ├── tar.README.md └── archive.py ├── data └── data.go ├── data_version.go ├── data_info.go ├── Makefile ├── regexp.go ├── data_list.go ├── dev ├── changelog.md ├── cli.md ├── formats.md ├── roadmap.md └── designdoc.md ├── serialize.go ├── data_handle.go ├── datafile.go ├── s3store.go ├── field_user_input.go ├── data_index.go ├── commands.go ├── data_publish.go ├── data_ref.go ├── data_get.go ├── data_config.go ├── util.go ├── data_user.go ├── data_manifest.go ├── data_pack.go ├── README.md └── data_blob.go /.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | data/data 3 | -------------------------------------------------------------------------------- /platforms/.gitignore: -------------------------------------------------------------------------------- 1 | archives/ 2 | -------------------------------------------------------------------------------- /platforms/darwin_386/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | -------------------------------------------------------------------------------- /platforms/darwin_amd64/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | -------------------------------------------------------------------------------- /platforms/installers/osx/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkg 2 | root/ 3 | -------------------------------------------------------------------------------- /platforms/linux_386/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | .vagrant/ 4 | -------------------------------------------------------------------------------- /platforms/linux_amd64/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data-v*-*/ 3 | .vagrant/ 4 | -------------------------------------------------------------------------------- /data/data.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "github.com/jbenet/data" 6 | "os" 7 | ) 8 | 9 | // This package (data/data) builds the `data` commandline tool. 10 | // Everything is in the proper data library package. This extra 11 | // package is necessary because packages must yield _either_ a 12 | // library or executable. `data` needed to be both, hence this. 13 | 14 | func main() { 15 | err := data.Cmd_data.Dispatch(os.Args[1:]) 16 | if err != nil { 17 | if len(err.Error()) > 0 { 18 | fmt.Fprintf(os.Stderr, "%v\n", err) 19 | } 20 | os.Exit(1) 21 | } 22 | return 23 | } 24 | -------------------------------------------------------------------------------- /platforms/installers/osx/resources/welcome.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 |

data

11 |

dataset package manager

12 | Publish and download datasets as easy as: 13 |
14 |
15 |

16 | > data publish foo/bar
17 | > data get foo/bar
18 |

19 |
20 | data is a simple commandline tool with a supporting package index on the web. Use data get to find, download, and manage your datasets. When you're ready to publish data to the world, just run data publish. 21 | 22 |
23 |
24 | Press the "Continue" button below. 25 | 26 | 27 | -------------------------------------------------------------------------------- /platforms/README.md: -------------------------------------------------------------------------------- 1 | # building data 2 | 3 | At present (Go 1.2), cross-compiling go does not work with cgo. It seems 4 | (not actually sure, as not very familiar with all the deps) data uses cgo 5 | extensively. While there seems to be a gcc work-around, it would be useful 6 | to test the tool in all the platforms. Thus, for now, all supported archs 7 | will have a vm in this directory. The process, then, is: 8 | 9 | 1. setup + launch the vm 10 | 1. compile + test data in vm 11 | 1. place release binary in `/platforms//data` 12 | 1. `make -tar` + `make dist` to package bins up 13 | 14 | ## TODO 15 | 16 | 17 | Add VMs: 18 | 19 | - Windows: http://www.modern.ie/en-us/virtualization-tools#downloads 20 | - Darwin 21 | - BSD 22 | -------------------------------------------------------------------------------- /data_version.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "github.com/gonuts/flag" 5 | "github.com/jbenet/commander" 6 | ) 7 | 8 | const Version = "0.1.1" 9 | 10 | var cmd_data_version = &commander.Command{ 11 | UsageLine: "version", 12 | Short: "Show data version information.", 13 | Long: `data version - Show data version information. 14 | 15 | Returns the current version of data and exits. 16 | `, 17 | Run: versionCmd, 18 | Flag: *flag.NewFlagSet("data-user-auth", flag.ExitOnError), 19 | } 20 | 21 | func init() { 22 | cmd_data_version.Flag.Bool("number", false, "show only the number") 23 | } 24 | 25 | func versionCmd(c *commander.Command, _ []string) error { 26 | number := c.Flag.Lookup("number").Value.Get().(bool) 27 | if !number { 28 | pOut("data version ") 29 | } 30 | pOut("%s\n", Version) 31 | return nil 32 | } 33 | -------------------------------------------------------------------------------- /data_info.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/jbenet/commander" 6 | ) 7 | 8 | var cmd_data_info = &commander.Command{ 9 | UsageLine: "info []", 10 | Short: "Show dataset information.", 11 | Long: `data info - Show dataset information. 12 | 13 | Returns the Datafile corresponding to (or in current 14 | directory) and exits. 15 | `, 16 | Run: infoCmd, 17 | } 18 | 19 | func infoCmd(c *commander.Command, args []string) error { 20 | if len(args) < 1 { 21 | return datasetInfo(DatafileName) 22 | } 23 | 24 | return datasetInfo(DatafilePath(args[0])) 25 | } 26 | 27 | func datasetInfo(path string) error { 28 | df, err := NewDatafile(path) 29 | if err != nil { 30 | dErr("Error: %s\n", err) 31 | return fmt.Errorf("Invalid dataset path: %s", path) 32 | } 33 | 34 | buf, err := df.Marshal() 35 | if err != nil { 36 | return err 37 | } 38 | 39 | pOut("%s\n", buf) 40 | return nil 41 | } 42 | -------------------------------------------------------------------------------- /platforms/installers/osx/resources/conclusion.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 |

Installation complete!

11 | data is now installed in your system. To see a list of available commands, open a terminal and enter: 12 |
13 |
14 |

15 | > data
16 |

17 |
18 | You're now able to install all the datadex datasets from the commandline. For example, to get the MNIST dataset, run: 19 |
20 |
21 |

22 | > data get jbenet/mnist
23 |

24 |
25 | Find datasets and more informaton at http://datadex.io, or in these documents: 26 |
27 |
28 |

33 | 34 | 35 | -------------------------------------------------------------------------------- /platforms/installers/osx/Makefile: -------------------------------------------------------------------------------- 1 | ifneq ($(shell uname),Darwin) 2 | $(error "Making osx installers is only supported in osx.") 3 | endif 4 | 5 | all: pkg 6 | 7 | # for now, amd64. lipo both together later. 8 | BIN=../../darwin_amd64/data 9 | VERSION=$(shell $(BIN) version --number) 10 | PKGNAME=data-v$(VERSION)-OSX-Installer 11 | 12 | # setup the package contents 13 | root: root/usr/bin/data 14 | 15 | root/usr/bin/data: $(BIN) 16 | -mkdir -p $(dir $@) 17 | cp $< $@ 18 | 19 | # build the packages 20 | data.pkg: root root/usr/bin/data 21 | pkgbuild \ 22 | --root root \ 23 | --identifier io.datadex.data \ 24 | --version $(VERSION) \ 25 | --ownership recommended \ 26 | data.pkg 27 | 28 | $(PKGNAME).pkg: data.pkg distribution.xml 29 | productbuild \ 30 | --distribution distribution.xml \ 31 | --resources resources \ 32 | --package-path data.pkg \ 33 | --version $(VERSION) \ 34 | $(PKGNAME).pkg 35 | 36 | pkg: $(PKGNAME).pkg 37 | 38 | clean: 39 | rm -rf -- root/ 40 | rm -f -- *.pkg 41 | -------------------------------------------------------------------------------- /platforms/Makefile: -------------------------------------------------------------------------------- 1 | 2 | VERSION=$(shell data version --number) 3 | 4 | PLATFORMS= \ 5 | darwin_amd64 \ 6 | linux_386 \ 7 | linux_amd64 \ 8 | # darwin_386 \ 9 | # windows_386 \ 10 | # windows_amd64 \ 11 | 12 | BINS=$(addsuffix /data,$(PLATFORMS)) 13 | 14 | ARCHIVES=$(patsubst %,archives/data-v$(VERSION)-%.tar.gz,$(PLATFORMS)) 15 | 16 | OTHER= \ 17 | installers/osx/data-v$(VERSION)-OSX-Installer.pkg 18 | 19 | all: $(ARCHIVES) $(OTHER) 20 | 21 | %s: %s/data 22 | 23 | linux_%/data: 24 | -rm $@ 25 | cd $(dir $@) && \ 26 | vagrant up && \ 27 | vagrant ssh -c "source ~/.bashrc; cd data; make deps; make;" && \ 28 | vagrant suspend 29 | 30 | darwin_%/data: 31 | cd ../ && $(MAKE) 32 | 33 | windows_%/data: 34 | $(error not implemented) 35 | 36 | archives/data-v$(VERSION)-%.tar.gz: %/data 37 | ./archive.py $( help 41 | 42 | To see a reference of all data commands run: 43 | 44 | data commands help | less 45 | -------------------------------------------------------------------------------- /regexp.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "regexp" 5 | ) 6 | 7 | var UserRegexp *regexp.Regexp 8 | var IdentRegexp *regexp.Regexp 9 | var PathRegexp *regexp.Regexp 10 | var EmailRegexp *regexp.Regexp 11 | var HandleRegexp *regexp.Regexp 12 | var NonIdentRegexp *regexp.Regexp 13 | 14 | func init() { 15 | identRE := "[A-Za-z0-9-_.]+" 16 | pathRE := "((" + identRE + ")/(" + identRE + "))" 17 | handleRE := pathRE + "(\\." + identRE + ")?(@" + identRE + ")?" 18 | emailRE := `(?i)[A-Z0-9._%+-]+@(?:[A-Z0-9-]+\.)+[A-Z]{2,6}` 19 | nonIdentRE := "[^A-Za-z0-9-_.]+" 20 | 21 | UserRegexp = compileRegexp("^" + identRE + "$") 22 | IdentRegexp = compileRegexp("^" + identRE + "$") 23 | PathRegexp = compileRegexp("^" + pathRE + "$") 24 | EmailRegexp = compileRegexp("^" + emailRE + "$") 25 | HandleRegexp = compileRegexp("^" + handleRE + "$") 26 | NonIdentRegexp = compileRegexp(nonIdentRE) 27 | } 28 | 29 | func compileRegexp(s string) *regexp.Regexp { 30 | r, err := regexp.Compile(s) 31 | if err != nil { 32 | pOut("%s", err) 33 | pOut("%v", r) 34 | panic("Regex does not compile: " + s) 35 | } 36 | return r 37 | } 38 | -------------------------------------------------------------------------------- /platforms/linux_386/Vagrantfile: -------------------------------------------------------------------------------- 1 | # github.com/jbenet/platform-vms/i386/linux_ubuntu/go 2 | 3 | Vagrant.configure("2") do |config| 4 | config.vm.box = 'precise32' 5 | config.vm.box_url = 'http://files.vagrantup.com/precise32.box' 6 | 7 | # synced files 8 | config.vm.synced_folder "../../", "/home/vagrant/go/src/github.com/jbenet/data" 9 | 10 | # increase VM memory to 512 MB 11 | config.vm.provider "virtualbox" do |v| 12 | v.customize ["modifyvm", :id, "--memory", "512"] 13 | end 14 | 15 | # run provisioning scripts 16 | config.vm.provision :shell, :inline => <<-eos 17 | 18 | # install tools 19 | apt-get install -y make 20 | apt-get install -y git bzr mercurial # for go get 21 | 22 | # install go 23 | echo "installing go..." 24 | cd /tmp 25 | wget -q https://go.googlecode.com/files/go1.2.linux-386.tar.gz 26 | tar xf go1.2.linux-386.tar.gz 27 | mv go /usr/local/go 28 | chown -R vagrant /home/vagrant/go 29 | ln -s go/src/github.com/jbenet/data /home/vagrant/data 30 | 31 | # setup go workspace 32 | echo "export GOROOT=/usr/local/go" >> /home/vagrant/.bash_profile 33 | echo "export GOPATH=/home/vagrant/go" >> /home/vagrant/.bash_profile 34 | echo "export PATH=\\$PATH:\\$GOROOT/bin:\\$GOPATH/bin" >> /home/vagrant/.bash_profile 35 | 36 | eos 37 | 38 | end 39 | -------------------------------------------------------------------------------- /platforms/linux_amd64/Vagrantfile: -------------------------------------------------------------------------------- 1 | # github.com/jbenet/platform-vms/amd64/linux_ubuntu/go 2 | 3 | Vagrant.configure("2") do |config| 4 | config.vm.box = 'precise64' 5 | config.vm.box_url = 'http://files.vagrantup.com/precise64.box' 6 | 7 | # synced files 8 | config.vm.synced_folder "../../", "/home/vagrant/go/src/github.com/jbenet/data" 9 | 10 | # increase VM memory to 512 MB 11 | config.vm.provider "virtualbox" do |v| 12 | v.customize ["modifyvm", :id, "--memory", "512"] 13 | end 14 | 15 | # run provisioning scripts 16 | config.vm.provision :shell, :inline => <<-eos 17 | 18 | # install tools 19 | apt-get install -y make 20 | apt-get install -y git bzr mercurial # for go get 21 | 22 | # install go 23 | echo "installing go..." 24 | cd /tmp 25 | wget -q https://go.googlecode.com/files/go1.2.linux-amd64.tar.gz 26 | tar xf go1.2.linux-amd64.tar.gz 27 | mv go /usr/local/go 28 | chown -R vagrant /home/vagrant/go 29 | ln -s go/src/github.com/jbenet/data /home/vagrant/data 30 | 31 | # setup go workspace 32 | echo "export GOROOT=/usr/local/go" >> /home/vagrant/.bash_profile 33 | echo "export GOPATH=/home/vagrant/go" >> /home/vagrant/.bash_profile 34 | echo "export PATH=\\$PATH:\\$GOROOT/bin:\\$GOPATH/bin" >> /home/vagrant/.bash_profile 35 | 36 | eos 37 | 38 | end 39 | -------------------------------------------------------------------------------- /platforms/installers/osx/resources/license.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 |

The MIT License

11 |

12 |

Permission is hereby granted, free of charge, to any person obtaining a copy 13 | of this software and associated documentation files (the "Software"), to deal 14 | in the Software without restriction, including without limitation the rights 15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | copies of the Software, and to permit persons to whom the Software is 17 | furnished to do so, subject to the following conditions:

18 |

The above copyright notice and this permission notice shall be included in 19 | all copies or substantial portions of the Software.

20 |

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 | THE SOFTWARE.

27 | 28 | 29 | -------------------------------------------------------------------------------- /platforms/installers/osx/distribution.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | data -- dataset package manager 4 | io.datadex 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | data.pkg 16 | 18 | 19 | 20 | 21 | 22 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /data_list.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "github.com/jbenet/commander" 5 | "io/ioutil" 6 | "path" 7 | ) 8 | 9 | var cmd_data_list = &commander.Command{ 10 | UsageLine: "list ", 11 | Short: "List installed datasets.", 12 | Long: `data list - List insalled datasets. 13 | 14 | Returns all the datasets installed in the dataset working directory, 15 | end exits. 16 | `, 17 | Run: listCmd, 18 | } 19 | 20 | func listCmd(*commander.Command, []string) error { 21 | return listDatasets(DatasetDir) 22 | } 23 | 24 | func listDatasets(dir string) error { 25 | authors, err := ioutil.ReadDir(dir) 26 | 27 | if err != nil { 28 | pErr("data: error reading dataset directory \"%s\"\n", dir) 29 | return err 30 | } 31 | 32 | // for each author dir 33 | for _, a := range authors { 34 | // skip hidden files 35 | if a.Name()[0] == '.' { 36 | continue 37 | } 38 | 39 | author := path.Join(dir, a.Name()) 40 | datasets, err := ioutil.ReadDir(author) 41 | if err != nil { 42 | continue 43 | } 44 | 45 | // for each dataset dir 46 | for _, d := range datasets { 47 | // skip hidden files 48 | if d.Name()[0] == '.' { 49 | continue 50 | } 51 | 52 | dataset := path.Join(a.Name(), d.Name()) 53 | datafile, err := NewDatafile(DatafilePath(dataset)) 54 | if err != nil { 55 | pErr("Error: %s\n", err) 56 | continue 57 | } 58 | 59 | pOut("%s\n", datafile.Dataset) 60 | } 61 | } 62 | 63 | return nil 64 | } 65 | -------------------------------------------------------------------------------- /dev/changelog.md: -------------------------------------------------------------------------------- 1 | # data changelog 2 | 3 | ## v0.1.1 2014-02-05 4 | 5 | - data help: groups commands 6 | - publish guide messages 7 | - default dataset id to cwd basename 8 | - changed Manifest -> .data/Manifest filename 9 | - data get: install path is handle 10 | - data get: no littering if not found 11 | - data blob: creates dir(path) 12 | - data config flexibility 13 | - semver support 14 | 15 | 16 | ## v0.1.0 2014-01-21 17 | 18 | First preview (alpha) 19 | 20 | - release builds 21 | - data commands (for reference) 22 | - data pack make -- Datafile defaults 23 | - datadex api suffix 24 | - data blob put -- verify hash 25 | - data blob {hash, check} 26 | - datadex interop 27 | - data config: env var, --edit 28 | - s3 token based auth for uploading 29 | - s3 anonymous downloading 30 | 31 | ## v0.0.5 2014-01-09 32 | 33 | Publishing + downloading packages. 34 | 35 | - data pack publish 36 | - data publish 37 | - data get (using pack) 38 | - data user {add, auth, pass, info, url} 39 | - data config 40 | 41 | ## v0.0.4 2014-01-03 42 | 43 | Manifest manipulation and packaging. 44 | 45 | - data manifest {add, rm, hash, check} 46 | - data pack {make, manifest, upload, download, check} 47 | 48 | ## v0.0.3 2013-12-13 49 | 50 | Uploading datasets. 51 | 52 | - data manifest (list + hash files) 53 | - data blob (blobs to storage service) 54 | 55 | 56 | ## v0.0.2 2013-11-24 57 | 58 | Downloading datasets. 59 | 60 | - data get (downloads + installs a dataset) 61 | 62 | ## v0.0.1 2013-11-22 63 | 64 | Initial version. 65 | 66 | - command dispatch 67 | - datafile format (yml + structure) 68 | - datafile parsing (loading/dumping) 69 | - data version 70 | - data help (just usage for now) 71 | - data list (show installed datasets) 72 | - data info (loads/dumps dataset's Datafile) 73 | -------------------------------------------------------------------------------- /platforms/archive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import re 5 | 6 | # constants 7 | readme_file = 'tar.README.md' 8 | semver_regx = r'^[0-9]+\.[0-9]+\.[0-9]+$' # lacks pre-releases/builds 9 | valid_archs = [ 10 | 'darwin_amd64', 11 | 'darwin_386', 12 | 'linux_amd64', 13 | 'linux_386', 14 | 'windows_amd64', 15 | 'windows_386', 16 | ] 17 | 18 | 19 | def check(cond, msg): 20 | if not cond: 21 | print 'Error:', msg 22 | exit(-1) 23 | 24 | def write_readme(output, arch, version): 25 | with open(output, 'w') as out: 26 | with open('../%s' % readme_file) as inp: 27 | txt = inp.read() 28 | txt = txt % {'arch': arch, 'version': version} 29 | out.write(txt) 30 | 31 | 32 | def make_archive(arch, vers): 33 | if arch not in valid_archs: 34 | print "Error: arch '%s' not supported" % arch 35 | return -1 36 | 37 | if not re.match(semver_regx, vers): 38 | print "Error: version '%s' is not like X.X.X" % vers 39 | return -1 40 | 41 | if not os.path.exists('%s/data' % arch): 42 | print "Error: binary '%s/data' not found" % arch 43 | return -1 44 | 45 | # move into arch dir 46 | os.chdir(arch) 47 | 48 | # setup directory 49 | dir = 'data-v%s-%s' % (vers, arch) 50 | os.system('mkdir -p %s' % dir) 51 | 52 | # write files 53 | os.system('cp data %s/data' % dir) 54 | write_readme('%s/README.md' % dir, arch, vers) 55 | 56 | # tar 57 | tar = '%s.tar.gz' % dir 58 | os.system('tar czf %s %s' % (tar, dir)) 59 | 60 | # move into place 61 | os.chdir('..') 62 | os.system('mkdir -p archives') 63 | os.system('mv %s/%s archives/%s' % (arch, tar, tar)) 64 | os.system('rm -rf %s/%s' % (arch, dir)) 65 | 66 | print 'packaged archives/%s' % tar 67 | return dir 68 | 69 | 70 | def main(): 71 | import sys 72 | if '-h' in sys.argv or len(sys.argv) < 3: 73 | print 'Usage: %s ' % sys.argv[0] 74 | print 'Prepares the release archive for a given architecture.' 75 | exit(0 if '-h' in sys.argv else -1) 76 | 77 | arch = sys.argv[1] 78 | vers = sys.argv[2] 79 | 80 | archs = valid_archs if arch == 'all' else [arch] 81 | 82 | for arch in archs: 83 | make_archive(arch, vers) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /serialize.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | "launchpad.net/goyaml" 9 | "os" 10 | "path" 11 | ) 12 | 13 | type SerializedFile struct { 14 | Path string "-" 15 | Format interface{} "-" 16 | } 17 | 18 | func (f *SerializedFile) Marshal() ([]byte, error) { 19 | dOut("Marshalling %s\n", f.Path) 20 | return goyaml.Marshal(f.Format) 21 | } 22 | 23 | func (f *SerializedFile) Unmarshal(buf []byte) error { 24 | err := goyaml.Unmarshal(buf, f.Format) 25 | if err != nil { 26 | return err 27 | } 28 | 29 | dOut("Unmarshalling %s\n", f.Path) 30 | return nil 31 | } 32 | 33 | func (f *SerializedFile) Write(w io.Writer) error { 34 | buf, err := f.Marshal() 35 | if err != nil { 36 | return err 37 | } 38 | 39 | _, err = w.Write(buf) 40 | return err 41 | } 42 | 43 | func (f *SerializedFile) Read(r io.Reader) error { 44 | buf, err := ioutil.ReadAll(r) 45 | if err != nil { 46 | return err 47 | } 48 | 49 | return f.Unmarshal(buf) 50 | } 51 | 52 | func (f *SerializedFile) WriteFile() error { 53 | if len(f.Path) < 1 { 54 | return fmt.Errorf("SerializedFile: No path provided for writing.") 55 | } 56 | 57 | buf, err := f.Marshal() 58 | if err != nil { 59 | return err 60 | } 61 | 62 | err = os.MkdirAll(path.Dir(f.Path), 0777) 63 | if err != nil { 64 | return err 65 | } 66 | 67 | return ioutil.WriteFile(f.Path, buf, 0666) 68 | } 69 | 70 | func (f *SerializedFile) ReadFile() error { 71 | if len(f.Path) < 1 { 72 | return fmt.Errorf("SerializedFile: No path provided for reading.") 73 | } 74 | 75 | buf, err := ioutil.ReadFile(f.Path) 76 | if err != nil { 77 | return err 78 | } 79 | 80 | return f.Unmarshal(buf) 81 | } 82 | 83 | func (f *SerializedFile) ReadBlob(ref string) error { 84 | i, err := NewMainDataIndex() 85 | if err != nil { 86 | return err 87 | } 88 | 89 | r, err := i.BlobStore.Get(BlobKey(ref)) 90 | if err != nil { 91 | return err 92 | } 93 | 94 | err = f.Read(r) 95 | if err != nil { 96 | return err 97 | } 98 | 99 | return nil 100 | } 101 | 102 | func Marshal(in interface{}) (io.Reader, error) { 103 | buf, err := goyaml.Marshal(in) 104 | if err != nil { 105 | return nil, err 106 | } 107 | 108 | // pOut("\n") 109 | // pOut("%s\n", buf) 110 | // pOut("\n") 111 | return bytes.NewReader(buf), nil 112 | } 113 | 114 | func Unmarshal(in io.Reader, out interface{}) error { 115 | buf, err := ioutil.ReadAll(in) 116 | if err != nil { 117 | return err 118 | } 119 | 120 | // pOut("\n") 121 | // pOut("%s\n", buf) 122 | // pOut("\n") 123 | return goyaml.Unmarshal(buf, out) 124 | } 125 | 126 | // Userful for converting between representations 127 | func MarshalUnmarshal(in interface{}, out interface{}) error { 128 | // struct -> yaml -> map for easy access 129 | rdr, err := Marshal(in) 130 | if err != nil { 131 | return err 132 | } 133 | 134 | return Unmarshal(rdr, out) 135 | } 136 | -------------------------------------------------------------------------------- /data_handle.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "path" 6 | "strings" 7 | ) 8 | 9 | // /[.][@] 10 | 11 | type Handle struct { 12 | Author string 13 | Name string 14 | Format string 15 | Version string 16 | } 17 | 18 | // There are problems with goyaml setters/getters. 19 | // Unmarshaling fails. 20 | // 21 | // func (d Handle) GetYAML() (string, interface{}) { 22 | // pOut("GetYAML\n") 23 | // return "", d.string 24 | // } 25 | // 26 | // func (d Handle) SetYAML(tag string, value interface{}) bool { 27 | // s, ok := value.(string) 28 | // d.string = s 29 | // pOut("SetYAML %s %s\n", d.string, &d) 30 | // return ok 31 | // } 32 | 33 | func NewHandle(s string) *Handle { 34 | d := new(Handle) 35 | d.SetDataset(s) 36 | return d 37 | } 38 | 39 | func (d *Handle) Dataset() string { 40 | s := d.Path() 41 | 42 | if len(d.Format) > 0 { 43 | s = fmt.Sprintf("%s.%s", s, d.Format) 44 | } 45 | 46 | if len(d.Version) > 0 { 47 | s = fmt.Sprintf("%s@%s", s, d.Version) 48 | } 49 | 50 | return s 51 | } 52 | 53 | func (d *Handle) Path() string { 54 | return path.Join(d.Author, d.Name) 55 | } 56 | 57 | func (d *Handle) InstallPath() string { 58 | return path.Join(DatasetDir, d.Dataset()) 59 | } 60 | 61 | // order: rsplit @, split /, rsplit . 62 | func (d *Handle) SetDataset(s string) { 63 | // no / is invalid 64 | if strings.Index(s, "/") == 0 { 65 | return 66 | } 67 | 68 | nam_idx := strings.Index(s, "/") 69 | if nam_idx < 0 { 70 | nam_idx = 0 71 | } 72 | 73 | ver_idx := strings.LastIndex(s, "@") 74 | if ver_idx < 0 { 75 | ver_idx = len(s) // no version in handle. 76 | } 77 | 78 | // this precludes names that have periods... use different delimiter? 79 | fmt_idx := strings.LastIndex(s[nam_idx+1:ver_idx], ".") 80 | if fmt_idx < 0 { 81 | fmt_idx = ver_idx // no format in handle. 82 | } else { 83 | fmt_idx += nam_idx + 1 84 | } 85 | 86 | // parts 87 | d.Author = slice(s, 0, nam_idx) 88 | d.Name = slice(s, nam_idx+1, fmt_idx) 89 | d.Format = slice(s, fmt_idx+1, ver_idx) 90 | d.Version = slice(s, ver_idx+1, len(s)) 91 | } 92 | 93 | func (d *Handle) GoString() string { 94 | return d.Dataset() 95 | } 96 | 97 | func (d *Handle) Valid() bool { 98 | return IsDatasetHandle(d.Dataset()) 99 | } 100 | 101 | // utils 102 | 103 | func slice(s string, from int, to int) string { 104 | from = maxInt(from, 0) 105 | to = minInt(to, len(s)) 106 | return s[minInt(from, to):to] 107 | } 108 | 109 | // https://groups.google.com/forum/#!topic/golang-nuts/dbyqx_LGUxM is silly. 110 | func minInt(x, y int) (r int) { 111 | if x < y { 112 | return x 113 | } 114 | return y 115 | } 116 | 117 | func maxInt(x, y int) (r int) { 118 | if x > y { 119 | return x 120 | } 121 | return y 122 | } 123 | 124 | func handleError(handle string, problem string) error { 125 | return fmt.Errorf("Invalid handle (%s): %s", problem, handle) 126 | } 127 | 128 | func IsDatasetHandle(str string) bool { 129 | return HandleRegexp.MatchString(str) 130 | } 131 | -------------------------------------------------------------------------------- /datafile.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "path" 5 | ) 6 | 7 | /* 8 | # Datafile format 9 | # A YAML (inc json) doc with the following keys: 10 | 11 | # required 12 | handle: /[.][@] 13 | title: Dataset Title 14 | 15 | # optional functionality 16 | dependencies: [] 17 | formats: { : } 18 | 19 | # optional information 20 | description: Text describing dataset. 21 | repository: 22 | website: 23 | license: 24 | contributors: ["Author Name [] [(url)]>", ...] 25 | sources: [] 26 | */ 27 | 28 | // Serializable into YAML 29 | type datafileContents struct { 30 | Dataset string 31 | Tagline string 32 | 33 | Mirrors []string ",omitempty" 34 | Dependencies []string ",omitempty" 35 | Formats map[string]string ",omitempty" 36 | 37 | Description string ",omitempty" 38 | Repository string ",omitempty" 39 | Website string ",omitempty" 40 | License string ",omitempty" 41 | Authors []string ",omitempty" 42 | Contributors []string ",omitempty" 43 | Sources []string ",omitempty" 44 | } 45 | 46 | type Datafile struct { 47 | SerializedFile "-" 48 | datafileContents ",inline" 49 | } 50 | 51 | const DatasetDir = "datasets" 52 | const DatafileName = "Datafile" 53 | 54 | func DatafilePath(dataset string) string { 55 | return path.Join(DatasetDir, dataset, DatafileName) 56 | } 57 | 58 | func NewDatafile(path string) (*Datafile, error) { 59 | df := &Datafile{SerializedFile: SerializedFile{Path: path}} 60 | df.SerializedFile.Format = df 61 | 62 | if len(path) > 0 { 63 | err := df.ReadFile() 64 | if err != nil { 65 | return df, err 66 | } 67 | } 68 | return df, nil 69 | } 70 | 71 | func NewDefaultDatafile() (*Datafile, error) { 72 | return NewDatafile(DatafileName) 73 | } 74 | 75 | func NewDatafileWithRef(ref string) (*Datafile, error) { 76 | f, _ := NewDatafile("") 77 | err := f.ReadBlob(ref) 78 | if err != nil { 79 | return nil, err 80 | } 81 | return f, nil 82 | } 83 | 84 | func (d *Datafile) Handle() *Handle { 85 | return NewHandle(d.Dataset) 86 | } 87 | 88 | func (d *Datafile) Valid() bool { 89 | return d.Handle().Valid() 90 | } 91 | 92 | // datafile manipulation utils 93 | 94 | // Return array of all Datafiles 95 | func NewDatafiles(filenames []string) ([]*Datafile, error) { 96 | files := []*Datafile{} 97 | for _, p := range filenames { 98 | f, err := NewDatafile(p) 99 | if err != nil { 100 | return nil, err 101 | } 102 | 103 | files = append(files, f) 104 | } 105 | return files, nil 106 | } 107 | 108 | // group Datafiles { path : [Datafile, ], } 109 | type DatafileGroupMap map[string][]*Datafile 110 | 111 | func GroupedDatafiles(files []*Datafile) *DatafileGroupMap { 112 | grouped := DatafileGroupMap{} 113 | 114 | for _, f := range files { 115 | group := f.Handle().Path() 116 | grouped[group] = append(grouped[group], f) 117 | } 118 | 119 | return &grouped 120 | } 121 | -------------------------------------------------------------------------------- /dev/cli.md: -------------------------------------------------------------------------------- 1 | ``` 2 | data 3 | 4 | version Show data version information. 5 | config Manage data configuration. 6 | info Show dataset information. 7 | list List installed datasets. 8 | get Download and install dataset. 9 | publish Guided dataset publishing. 10 | 11 | user Manage users and credentials. 12 | add Register new user with index. 13 | auth Authenticate user account. 14 | pass Change user password. 15 | info Show (or edit) public user information. 16 | url Output user profile url. 17 | 18 | manifest Generate and manipulate dataset manifest. 19 | add Adds to manifest (does not hash). 20 | rm Removes from manifest. 21 | hash Hashes and adds checksum to manifest. 22 | check Verifies checksum matches manifest. 23 | 24 | pack Dataset packaging, upload, and download. 25 | make Create or update package description. 26 | manifest Show current package manifest. 27 | upload Upload package to remote storage. 28 | download Download package from remote storage. 29 | checksum Verify all file checksums match. 30 | 31 | blob Manage blobs in the blobstore. 32 | put Upload blob named by to blobstore. 33 | get Download blob named by from blobstore. 34 | url Output Url for blob named by . 35 | check Verify blob contents named by match . 36 | show Output blob contents named by . 37 | ``` 38 | 39 | git backed (use git internally to manage repository changes) 40 | 41 | ``` 42 | data 43 | 44 | version Show data version information. 45 | config Manage data configuration. 46 | info Show dataset information. 47 | list List installed datasets. 48 | get Download and install dataset. 49 | publish Guided dataset publishing. 50 | 51 | user Manage users and credentials. 52 | add Register new user with index. 53 | auth Authenticate user account. 54 | pass Change user password. 55 | info Show (or edit) public user information. 56 | url Output user profile url. 57 | 58 | pack Dataset packaging, upload, and download. 59 | add Add contents to package staging. 60 | rm Removes contents from package staging. 61 | status Show the working tree status. 62 | commit Record changes to package repository. 63 | upload Upload package to remote storage and post to index. 64 | download Download package from remote storage. 65 | checksum Verify all file checksums match. 66 | 67 | blob Manage blobs in the blobstore (unaware of pack) 68 | put Upload blob named from to blobstore. 69 | get Download blob named from blobstore to . 70 | check Verify blob contents in match . 71 | url Output Url for blob named by . 72 | show Output blob contents named by . 73 | ``` 74 | -------------------------------------------------------------------------------- /s3store.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/jbenet/s3" 6 | "github.com/jbenet/s3/s3util" 7 | "io" 8 | "strings" 9 | ) 10 | 11 | type S3Store struct { 12 | bucket string 13 | domain string 14 | config *s3util.Config 15 | 16 | // used for auth credentials 17 | dataIndex *DataIndex 18 | } 19 | 20 | // format from `aws sts` cmd 21 | type AwsCredentials struct { 22 | SecretAccessKey string 23 | SessionToken string 24 | AccessKeyId string 25 | } 26 | 27 | func NewS3Store(bucket string, index *DataIndex) (*S3Store, error) { 28 | 29 | if len(bucket) < 1 { 30 | return nil, fmt.Errorf("Invalid (empty) S3 Bucket name.") 31 | } 32 | 33 | if index == nil { 34 | return nil, fmt.Errorf("Invalid (nil) DataIndex.") 35 | } 36 | 37 | s := &S3Store{ 38 | bucket: bucket, 39 | domain: "s3.amazonaws.com", 40 | dataIndex: index, 41 | } 42 | 43 | s.config = &s3util.Config{ 44 | Service: s3.DefaultService, 45 | Keys: new(s3.Keys), 46 | } 47 | 48 | return s, nil 49 | } 50 | 51 | func (s *S3Store) SetAwsCredentials(c *AwsCredentials) { 52 | s.config.AccessKey = c.AccessKeyId 53 | s.config.SecretKey = c.SecretAccessKey 54 | s.config.SecurityToken = c.SessionToken 55 | 56 | // pOut("Got Aws Credentials:\n") 57 | // pOut(" AccessKey: %s\n", s.config.AccessKey) 58 | // pOut(" SecretKey: %s\n", s.config.SecretKey) 59 | // pOut(" SessToken: %s\n\n", s.config.SecurityToken) 60 | } 61 | 62 | func (s *S3Store) AwsCredentials() *AwsCredentials { 63 | if s.config == nil || len(s.config.AccessKey) == 0 { 64 | return nil 65 | } 66 | 67 | return &AwsCredentials{ 68 | AccessKeyId: s.config.AccessKey, 69 | SecretAccessKey: s.config.SecretKey, 70 | SessionToken: s.config.SecurityToken, 71 | } 72 | } 73 | 74 | func (s *S3Store) Url(key string) string { 75 | if !strings.HasPrefix(key, "/") { 76 | key = "/" + key 77 | } 78 | return fmt.Sprintf("http://%s.%s%s", s.bucket, s.domain, key) 79 | } 80 | 81 | func (s *S3Store) Has(key string) (bool, error) { 82 | url := s.Url(key) 83 | rc, err := s3util.Open(url, s.config) 84 | 85 | if err == nil { 86 | rc.Close() 87 | return true, nil 88 | } 89 | 90 | if strings.Contains(err.Error(), "unwanted http status 404:") { 91 | return false, nil 92 | } 93 | 94 | return false, err 95 | } 96 | 97 | func (s *S3Store) Put(key string, value io.Reader) error { 98 | err := s.ensureUserAwsCredentials() 99 | if err != nil { 100 | return fmt.Errorf("aws credentials error: %v", err) 101 | } 102 | 103 | url := s.Url(key) 104 | w, err := s3util.Create(url, nil, s.config) 105 | if err != nil { 106 | return err 107 | } 108 | 109 | _, err = io.Copy(w, value) 110 | if err != nil { 111 | return err 112 | } 113 | 114 | err = w.Close() 115 | if err != nil { 116 | return err 117 | } 118 | 119 | return nil 120 | } 121 | 122 | func (s *S3Store) Get(key string) (io.ReadCloser, error) { 123 | url := s.Url(key) 124 | return s3util.Open(url, s.config) 125 | } 126 | 127 | func (s *S3Store) getUserAwsCredentials() error { 128 | u := configUser() 129 | if !isNamedUser(u) { 130 | return fmt.Errorf("must be signed in to request aws credentials") 131 | } 132 | 133 | ui := s.dataIndex.NewUserIndex(u) 134 | c, err := ui.AwsCred() 135 | if err != nil { 136 | return err 137 | } 138 | 139 | s.SetAwsCredentials(c) 140 | return nil 141 | } 142 | 143 | func (s *S3Store) ensureUserAwsCredentials() error { 144 | // if we already have credentials, do nothing. 145 | if s.AwsCredentials() != nil { 146 | return nil 147 | } 148 | 149 | return s.getUserAwsCredentials() 150 | } 151 | -------------------------------------------------------------------------------- /field_user_input.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "os" 5 | "regexp" 6 | "strings" 7 | ) 8 | 9 | type InputField struct { 10 | Prompt string 11 | Value *string 12 | Pattern *regexp.Regexp 13 | Help string 14 | } 15 | 16 | func ensureDatafileInPath(path string) error { 17 | _, err := os.Stat(path) 18 | if err == nil { 19 | return nil 20 | } 21 | 22 | // if it doesn't exist, create it. 23 | f, err := os.Create(path) 24 | defer f.Close() 25 | 26 | return nil 27 | } 28 | 29 | func fillOutDatafileInPath(path string) error { 30 | 31 | err := ensureDatafileInPath(path) 32 | if err != nil { 33 | return err 34 | } 35 | 36 | df, err := NewDatafile(path) 37 | if err != nil { 38 | return err 39 | } 40 | 41 | return fillOutDatafile(df) 42 | } 43 | 44 | func fillOutDatafile(df *Datafile) error { 45 | pOut("Writing Datafile fields...\n") 46 | pOut("'Field description [current value]'\n") 47 | 48 | h := df.Handle() 49 | fields := []InputField{ 50 | InputField{ 51 | "owner id (required)", 52 | &h.Author, 53 | UserRegexp, 54 | "Must be a valid username. Can only contain [a-z0-9-_.].", 55 | }, 56 | InputField{ 57 | "dataset id (required)", 58 | &h.Name, 59 | IdentRegexp, 60 | "Must be a valid dataset id. Can only contain [a-z0-9-_.].", 61 | }, 62 | InputField{ 63 | "dataset version (required)", 64 | &h.Version, 65 | IdentRegexp, 66 | "Must be a valid version. Can only contain [a-z0-9-_.].", 67 | }, 68 | InputField{"tagline description (required)", &df.Tagline, nil, 69 | `A tagline is required to describe your package to others. 70 | Good taglines are like titles: short, descriptive phrases.`}, 71 | InputField{"long description (optional)", &df.Description, nil, ""}, 72 | InputField{"license name (optional)", &df.License, nil, ""}, 73 | } 74 | 75 | for _, field := range fields { 76 | err := fillOutField(field) 77 | if err != nil { 78 | return err 79 | } 80 | 81 | df.Dataset = h.Dataset() 82 | if df.Valid() { 83 | err = df.WriteFile() 84 | if err != nil { 85 | return err 86 | } 87 | } 88 | } 89 | 90 | return nil 91 | } 92 | 93 | func fillOutField(f InputField) error { 94 | 95 | // validator function 96 | valid := func(val string) bool { 97 | if strings.Contains(f.Prompt, "required") && len(val) < 1 { 98 | return false 99 | } 100 | 101 | if f.Pattern != nil && !f.Pattern.MatchString(val) { 102 | return false 103 | } 104 | 105 | return true 106 | } 107 | 108 | for { 109 | pOut("Enter %s [%s]: ", f.Prompt, *f.Value) 110 | line, err := readInput() 111 | if err != nil { 112 | return err 113 | } 114 | 115 | // if not required, and entered nothing, get out. 116 | if len(line) == 0 && valid(*f.Value) { 117 | break 118 | } 119 | 120 | // if valid input 121 | if valid(line) { 122 | *f.Value = line 123 | break 124 | } 125 | 126 | if len(f.Help) > 0 { 127 | pOut(" Error: %s\n", f.Help) 128 | } else { 129 | pOut(" Error: Invalid input.\n") 130 | } 131 | } 132 | 133 | dOut("entered: %s\n", *f.Value) 134 | return nil 135 | } 136 | 137 | func fillOutUserProfile(p *UserProfile) error { 138 | pOut("Editing user profile. [Current value].\n") 139 | 140 | fields := []InputField{ 141 | InputField{"Full Name", &p.Name, nil, ""}, 142 | // "Email (required)": &p.Email, 143 | InputField{"Website Url", &p.Website, nil, ""}, 144 | InputField{"Github username", &p.Github, nil, ""}, 145 | InputField{"Twitter username", &p.Twitter, nil, ""}, 146 | } 147 | 148 | for _, f := range fields { 149 | err := fillOutField(f) 150 | if err != nil { 151 | return err 152 | } 153 | } 154 | 155 | return nil 156 | } 157 | -------------------------------------------------------------------------------- /data_index.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "io/ioutil" 7 | "net/http" 8 | "strings" 9 | ) 10 | 11 | type DataIndex struct { 12 | Name string 13 | Http *HttpClient 14 | 15 | // For now, use S3Store directly. clean up interface later. 16 | // BlobStore blobStore 17 | BlobStore *S3Store 18 | } 19 | 20 | var mainDataIndex *DataIndex 21 | 22 | const mainIndexName = "datadex" 23 | 24 | // why not use `func init()`? some commands don't need an index 25 | // is annoying to error out on an S3 key when S3 isn't needed. 26 | func NewMainDataIndex() (*DataIndex, error) { 27 | if mainDataIndex != nil { 28 | return mainDataIndex, nil 29 | } 30 | 31 | i := &DataIndex{Name: mainIndexName} 32 | err := error(nil) 33 | 34 | i.Http, err = NewHttpClient(i.Name) 35 | if err != nil { 36 | return nil, err 37 | } 38 | 39 | i.BlobStore, err = NewS3Store("datadex.archives", i) 40 | if err != nil { 41 | return nil, err 42 | } 43 | 44 | mainDataIndex = i 45 | return mainDataIndex, nil 46 | } 47 | 48 | const HttpHeaderUser = "X-Data-User" 49 | const HttpHeaderToken = "X-Data-Token" 50 | const HttpHeaderContentType = "Content-Type" 51 | const HttpHeaderContentTypeYaml = "application/yaml" 52 | const ApiUrlSuffix = "/api/v1" 53 | 54 | // Controls authenticated http accesses. 55 | type HttpClient struct { 56 | BaseUrl string 57 | Url string 58 | User string 59 | AuthToken string 60 | } 61 | 62 | func NewHttpClient(index string) (*HttpClient, error) { 63 | i, err := configGetIndex(index) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | h := &HttpClient{ 69 | BaseUrl: strings.ToLower(i["url"]), 70 | User: i["user"], 71 | AuthToken: i["token"], 72 | } 73 | 74 | // ensure url has protocol prefix 75 | if !strings.HasPrefix(h.BaseUrl, "http://") && 76 | !strings.HasPrefix(h.BaseUrl, "https://") { 77 | h.BaseUrl = "http://" + h.BaseUrl 78 | } 79 | h.Url = h.BaseUrl 80 | 81 | // ensure url has api suffix 82 | if !strings.HasSuffix(strings.ToLower(h.Url), ApiUrlSuffix) { 83 | h.Url = h.Url + ApiUrlSuffix 84 | } 85 | 86 | return h, nil 87 | } 88 | 89 | func (h HttpClient) SubUrl(path string) string { 90 | return h.Url + "/" + path 91 | } 92 | 93 | func (h *HttpClient) Get(path string) (*http.Response, error) { 94 | dOut("http index get %s\n", h.SubUrl(path)) 95 | 96 | req, err := http.NewRequest("GET", h.SubUrl(path), nil) 97 | if err != nil { 98 | return nil, err 99 | } 100 | 101 | req.Header.Add(HttpHeaderToken, h.AuthToken) 102 | req.Header.Add(HttpHeaderUser, h.User) 103 | return h.DoRequest(req) 104 | } 105 | 106 | func (h *HttpClient) Post(path string, body interface{}) (*http.Response, error) { 107 | dOut("http index post %s\n", h.SubUrl(path)) 108 | 109 | rdr := io.Reader(nil) 110 | var err error 111 | if body != nil { 112 | rdr, err = Marshal(body) 113 | if err != nil { 114 | return nil, err 115 | } 116 | } 117 | 118 | req, err := http.NewRequest("POST", h.SubUrl(path), rdr) 119 | if err != nil { 120 | return nil, err 121 | } 122 | 123 | req.Header.Add(HttpHeaderContentType, HttpHeaderContentTypeYaml) 124 | req.Header.Add(HttpHeaderToken, h.AuthToken) 125 | req.Header.Add(HttpHeaderUser, h.User) 126 | return h.DoRequest(req) 127 | } 128 | 129 | func (h *HttpClient) DoRequest(req *http.Request) (*http.Response, error) { 130 | resp, err := http.DefaultClient.Do(req) 131 | if err != nil { 132 | return nil, err 133 | } 134 | 135 | c := resp.StatusCode 136 | if 200 <= c && c < 400 { 137 | return resp, nil 138 | } 139 | 140 | e, _ := ioutil.ReadAll(resp.Body) 141 | resp.Body.Close() 142 | 143 | s := strings.TrimSpace(string(e[:])) 144 | return nil, fmt.Errorf("HTTP error status code: %d (%s)", c, s) 145 | } 146 | -------------------------------------------------------------------------------- /commands.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "github.com/gonuts/flag" 5 | "github.com/jbenet/commander" 6 | "strings" 7 | "time" 8 | ) 9 | 10 | var Cmd_data = &commander.Command{ 11 | UsageLine: "data [] []", 12 | Short: "dataset package manager", 13 | Long: `data - dataset package manager 14 | 15 | Basic commands: 16 | 17 | get Download and install dataset. 18 | list List installed datasets. 19 | info Show dataset information. 20 | publish Guided dataset publishing. 21 | 22 | Tool commands: 23 | 24 | version Show data version information. 25 | config Manage data configuration. 26 | user Manage users and credentials. 27 | commands List all available commands. 28 | 29 | Advanced Commands: 30 | 31 | blob Manage blobs in the blobstore. 32 | manifest Generate and manipulate dataset manifest. 33 | pack Dataset packaging, upload, and download. 34 | 35 | Use "data help " for more information about a command. 36 | `, 37 | Run: dataCmd, 38 | Subcommands: []*commander.Command{ 39 | cmd_data_version, 40 | cmd_data_config, 41 | cmd_data_info, 42 | cmd_data_list, 43 | cmd_data_get, 44 | cmd_data_manifest, 45 | cmd_data_pack, 46 | cmd_data_blob, 47 | cmd_data_publish, 48 | cmd_data_user, 49 | cmd_data_commands, 50 | }, 51 | Flag: *flag.NewFlagSet("data", flag.ExitOnError), 52 | } 53 | 54 | func dataCmd(c *commander.Command, args []string) error { 55 | pOut(c.Long) 56 | return nil 57 | } 58 | 59 | var cmd_root *commander.Command 60 | 61 | func init() { 62 | // this funky alias is to resolve cyclical decl references. 63 | cmd_root = Cmd_data 64 | } 65 | 66 | var cmd_data_commands = &commander.Command{ 67 | UsageLine: "commands", 68 | Short: "List all available commands.", 69 | Long: `data commands - List all available commands. 70 | 71 | Lists all available commands (and sub-commands) and exits. 72 | `, 73 | Run: commandsCmd, 74 | Subcommands: []*commander.Command{ 75 | cmd_data_commands_help, 76 | }, 77 | } 78 | 79 | var cmd_data_commands_help = &commander.Command{ 80 | UsageLine: "help", 81 | Short: "List all available commands' help pages.", 82 | Long: `data commands help - List all available commands's help pages. 83 | 84 | Shows the pages of all available commands (and sub-commands) and exits. 85 | Outputs a markdown document, also viewable at http://datadex.io/doc/ref 86 | `, 87 | Run: commandsHelpCmd, 88 | } 89 | 90 | func commandsCmd(c *commander.Command, args []string) error { 91 | var listCmds func(c *commander.Command) 92 | listCmds = func(c *commander.Command) { 93 | pOut("%s\n", c.FullSpacedName()) 94 | for _, sc := range c.Subcommands { 95 | listCmds(sc) 96 | } 97 | } 98 | 99 | listCmds(c.Parent) 100 | return nil 101 | } 102 | 103 | func commandsHelpCmd(c *commander.Command, args []string) error { 104 | pOut(referenceHeaderMsg) 105 | pOut("Generated on %s.\n\n", time.Now().UTC().Format("2006-01-02")) 106 | 107 | var printCmds func(*commander.Command, int) 108 | printCmds = func(c *commander.Command, level int) { 109 | pOut("%s ", strings.Repeat("#", level)) 110 | pOut("%s\n\n", c.FullSpacedName()) 111 | pOut("```\n") 112 | pOut("%s\n", c.Long) 113 | pOut("```\n\n") 114 | 115 | for _, sc := range c.Subcommands { 116 | printCmds(sc, level+1) 117 | } 118 | } 119 | 120 | printCmds(c.Parent.Parent, 1) 121 | return nil 122 | } 123 | 124 | const referenceHeaderMsg = ` 125 | # data command reference 126 | 127 | This document lists every data command (including subcommands), along with 128 | its help page. It can be viewed by running 'data commands help', and 129 | at http://datadex.io/doc/ref 130 | 131 | ` 132 | -------------------------------------------------------------------------------- /dev/formats.md: -------------------------------------------------------------------------------- 1 | # data formats 2 | 3 | 4 | One of the important design goals is format-fluidity: ability to store datasets in various formats and transfer between them. Suppose a graph of formats, datasets should be able to traverse strongly connected components. So, if a dataset is published in XML, I should be able to request it in json.[1] This is easy for homogeneous datasets, but gets complicated when one dataset includes files in multiple formats, or it has metadata separated out. 5 | 6 | This is complicated further when thinking about how datasets get authored/published to the index, and retrieved thereafter. In brief, the idea is to follow github pattern: `/`, which reduces namespace problems. This includes versions (tags/branches): `/@`. Note: this handle will be used in projects' Datafiles, to specify dependencies (datasets composed of other datasets[2]), etc. 7 | 8 | 9 | Some possibilities: 10 | 11 | 12 | 1. Let formats be branches like any other. `/@`. Since version and format are now in the same namespace, would see things like: `foo/bar@1.0-json`, `foo/bar@1.2-xml`. This complicates maintenance: both new versions or new formats require a "row" of "commits" along the formats or versions, respectively. 13 | 14 | 2. Let formats be dimensions (see [3]). `/#format:`. Would see things like: `foo/bar@format:json`, `foo/bar@format:xml` There would be dimensional 'defaults' (as HEAD is default tag) that could be specified in the package description file. 15 | 16 | 3. Let formats be specified separately. `/.`. e.g. `foo/bar.json`, `foo/bar.xml`. This seems neat and nice. 17 | 18 | 4. Punt. let authors choose their formats in the dataset. Would see things like: `foo/bar-json`, `foo/xmlbar`. Would not have format-fluidity :(. Naming wont be held to standard if users control it... 19 | 20 | 21 | So far, I like 2 and 3 the best. 2 implies building [3] below, or at least a subset of the functionality. Building [3] would also make it easier to convert between formats. Just unclear how likely data across domains would be generalizable to this DIR. Would genomics/proteomics data fit this? 22 | 23 | 24 | 25 | [1] implementation detail to choose where to be in the `index stores one fmt and tool converts locally <--> index stores every format` spectrum. Most likely in between: index stores every format but constructs them lazily) 26 | 27 | [2] think of docker images. datasets can be expressed as instructions that construct it (some files from foo/dataset1 + some from bar/dataset2). This implies that a selecting sub-portions of a dataset could be a really useful mechanic.[3] 28 | 29 | ### selecting 30 | 31 | [3] imagine selecting [n-m] rows of a given dataset. Unclear yet how this should work exactly, but i've ideas along a dataset intermediate representation (DIR), where data is expressed as points in a multi-dimensional space, and a dataset is expressed as a subspace, or intervals across some dimensions. This would work well even for tables, allowing one to select slices of a dataset with something like: /#[:[:]]` e.g. 32 | 33 | lecun/norb#class # points that have a class 34 | lecun/norb#class:car # points that have class `car` 35 | lecun/norb#set:training # points in the training set 36 | lecun/norb#y:0:10 # points where `0 <= y <= 10` 37 | 38 | (and of course, can specify multiple comma-delimited dimensions) 39 | 40 | Or: 41 | 42 | lecun/norb#class # points that have a class 43 | lecun/norb#class[car] # points that have class `car` 44 | lecun/norb#set[training] # points in the training set 45 | lecun/norb#y[0, 10] # points where `0 <= y <= 10` 46 | lecun/norb#y]0, 10[ # points where `0 < y < 10` 47 | 48 | This seems like a really powerful thing to enable. Unclear how to do it well at present. Lots and lots of edge cases. This can come in later versions but must not close doors to it now. (Another note: i realize this basically is a dumber query string `?param=val`, problem with using a query string is these handles may have to be embedded in URLs :/ though i guess hashes are out in that case...) 49 | 50 | -------------------------------------------------------------------------------- /data_publish.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | import ( 4 | "fmt" 5 | "github.com/gonuts/flag" 6 | "github.com/jbenet/commander" 7 | ) 8 | 9 | var cmd_data_publish = &commander.Command{ 10 | UsageLine: "publish", 11 | Short: "Guided dataset publishing.", 12 | Long: `data publish - Guided dataset publishing. 13 | 14 | This command guides the user through the necessary steps to 15 | create a data package (Datafile and Manifest), uploads it, 16 | and publishes it to the dataset index. 17 | 18 | See 'data pack'. 19 | `, 20 | Run: publishCmd, 21 | Flag: *flag.NewFlagSet("data-pack-publish", flag.ExitOnError), 22 | } 23 | 24 | func init() { 25 | cmd_data_publish.Flag.Bool("clean", true, 26 | "rebuild manifest (data pack make --clean)") 27 | cmd_data_publish.Flag.Bool("force", false, 28 | "force publish (data pack publish --force)") 29 | } 30 | 31 | func publishCmd(c *commander.Command, args []string) error { 32 | u := configUser() 33 | if !isNamedUser(u) { 34 | return fmt.Errorf(NotLoggedInErr) 35 | } 36 | 37 | pOut("==> Guided Data Package Publishing.\n") 38 | pOut(PublishMsgWelcome) 39 | 40 | pOut("\n==> Step 1/3: Creating the package.\n") 41 | pOut(PublishMsgDatafile) 42 | err := packMakeCmd(c, []string{}) 43 | if err != nil { 44 | return err 45 | } 46 | 47 | pOut("\n==> Step 2/3: Uploading the package contents.\n") 48 | pOut(PublishMsgUpload) 49 | err = packUploadCmd(c, []string{}) 50 | if err != nil { 51 | return err 52 | } 53 | 54 | pOut("\n==> Step 3/3: Publishing the package to the index.\n") 55 | pOut(PublishMsgPublish) 56 | return packPublishCmd(c, []string{}) 57 | } 58 | 59 | const NotLoggedInErr = `You are not logged in. First, either: 60 | 61 | - Run 'data user add' to create a new user account. 62 | - Run 'data user auth' to log in to an existing user account. 63 | 64 | 65 | Why does publishing require a registered user account (and email)? The index 66 | service needs to distinguish users to perform many of its tasks. For example: 67 | 68 | - Verify who can or cannot publish datasets, or modify already published ones. 69 | (i.e. the creator + collaborators should be able to, others should not). 70 | - Profiles credit people for the datasets they have published. 71 | - Malicious users can be removed, and their email addresses blacklisted to 72 | prevent further abuse. 73 | ` 74 | 75 | const PublishMsgWelcome = ` 76 | Welcome to Data Package Publishing. You should read these short 77 | messages carefully, as they contain important information about 78 | how data works, and how your data package will be published. 79 | 80 | First, a 'data package' is a collection of files, containing: 81 | - various files with your data, in any format. 82 | - 'Datafile', a file with descriptive information about the package. 83 | - 'Manifest', a file listing the other files in the package and their checksums. 84 | 85 | This tool will automatically: 86 | 1. Create the package 87 | - Generate a 'Datafile', with information you will provide. 88 | - Generate a 'Manifest', with all the files in the current directory. 89 | 2. Upload the package contents 90 | 3. Publish the package to the index 91 | 92 | (Note: to specify which files are part of the package, and other advanced 93 | features, use the 'data pack' command directly. See 'data pack help'.) 94 | 95 | ` 96 | 97 | const PublishMsgDatafile = ` 98 | First, let's write the package's Datafile, which contains important 99 | information about the package. The 'owner id' is the username of the 100 | package's owner (usually your username). The 'dataset id' is the identifier 101 | which defines this dataset. Good 'dataset ids' are like names: short, unique, 102 | and memorable. For example: "mnist" or "cifar". Choose it carefully. 103 | 104 | ` 105 | 106 | const PublishMsgUpload = ` 107 | Now, data will upload the contents of the package (this directory) to the index 108 | sotrage service. This may take a while, if the files are large (over 100MB). 109 | 110 | ` 111 | 112 | const PublishMsgPublish = ` 113 | Finally, data will publish the package to the index, where others can find 114 | and download your package. The index is available through data, and on the web. 115 | 116 | ` 117 | -------------------------------------------------------------------------------- /dev/roadmap.md: -------------------------------------------------------------------------------- 1 | # data roadmap 2 | 3 | This document briefly outlines desired features to implement. 4 | 5 | 6 | ## command dispatch 7 | 8 | Need to implement the skeleton of the project: command parsing/dispatch. 9 | 10 | ## data list 11 | 12 | data list 13 | 14 | List the datasets in the current project 15 | 16 | ## data config 17 | 18 | data config user.name = 'jbenet' 19 | data config --global user.name = 'jbenet' 20 | 21 | Allow the configuration of `data`, using (`git` like) config files. 22 | Consider using a `~/.dataconfig` global config file. 23 | Consider using a `data/config` (or `.dataconfig`) local config file. 24 | 25 | ## data update 26 | 27 | data update 28 | 29 | Download and install newer version. 30 | Also, check whether data is up-to-date on every run (inc option to silence). 31 | 32 | ## data get 33 | 34 | data get / 35 | data get http://datadex.io// 36 | 37 | Download and install packages from the dataset index (datadex, configurable). 38 | No arguments looks into the directory's `Datafile` (configurable) 39 | Allow installation of packages using `/` ref-naming. 40 | Allow installation of packages using `https?://...//` urls. 41 | Use a `--save` flag to store into a `Datafile`. 42 | Installed datasets go into the `data/` directory (configurable) of the project. 43 | Should download compressed files, and use array of mirrors. 44 | 45 | 46 | ## data manifest 47 | 48 | data manifest 49 | 50 | Generate the data manifest file (`.data/manifest`? `Manifest`?), a list of 51 | 52 |

data

dataset package manager

Installation complete!

The MIT License

Copyright (c) 2014 Juan Batiz-Benet