Permission is hereby granted, free of charge, to any person obtaining a copy
13 | of this software and associated documentation files (the "Software"), to deal
14 | in the Software without restriction, including without limitation the rights
15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 | copies of the Software, and to permit persons to whom the Software is
17 | furnished to do so, subject to the following conditions:
18 |
The above copyright notice and this permission notice shall be included in
19 | all copies or substantial portions of the Software.
20 |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 | THE SOFTWARE.
27 |
28 |
29 |
--------------------------------------------------------------------------------
/platforms/installers/osx/distribution.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | data -- dataset package manager
4 | io.datadex
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | data.pkg
16 |
18 |
19 |
20 |
21 |
22 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/data_list.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "github.com/jbenet/commander"
5 | "io/ioutil"
6 | "path"
7 | )
8 |
9 | var cmd_data_list = &commander.Command{
10 | UsageLine: "list ",
11 | Short: "List installed datasets.",
12 | Long: `data list - List insalled datasets.
13 |
14 | Returns all the datasets installed in the dataset working directory,
15 | end exits.
16 | `,
17 | Run: listCmd,
18 | }
19 |
20 | func listCmd(*commander.Command, []string) error {
21 | return listDatasets(DatasetDir)
22 | }
23 |
24 | func listDatasets(dir string) error {
25 | authors, err := ioutil.ReadDir(dir)
26 |
27 | if err != nil {
28 | pErr("data: error reading dataset directory \"%s\"\n", dir)
29 | return err
30 | }
31 |
32 | // for each author dir
33 | for _, a := range authors {
34 | // skip hidden files
35 | if a.Name()[0] == '.' {
36 | continue
37 | }
38 |
39 | author := path.Join(dir, a.Name())
40 | datasets, err := ioutil.ReadDir(author)
41 | if err != nil {
42 | continue
43 | }
44 |
45 | // for each dataset dir
46 | for _, d := range datasets {
47 | // skip hidden files
48 | if d.Name()[0] == '.' {
49 | continue
50 | }
51 |
52 | dataset := path.Join(a.Name(), d.Name())
53 | datafile, err := NewDatafile(DatafilePath(dataset))
54 | if err != nil {
55 | pErr("Error: %s\n", err)
56 | continue
57 | }
58 |
59 | pOut("%s\n", datafile.Dataset)
60 | }
61 | }
62 |
63 | return nil
64 | }
65 |
--------------------------------------------------------------------------------
/dev/changelog.md:
--------------------------------------------------------------------------------
1 | # data changelog
2 |
3 | ## v0.1.1 2014-02-05
4 |
5 | - data help: groups commands
6 | - publish guide messages
7 | - default dataset id to cwd basename
8 | - changed Manifest -> .data/Manifest filename
9 | - data get: install path is handle
10 | - data get: no littering if not found
11 | - data blob: creates dir(path)
12 | - data config flexibility
13 | - semver support
14 |
15 |
16 | ## v0.1.0 2014-01-21
17 |
18 | First preview (alpha)
19 |
20 | - release builds
21 | - data commands (for reference)
22 | - data pack make -- Datafile defaults
23 | - datadex api suffix
24 | - data blob put -- verify hash
25 | - data blob {hash, check}
26 | - datadex interop
27 | - data config: env var, --edit
28 | - s3 token based auth for uploading
29 | - s3 anonymous downloading
30 |
31 | ## v0.0.5 2014-01-09
32 |
33 | Publishing + downloading packages.
34 |
35 | - data pack publish
36 | - data publish
37 | - data get (using pack)
38 | - data user {add, auth, pass, info, url}
39 | - data config
40 |
41 | ## v0.0.4 2014-01-03
42 |
43 | Manifest manipulation and packaging.
44 |
45 | - data manifest {add, rm, hash, check}
46 | - data pack {make, manifest, upload, download, check}
47 |
48 | ## v0.0.3 2013-12-13
49 |
50 | Uploading datasets.
51 |
52 | - data manifest (list + hash files)
53 | - data blob (blobs to storage service)
54 |
55 |
56 | ## v0.0.2 2013-11-24
57 |
58 | Downloading datasets.
59 |
60 | - data get (downloads + installs a dataset)
61 |
62 | ## v0.0.1 2013-11-22
63 |
64 | Initial version.
65 |
66 | - command dispatch
67 | - datafile format (yml + structure)
68 | - datafile parsing (loading/dumping)
69 | - data version
70 | - data help (just usage for now)
71 | - data list (show installed datasets)
72 | - data info (loads/dumps dataset's Datafile)
73 |
--------------------------------------------------------------------------------
/platforms/archive.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import re
5 |
6 | # constants
7 | readme_file = 'tar.README.md'
8 | semver_regx = r'^[0-9]+\.[0-9]+\.[0-9]+$' # lacks pre-releases/builds
9 | valid_archs = [
10 | 'darwin_amd64',
11 | 'darwin_386',
12 | 'linux_amd64',
13 | 'linux_386',
14 | 'windows_amd64',
15 | 'windows_386',
16 | ]
17 |
18 |
19 | def check(cond, msg):
20 | if not cond:
21 | print 'Error:', msg
22 | exit(-1)
23 |
24 | def write_readme(output, arch, version):
25 | with open(output, 'w') as out:
26 | with open('../%s' % readme_file) as inp:
27 | txt = inp.read()
28 | txt = txt % {'arch': arch, 'version': version}
29 | out.write(txt)
30 |
31 |
32 | def make_archive(arch, vers):
33 | if arch not in valid_archs:
34 | print "Error: arch '%s' not supported" % arch
35 | return -1
36 |
37 | if not re.match(semver_regx, vers):
38 | print "Error: version '%s' is not like X.X.X" % vers
39 | return -1
40 |
41 | if not os.path.exists('%s/data' % arch):
42 | print "Error: binary '%s/data' not found" % arch
43 | return -1
44 |
45 | # move into arch dir
46 | os.chdir(arch)
47 |
48 | # setup directory
49 | dir = 'data-v%s-%s' % (vers, arch)
50 | os.system('mkdir -p %s' % dir)
51 |
52 | # write files
53 | os.system('cp data %s/data' % dir)
54 | write_readme('%s/README.md' % dir, arch, vers)
55 |
56 | # tar
57 | tar = '%s.tar.gz' % dir
58 | os.system('tar czf %s %s' % (tar, dir))
59 |
60 | # move into place
61 | os.chdir('..')
62 | os.system('mkdir -p archives')
63 | os.system('mv %s/%s archives/%s' % (arch, tar, tar))
64 | os.system('rm -rf %s/%s' % (arch, dir))
65 |
66 | print 'packaged archives/%s' % tar
67 | return dir
68 |
69 |
70 | def main():
71 | import sys
72 | if '-h' in sys.argv or len(sys.argv) < 3:
73 | print 'Usage: %s ' % sys.argv[0]
74 | print 'Prepares the release archive for a given architecture.'
75 | exit(0 if '-h' in sys.argv else -1)
76 |
77 | arch = sys.argv[1]
78 | vers = sys.argv[2]
79 |
80 | archs = valid_archs if arch == 'all' else [arch]
81 |
82 | for arch in archs:
83 | make_archive(arch, vers)
84 |
85 |
86 | if __name__ == '__main__':
87 | main()
88 |
--------------------------------------------------------------------------------
/serialize.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "io/ioutil"
8 | "launchpad.net/goyaml"
9 | "os"
10 | "path"
11 | )
12 |
13 | type SerializedFile struct {
14 | Path string "-"
15 | Format interface{} "-"
16 | }
17 |
18 | func (f *SerializedFile) Marshal() ([]byte, error) {
19 | dOut("Marshalling %s\n", f.Path)
20 | return goyaml.Marshal(f.Format)
21 | }
22 |
23 | func (f *SerializedFile) Unmarshal(buf []byte) error {
24 | err := goyaml.Unmarshal(buf, f.Format)
25 | if err != nil {
26 | return err
27 | }
28 |
29 | dOut("Unmarshalling %s\n", f.Path)
30 | return nil
31 | }
32 |
33 | func (f *SerializedFile) Write(w io.Writer) error {
34 | buf, err := f.Marshal()
35 | if err != nil {
36 | return err
37 | }
38 |
39 | _, err = w.Write(buf)
40 | return err
41 | }
42 |
43 | func (f *SerializedFile) Read(r io.Reader) error {
44 | buf, err := ioutil.ReadAll(r)
45 | if err != nil {
46 | return err
47 | }
48 |
49 | return f.Unmarshal(buf)
50 | }
51 |
52 | func (f *SerializedFile) WriteFile() error {
53 | if len(f.Path) < 1 {
54 | return fmt.Errorf("SerializedFile: No path provided for writing.")
55 | }
56 |
57 | buf, err := f.Marshal()
58 | if err != nil {
59 | return err
60 | }
61 |
62 | err = os.MkdirAll(path.Dir(f.Path), 0777)
63 | if err != nil {
64 | return err
65 | }
66 |
67 | return ioutil.WriteFile(f.Path, buf, 0666)
68 | }
69 |
70 | func (f *SerializedFile) ReadFile() error {
71 | if len(f.Path) < 1 {
72 | return fmt.Errorf("SerializedFile: No path provided for reading.")
73 | }
74 |
75 | buf, err := ioutil.ReadFile(f.Path)
76 | if err != nil {
77 | return err
78 | }
79 |
80 | return f.Unmarshal(buf)
81 | }
82 |
83 | func (f *SerializedFile) ReadBlob(ref string) error {
84 | i, err := NewMainDataIndex()
85 | if err != nil {
86 | return err
87 | }
88 |
89 | r, err := i.BlobStore.Get(BlobKey(ref))
90 | if err != nil {
91 | return err
92 | }
93 |
94 | err = f.Read(r)
95 | if err != nil {
96 | return err
97 | }
98 |
99 | return nil
100 | }
101 |
102 | func Marshal(in interface{}) (io.Reader, error) {
103 | buf, err := goyaml.Marshal(in)
104 | if err != nil {
105 | return nil, err
106 | }
107 |
108 | // pOut("\n")
109 | // pOut("%s\n", buf)
110 | // pOut("\n")
111 | return bytes.NewReader(buf), nil
112 | }
113 |
114 | func Unmarshal(in io.Reader, out interface{}) error {
115 | buf, err := ioutil.ReadAll(in)
116 | if err != nil {
117 | return err
118 | }
119 |
120 | // pOut("\n")
121 | // pOut("%s\n", buf)
122 | // pOut("\n")
123 | return goyaml.Unmarshal(buf, out)
124 | }
125 |
126 | // Userful for converting between representations
127 | func MarshalUnmarshal(in interface{}, out interface{}) error {
128 | // struct -> yaml -> map for easy access
129 | rdr, err := Marshal(in)
130 | if err != nil {
131 | return err
132 | }
133 |
134 | return Unmarshal(rdr, out)
135 | }
136 |
--------------------------------------------------------------------------------
/data_handle.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "fmt"
5 | "path"
6 | "strings"
7 | )
8 |
9 | // /[.][@]
10 |
11 | type Handle struct {
12 | Author string
13 | Name string
14 | Format string
15 | Version string
16 | }
17 |
18 | // There are problems with goyaml setters/getters.
19 | // Unmarshaling fails.
20 | //
21 | // func (d Handle) GetYAML() (string, interface{}) {
22 | // pOut("GetYAML\n")
23 | // return "", d.string
24 | // }
25 | //
26 | // func (d Handle) SetYAML(tag string, value interface{}) bool {
27 | // s, ok := value.(string)
28 | // d.string = s
29 | // pOut("SetYAML %s %s\n", d.string, &d)
30 | // return ok
31 | // }
32 |
33 | func NewHandle(s string) *Handle {
34 | d := new(Handle)
35 | d.SetDataset(s)
36 | return d
37 | }
38 |
39 | func (d *Handle) Dataset() string {
40 | s := d.Path()
41 |
42 | if len(d.Format) > 0 {
43 | s = fmt.Sprintf("%s.%s", s, d.Format)
44 | }
45 |
46 | if len(d.Version) > 0 {
47 | s = fmt.Sprintf("%s@%s", s, d.Version)
48 | }
49 |
50 | return s
51 | }
52 |
53 | func (d *Handle) Path() string {
54 | return path.Join(d.Author, d.Name)
55 | }
56 |
57 | func (d *Handle) InstallPath() string {
58 | return path.Join(DatasetDir, d.Dataset())
59 | }
60 |
61 | // order: rsplit @, split /, rsplit .
62 | func (d *Handle) SetDataset(s string) {
63 | // no / is invalid
64 | if strings.Index(s, "/") == 0 {
65 | return
66 | }
67 |
68 | nam_idx := strings.Index(s, "/")
69 | if nam_idx < 0 {
70 | nam_idx = 0
71 | }
72 |
73 | ver_idx := strings.LastIndex(s, "@")
74 | if ver_idx < 0 {
75 | ver_idx = len(s) // no version in handle.
76 | }
77 |
78 | // this precludes names that have periods... use different delimiter?
79 | fmt_idx := strings.LastIndex(s[nam_idx+1:ver_idx], ".")
80 | if fmt_idx < 0 {
81 | fmt_idx = ver_idx // no format in handle.
82 | } else {
83 | fmt_idx += nam_idx + 1
84 | }
85 |
86 | // parts
87 | d.Author = slice(s, 0, nam_idx)
88 | d.Name = slice(s, nam_idx+1, fmt_idx)
89 | d.Format = slice(s, fmt_idx+1, ver_idx)
90 | d.Version = slice(s, ver_idx+1, len(s))
91 | }
92 |
93 | func (d *Handle) GoString() string {
94 | return d.Dataset()
95 | }
96 |
97 | func (d *Handle) Valid() bool {
98 | return IsDatasetHandle(d.Dataset())
99 | }
100 |
101 | // utils
102 |
103 | func slice(s string, from int, to int) string {
104 | from = maxInt(from, 0)
105 | to = minInt(to, len(s))
106 | return s[minInt(from, to):to]
107 | }
108 |
109 | // https://groups.google.com/forum/#!topic/golang-nuts/dbyqx_LGUxM is silly.
110 | func minInt(x, y int) (r int) {
111 | if x < y {
112 | return x
113 | }
114 | return y
115 | }
116 |
117 | func maxInt(x, y int) (r int) {
118 | if x > y {
119 | return x
120 | }
121 | return y
122 | }
123 |
124 | func handleError(handle string, problem string) error {
125 | return fmt.Errorf("Invalid handle (%s): %s", problem, handle)
126 | }
127 |
128 | func IsDatasetHandle(str string) bool {
129 | return HandleRegexp.MatchString(str)
130 | }
131 |
--------------------------------------------------------------------------------
/datafile.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "path"
5 | )
6 |
7 | /*
8 | # Datafile format
9 | # A YAML (inc json) doc with the following keys:
10 |
11 | # required
12 | handle: /[.][@]
13 | title: Dataset Title
14 |
15 | # optional functionality
16 | dependencies: []
17 | formats: { : }
18 |
19 | # optional information
20 | description: Text describing dataset.
21 | repository:
22 | website:
23 | license:
24 | contributors: ["Author Name [] [(url)]>", ...]
25 | sources: []
26 | */
27 |
28 | // Serializable into YAML
29 | type datafileContents struct {
30 | Dataset string
31 | Tagline string
32 |
33 | Mirrors []string ",omitempty"
34 | Dependencies []string ",omitempty"
35 | Formats map[string]string ",omitempty"
36 |
37 | Description string ",omitempty"
38 | Repository string ",omitempty"
39 | Website string ",omitempty"
40 | License string ",omitempty"
41 | Authors []string ",omitempty"
42 | Contributors []string ",omitempty"
43 | Sources []string ",omitempty"
44 | }
45 |
46 | type Datafile struct {
47 | SerializedFile "-"
48 | datafileContents ",inline"
49 | }
50 |
51 | const DatasetDir = "datasets"
52 | const DatafileName = "Datafile"
53 |
54 | func DatafilePath(dataset string) string {
55 | return path.Join(DatasetDir, dataset, DatafileName)
56 | }
57 |
58 | func NewDatafile(path string) (*Datafile, error) {
59 | df := &Datafile{SerializedFile: SerializedFile{Path: path}}
60 | df.SerializedFile.Format = df
61 |
62 | if len(path) > 0 {
63 | err := df.ReadFile()
64 | if err != nil {
65 | return df, err
66 | }
67 | }
68 | return df, nil
69 | }
70 |
71 | func NewDefaultDatafile() (*Datafile, error) {
72 | return NewDatafile(DatafileName)
73 | }
74 |
75 | func NewDatafileWithRef(ref string) (*Datafile, error) {
76 | f, _ := NewDatafile("")
77 | err := f.ReadBlob(ref)
78 | if err != nil {
79 | return nil, err
80 | }
81 | return f, nil
82 | }
83 |
84 | func (d *Datafile) Handle() *Handle {
85 | return NewHandle(d.Dataset)
86 | }
87 |
88 | func (d *Datafile) Valid() bool {
89 | return d.Handle().Valid()
90 | }
91 |
92 | // datafile manipulation utils
93 |
94 | // Return array of all Datafiles
95 | func NewDatafiles(filenames []string) ([]*Datafile, error) {
96 | files := []*Datafile{}
97 | for _, p := range filenames {
98 | f, err := NewDatafile(p)
99 | if err != nil {
100 | return nil, err
101 | }
102 |
103 | files = append(files, f)
104 | }
105 | return files, nil
106 | }
107 |
108 | // group Datafiles { path : [Datafile, ], }
109 | type DatafileGroupMap map[string][]*Datafile
110 |
111 | func GroupedDatafiles(files []*Datafile) *DatafileGroupMap {
112 | grouped := DatafileGroupMap{}
113 |
114 | for _, f := range files {
115 | group := f.Handle().Path()
116 | grouped[group] = append(grouped[group], f)
117 | }
118 |
119 | return &grouped
120 | }
121 |
--------------------------------------------------------------------------------
/dev/cli.md:
--------------------------------------------------------------------------------
1 | ```
2 | data
3 |
4 | version Show data version information.
5 | config Manage data configuration.
6 | info Show dataset information.
7 | list List installed datasets.
8 | get Download and install dataset.
9 | publish Guided dataset publishing.
10 |
11 | user Manage users and credentials.
12 | add Register new user with index.
13 | auth Authenticate user account.
14 | pass Change user password.
15 | info Show (or edit) public user information.
16 | url Output user profile url.
17 |
18 | manifest Generate and manipulate dataset manifest.
19 | add Adds to manifest (does not hash).
20 | rm Removes from manifest.
21 | hash Hashes and adds checksum to manifest.
22 | check Verifies checksum matches manifest.
23 |
24 | pack Dataset packaging, upload, and download.
25 | make Create or update package description.
26 | manifest Show current package manifest.
27 | upload Upload package to remote storage.
28 | download Download package from remote storage.
29 | checksum Verify all file checksums match.
30 |
31 | blob Manage blobs in the blobstore.
32 | put Upload blob named by to blobstore.
33 | get Download blob named by from blobstore.
34 | url Output Url for blob named by .
35 | check Verify blob contents named by match .
36 | show Output blob contents named by .
37 | ```
38 |
39 | git backed (use git internally to manage repository changes)
40 |
41 | ```
42 | data
43 |
44 | version Show data version information.
45 | config Manage data configuration.
46 | info Show dataset information.
47 | list List installed datasets.
48 | get Download and install dataset.
49 | publish Guided dataset publishing.
50 |
51 | user Manage users and credentials.
52 | add Register new user with index.
53 | auth Authenticate user account.
54 | pass Change user password.
55 | info Show (or edit) public user information.
56 | url Output user profile url.
57 |
58 | pack Dataset packaging, upload, and download.
59 | add Add contents to package staging.
60 | rm Removes contents from package staging.
61 | status Show the working tree status.
62 | commit Record changes to package repository.
63 | upload Upload package to remote storage and post to index.
64 | download Download package from remote storage.
65 | checksum Verify all file checksums match.
66 |
67 | blob Manage blobs in the blobstore (unaware of pack)
68 | put Upload blob named from to blobstore.
69 | get Download blob named from blobstore to .
70 | check Verify blob contents in match .
71 | url Output Url for blob named by .
72 | show Output blob contents named by .
73 | ```
74 |
--------------------------------------------------------------------------------
/s3store.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "fmt"
5 | "github.com/jbenet/s3"
6 | "github.com/jbenet/s3/s3util"
7 | "io"
8 | "strings"
9 | )
10 |
11 | type S3Store struct {
12 | bucket string
13 | domain string
14 | config *s3util.Config
15 |
16 | // used for auth credentials
17 | dataIndex *DataIndex
18 | }
19 |
20 | // format from `aws sts` cmd
21 | type AwsCredentials struct {
22 | SecretAccessKey string
23 | SessionToken string
24 | AccessKeyId string
25 | }
26 |
27 | func NewS3Store(bucket string, index *DataIndex) (*S3Store, error) {
28 |
29 | if len(bucket) < 1 {
30 | return nil, fmt.Errorf("Invalid (empty) S3 Bucket name.")
31 | }
32 |
33 | if index == nil {
34 | return nil, fmt.Errorf("Invalid (nil) DataIndex.")
35 | }
36 |
37 | s := &S3Store{
38 | bucket: bucket,
39 | domain: "s3.amazonaws.com",
40 | dataIndex: index,
41 | }
42 |
43 | s.config = &s3util.Config{
44 | Service: s3.DefaultService,
45 | Keys: new(s3.Keys),
46 | }
47 |
48 | return s, nil
49 | }
50 |
51 | func (s *S3Store) SetAwsCredentials(c *AwsCredentials) {
52 | s.config.AccessKey = c.AccessKeyId
53 | s.config.SecretKey = c.SecretAccessKey
54 | s.config.SecurityToken = c.SessionToken
55 |
56 | // pOut("Got Aws Credentials:\n")
57 | // pOut(" AccessKey: %s\n", s.config.AccessKey)
58 | // pOut(" SecretKey: %s\n", s.config.SecretKey)
59 | // pOut(" SessToken: %s\n\n", s.config.SecurityToken)
60 | }
61 |
62 | func (s *S3Store) AwsCredentials() *AwsCredentials {
63 | if s.config == nil || len(s.config.AccessKey) == 0 {
64 | return nil
65 | }
66 |
67 | return &AwsCredentials{
68 | AccessKeyId: s.config.AccessKey,
69 | SecretAccessKey: s.config.SecretKey,
70 | SessionToken: s.config.SecurityToken,
71 | }
72 | }
73 |
74 | func (s *S3Store) Url(key string) string {
75 | if !strings.HasPrefix(key, "/") {
76 | key = "/" + key
77 | }
78 | return fmt.Sprintf("http://%s.%s%s", s.bucket, s.domain, key)
79 | }
80 |
81 | func (s *S3Store) Has(key string) (bool, error) {
82 | url := s.Url(key)
83 | rc, err := s3util.Open(url, s.config)
84 |
85 | if err == nil {
86 | rc.Close()
87 | return true, nil
88 | }
89 |
90 | if strings.Contains(err.Error(), "unwanted http status 404:") {
91 | return false, nil
92 | }
93 |
94 | return false, err
95 | }
96 |
97 | func (s *S3Store) Put(key string, value io.Reader) error {
98 | err := s.ensureUserAwsCredentials()
99 | if err != nil {
100 | return fmt.Errorf("aws credentials error: %v", err)
101 | }
102 |
103 | url := s.Url(key)
104 | w, err := s3util.Create(url, nil, s.config)
105 | if err != nil {
106 | return err
107 | }
108 |
109 | _, err = io.Copy(w, value)
110 | if err != nil {
111 | return err
112 | }
113 |
114 | err = w.Close()
115 | if err != nil {
116 | return err
117 | }
118 |
119 | return nil
120 | }
121 |
122 | func (s *S3Store) Get(key string) (io.ReadCloser, error) {
123 | url := s.Url(key)
124 | return s3util.Open(url, s.config)
125 | }
126 |
127 | func (s *S3Store) getUserAwsCredentials() error {
128 | u := configUser()
129 | if !isNamedUser(u) {
130 | return fmt.Errorf("must be signed in to request aws credentials")
131 | }
132 |
133 | ui := s.dataIndex.NewUserIndex(u)
134 | c, err := ui.AwsCred()
135 | if err != nil {
136 | return err
137 | }
138 |
139 | s.SetAwsCredentials(c)
140 | return nil
141 | }
142 |
143 | func (s *S3Store) ensureUserAwsCredentials() error {
144 | // if we already have credentials, do nothing.
145 | if s.AwsCredentials() != nil {
146 | return nil
147 | }
148 |
149 | return s.getUserAwsCredentials()
150 | }
151 |
--------------------------------------------------------------------------------
/field_user_input.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "os"
5 | "regexp"
6 | "strings"
7 | )
8 |
9 | type InputField struct {
10 | Prompt string
11 | Value *string
12 | Pattern *regexp.Regexp
13 | Help string
14 | }
15 |
16 | func ensureDatafileInPath(path string) error {
17 | _, err := os.Stat(path)
18 | if err == nil {
19 | return nil
20 | }
21 |
22 | // if it doesn't exist, create it.
23 | f, err := os.Create(path)
24 | defer f.Close()
25 |
26 | return nil
27 | }
28 |
29 | func fillOutDatafileInPath(path string) error {
30 |
31 | err := ensureDatafileInPath(path)
32 | if err != nil {
33 | return err
34 | }
35 |
36 | df, err := NewDatafile(path)
37 | if err != nil {
38 | return err
39 | }
40 |
41 | return fillOutDatafile(df)
42 | }
43 |
44 | func fillOutDatafile(df *Datafile) error {
45 | pOut("Writing Datafile fields...\n")
46 | pOut("'Field description [current value]'\n")
47 |
48 | h := df.Handle()
49 | fields := []InputField{
50 | InputField{
51 | "owner id (required)",
52 | &h.Author,
53 | UserRegexp,
54 | "Must be a valid username. Can only contain [a-z0-9-_.].",
55 | },
56 | InputField{
57 | "dataset id (required)",
58 | &h.Name,
59 | IdentRegexp,
60 | "Must be a valid dataset id. Can only contain [a-z0-9-_.].",
61 | },
62 | InputField{
63 | "dataset version (required)",
64 | &h.Version,
65 | IdentRegexp,
66 | "Must be a valid version. Can only contain [a-z0-9-_.].",
67 | },
68 | InputField{"tagline description (required)", &df.Tagline, nil,
69 | `A tagline is required to describe your package to others.
70 | Good taglines are like titles: short, descriptive phrases.`},
71 | InputField{"long description (optional)", &df.Description, nil, ""},
72 | InputField{"license name (optional)", &df.License, nil, ""},
73 | }
74 |
75 | for _, field := range fields {
76 | err := fillOutField(field)
77 | if err != nil {
78 | return err
79 | }
80 |
81 | df.Dataset = h.Dataset()
82 | if df.Valid() {
83 | err = df.WriteFile()
84 | if err != nil {
85 | return err
86 | }
87 | }
88 | }
89 |
90 | return nil
91 | }
92 |
93 | func fillOutField(f InputField) error {
94 |
95 | // validator function
96 | valid := func(val string) bool {
97 | if strings.Contains(f.Prompt, "required") && len(val) < 1 {
98 | return false
99 | }
100 |
101 | if f.Pattern != nil && !f.Pattern.MatchString(val) {
102 | return false
103 | }
104 |
105 | return true
106 | }
107 |
108 | for {
109 | pOut("Enter %s [%s]: ", f.Prompt, *f.Value)
110 | line, err := readInput()
111 | if err != nil {
112 | return err
113 | }
114 |
115 | // if not required, and entered nothing, get out.
116 | if len(line) == 0 && valid(*f.Value) {
117 | break
118 | }
119 |
120 | // if valid input
121 | if valid(line) {
122 | *f.Value = line
123 | break
124 | }
125 |
126 | if len(f.Help) > 0 {
127 | pOut(" Error: %s\n", f.Help)
128 | } else {
129 | pOut(" Error: Invalid input.\n")
130 | }
131 | }
132 |
133 | dOut("entered: %s\n", *f.Value)
134 | return nil
135 | }
136 |
137 | func fillOutUserProfile(p *UserProfile) error {
138 | pOut("Editing user profile. [Current value].\n")
139 |
140 | fields := []InputField{
141 | InputField{"Full Name", &p.Name, nil, ""},
142 | // "Email (required)": &p.Email,
143 | InputField{"Website Url", &p.Website, nil, ""},
144 | InputField{"Github username", &p.Github, nil, ""},
145 | InputField{"Twitter username", &p.Twitter, nil, ""},
146 | }
147 |
148 | for _, f := range fields {
149 | err := fillOutField(f)
150 | if err != nil {
151 | return err
152 | }
153 | }
154 |
155 | return nil
156 | }
157 |
--------------------------------------------------------------------------------
/data_index.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "io/ioutil"
7 | "net/http"
8 | "strings"
9 | )
10 |
11 | type DataIndex struct {
12 | Name string
13 | Http *HttpClient
14 |
15 | // For now, use S3Store directly. clean up interface later.
16 | // BlobStore blobStore
17 | BlobStore *S3Store
18 | }
19 |
20 | var mainDataIndex *DataIndex
21 |
22 | const mainIndexName = "datadex"
23 |
24 | // why not use `func init()`? some commands don't need an index
25 | // is annoying to error out on an S3 key when S3 isn't needed.
26 | func NewMainDataIndex() (*DataIndex, error) {
27 | if mainDataIndex != nil {
28 | return mainDataIndex, nil
29 | }
30 |
31 | i := &DataIndex{Name: mainIndexName}
32 | err := error(nil)
33 |
34 | i.Http, err = NewHttpClient(i.Name)
35 | if err != nil {
36 | return nil, err
37 | }
38 |
39 | i.BlobStore, err = NewS3Store("datadex.archives", i)
40 | if err != nil {
41 | return nil, err
42 | }
43 |
44 | mainDataIndex = i
45 | return mainDataIndex, nil
46 | }
47 |
48 | const HttpHeaderUser = "X-Data-User"
49 | const HttpHeaderToken = "X-Data-Token"
50 | const HttpHeaderContentType = "Content-Type"
51 | const HttpHeaderContentTypeYaml = "application/yaml"
52 | const ApiUrlSuffix = "/api/v1"
53 |
54 | // Controls authenticated http accesses.
55 | type HttpClient struct {
56 | BaseUrl string
57 | Url string
58 | User string
59 | AuthToken string
60 | }
61 |
62 | func NewHttpClient(index string) (*HttpClient, error) {
63 | i, err := configGetIndex(index)
64 | if err != nil {
65 | return nil, err
66 | }
67 |
68 | h := &HttpClient{
69 | BaseUrl: strings.ToLower(i["url"]),
70 | User: i["user"],
71 | AuthToken: i["token"],
72 | }
73 |
74 | // ensure url has protocol prefix
75 | if !strings.HasPrefix(h.BaseUrl, "http://") &&
76 | !strings.HasPrefix(h.BaseUrl, "https://") {
77 | h.BaseUrl = "http://" + h.BaseUrl
78 | }
79 | h.Url = h.BaseUrl
80 |
81 | // ensure url has api suffix
82 | if !strings.HasSuffix(strings.ToLower(h.Url), ApiUrlSuffix) {
83 | h.Url = h.Url + ApiUrlSuffix
84 | }
85 |
86 | return h, nil
87 | }
88 |
89 | func (h HttpClient) SubUrl(path string) string {
90 | return h.Url + "/" + path
91 | }
92 |
93 | func (h *HttpClient) Get(path string) (*http.Response, error) {
94 | dOut("http index get %s\n", h.SubUrl(path))
95 |
96 | req, err := http.NewRequest("GET", h.SubUrl(path), nil)
97 | if err != nil {
98 | return nil, err
99 | }
100 |
101 | req.Header.Add(HttpHeaderToken, h.AuthToken)
102 | req.Header.Add(HttpHeaderUser, h.User)
103 | return h.DoRequest(req)
104 | }
105 |
106 | func (h *HttpClient) Post(path string, body interface{}) (*http.Response, error) {
107 | dOut("http index post %s\n", h.SubUrl(path))
108 |
109 | rdr := io.Reader(nil)
110 | var err error
111 | if body != nil {
112 | rdr, err = Marshal(body)
113 | if err != nil {
114 | return nil, err
115 | }
116 | }
117 |
118 | req, err := http.NewRequest("POST", h.SubUrl(path), rdr)
119 | if err != nil {
120 | return nil, err
121 | }
122 |
123 | req.Header.Add(HttpHeaderContentType, HttpHeaderContentTypeYaml)
124 | req.Header.Add(HttpHeaderToken, h.AuthToken)
125 | req.Header.Add(HttpHeaderUser, h.User)
126 | return h.DoRequest(req)
127 | }
128 |
129 | func (h *HttpClient) DoRequest(req *http.Request) (*http.Response, error) {
130 | resp, err := http.DefaultClient.Do(req)
131 | if err != nil {
132 | return nil, err
133 | }
134 |
135 | c := resp.StatusCode
136 | if 200 <= c && c < 400 {
137 | return resp, nil
138 | }
139 |
140 | e, _ := ioutil.ReadAll(resp.Body)
141 | resp.Body.Close()
142 |
143 | s := strings.TrimSpace(string(e[:]))
144 | return nil, fmt.Errorf("HTTP error status code: %d (%s)", c, s)
145 | }
146 |
--------------------------------------------------------------------------------
/commands.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "github.com/gonuts/flag"
5 | "github.com/jbenet/commander"
6 | "strings"
7 | "time"
8 | )
9 |
10 | var Cmd_data = &commander.Command{
11 | UsageLine: "data [] []",
12 | Short: "dataset package manager",
13 | Long: `data - dataset package manager
14 |
15 | Basic commands:
16 |
17 | get Download and install dataset.
18 | list List installed datasets.
19 | info Show dataset information.
20 | publish Guided dataset publishing.
21 |
22 | Tool commands:
23 |
24 | version Show data version information.
25 | config Manage data configuration.
26 | user Manage users and credentials.
27 | commands List all available commands.
28 |
29 | Advanced Commands:
30 |
31 | blob Manage blobs in the blobstore.
32 | manifest Generate and manipulate dataset manifest.
33 | pack Dataset packaging, upload, and download.
34 |
35 | Use "data help " for more information about a command.
36 | `,
37 | Run: dataCmd,
38 | Subcommands: []*commander.Command{
39 | cmd_data_version,
40 | cmd_data_config,
41 | cmd_data_info,
42 | cmd_data_list,
43 | cmd_data_get,
44 | cmd_data_manifest,
45 | cmd_data_pack,
46 | cmd_data_blob,
47 | cmd_data_publish,
48 | cmd_data_user,
49 | cmd_data_commands,
50 | },
51 | Flag: *flag.NewFlagSet("data", flag.ExitOnError),
52 | }
53 |
54 | func dataCmd(c *commander.Command, args []string) error {
55 | pOut(c.Long)
56 | return nil
57 | }
58 |
59 | var cmd_root *commander.Command
60 |
61 | func init() {
62 | // this funky alias is to resolve cyclical decl references.
63 | cmd_root = Cmd_data
64 | }
65 |
66 | var cmd_data_commands = &commander.Command{
67 | UsageLine: "commands",
68 | Short: "List all available commands.",
69 | Long: `data commands - List all available commands.
70 |
71 | Lists all available commands (and sub-commands) and exits.
72 | `,
73 | Run: commandsCmd,
74 | Subcommands: []*commander.Command{
75 | cmd_data_commands_help,
76 | },
77 | }
78 |
79 | var cmd_data_commands_help = &commander.Command{
80 | UsageLine: "help",
81 | Short: "List all available commands' help pages.",
82 | Long: `data commands help - List all available commands's help pages.
83 |
84 | Shows the pages of all available commands (and sub-commands) and exits.
85 | Outputs a markdown document, also viewable at http://datadex.io/doc/ref
86 | `,
87 | Run: commandsHelpCmd,
88 | }
89 |
90 | func commandsCmd(c *commander.Command, args []string) error {
91 | var listCmds func(c *commander.Command)
92 | listCmds = func(c *commander.Command) {
93 | pOut("%s\n", c.FullSpacedName())
94 | for _, sc := range c.Subcommands {
95 | listCmds(sc)
96 | }
97 | }
98 |
99 | listCmds(c.Parent)
100 | return nil
101 | }
102 |
103 | func commandsHelpCmd(c *commander.Command, args []string) error {
104 | pOut(referenceHeaderMsg)
105 | pOut("Generated on %s.\n\n", time.Now().UTC().Format("2006-01-02"))
106 |
107 | var printCmds func(*commander.Command, int)
108 | printCmds = func(c *commander.Command, level int) {
109 | pOut("%s ", strings.Repeat("#", level))
110 | pOut("%s\n\n", c.FullSpacedName())
111 | pOut("```\n")
112 | pOut("%s\n", c.Long)
113 | pOut("```\n\n")
114 |
115 | for _, sc := range c.Subcommands {
116 | printCmds(sc, level+1)
117 | }
118 | }
119 |
120 | printCmds(c.Parent.Parent, 1)
121 | return nil
122 | }
123 |
124 | const referenceHeaderMsg = `
125 | # data command reference
126 |
127 | This document lists every data command (including subcommands), along with
128 | its help page. It can be viewed by running 'data commands help', and
129 | at http://datadex.io/doc/ref
130 |
131 | `
132 |
--------------------------------------------------------------------------------
/dev/formats.md:
--------------------------------------------------------------------------------
1 | # data formats
2 |
3 |
4 | One of the important design goals is format-fluidity: ability to store datasets in various formats and transfer between them. Suppose a graph of formats, datasets should be able to traverse strongly connected components. So, if a dataset is published in XML, I should be able to request it in json.[1] This is easy for homogeneous datasets, but gets complicated when one dataset includes files in multiple formats, or it has metadata separated out.
5 |
6 | This is complicated further when thinking about how datasets get authored/published to the index, and retrieved thereafter. In brief, the idea is to follow github pattern: `/`, which reduces namespace problems. This includes versions (tags/branches): `/@`. Note: this handle will be used in projects' Datafiles, to specify dependencies (datasets composed of other datasets[2]), etc.
7 |
8 |
9 | Some possibilities:
10 |
11 |
12 | 1. Let formats be branches like any other. `/@`. Since version and format are now in the same namespace, would see things like: `foo/bar@1.0-json`, `foo/bar@1.2-xml`. This complicates maintenance: both new versions or new formats require a "row" of "commits" along the formats or versions, respectively.
13 |
14 | 2. Let formats be dimensions (see [3]). `/#format:`. Would see things like: `foo/bar@format:json`, `foo/bar@format:xml` There would be dimensional 'defaults' (as HEAD is default tag) that could be specified in the package description file.
15 |
16 | 3. Let formats be specified separately. `/.`. e.g. `foo/bar.json`, `foo/bar.xml`. This seems neat and nice.
17 |
18 | 4. Punt. let authors choose their formats in the dataset. Would see things like: `foo/bar-json`, `foo/xmlbar`. Would not have format-fluidity :(. Naming wont be held to standard if users control it...
19 |
20 |
21 | So far, I like 2 and 3 the best. 2 implies building [3] below, or at least a subset of the functionality. Building [3] would also make it easier to convert between formats. Just unclear how likely data across domains would be generalizable to this DIR. Would genomics/proteomics data fit this?
22 |
23 |
24 |
25 | [1] implementation detail to choose where to be in the `index stores one fmt and tool converts locally <--> index stores every format` spectrum. Most likely in between: index stores every format but constructs them lazily)
26 |
27 | [2] think of docker images. datasets can be expressed as instructions that construct it (some files from foo/dataset1 + some from bar/dataset2). This implies that a selecting sub-portions of a dataset could be a really useful mechanic.[3]
28 |
29 | ### selecting
30 |
31 | [3] imagine selecting [n-m] rows of a given dataset. Unclear yet how this should work exactly, but i've ideas along a dataset intermediate representation (DIR), where data is expressed as points in a multi-dimensional space, and a dataset is expressed as a subspace, or intervals across some dimensions. This would work well even for tables, allowing one to select slices of a dataset with something like: /#[:[:]]` e.g.
32 |
33 | lecun/norb#class # points that have a class
34 | lecun/norb#class:car # points that have class `car`
35 | lecun/norb#set:training # points in the training set
36 | lecun/norb#y:0:10 # points where `0 <= y <= 10`
37 |
38 | (and of course, can specify multiple comma-delimited dimensions)
39 |
40 | Or:
41 |
42 | lecun/norb#class # points that have a class
43 | lecun/norb#class[car] # points that have class `car`
44 | lecun/norb#set[training] # points in the training set
45 | lecun/norb#y[0, 10] # points where `0 <= y <= 10`
46 | lecun/norb#y]0, 10[ # points where `0 < y < 10`
47 |
48 | This seems like a really powerful thing to enable. Unclear how to do it well at present. Lots and lots of edge cases. This can come in later versions but must not close doors to it now. (Another note: i realize this basically is a dumber query string `?param=val`, problem with using a query string is these handles may have to be embedded in URLs :/ though i guess hashes are out in that case...)
49 |
50 |
--------------------------------------------------------------------------------
/data_publish.go:
--------------------------------------------------------------------------------
1 | package data
2 |
3 | import (
4 | "fmt"
5 | "github.com/gonuts/flag"
6 | "github.com/jbenet/commander"
7 | )
8 |
9 | var cmd_data_publish = &commander.Command{
10 | UsageLine: "publish",
11 | Short: "Guided dataset publishing.",
12 | Long: `data publish - Guided dataset publishing.
13 |
14 | This command guides the user through the necessary steps to
15 | create a data package (Datafile and Manifest), uploads it,
16 | and publishes it to the dataset index.
17 |
18 | See 'data pack'.
19 | `,
20 | Run: publishCmd,
21 | Flag: *flag.NewFlagSet("data-pack-publish", flag.ExitOnError),
22 | }
23 |
24 | func init() {
25 | cmd_data_publish.Flag.Bool("clean", true,
26 | "rebuild manifest (data pack make --clean)")
27 | cmd_data_publish.Flag.Bool("force", false,
28 | "force publish (data pack publish --force)")
29 | }
30 |
31 | func publishCmd(c *commander.Command, args []string) error {
32 | u := configUser()
33 | if !isNamedUser(u) {
34 | return fmt.Errorf(NotLoggedInErr)
35 | }
36 |
37 | pOut("==> Guided Data Package Publishing.\n")
38 | pOut(PublishMsgWelcome)
39 |
40 | pOut("\n==> Step 1/3: Creating the package.\n")
41 | pOut(PublishMsgDatafile)
42 | err := packMakeCmd(c, []string{})
43 | if err != nil {
44 | return err
45 | }
46 |
47 | pOut("\n==> Step 2/3: Uploading the package contents.\n")
48 | pOut(PublishMsgUpload)
49 | err = packUploadCmd(c, []string{})
50 | if err != nil {
51 | return err
52 | }
53 |
54 | pOut("\n==> Step 3/3: Publishing the package to the index.\n")
55 | pOut(PublishMsgPublish)
56 | return packPublishCmd(c, []string{})
57 | }
58 |
59 | const NotLoggedInErr = `You are not logged in. First, either:
60 |
61 | - Run 'data user add' to create a new user account.
62 | - Run 'data user auth' to log in to an existing user account.
63 |
64 |
65 | Why does publishing require a registered user account (and email)? The index
66 | service needs to distinguish users to perform many of its tasks. For example:
67 |
68 | - Verify who can or cannot publish datasets, or modify already published ones.
69 | (i.e. the creator + collaborators should be able to, others should not).
70 | - Profiles credit people for the datasets they have published.
71 | - Malicious users can be removed, and their email addresses blacklisted to
72 | prevent further abuse.
73 | `
74 |
75 | const PublishMsgWelcome = `
76 | Welcome to Data Package Publishing. You should read these short
77 | messages carefully, as they contain important information about
78 | how data works, and how your data package will be published.
79 |
80 | First, a 'data package' is a collection of files, containing:
81 | - various files with your data, in any format.
82 | - 'Datafile', a file with descriptive information about the package.
83 | - 'Manifest', a file listing the other files in the package and their checksums.
84 |
85 | This tool will automatically:
86 | 1. Create the package
87 | - Generate a 'Datafile', with information you will provide.
88 | - Generate a 'Manifest', with all the files in the current directory.
89 | 2. Upload the package contents
90 | 3. Publish the package to the index
91 |
92 | (Note: to specify which files are part of the package, and other advanced
93 | features, use the 'data pack' command directly. See 'data pack help'.)
94 |
95 | `
96 |
97 | const PublishMsgDatafile = `
98 | First, let's write the package's Datafile, which contains important
99 | information about the package. The 'owner id' is the username of the
100 | package's owner (usually your username). The 'dataset id' is the identifier
101 | which defines this dataset. Good 'dataset ids' are like names: short, unique,
102 | and memorable. For example: "mnist" or "cifar". Choose it carefully.
103 |
104 | `
105 |
106 | const PublishMsgUpload = `
107 | Now, data will upload the contents of the package (this directory) to the index
108 | sotrage service. This may take a while, if the files are large (over 100MB).
109 |
110 | `
111 |
112 | const PublishMsgPublish = `
113 | Finally, data will publish the package to the index, where others can find
114 | and download your package. The index is available through data, and on the web.
115 |
116 | `
117 |
--------------------------------------------------------------------------------
/dev/roadmap.md:
--------------------------------------------------------------------------------
1 | # data roadmap
2 |
3 | This document briefly outlines desired features to implement.
4 |
5 |
6 | ## command dispatch
7 |
8 | Need to implement the skeleton of the project: command parsing/dispatch.
9 |
10 | ## data list
11 |
12 | data list
13 |
14 | List the datasets in the current project
15 |
16 | ## data config
17 |
18 | data config user.name = 'jbenet'
19 | data config --global user.name = 'jbenet'
20 |
21 | Allow the configuration of `data`, using (`git` like) config files.
22 | Consider using a `~/.dataconfig` global config file.
23 | Consider using a `data/config` (or `.dataconfig`) local config file.
24 |
25 | ## data update
26 |
27 | data update
28 |
29 | Download and install newer version.
30 | Also, check whether data is up-to-date on every run (inc option to silence).
31 |
32 | ## data get
33 |
34 | data get /
35 | data get http://datadex.io//
36 |
37 | Download and install packages from the dataset index (datadex, configurable).
38 | No arguments looks into the directory's `Datafile` (configurable)
39 | Allow installation of packages using `/` ref-naming.
40 | Allow installation of packages using `https?://...//` urls.
41 | Use a `--save` flag to store into a `Datafile`.
42 | Installed datasets go into the `data/` directory (configurable) of the project.
43 | Should download compressed files, and use array of mirrors.
44 |
45 |
46 | ## data manifest
47 |
48 | data manifest
49 |
50 | Generate the data manifest file (`.data/manifest`? `Manifest`?), a list of
51 |
52 |