├── .gitignore ├── detect ├── testdata │ ├── invalid.cbor │ ├── array.json │ ├── object.json │ ├── cbor_array.cbor │ ├── cbor_object.cbor │ ├── cbor_array.structure.json │ ├── cbor_object.structure.json │ ├── sitemap_array.structure.json │ ├── sitemap_object.structure.json │ ├── hours.csv │ ├── hours-with-header.csv │ ├── spelling.structure.json │ ├── hours.structure.json │ ├── hours-with-header.structure.json │ ├── police.structure.json │ ├── spelling.csv │ ├── daily_wind_2011.structure.json │ └── daily_wind_2011.csv ├── xlsx.go ├── cbor.go ├── json_test.go ├── json.go └── determineFields_test.go ├── testdata ├── vizs │ ├── invalidJSON.json │ ├── visconfig2.json │ ├── visconfig3.json │ └── visconfig1.json ├── readmes │ ├── invalidJSON.json │ ├── readmeconfig2.json │ ├── readmeconfig3.json │ └── readmeconfig1.json ├── zip │ └── exported.zip ├── metadata │ ├── hours.json │ ├── airport-codes.json │ └── continent-codes.json ├── datasets │ ├── hours.json │ ├── continent-codes.json │ ├── airport-codes.json │ └── complete.json └── structures │ ├── continent-codes.json │ ├── hours.json │ └── airport-codes.json ├── dsdiff ├── exp0.txt ├── got0.txt ├── Makefile ├── README.md ├── testdata │ ├── structureJsonSchemaOrig.json │ ├── structureJsonSchemaNew.json │ ├── newStructure.json │ ├── orig.json │ ├── newData.json │ ├── newDescription.json │ ├── newTitle.json │ ├── newVisConfig.json │ └── newTransform.json └── LICENSE ├── dstest ├── testdata │ └── complete │ │ ├── transform.star │ │ ├── template.html │ │ ├── body.csv │ │ ├── rendered.html │ │ ├── readme.md │ │ ├── expect.dataset.json │ │ └── input.dataset.json ├── template.go ├── golden.go ├── priv_key.go ├── compare.go └── dstest_test.go ├── stepfile ├── testdata │ ├── steps.txt │ └── steps.json ├── stepfile.go └── stepfile_test.go ├── dsio ├── testdata │ ├── movies │ │ └── body.cbor │ ├── cbor │ │ ├── array │ │ │ └── body.cbor │ │ ├── city │ │ │ └── body.cbor │ │ ├── movies │ │ │ └── body.cbor │ │ ├── object │ │ │ └── body.cbor │ │ ├── sitemap │ │ │ └── body.cbor │ │ ├── craigslist │ │ │ └── body.cbor │ │ ├── links_array │ │ │ └── body.cbor │ │ ├── links_object │ │ │ └── body.cbor │ │ ├── sitemap_object │ │ │ └── body.cbor │ │ └── flourinated_compounds_in_fast_food_packaging │ │ │ ├── body.cbor │ │ │ └── input.dataset.json │ ├── xlsx │ │ ├── simple │ │ │ └── body.xlsx │ │ └── obj_cell │ │ │ └── body.xlsx │ ├── json │ │ ├── city │ │ │ ├── input.dataset.json │ │ │ └── body.json │ │ ├── array │ │ │ └── body.json │ │ ├── object │ │ │ └── body.json │ │ ├── links_array │ │ │ └── body.json │ │ └── links_object │ │ │ └── body.json │ └── csv │ │ ├── cities_unique │ │ └── cities_unique.csv │ │ ├── cities │ │ ├── data.csv │ │ └── input.dataset.json │ │ ├── movies │ │ ├── input.dataset.json │ │ └── body.csv │ │ ├── movies_sorted_duration_desc │ │ └── body.csv │ │ ├── movies_sorted_movie_title │ │ └── body.csv │ │ ├── movies_sorted_movie_title_desc │ │ └── body.csv │ │ └── movies_sorted_duration_movie_title │ │ └── body.csv ├── replacecr │ ├── replace_cr_test.go │ └── replace_cr.go ├── tracked_reader_test.go ├── fuzz.go ├── tracked_reader.go ├── entry.go ├── entry_test.go ├── streams.go ├── entry_buffer_test.go ├── entry_buffer.go ├── README.md ├── identity.go ├── ndjson_test.go ├── ndjson.go └── xlsx_test.go ├── dsviz ├── testdata │ ├── custom │ │ ├── body.json │ │ ├── template.html │ │ ├── rendered.html │ │ └── input.dataset.json │ └── default │ │ ├── body.json │ │ ├── input.dataset.json │ │ ├── rendered.html │ │ └── template.html └── doc.go ├── validate ├── testdata │ ├── city │ │ ├── input.dataset.json │ │ └── body.json │ ├── flourinated_compounds_in_fast_food_packaging │ │ ├── body.cbor │ │ └── input.dataset.json │ └── movies │ │ ├── input.dataset.json │ │ └── body.csv ├── csv_test.go ├── validate_test.go ├── validate.go ├── csv.go ├── data_test.go ├── testdata_test.go ├── data.go └── dataset.go ├── codecov.yml ├── Makefile ├── vals ├── object_value.go ├── object_value_test.go ├── compare.go ├── compare_test.go ├── coding_test.go └── coding.go ├── hash_test.go ├── .codeclimate.yml ├── LICENSE ├── go.mod ├── hash.go ├── .circleci └── config.yml ├── generate ├── tabular_test.go ├── value.go ├── dsgen │ └── main.go └── tabular.go ├── kind_test.go ├── preview ├── preview_test.go ├── testdata │ └── earthquakes │ │ ├── readme.md │ │ └── input.dataset.json └── preview.go ├── readme.md ├── stats.go ├── kind.go ├── compression ├── compression_test.go └── compression.go ├── data_format.go └── data_format_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | coverage.txt -------------------------------------------------------------------------------- /detect/testdata/invalid.cbor: -------------------------------------------------------------------------------- 1 | 9af4f5f6fb41 -------------------------------------------------------------------------------- /testdata/vizs/invalidJSON.json: -------------------------------------------------------------------------------- 1 | Invalid Json -------------------------------------------------------------------------------- /detect/testdata/array.json: -------------------------------------------------------------------------------- 1 | [ 2 | "foo" 3 | ] -------------------------------------------------------------------------------- /testdata/readmes/invalidJSON.json: -------------------------------------------------------------------------------- 1 | Invalid Json -------------------------------------------------------------------------------- /detect/testdata/object.json: -------------------------------------------------------------------------------- 1 | { 2 | "foo" : "bar" 3 | } -------------------------------------------------------------------------------- /dsdiff/exp0.txt: -------------------------------------------------------------------------------- 1 | VisConfig: 1 change 2 | - modified format -------------------------------------------------------------------------------- /dstest/testdata/complete/transform.star: -------------------------------------------------------------------------------- 1 | commit([1,2,3,4,5]) -------------------------------------------------------------------------------- /stepfile/testdata/steps.txt: -------------------------------------------------------------------------------- 1 | I am a step 2 | --- 3 | I am another step 4 | --- 5 | I am a third step -------------------------------------------------------------------------------- /dsdiff/got0.txt: -------------------------------------------------------------------------------- 1 | { 2 | - "format": "abc", 3 | + "format": "new thing", 4 | "qri": "" 5 | } 6 | -------------------------------------------------------------------------------- /testdata/zip/exported.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/testdata/zip/exported.zip -------------------------------------------------------------------------------- /testdata/vizs/visconfig2.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "bar", 3 | "qri": "vz:0", 4 | "scriptPath": "two" 5 | } -------------------------------------------------------------------------------- /testdata/vizs/visconfig3.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "bar", 3 | "qri": "vz:0", 4 | "scriptPath": "three" 5 | } -------------------------------------------------------------------------------- /detect/testdata/cbor_array.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/detect/testdata/cbor_array.cbor -------------------------------------------------------------------------------- /dsio/testdata/movies/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/movies/body.cbor -------------------------------------------------------------------------------- /testdata/readmes/readmeconfig2.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "bar", 3 | "qri": "rm:0", 4 | "scriptPath": "two" 5 | } -------------------------------------------------------------------------------- /detect/testdata/cbor_object.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/detect/testdata/cbor_object.cbor -------------------------------------------------------------------------------- /dsio/testdata/cbor/array/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/array/body.cbor -------------------------------------------------------------------------------- /dsio/testdata/cbor/city/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/city/body.cbor -------------------------------------------------------------------------------- /dsviz/testdata/custom/body.json: -------------------------------------------------------------------------------- 1 | [ 2 | [2017, 7500000000], 3 | [2016, 7444000000], 4 | [2015, 7358000000] 5 | ] -------------------------------------------------------------------------------- /dsviz/testdata/default/body.json: -------------------------------------------------------------------------------- 1 | [ 2 | [2017, 7500000000], 3 | [2016, 7444000000], 4 | [2015, 7358000000] 5 | ] -------------------------------------------------------------------------------- /testdata/readmes/readmeconfig3.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "bar", 3 | "qri": "rm:0", 4 | "scriptPath": "three" 5 | } -------------------------------------------------------------------------------- /detect/testdata/cbor_array.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "cbor", 3 | "schema": { 4 | "type": "array" 5 | } 6 | } -------------------------------------------------------------------------------- /dsio/testdata/cbor/movies/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/movies/body.cbor -------------------------------------------------------------------------------- /dsio/testdata/cbor/object/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/object/body.cbor -------------------------------------------------------------------------------- /dsio/testdata/cbor/sitemap/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/sitemap/body.cbor -------------------------------------------------------------------------------- /dsio/testdata/xlsx/simple/body.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/xlsx/simple/body.xlsx -------------------------------------------------------------------------------- /detect/testdata/cbor_object.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "cbor", 3 | "schema": { 4 | "type": "object" 5 | } 6 | } -------------------------------------------------------------------------------- /detect/testdata/sitemap_array.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "json", 3 | "schema": { 4 | "type": "array" 5 | } 6 | } -------------------------------------------------------------------------------- /detect/testdata/sitemap_object.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "json", 3 | "schema": { 4 | "type": "object" 5 | } 6 | } -------------------------------------------------------------------------------- /dsio/testdata/cbor/craigslist/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/craigslist/body.cbor -------------------------------------------------------------------------------- /dsio/testdata/xlsx/obj_cell/body.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/xlsx/obj_cell/body.xlsx -------------------------------------------------------------------------------- /dsio/testdata/cbor/links_array/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/links_array/body.cbor -------------------------------------------------------------------------------- /dsio/testdata/cbor/links_object/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/links_object/body.cbor -------------------------------------------------------------------------------- /dsio/testdata/cbor/sitemap_object/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/sitemap_object/body.cbor -------------------------------------------------------------------------------- /testdata/vizs/visconfig1.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "foo", 3 | "qri": "vz:0", 4 | "scriptPath": "one", 5 | "renderedPath": "one" 6 | } -------------------------------------------------------------------------------- /testdata/readmes/readmeconfig1.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "foo", 3 | "qri": "rm:0", 4 | "scriptPath": "one", 5 | "renderedPath": "one" 6 | } -------------------------------------------------------------------------------- /dsio/testdata/json/city/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "structure": { 3 | "format": "json", 4 | "schema": { 5 | "type": "object" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /validate/testdata/city/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "structure": { 3 | "format": "json", 4 | "schema": { 5 | "type": "object" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /dsdiff/Makefile: -------------------------------------------------------------------------------- 1 | # Let's keep all our changelog commands the same across all our packages: 2 | update-changelog: 3 | conventional-changelog -p angular -i CHANGELOG.md -s -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | ci: 3 | - "ci/circle-ci" 4 | notify: 5 | require_ci_to_pass: no 6 | after_n_builds: 1 7 | coverage: 8 | range: "80...100" 9 | comment: off -------------------------------------------------------------------------------- /dsio/testdata/cbor/flourinated_compounds_in_fast_food_packaging/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/flourinated_compounds_in_fast_food_packaging/body.cbor -------------------------------------------------------------------------------- /dstest/testdata/complete/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{ .Meta.Title }} 5 | 6 | 7 |

{{ .Meta.Title }}

8 | 9 | -------------------------------------------------------------------------------- /validate/testdata/flourinated_compounds_in_fast_food_packaging/body.cbor: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qri-io/dataset/HEAD/validate/testdata/flourinated_compounds_in_fast_food_packaging/body.cbor -------------------------------------------------------------------------------- /stepfile/testdata/steps.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "script": "I am a step" 4 | }, 5 | { 6 | "script": "I am another step" 7 | }, 8 | { 9 | "script": "I am a third step" 10 | } 11 | ] -------------------------------------------------------------------------------- /dstest/testdata/complete/body.csv: -------------------------------------------------------------------------------- 1 | city,pop,avg_age,in_usa 2 | toronto,40000000,55.5,false 3 | new york,8500000,44.4,true 4 | chicago,300000,44.4,true 5 | chatham,35000,65.25,true 6 | raleigh,250000,50.65,true 7 | -------------------------------------------------------------------------------- /testdata/metadata/hours.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "md:0", 3 | "title": "hours", 4 | "accessURL": "https://example.com/not/a/url", 5 | "downloadURL": "https://example.com/not/a/url", 6 | "readmeURL": "/ipfs/notahash" 7 | } -------------------------------------------------------------------------------- /dsio/testdata/csv/cities_unique/cities_unique.csv: -------------------------------------------------------------------------------- 1 | city,pop,avg_age,in_usa 2 | toronto,40000000,55.5,false 3 | new york,8500000,44.4,true 4 | chicago,300000,44.4,true 5 | chatham,35000,65.25,true 6 | raleigh,250000,50.65,true 7 | -------------------------------------------------------------------------------- /dstest/testdata/complete/rendered.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dataset with all submodels example 5 | 6 | 7 |

dataset with all submodels example

8 | 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Let's keep all our changelog commands the same across all our packages: 2 | update-changelog: 3 | conventional-changelog -p angular -i CHANGELOG.md -s 4 | 5 | test: 6 | go test ./... -v --coverprofile=coverage.txt --covermode=atomic -------------------------------------------------------------------------------- /dsio/testdata/csv/cities/data.csv: -------------------------------------------------------------------------------- 1 | city,pop,avg_age,in_usa 2 | toronto,40000000,55.5,false 3 | toronto,40000000,55.5,false 4 | new york,8500000,44.4,true 5 | chicago,300000,44.4,true 6 | chatham,35000,65.25,true 7 | raleigh,250000,50.65,true 8 | -------------------------------------------------------------------------------- /testdata/metadata/airport-codes.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "md:0", 3 | "homeURL": "http://www.ourairports.com/", 4 | "license": { 5 | "type":"PDDL-1.0" 6 | }, 7 | "title": "Airport Codes", 8 | "citations": [ 9 | { 10 | "name" : "Our Airports", 11 | "url" : "http://ourairports.com/data/" 12 | } 13 | ] 14 | } -------------------------------------------------------------------------------- /testdata/datasets/hours.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "meta": { 4 | "qri": "md:0", 5 | "title": "hours", 6 | "accessURL": "https://example.com/not/a/url", 7 | "downloadURL": "https://example.com/not/a/url", 8 | "readmeURL": "/ipfs/notahash" 9 | }, 10 | "bodyPath": "/ipfs/QmS1dVa1xemo7gQzJgjimj1WwnVBF3TwRTGsyKa1uEBWbJ" 11 | } -------------------------------------------------------------------------------- /detect/xlsx.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/qri-io/dataset" 7 | ) 8 | 9 | // XLSXSchema determines any schema information for an excel spreadsheet 10 | // TODO (b5): currently unimplemented 11 | func XLSXSchema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) { 12 | return dataset.BaseSchemaArray, 0, nil 13 | } 14 | -------------------------------------------------------------------------------- /testdata/structures/continent-codes.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "csv", 3 | "schema": { 4 | "type": "array", 5 | "items": { 6 | "type": "array", 7 | "items": [ 8 | { 9 | "title": "Code", 10 | "type": "string" 11 | }, 12 | { 13 | "title": "Name", 14 | "type": "string" 15 | } 16 | ] 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /detect/testdata/hours.csv: -------------------------------------------------------------------------------- 1 | 11 Jan 16 12:00 EST, 1.0, dev, working on some cool stuff 2 | 11 Jan 16 12:00 EST, 1.0, dev, working 3 | 11 Jan 16 12:00 EST, 1.0, dev, other stuff 4 | 11 Jan 16 12:00 EST, 1.0, dev, moar work 5 | 11 Jan 16 12:00 EST, 1.0, dev, youtube work? 6 | 11 Jan 16 12:00 EST, 1.0, dev, is this really work? 7 | 11 Jan 16 12:00 EST, 1.0, dev, werd 8 | 11 Jan 16 12:00 EST, 1.0, dev, making more work -------------------------------------------------------------------------------- /testdata/metadata/continent-codes.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Continent Codes", 3 | "qri": "md:0", 4 | "description": "list of continents with corresponding two letter codes", 5 | "license": { 6 | "type": "odc-pddl", 7 | "url": "http://opendatacommons.org/licenses/pddl/" 8 | }, 9 | "keywords": [ 10 | "Continents", 11 | "Two letter code", 12 | "Continent codes", 13 | "Continent code list" 14 | ] 15 | } -------------------------------------------------------------------------------- /vals/object_value.go: -------------------------------------------------------------------------------- 1 | package vals 2 | 3 | // ObjectValue is a special value that represents a value in the context of a parent object 4 | // It wraps a value, adding a property "Key" that holds the value's key in the parent object 5 | type ObjectValue struct { 6 | Key string 7 | Value 8 | } 9 | 10 | // NewObjectValue allocates a new Object Value 11 | func NewObjectValue(key string, v Value) Value { 12 | return ObjectValue{key, v} 13 | } 14 | -------------------------------------------------------------------------------- /detect/testdata/hours-with-header.csv: -------------------------------------------------------------------------------- 1 | timestamp,hours,category,comments 2 | 11 Jan 16 12:00 EST, 1.0, dev, working on some cool stuff 3 | 11 Jan 16 12:00 EST, 1.0, dev, working 4 | 11 Jan 16 12:00 EST, 1.0, dev, other stuff 5 | 11 Jan 16 12:00 EST, 1.0, dev, moar work 6 | 11 Jan 16 12:00 EST, 1.0, dev, youtube work? 7 | 11 Jan 16 12:00 EST, 1.0, dev, is this really work? 8 | 11 Jan 16 12:00 EST, 1.0, dev, werd 9 | 11 Jan 16 12:00 EST, 1.0, dev, making more work -------------------------------------------------------------------------------- /detect/testdata/spelling.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "csv", 3 | "formatConfig" : { 4 | "headerRow" : true, 5 | "lazyQuotes": true 6 | }, 7 | "schema": { 8 | "type": "array", 9 | "items": { 10 | "type":"array", 11 | "items": [ 12 | { 13 | "title": "state", 14 | "type": "string" 15 | }, 16 | { 17 | "title": "search_interest", 18 | "type": "number" 19 | } 20 | ] 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /testdata/datasets/continent-codes.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "meta": { 4 | "title": "Continent Codes", 5 | "qri": "md:0", 6 | "description": "list of continents with corresponding two letter codes", 7 | "license": { 8 | "type": "odc-pddl", 9 | "url": "http://opendatacommons.org/licenses/pddl/" 10 | }, 11 | "keywords": [ 12 | "Continents", 13 | "Two letter code", 14 | "Continent codes", 15 | "Continent code list" 16 | ] 17 | } 18 | } -------------------------------------------------------------------------------- /dsio/testdata/json/city/body.json: -------------------------------------------------------------------------------- 1 | [ 2 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }, 3 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }, 4 | { "city" : "new york", "pop" : 8500000, "avg_age": 44.4, "in_usa" : true }, 5 | { "city" : "chicago", "pop" : 300000, "avg_age" : 44.4 , "in_usa" : true }, 6 | { "city" : "chatham", "pop" : 35000, "avg_age" : 65.25 , "in_usa" : false }, 7 | { "city" : "raleigh", "pop" : 250000, "avg_age" : 50.65 , "in_usa" : true } 8 | ] -------------------------------------------------------------------------------- /validate/testdata/city/body.json: -------------------------------------------------------------------------------- 1 | [ 2 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }, 3 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }, 4 | { "city" : "new york", "pop" : 8500000, "avg_age": 44.4, "in_usa" : true }, 5 | { "city" : "chicago", "pop" : 300000, "avg_age" : 44.4 , "in_usa" : true }, 6 | { "city" : "chatham", "pop" : 35000, "avg_age" : 65.25 , "in_usa" : false }, 7 | { "city" : "raleigh", "pop" : 250000, "avg_age" : 50.65 , "in_usa" : true } 8 | ] -------------------------------------------------------------------------------- /vals/object_value_test.go: -------------------------------------------------------------------------------- 1 | package vals 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestNewObjectValue(t *testing.T) { 8 | v := NewObjectValue("foo", String("")) 9 | if v.Type() != TypeString { 10 | t.Errorf("type mismatch. expected: %s. got: %s", TypeString, v.Type()) 11 | } 12 | 13 | if ov, ok := v.(ObjectValue); ok { 14 | if ov.Key != "foo" { 15 | t.Errorf("key mismatch. expected: %s, got: %s", "foo", ov.Key) 16 | } 17 | } else { 18 | 19 | t.Errorf("expected ObjectValue type") 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /dstest/testdata/complete/readme.md: -------------------------------------------------------------------------------- 1 | # Lorem ipsum dolor 2 | 3 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas maximus erat ut rhoncus blandit. Duis aliquet vulputate leo eu volutpat. Praesent in mollis metus, non convallis lectus. Vestibulum malesuada mauris quis nisl auctor pellentesque. Duis lacinia nec justo in viverra. Quisque quis aliquet ante. Donec semper scelerisque laoreet. Praesent dapibus interdum mi, sit amet lacinia odio malesuada vitae. Proin eu erat quis nisi tristique mollis. Donec sed eleifend augue, at convallis ex. 4 | -------------------------------------------------------------------------------- /dsviz/testdata/custom/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{ ds.meta.title }} 5 | 6 | 7 |

{{ ds.meta.title }}

8 |

First Row:

9 | 10 | {{- range bodyEntries 0 1 }} 11 | 12 | {{ range . }}{{ end }} 13 | 14 | {{- end }} 15 |
{{ . }}
16 |

Full Body:

17 | 18 | {{- range allBodyEntries }} 19 | 20 | {{ range . }}{{ end }} 21 | 22 | {{- end }} 23 |
{{ . }}
24 | 25 | -------------------------------------------------------------------------------- /dsviz/testdata/custom/rendered.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | World Population 5 | 6 | 7 |

World Population

8 |

First Row:

9 | 10 | 11 | 12 | 13 |
20177500000000
14 |

Full Body:

15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 |
20177500000000
20167444000000
20157358000000
26 | 27 | -------------------------------------------------------------------------------- /dsio/testdata/csv/movies/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "meta": { 4 | "title": "example movie data" 5 | }, 6 | "structure": { 7 | "format": "csv", 8 | "formatConfig": { 9 | "headerRow": true 10 | }, 11 | "schema": { 12 | "type": "array", 13 | "items": { 14 | "type": "array", 15 | "items": [ 16 | { 17 | "title": "movie_title", 18 | "type": "string" 19 | }, 20 | { 21 | "title": "duration", 22 | "type": "integer" 23 | } 24 | ] 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /validate/testdata/movies/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "meta": { 4 | "title": "example movie data" 5 | }, 6 | "structure": { 7 | "format": "csv", 8 | "formatConfig": { 9 | "headerRow": true 10 | }, 11 | "schema": { 12 | "type": "array", 13 | "items": { 14 | "type": "array", 15 | "items": [ 16 | { 17 | "title": "movie_title", 18 | "type": "string" 19 | }, 20 | { 21 | "title": "duration", 22 | "type": "integer" 23 | } 24 | ] 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /hash_test.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestHashBytes(t *testing.T) { 8 | cases := []struct { 9 | in []byte 10 | out string 11 | err error 12 | }{ 13 | {[]byte(""), "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n", nil}, 14 | } 15 | 16 | for i, c := range cases { 17 | got, err := HashBytes(c.in) 18 | if err != c.err { 19 | t.Errorf("case %d error mismatch. expected: %s got: %s", i, c.err, err) 20 | continue 21 | } 22 | 23 | if got != c.out { 24 | t.Errorf("case %d result mismatch. expected: %s got: %s", i, c.out, got) 25 | continue 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /testdata/structures/hours.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "csv", 3 | "formatOptions": null, 4 | "depth" : 2, 5 | "schema": { 6 | "type": "array", 7 | "items": { 8 | "type": "array", 9 | "items": [ 10 | { 11 | "title": "field_1", 12 | "type": "string" 13 | }, 14 | { 15 | "title": "field_2", 16 | "type": "number" 17 | }, 18 | { 19 | "title": "field_3", 20 | "type": "string" 21 | }, 22 | { 23 | "title": "field_4", 24 | "type": "string" 25 | } 26 | ] 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /dstest/template.go: -------------------------------------------------------------------------------- 1 | package dstest 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | "text/template" 7 | ) 8 | 9 | // Template executes & returns a template string, failing the test if the 10 | // template fails to compile 11 | func Template(t *testing.T, tmplStr string, data interface{}) string { 12 | t.Helper() 13 | tmpl, err := template.New("tmpl").Parse(tmplStr) 14 | if err != nil { 15 | t.Fatalf("error parsing dstest template: %s", err) 16 | } 17 | 18 | w := &bytes.Buffer{} 19 | if err := tmpl.Execute(w, data); err != nil { 20 | t.Fatalf("error executing dstest template: %s", err) 21 | } 22 | 23 | return w.String() 24 | } 25 | -------------------------------------------------------------------------------- /dsviz/testdata/custom/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "peername" : "steve", 3 | "name" : "world_pop", 4 | "meta": { 5 | "title": "World Population", 6 | "description": "a dataset showing the population of the world" 7 | }, 8 | "structure" : { 9 | "format": "json", 10 | "schema": { 11 | "type": "array", 12 | "items": { 13 | "type": "array", 14 | "items": [ 15 | { "title": "year", "type": "integer" }, 16 | { "title": "population", "type": "integer" } 17 | ] 18 | } 19 | } 20 | }, 21 | "viz":{ 22 | "format": "html", 23 | "scriptPath": "bar" 24 | } 25 | } -------------------------------------------------------------------------------- /detect/testdata/hours.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "csv", 3 | "formatConfig": { 4 | "lazyQuotes" : true 5 | }, 6 | "schema": { 7 | "type": "array", 8 | "items": { 9 | "type": "array", 10 | "items": [ 11 | { 12 | "title": "field_1", 13 | "type": "string" 14 | }, 15 | { 16 | "title": "field_2", 17 | "type": "number" 18 | }, 19 | { 20 | "title": "field_3", 21 | "type": "string" 22 | }, 23 | { 24 | "title": "field_4", 25 | "type": "string" 26 | } 27 | ] 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /dsio/replacecr/replace_cr_test.go: -------------------------------------------------------------------------------- 1 | package replacecr 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | ) 7 | 8 | func TestReader(t *testing.T) { 9 | input := []byte("foo\r\rbar\r\nbaz\r\r") 10 | expect := []byte("foo\r\n\r\nbar\r\nbaz\r\n\r\n") 11 | 12 | got := make([]byte, 19) 13 | n, err := Reader(bytes.NewReader(input)).Read(got) 14 | if err != nil && err.Error() != "EOF" { 15 | t.Errorf("unexpected error: %s", err.Error()) 16 | } 17 | if n != 19 { 18 | t.Errorf("length error. expected: %d, got: %d", 19, n) 19 | } 20 | if !bytes.Equal(expect, got) { 21 | t.Errorf("byte mismatch. expected:\n%v\ngot:\n%v", expect, got) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /dsio/tracked_reader_test.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestTrackedReader(t *testing.T) { 9 | r := strings.NewReader("0123456789") 10 | tr := NewTrackedReader(r) 11 | 12 | buf := make([]byte, 4) 13 | tr.Read(buf) 14 | if tr.BytesRead() != 4 { 15 | t.Errorf("expected bytes read to equal 4, got: %d", tr.BytesRead()) 16 | } 17 | tr.Read(buf) 18 | if tr.BytesRead() != 8 { 19 | t.Errorf("expected bytes read to equal 4, got: %d", tr.BytesRead()) 20 | } 21 | tr.Read(buf) 22 | if tr.BytesRead() != 10 { 23 | t.Errorf("expected bytes read to equal 4, got: %d", tr.BytesRead()) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /validate/testdata/movies/body.csv: -------------------------------------------------------------------------------- 1 | movie_title,duration 2 | Avatar , 3 | Pirates of the Caribbean: At World's End , 4 | Spectre ,148 5 | The Dark Knight Rises ,164 6 | Star Wars: Episode VII - The Force Awakens ,0 7 | John Carter ,132 8 | Spider-Man 3 ,156 9 | Tangled ,100 10 | Avengers: Age of Ultron ,141 11 | Harry Potter and the Half-Blood Prince ,153 12 | Batman v Superman: Dawn of Justice ,183 13 | Superman Returns ,169 14 | Quantum of Solace ,106 15 | Pirates of the Caribbean: Dead Man's Chest ,151 16 | The Lone Ranger ,150 17 | Man of Steel ,143 18 | The Chronicles of Narnia: Prince Caspian ,150 19 | The Avengers ,173 20 | Pirates of the Caribbean: On Stranger Tides ,136 21 | -------------------------------------------------------------------------------- /dsio/testdata/csv/movies/body.csv: -------------------------------------------------------------------------------- 1 | movie_title,duration 2 | Avatar ,178 3 | Pirates of the Caribbean: At World's End ,169 4 | Spectre ,148 5 | The Dark Knight Rises ,164 6 | Star Wars: Episode VII - The Force Awakens , 7 | John Carter ,132 8 | Spider-Man 3 ,156 9 | Tangled ,100 10 | Avengers: Age of Ultron ,141 11 | Harry Potter and the Half-Blood Prince ,153 12 | Batman v Superman: Dawn of Justice ,183 13 | Superman Returns ,169 14 | Quantum of Solace ,106 15 | Pirates of the Caribbean: Dead Man's Chest ,151 16 | The Lone Ranger ,150 17 | Man of Steel ,143 18 | The Chronicles of Narnia: Prince Caspian ,150 19 | The Avengers ,173 20 | Pirates of the Caribbean: On Stranger Tides ,136 21 | -------------------------------------------------------------------------------- /detect/testdata/hours-with-header.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "csv", 3 | "formatConfig" : { 4 | "headerRow" : true, 5 | "lazyQuotes" : true 6 | }, 7 | "schema": { 8 | "type": "array", 9 | "items": { 10 | "type": "array", 11 | "items": [ 12 | { 13 | "title": "timestamp", 14 | "type": "string" 15 | }, 16 | { 17 | "title": "hours", 18 | "type": "number" 19 | }, 20 | { 21 | "title": "category", 22 | "type": "string" 23 | }, 24 | { 25 | "title": "comments", 26 | "type": "string" 27 | } 28 | ] 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /dsio/fuzz.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | 7 | "github.com/qri-io/dataset" 8 | ) 9 | 10 | // Fuzz is the entry-point for go-fuzz. Return 1 for a successful parse and 0 for failures. 11 | func Fuzz(data []byte) int { 12 | r := bytes.NewReader(data) 13 | st := &dataset.Structure{Format: dataset.JSONDataFormat.String(), Schema: dataset.BaseSchemaObject} 14 | reader, err := NewJSONReader(st, r) 15 | if err != nil { 16 | return 0 17 | } 18 | for { 19 | _, err = reader.ReadEntry() 20 | if err != nil { 21 | if err.Error() == "EOF" { 22 | break 23 | } 24 | fmt.Printf("Error: %s\n", err.Error()) 25 | return 0 26 | } 27 | } 28 | return 1 29 | } 30 | -------------------------------------------------------------------------------- /dsio/testdata/csv/movies_sorted_duration_desc/body.csv: -------------------------------------------------------------------------------- 1 | movie_title,duration 2 | Batman v Superman: Dawn of Justice ,183 3 | Avatar ,178 4 | The Avengers ,173 5 | Superman Returns ,169 6 | Pirates of the Caribbean: At World's End ,169 7 | The Dark Knight Rises ,164 8 | Spider-Man 3 ,156 9 | Harry Potter and the Half-Blood Prince ,153 10 | Pirates of the Caribbean: Dead Man's Chest ,151 11 | The Chronicles of Narnia: Prince Caspian ,150 12 | The Lone Ranger ,150 13 | Spectre ,148 14 | Man of Steel ,143 15 | Avengers: Age of Ultron ,141 16 | Pirates of the Caribbean: On Stranger Tides ,136 17 | John Carter ,132 18 | Quantum of Solace ,106 19 | Tangled ,100 20 | Star Wars: Episode VII - The Force Awakens , 21 | -------------------------------------------------------------------------------- /dsio/testdata/csv/movies_sorted_movie_title/body.csv: -------------------------------------------------------------------------------- 1 | movie_title,duration 2 | Avatar ,178 3 | Avengers: Age of Ultron ,141 4 | Batman v Superman: Dawn of Justice ,183 5 | Harry Potter and the Half-Blood Prince ,153 6 | John Carter ,132 7 | Man of Steel ,143 8 | Pirates of the Caribbean: At World's End ,169 9 | Pirates of the Caribbean: Dead Man's Chest ,151 10 | Pirates of the Caribbean: On Stranger Tides ,136 11 | Quantum of Solace ,106 12 | Spectre ,148 13 | Spider-Man 3 ,156 14 | Star Wars: Episode VII - The Force Awakens , 15 | Superman Returns ,169 16 | Tangled ,100 17 | The Avengers ,173 18 | The Chronicles of Narnia: Prince Caspian ,150 19 | The Dark Knight Rises ,164 20 | The Lone Ranger ,150 21 | -------------------------------------------------------------------------------- /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | ratings: 2 | paths: 3 | - "**/*.go" 4 | 5 | engines: 6 | fixme: 7 | enabled: true 8 | config: 9 | strings: 10 | - TODO 11 | golint: 12 | enabled: true 13 | govet: 14 | enabled: true 15 | gofmt: 16 | enabled: true 17 | 18 | version: "2" 19 | checks: 20 | argument-count: 21 | enabled: false 22 | complex-logic: 23 | enabled: false 24 | file-lines: 25 | enabled: false 26 | method-complexity: 27 | enabled: false 28 | method-count: 29 | enabled: false 30 | method-lines: 31 | enabled: false 32 | nested-control-flow: 33 | enabled: false 34 | return-statements: 35 | enabled: false 36 | similar-code: 37 | enabled: false -------------------------------------------------------------------------------- /dsio/testdata/csv/movies_sorted_movie_title_desc/body.csv: -------------------------------------------------------------------------------- 1 | movie_title,duration 2 | The Lone Ranger ,150 3 | The Dark Knight Rises ,164 4 | The Chronicles of Narnia: Prince Caspian ,150 5 | The Avengers ,173 6 | Tangled ,100 7 | Superman Returns ,169 8 | Star Wars: Episode VII - The Force Awakens , 9 | Spider-Man 3 ,156 10 | Spectre ,148 11 | Quantum of Solace ,106 12 | Pirates of the Caribbean: On Stranger Tides ,136 13 | Pirates of the Caribbean: Dead Man's Chest ,151 14 | Pirates of the Caribbean: At World's End ,169 15 | Man of Steel ,143 16 | John Carter ,132 17 | Harry Potter and the Half-Blood Prince ,153 18 | Batman v Superman: Dawn of Justice ,183 19 | Avengers: Age of Ultron ,141 20 | Avatar ,178 21 | -------------------------------------------------------------------------------- /dsio/testdata/csv/movies_sorted_duration_movie_title/body.csv: -------------------------------------------------------------------------------- 1 | movie_title,duration 2 | Star Wars: Episode VII - The Force Awakens , 3 | Tangled ,100 4 | Quantum of Solace ,106 5 | John Carter ,132 6 | Pirates of the Caribbean: On Stranger Tides ,136 7 | Avengers: Age of Ultron ,141 8 | Man of Steel ,143 9 | Spectre ,148 10 | The Chronicles of Narnia: Prince Caspian ,150 11 | The Lone Ranger ,150 12 | Pirates of the Caribbean: Dead Man's Chest ,151 13 | Harry Potter and the Half-Blood Prince ,153 14 | Spider-Man 3 ,156 15 | The Dark Knight Rises ,164 16 | Pirates of the Caribbean: At World's End ,169 17 | Superman Returns ,169 18 | The Avengers ,173 19 | Avatar ,178 20 | Batman v Superman: Dawn of Justice ,183 21 | -------------------------------------------------------------------------------- /validate/csv_test.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | func TestCheckCsvRowLengths(t *testing.T) { 9 | cases := []struct { 10 | input string 11 | err string 12 | }{ 13 | {rawText1, ""}, 14 | {rawText2, ""}, 15 | {rawText2b, ""}, 16 | {rawText3, ""}, //Note: since there are no commas this should pass 17 | {rawText4, "error: inconsistent column length on line 4 of length 2 (rather than 1). ensure all csv columns same length"}, 18 | } 19 | 20 | for i, c := range cases { 21 | r := strings.NewReader(c.input) 22 | err := CheckCsvRowLengths(r) 23 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 24 | t.Errorf("case [%d] error mismatch. expected: '%s', got: '%s'", i, c.err, err) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /dsdiff/README.md: -------------------------------------------------------------------------------- 1 | [![Qri](https://img.shields.io/badge/made%20by-qri-magenta.svg?style=flat-square)](https://qri.io) 2 | [![GoDoc](https://godoc.org/github.com/qri-io/dsdiff?status.svg)](http://godoc.org/github.com/qri-io/dsdiff) 3 | [![License](https://img.shields.io/github/license/qri-io/dsdiff.svg?style=flat-square)](./LICENSE) 4 | [![Codecov](https://img.shields.io/codecov/c/github/qri-io/dsdiff.svg?style=flat-square)](https://codecov.io/gh/qri-io/dsdiff) 5 | [![CI](https://img.shields.io/circleci/project/github/qri-io/dsdiff.svg?style=flat-square)](https://circleci.com/gh/qri-io/dsdiff) 6 | [![Go Report Card](https://goreportcard.com/badge/github.com/qri-io/dsdiff)](https://goreportcard.com/report/github.com/qri-io/dsdiff) 7 | 8 | # dsdiff 9 | 10 | Utility for Diffing Datasets, currently a very basic placeholder 11 | -------------------------------------------------------------------------------- /dsdiff/testdata/structureJsonSchemaOrig.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "BodyPath": "abc", 4 | "path": "123", 5 | "format": "csv", 6 | "formatConfig": { 7 | "headerRow": true 8 | }, 9 | "structure": { 10 | "schema": { 11 | "title": "Person", 12 | "type": "object", 13 | "properties": { 14 | "firstName": { 15 | "type": "string" 16 | }, 17 | "lastName": { 18 | "type": "string" 19 | }, 20 | "age": { 21 | "description": "Age in years", 22 | "type": "integer", 23 | "minimum": 0 24 | }, 25 | "friends": { 26 | "type" : "array", 27 | "items" : { "title" : "REFERENCE", "$ref" : "#" } 28 | } 29 | }, 30 | "required": ["firstName", "lastName"] 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /dsviz/testdata/default/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "path" : "/ipfs/QmSH2WNg8x3ckC8GYTZDY6kVtxfMo2RNJSMgcc2Ewb7iiJ", 3 | "peername" : "steve", 4 | "name" : "world_pop", 5 | "commit" : { 6 | "timestamp": "2019-03-20T20:02:24.689938Z" 7 | }, 8 | "meta": { 9 | "title": "World Population", 10 | "description": "a dataset showing the population of the world" 11 | }, 12 | "structure" : { 13 | "format": "json", 14 | "length" : 234567, 15 | "entries" : 234, 16 | "schema": { 17 | "type": "array", 18 | "items": { 19 | "type": "array", 20 | "items": [ 21 | { "title": "year", "type": "integer" }, 22 | { "title": "population", "type": "integer" } 23 | ] 24 | } 25 | } 26 | }, 27 | "viz":{ 28 | "format": "html", 29 | "scriptPath": "bar" 30 | } 31 | } -------------------------------------------------------------------------------- /dsio/testdata/csv/cities/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "structure": { 4 | "title": "example city data", 5 | "structure": { 6 | "format": "csv", 7 | "formatConfig": { 8 | "headerRow": true 9 | }, 10 | "schema": { 11 | "type": "array", 12 | "items": { 13 | "type": "array", 14 | "items": [ 15 | { 16 | "title": "city", 17 | "type": "string" 18 | }, 19 | { 20 | "title": "pop", 21 | "type": "integer" 22 | }, 23 | { 24 | "title": "avg_age", 25 | "type": "number" 26 | }, 27 | { 28 | "title": "in_usa", 29 | "type": "boolean" 30 | } 31 | ] 32 | } 33 | } 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /validate/validate_test.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestValidName(t *testing.T) { 8 | cases := []struct { 9 | name string 10 | err string 11 | }{ 12 | {"", "error: name cannot be empty"}, 13 | {"9", "error: illegal name '9', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters"}, 14 | {"_", "error: illegal name '_', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters"}, 15 | {"_foo", "error: illegal name '_foo', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters"}, 16 | } 17 | 18 | for i, c := range cases { 19 | err := ValidName(c.name) 20 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 21 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err) 22 | continue 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /dsdiff/testdata/structureJsonSchemaNew.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "BodyPath": "abc", 4 | "path": "123", 5 | "format": "csv", 6 | "formatConfig": { 7 | "headerRow": true 8 | }, 9 | "structure": { 10 | "schema": { 11 | "title": "Person", 12 | "type": "object", 13 | "properties": { 14 | "firstName": { 15 | "type": "string" 16 | }, 17 | "middleName": { 18 | "type": "string" 19 | }, 20 | "lastName": { 21 | "type": "string" 22 | }, 23 | "age": { 24 | "description": "Age in years", 25 | "type": "integer", 26 | "minimum": 0 27 | }, 28 | "friends": { 29 | "type" : "array", 30 | "items" : { "title" : "REFERENCE", "$ref" : "#" } 31 | } 32 | }, 33 | "required": ["firstName", "lastName"] 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /dsviz/testdata/default/rendered.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | World Population 10 | 11 | 12 |
13 |
14 |

World Population

15 |
16 |
17 |

a dataset showing the population of the world

18 |
19 | 26 |
27 | 28 | -------------------------------------------------------------------------------- /validate/validate.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | 7 | logger "github.com/ipfs/go-log" 8 | ) 9 | 10 | var ( 11 | alphaNumericRegex = regexp.MustCompile(`^[a-zA-Z]\w{0,143}$`) 12 | log = logger.Logger("validate") 13 | ) 14 | 15 | // ValidName checks for a valid variable name 16 | // names must: 17 | // * start with a letter 18 | // * consist of only alpha-numeric characters and/or underscores 19 | // * have a total length of no more than 144 characters 20 | func ValidName(name string) error { 21 | if name == "" { 22 | err := fmt.Errorf("error: name cannot be empty") 23 | log.Debug(err.Error()) 24 | return err 25 | } 26 | if alphaNumericRegex.FindString(name) == "" { 27 | err := fmt.Errorf("error: illegal name '%s', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters", name) 28 | log.Debug(err.Error()) 29 | return err 30 | } 31 | return nil 32 | } 33 | -------------------------------------------------------------------------------- /dsio/tracked_reader.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import "io" 4 | 5 | // TrackedReader wraps a reader, keeping an internal count of the bytes read 6 | type TrackedReader struct { 7 | read int 8 | r io.Reader 9 | } 10 | 11 | // NewTrackedReader creates a new tracked reader 12 | func NewTrackedReader(r io.Reader) *TrackedReader { 13 | return &TrackedReader{r: r} 14 | } 15 | 16 | // Read implements the io.Reader interface 17 | func (tr *TrackedReader) Read(p []byte) (n int, err error) { 18 | n, err = tr.r.Read(p) 19 | tr.read += n 20 | return 21 | } 22 | 23 | // BytesRead gives the total number of bytes read from the underlying reader 24 | func (tr *TrackedReader) BytesRead() int { 25 | return tr.read 26 | } 27 | 28 | // Close implements the io.Closer interface, closes the underlying reader if 29 | // it's an io.Closer 30 | func (tr *TrackedReader) Close() error { 31 | if cl, ok := tr.r.(io.Closer); ok { 32 | return cl.Close() 33 | } 34 | return nil 35 | } 36 | -------------------------------------------------------------------------------- /detect/testdata/police.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "csv", 3 | "schema": { 4 | "type": "array", 5 | "items": { 6 | "type": "array", 7 | "items": [ 8 | { 9 | "title": "city", 10 | "type": "string" 11 | }, 12 | { 13 | "title": "police_force_size", 14 | "type": "integer" 15 | }, 16 | { 17 | "title": "all", 18 | "type": "number" 19 | }, 20 | { 21 | "title": "white", 22 | "type": "number" 23 | }, 24 | { 25 | "title": "non_white", 26 | "type": "number" 27 | }, 28 | { 29 | "title": "black", 30 | "type": "number" 31 | }, 32 | { 33 | "title": "hispanic", 34 | "type": "number" 35 | }, 36 | { 37 | "title": "asian", 38 | "type": "number" 39 | } 40 | ] 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /validate/csv.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "encoding/csv" 5 | "fmt" 6 | "io" 7 | ) 8 | 9 | // CheckCsvRowLengths ensures that csv input has 10 | // the same number of columns in every row and otherwise 11 | // returns an error 12 | func CheckCsvRowLengths(r io.Reader) error { 13 | csvReader := csv.NewReader(r) 14 | csvReader.FieldsPerRecord = -1 15 | csvReader.TrimLeadingSpace = true 16 | //csvReader.LazyQuotes = true 17 | firstRow, err := csvReader.Read() 18 | rowLen := len(firstRow) 19 | if err != nil { 20 | return fmt.Errorf("error reading first row of csv: %s", err.Error()) 21 | } 22 | for i := 1; ; i++ { 23 | record, err := csvReader.Read() 24 | if err == io.EOF { 25 | return nil 26 | } 27 | if err != nil { 28 | return err 29 | } 30 | if len(record) != rowLen { 31 | return fmt.Errorf("error: inconsistent column length on line %d of length %d (rather than %d). ensure all csv columns same length", i, len(record), rowLen) 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /dsio/testdata/json/array/body.json: -------------------------------------------------------------------------------- 1 | [ 2 | false, 3 | true, 4 | null, 5 | 1234567890, 6 | -1234567890e3, 7 | "this is a very long string to make sure the bytes.Scanner needs to load more data at least once during the course of scanning. So now I'm just gonna paste the word puppy like 30x. puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy ", 8 | "oh hello there", 9 | { 10 | "key": "oh hello there" 11 | }, 12 | [ 13 | "key", 14 | "oh hello there" 15 | ], 16 | { 17 | "objects": { 18 | "within": { 19 | "objects": { 20 | "that": { 21 | "haz": [ 22 | "array" 23 | ] 24 | } 25 | } 26 | } 27 | } 28 | } 29 | ] -------------------------------------------------------------------------------- /dsio/testdata/json/object/body.json: -------------------------------------------------------------------------------- 1 | { 2 | "a": false, 3 | "b": true, 4 | "c": null, 5 | "g": "this is a very long string to make sure the bytes.Scanner needs to load more data at least once during the course of scanning. So now I'm just gonna paste the word puppy like 30x. puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy ", 6 | "d": "oh hello there", 7 | "e": { 8 | "key": "oh hello there" 9 | }, 10 | "f": [ 11 | "key", 12 | "oh hello there" 13 | ], 14 | "l": { 15 | "objects": { 16 | "within": { 17 | "objects": { 18 | "that": { 19 | "haz": [ 20 | "array" 21 | ] 22 | } 23 | } 24 | } 25 | } 26 | }, 27 | "m": 1234567890, 28 | "n": -1234567890 29 | } -------------------------------------------------------------------------------- /dsdiff/testdata/newStructure.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "BodyPath": "abc", 4 | "path": "123", 5 | "structure": { 6 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcaaa", 7 | "entries": 35, 8 | "format": "csv", 9 | "formatConfig": { 10 | "headerRow": true 11 | }, 12 | "length": 1582, 13 | "qri": "st:0", 14 | "schema": { 15 | "items": { 16 | "items": [ 17 | { 18 | "title": "ranking", 19 | "type": "integer" 20 | }, 21 | { 22 | "title": "prob_of_automation", 23 | "type": "number" 24 | }, 25 | { 26 | "title": "soc_code", 27 | "type": "string" 28 | }, 29 | { 30 | "title": "job_title", 31 | "type": "string" 32 | } 33 | ], 34 | "type": "array" 35 | }, 36 | "type": "array" 37 | } 38 | }, 39 | "meta": { 40 | "title": "abc", 41 | "description": "I am a dataset" 42 | }, 43 | "visConfig": { 44 | "format": "abc" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Qri, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /dsdiff/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Qri.io 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/qri-io/dataset 2 | 3 | go 1.15 4 | 5 | require ( 6 | github.com/360EntSecGroup-Skylar/excelize v1.4.1 7 | github.com/axiomhq/hyperloglog v0.0.0-20191112132149-a4c4c47bc57f 8 | github.com/dgryski/go-sip13 v0.0.0-20200911182023-62edffca9245 // indirect 9 | github.com/dgryski/go-topk v0.0.0-20191119021947-593b4f2374c9 10 | github.com/google/go-cmp v0.5.5 11 | github.com/ipfs/go-log v1.0.5 12 | github.com/jinzhu/copier v0.0.0-20190924061706-b57f9002281a 13 | github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88 // indirect 14 | github.com/klauspost/compress v1.13.0 15 | github.com/libp2p/go-libp2p-core v0.8.5 16 | github.com/mr-tron/base58 v1.2.0 17 | github.com/multiformats/go-multihash v0.0.15 18 | github.com/qri-io/compare v0.1.0 19 | github.com/qri-io/jsonschema v0.2.2-0.20210618085106-a515144d7449 20 | github.com/qri-io/qfs v0.6.1-0.20210629014446-45bdcdb57434 21 | github.com/qri-io/varName v0.1.0 22 | github.com/sergi/go-diff v1.1.0 // indirect 23 | github.com/ugorji/go/codec v1.1.7 24 | github.com/yudai/gojsondiff v1.0.0 25 | github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 // indirect 26 | github.com/yudai/pp v2.0.1+incompatible // indirect 27 | ) 28 | -------------------------------------------------------------------------------- /detect/cbor.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | 8 | "github.com/qri-io/dataset" 9 | ) 10 | 11 | const ( 12 | cborBdIndefiniteArray byte = 0x9f 13 | cborBdIndefiniteMap = 0xbf 14 | cborBaseArray = 0x80 15 | cborBaseMap = 0xa0 16 | cborBaseTag = 0xc0 17 | ) 18 | 19 | // CBORSchema determines the field names and types of an io.Reader of CBOR-formatted data, returning a json schema 20 | func CBORSchema(resource *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) { 21 | rd := bufio.NewReader(data) 22 | bd, err := rd.ReadByte() 23 | n++ 24 | if err != nil && err != io.EOF { 25 | log.Debugf(err.Error()) 26 | err = fmt.Errorf("error reading data: %s", err.Error()) 27 | return 28 | } 29 | 30 | switch { 31 | case bd >= cborBaseArray && bd < cborBaseMap, bd == cborBdIndefiniteArray: 32 | return dataset.BaseSchemaArray, n, nil 33 | case bd >= cborBaseMap && bd < cborBaseTag, bd == cborBdIndefiniteMap: 34 | return dataset.BaseSchemaObject, n, nil 35 | default: 36 | err = fmt.Errorf("invalid top-level type for CBOR data. cbor datasets must begin with either an array or map") 37 | log.Debugf(err.Error()) 38 | return 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /dsdiff/testdata/orig.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "bodyPath": "abc", 4 | "path": "123", 5 | 6 | "structure": { 7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9", 8 | "entries": 33, 9 | "format": "csv", 10 | "formatConfig": { 11 | "headerRow": true 12 | }, 13 | "length": 1582, 14 | "qri": "st:0", 15 | "Commit": { 16 | "title": "abc" 17 | }, 18 | "PreviousPath": "", 19 | "schema": { 20 | "items": { 21 | "items": [ 22 | { 23 | "title": "rank", 24 | "type": "integer" 25 | }, 26 | { 27 | "title": "probability_of_automation", 28 | "type": "number" 29 | }, 30 | { 31 | "title": "soc_code", 32 | "type": "string" 33 | }, 34 | { 35 | "title": "job_title", 36 | "type": "string" 37 | } 38 | ], 39 | "type": "array" 40 | }, 41 | "type": "array" 42 | } 43 | }, 44 | "meta": { 45 | "title": "abc", 46 | "description": "I am a dataset" 47 | }, 48 | "transform": { 49 | "syntax": "python", 50 | "data": "abc", 51 | "config": { 52 | "option": "value" 53 | } 54 | }, 55 | "viz": { 56 | "format": "abc" 57 | } 58 | } 59 | 60 | -------------------------------------------------------------------------------- /dsio/entry.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | ) 7 | 8 | // Entry is a "row" of a dataset 9 | type Entry struct { 10 | // Index represents this entry's numeric position in a dataset 11 | // this index may not necessarily refer to the overall position within the dataset 12 | // as things like offsets affect where the index begins 13 | Index int 14 | // Key is a string key for this entry 15 | // only present when the top level structure is a map 16 | Key string 17 | // Value is information contained within the row 18 | Value interface{} 19 | } 20 | 21 | // DataIteratorFunc is a function for each "row" of a resource's raw data 22 | type DataIteratorFunc func(int, Entry, error) error 23 | 24 | // EachEntry calls fn on each row of a given EntryReader 25 | func EachEntry(rr EntryReader, fn DataIteratorFunc) error { 26 | num := 0 27 | for { 28 | row, err := rr.ReadEntry() 29 | if err != nil { 30 | if err.Error() == io.EOF.Error() { 31 | return nil 32 | } 33 | err := fmt.Errorf("error reading row %d: %s", num, err.Error()) 34 | log.Debug(err.Error()) 35 | return err 36 | } 37 | 38 | if err := fn(num, row, err); err != nil { 39 | if err.Error() == io.EOF.Error() { 40 | return nil 41 | } 42 | log.Debug(err.Error()) 43 | return err 44 | } 45 | num++ 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /dsdiff/testdata/newData.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "bodyPath": "abcdefg", 4 | "path": "123", 5 | 6 | "structure": { 7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9", 8 | "entries": 33, 9 | "format": "csv", 10 | "formatConfig": { 11 | "headerRow": true 12 | }, 13 | "length": 1582, 14 | "qri": "st:0", 15 | "Commit": { 16 | "title": "abc" 17 | }, 18 | "PreviousPath": "", 19 | "schema": { 20 | "items": { 21 | "items": [ 22 | { 23 | "title": "rank", 24 | "type": "integer" 25 | }, 26 | { 27 | "title": "probability_of_automation", 28 | "type": "number" 29 | }, 30 | { 31 | "title": "soc_code", 32 | "type": "string" 33 | }, 34 | { 35 | "title": "job_title", 36 | "type": "string" 37 | } 38 | ], 39 | "type": "array" 40 | }, 41 | "type": "array" 42 | } 43 | }, 44 | "meta": { 45 | "title": "abc", 46 | "description": "I am a dataset" 47 | }, 48 | "transform": { 49 | "syntax": "python", 50 | "data": "abc", 51 | "config": { 52 | "option": "value" 53 | } 54 | }, 55 | "viz": { 56 | "format": "abc" 57 | } 58 | } 59 | 60 | -------------------------------------------------------------------------------- /dsdiff/testdata/newDescription.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "BodyPath": "abc", 4 | "path": "123", 5 | 6 | "structure": { 7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9", 8 | "entries": 33, 9 | "format": "csv", 10 | "formatConfig": { 11 | "headerRow": true 12 | }, 13 | "length": 1582, 14 | "qri": "st:0", 15 | "Commit": { 16 | "title": "abc" 17 | }, 18 | "PreviousPath": "", 19 | "schema": { 20 | "items": { 21 | "items": [ 22 | { 23 | "title": "rank", 24 | "type": "integer" 25 | }, 26 | { 27 | "title": "probability_of_automation", 28 | "type": "number" 29 | }, 30 | { 31 | "title": "soc_code", 32 | "type": "string" 33 | }, 34 | { 35 | "title": "job_title", 36 | "type": "string" 37 | } 38 | ], 39 | "type": "array" 40 | }, 41 | "type": "array" 42 | } 43 | }, 44 | "meta": { 45 | "title": "abc", 46 | "description": "I am a new description" 47 | }, 48 | "transform": { 49 | "syntax": "python", 50 | "data": "abc", 51 | "config": { 52 | "option": "value" 53 | } 54 | }, 55 | "viz": { 56 | "format": "abc" 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /hash.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import ( 4 | "crypto/sha256" 5 | "encoding/json" 6 | "fmt" 7 | 8 | "github.com/mr-tron/base58/base58" 9 | "github.com/multiformats/go-multihash" 10 | ) 11 | 12 | // JSONHash calculates the hash of a json.Marshaler 13 | // It's important to note that this is *NOT* the same as an IPFS hash, 14 | // These hash functions should be used for other things like 15 | // checksumming, in-memory content-addressing, etc. 16 | func JSONHash(m json.Marshaler) (hash string, err error) { 17 | // marshal to cannoncical JSON representation 18 | data, err := m.MarshalJSON() 19 | if err != nil { 20 | return 21 | } 22 | return HashBytes(data) 23 | } 24 | 25 | // HashBytes generates the base-58 encoded SHA-256 hash of a byte slice 26 | // It's important to note that this is *NOT* the same as an IPFS hash, 27 | // These hash functions should be used for other things like 28 | // checksumming, in-memory content-addressing, etc. 29 | func HashBytes(data []byte) (hash string, err error) { 30 | h := sha256.New() 31 | 32 | if _, err = h.Write(data); err != nil { 33 | return 34 | } 35 | 36 | mhBuf, err := multihash.Encode(h.Sum(nil), multihash.SHA2_256) 37 | if err != nil { 38 | err = fmt.Errorf("error allocating multihash buffer: %s", err.Error()) 39 | return 40 | } 41 | 42 | hash = base58.Encode(mhBuf) 43 | return 44 | } 45 | -------------------------------------------------------------------------------- /dsdiff/testdata/newTitle.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "BodyPath": "abc", 4 | "path": "123", 5 | "structure": { 6 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9", 7 | "entries": 33, 8 | "format": "csv", 9 | "formatConfig": { 10 | "headerRow": true 11 | }, 12 | "length": 1582, 13 | "qri": "st:0", 14 | "transform": { 15 | "syntax": "python", 16 | "data": "abc", 17 | "config": { 18 | "option": "value" 19 | } 20 | }, 21 | "Commit": { 22 | "title": "abc" 23 | }, 24 | "PreviousPath": "", 25 | "schema": { 26 | "items": { 27 | "items": [ 28 | { 29 | "title": "rank", 30 | "type": "integer" 31 | }, 32 | { 33 | "title": "probability_of_automation", 34 | "type": "number" 35 | }, 36 | { 37 | "title": "soc_code", 38 | "type": "string" 39 | }, 40 | { 41 | "title": "job_title", 42 | "type": "string" 43 | } 44 | ], 45 | "type": "array" 46 | }, 47 | "type": "array" 48 | } 49 | }, 50 | "meta": { 51 | "title": "data data data", 52 | "description": "I am a dataset" 53 | }, 54 | "visConfig": { 55 | "format": "abc" 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /dsdiff/testdata/newVisConfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "BodyPath": "abc", 4 | "path": "123", 5 | 6 | "structure": { 7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9", 8 | "entries": 33, 9 | "format": "csv", 10 | "formatConfig": { 11 | "headerRow": true 12 | }, 13 | "length": 1582, 14 | "qri": "st:0", 15 | "Commit": { 16 | "title": "abc" 17 | }, 18 | "PreviousPath": "", 19 | "schema": { 20 | "items": { 21 | "items": [ 22 | { 23 | "title": "rank", 24 | "type": "integer" 25 | }, 26 | { 27 | "title": "probability_of_automation", 28 | "type": "number" 29 | }, 30 | { 31 | "title": "soc_code", 32 | "type": "string" 33 | }, 34 | { 35 | "title": "job_title", 36 | "type": "string" 37 | } 38 | ], 39 | "type": "array" 40 | }, 41 | "type": "array" 42 | } 43 | }, 44 | "meta": { 45 | "title": "abc", 46 | "description": "I am a dataset" 47 | }, 48 | "transform": { 49 | "syntax": "python", 50 | "data": "abc", 51 | "config": { 52 | "option": "value" 53 | } 54 | }, 55 | "viz": { 56 | "format": "new thing" 57 | } 58 | } 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /dsviz/testdata/default/template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{- block "stylesheet" . }} 5 | 10 | {{ end -}} 11 | {{ title }} 12 | 13 | 14 |
15 | {{ block "header" . -}} 16 |
17 |

{{ title }}

18 |
19 | {{ end -}} 20 | {{- block "summary" . -}} 21 |
22 |

{{ ds.meta.description }}

23 |
24 | {{ end -}} 25 | {{- block "stats" . }}{{ if ds.stats -}} 26 |

stats

27 | {{ end }}{{ end -}} 28 | {{- block "citations" . -}} 29 | 38 | {{- end }} 39 |
40 | 41 | -------------------------------------------------------------------------------- /dsdiff/testdata/newTransform.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "qri:ds:0", 3 | "BodyPath": "abc", 4 | "path": "123", 5 | 6 | "structure": { 7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9", 8 | "entries": 33, 9 | "format": "csv", 10 | "formatConfig": { 11 | "headerRow": true 12 | }, 13 | "length": 1582, 14 | "qri": "st:0", 15 | "Commit": { 16 | "title": "abc" 17 | }, 18 | "PreviousPath": "", 19 | "schema": { 20 | "items": { 21 | "items": [ 22 | { 23 | "title": "rank", 24 | "type": "integer" 25 | }, 26 | { 27 | "title": "probability_of_automation", 28 | "type": "number" 29 | }, 30 | { 31 | "title": "soc_code", 32 | "type": "string" 33 | }, 34 | { 35 | "title": "job_title", 36 | "type": "string" 37 | } 38 | ], 39 | "type": "array" 40 | }, 41 | "type": "array" 42 | } 43 | }, 44 | "meta": { 45 | "title": "abc", 46 | "description": "I am a dataset" 47 | }, 48 | "transform": { 49 | "appVersion": "0.1.0", 50 | "syntax": "sql", 51 | "data": "xyz", 52 | "config": { 53 | "option": "new_value" 54 | } 55 | }, 56 | "viz": { 57 | "format": "abc" 58 | } 59 | } 60 | 61 | -------------------------------------------------------------------------------- /detect/json_test.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "io" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/google/go-cmp/cmp" 9 | "github.com/qri-io/dataset" 10 | ) 11 | 12 | func TestJSONSchema(t *testing.T) { 13 | 14 | pr, _ := io.Pipe() 15 | pr.Close() 16 | _, _, err := JSONSchema(&dataset.Structure{}, pr) 17 | if err == nil { 18 | t.Error("expected error when reading bad reader") 19 | return 20 | } 21 | 22 | cases := []struct { 23 | st *dataset.Structure 24 | data string 25 | expect map[string]interface{} 26 | err string 27 | }{ 28 | {&dataset.Structure{}, "", nil, "invalid json data"}, 29 | {&dataset.Structure{}, "f", nil, "invalid json data"}, 30 | {&dataset.Structure{}, "{", dataset.BaseSchemaObject, ""}, 31 | {&dataset.Structure{}, "[", dataset.BaseSchemaArray, ""}, 32 | {&dataset.Structure{}, strings.Repeat(" ", 250) + "[", dataset.BaseSchemaArray, ""}, 33 | } 34 | 35 | for i, c := range cases { 36 | rdr := strings.NewReader(c.data) 37 | 38 | got, _, err := JSONSchema(c.st, rdr) 39 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 40 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err) 41 | return 42 | } 43 | 44 | if diff := cmp.Diff(c.expect, got); diff != "" { 45 | t.Errorf("case %d returned schema mismatch (-want +got):\n%s", i, diff) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /dsio/testdata/json/links_array/body.json: -------------------------------------------------------------------------------- 1 | [ 2 | "http://datatogether.org", 3 | "https://datatogether.org/css/style.css", 4 | "https://datatogether.org/img/favicon.ico", 5 | "https://datatogether.org", 6 | "https://datatogether.org/public-record", 7 | "https://datatogether.org/activities", 8 | "https://datatogether.org/activities/harvesting", 9 | "https://datatogether.org/activities/monitoring", 10 | "https://datatogether.org/activities/storing", 11 | "https://datatogether.org/activities/rescuing", 12 | "http://2017.code4lib.org", 13 | "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.pdf", 14 | "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.key", 15 | "http://www.esipfed.org/meetings/upcoming-meetings/esip-summer-meeting-2017", 16 | "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.pdf", 17 | "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.key", 18 | "https://archive.org/details/ndsr-dc-2017", 19 | "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.pdf", 20 | "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.key", 21 | "https://github.com/datatogether" 22 | ] -------------------------------------------------------------------------------- /dsio/entry_test.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/qri-io/dataset" 9 | "github.com/qri-io/dataset/dstest" 10 | ) 11 | 12 | func TestEachEntry(t *testing.T) { 13 | tc, err := dstest.NewTestCaseFromDir("testdata/json/city") 14 | if err != nil { 15 | t.Errorf("error reading test case: %s", err.Error()) 16 | return 17 | } 18 | 19 | st := &dataset.Structure{ 20 | Format: "json", 21 | Schema: dataset.BaseSchemaArray, 22 | } 23 | r, err := NewEntryReader(st, tc.BodyFile()) 24 | if err != nil { 25 | t.Errorf("error allocating RowReader: %s", err.Error()) 26 | return 27 | } 28 | 29 | err = EachEntry(r, func(i int, ent Entry, err error) error { 30 | if err != nil { 31 | return err 32 | } 33 | 34 | // if len(expect[i]) != len(data) { 35 | // return fmt.Errorf("data length mismatch. expected %d, got: %d", len(expect[i]), len(data)) 36 | // } 37 | 38 | // for j, cell := range data { 39 | // if !bytes.Equal(expect[i][j], cell) { 40 | // return fmt.Errorf("result mismatch. row: %d, cell: %d. %s != %s", i, j, string(expect[i][j]), string(cell)) 41 | // } 42 | // } 43 | 44 | return nil 45 | }) 46 | 47 | if err != nil { 48 | t.Errorf("eachrow error: %s", err.Error()) 49 | return 50 | } 51 | } 52 | 53 | func testdataFile(base string) string { 54 | return filepath.Join(os.Getenv("GOPATH"), "/src/github.com/qri-io/dataset/testdata/", base) 55 | } 56 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | jobs: 3 | build: 4 | working_directory: /go/src/github.com/qri-io/dataset 5 | docker: 6 | - image: circleci/golang:latest 7 | environment: 8 | TEST_RESULTS: /tmp/test-results 9 | GO111MODULE: "on" 10 | GOPROXY: "https://proxy.golang.org" 11 | steps: 12 | - checkout 13 | - run: mkdir -p $TEST_RESULTS 14 | - restore_cache: 15 | key: dependency-cache-{{ checksum "go.sum" }} 16 | - run: 17 | name: Get CI Deps 18 | command: go get github.com/jstemmer/go-junit-report golang.org/x/lint/golint 19 | - run: 20 | name: Lint 21 | command: golint -set_exit_status ./... 22 | - run: 23 | name: Go Vet 24 | command: go vet ./... 25 | - run: 26 | name: Run Tests 27 | command: | 28 | trap "go-junit-report <${TEST_RESULTS}/go-test.out > ${TEST_RESULTS}/go-test-report.xml" EXIT 29 | make test | tee ${TEST_RESULTS}/go-test.out 30 | - save_cache: 31 | key: dependency-cache-{{ checksum "go.sum" }} 32 | paths: 33 | - /go/pkg/mod 34 | - run: 35 | name: Publish coverage info to codecov.io 36 | command: bash <(curl -s https://codecov.io/bash) 37 | - store_artifacts: 38 | path: /tmp/test-results 39 | destination: raw-test-output 40 | - store_test_results: 41 | path: /tmp/test-results 42 | -------------------------------------------------------------------------------- /dsio/testdata/json/links_object/body.json: -------------------------------------------------------------------------------- 1 | { 2 | "a": "http://datatogether.org", 3 | "b": "https://datatogether.org/css/style.css", 4 | "c": "https://datatogether.org/img/favicon.ico", 5 | "d": "https://datatogether.org", 6 | "e": "https://datatogether.org/public-record", 7 | "f": "https://datatogether.org/activities", 8 | "g": "https://datatogether.org/activities/harvesting", 9 | "h": "https://datatogether.org/activities/monitoring", 10 | "i": "https://datatogether.org/activities/storing", 11 | "j": "https://datatogether.org/activities/rescuing", 12 | "k": "http://2017.code4lib.org", 13 | "l": "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.pdf", 14 | "m": "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.key", 15 | "n": "http://www.esipfed.org/meetings/upcoming-meetings/esip-summer-meeting-2017", 16 | "o": "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.pdf", 17 | "p": "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.key", 18 | "q": "https://archive.org/details/ndsr-dc-2017", 19 | "r": "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.pdf", 20 | "s": "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.key", 21 | "t": "https://github.com/datatogether" 22 | } -------------------------------------------------------------------------------- /validate/data_test.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/qri-io/dataset/dsio" 8 | "github.com/qri-io/dataset/dstest" 9 | ) 10 | 11 | func TestEntryReader(t *testing.T) { 12 | cases := []struct { 13 | name string 14 | err string 15 | errors []string 16 | }{ 17 | {"craigslist", "", nil}, 18 | {"movies", "", []string{ 19 | `/0/1: "" type should be integer, got string`, 20 | `/1/1: "" type should be integer, got string`, 21 | }}, 22 | } 23 | 24 | for _, c := range cases { 25 | tc, err := dstest.NewTestCaseFromDir(fmt.Sprintf("testdata/%s", c.name)) 26 | if err != nil { 27 | t.Errorf("%s: error loading %s", c.name, err.Error()) 28 | continue 29 | } 30 | 31 | r, err := dsio.NewEntryReader(tc.Input.Structure, tc.BodyFile()) 32 | if err != nil { 33 | t.Errorf("%s: error creating entry reader: %s", c.name, err.Error()) 34 | continue 35 | } 36 | 37 | errors, err := EntryReader(r) 38 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 39 | t.Errorf("%s error mismatch. expected: %s, got: %s", c.name, c.err, err) 40 | continue 41 | } 42 | 43 | if len(errors) != len(c.errors) { 44 | t.Errorf("%s: error length mismatch. expected: %d, got: %d", c.name, len(c.errors), len(errors)) 45 | continue 46 | } 47 | 48 | for j, e := range errors { 49 | if e.Error() != c.errors[j] { 50 | t.Errorf("%s: validation error %d mismatch. expected: %s, got: %s", c.name, j, c.errors[j], e.Error()) 51 | continue 52 | } 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /dsio/streams.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | 7 | "github.com/qri-io/dataset" 8 | ) 9 | 10 | // PagedReader wraps a reader, starting reads from offset, and only reads limit number of entries 11 | type PagedReader struct { 12 | Reader EntryReader 13 | Limit int 14 | Offset int 15 | } 16 | 17 | var _ EntryReader = (*PagedReader)(nil) 18 | 19 | // Structure returns the wrapped reader's structure 20 | func (r *PagedReader) Structure() *dataset.Structure { 21 | return r.Reader.Structure() 22 | } 23 | 24 | // ReadEntry returns an entry, taking offset and limit into account 25 | func (r *PagedReader) ReadEntry() (Entry, error) { 26 | for r.Offset > 0 { 27 | _, err := r.Reader.ReadEntry() 28 | if err != nil { 29 | return Entry{}, err 30 | } 31 | r.Offset-- 32 | } 33 | if r.Limit == 0 { 34 | return Entry{}, io.EOF 35 | } 36 | r.Limit-- 37 | return r.Reader.ReadEntry() 38 | } 39 | 40 | // Close finalizes the writer, indicating no more records 41 | // will be written 42 | func (r *PagedReader) Close() error { 43 | return r.Reader.Close() 44 | } 45 | 46 | // Copy reads all entries from the reader and writes them to the writer 47 | func Copy(reader EntryReader, writer EntryWriter) error { 48 | for { 49 | val, err := reader.ReadEntry() 50 | if err != nil { 51 | if err == io.EOF { 52 | break 53 | } 54 | return fmt.Errorf("row iteration error: %s", err.Error()) 55 | } 56 | if err := writer.WriteEntry(val); err != nil { 57 | return fmt.Errorf("error writing value to buffer: %s", err.Error()) 58 | } 59 | } 60 | return nil 61 | } 62 | -------------------------------------------------------------------------------- /dsviz/doc.go: -------------------------------------------------------------------------------- 1 | /*Package dsviz renders the viz component of a dataset, returning a qfs.File of 2 | data 3 | 4 | HTML rendering uses go's html/template package to generate html documents from 5 | an input dataset. It's API has been adjusted to use lowerCamelCase instead of 6 | UpperCamelCase naming conventions 7 | 8 | outline: html viz templates 9 | HTML template should expose a number of helper template functions, along 10 | with a dataset document at ds. Exposing the dataset document as "ds" 11 | matches our conventions for referring to a dataset elsewhere, and allows 12 | access to all defined parts of a dataset. 13 | HTML visualization is built atop the 14 | [go template syntax](https://golang.org/pkg/text/template/#hdr-Functions) 15 | types: 16 | {{ ds }} 17 | the dataset being visualized, ds can have a number of components like 18 | commit, meta, transform, body, all of which have helpful fields for 19 | visualization. Details of the dataset document are outlined in the 20 | dataset document definition 21 | functions: 22 | {{ allBodyEntries }} 23 | load the full dataset body 24 | {{ bodyEntries offset limit }} 25 | get body entries within an offset/limit range. passing offset: 0, 26 | limit: -1 returns the entire body 27 | {{ filesize }} 28 | convert byte count to kb/mb/etc string 29 | {{ title }} 30 | give the title of a dataset 31 | {{ isType $val "type" }} 32 | return true or false if the type of $val matches the given type string 33 | possible type values are "string", "object", "array", "boolean", "number" 34 | */ 35 | package dsviz 36 | -------------------------------------------------------------------------------- /testdata/structures/airport-codes.json: -------------------------------------------------------------------------------- 1 | { 2 | "errCount": 5, 3 | "format": "csv", 4 | "formatConfig": { 5 | "headerRow": true 6 | }, 7 | "qri": "st:0", 8 | "mediatype": "text/csv", 9 | "readme": "readme.md", 10 | "schema": { 11 | "type": "array", 12 | "items": { 13 | "type": "array", 14 | "items": [ 15 | { 16 | "title": "ident", 17 | "type": "string" 18 | }, 19 | { 20 | "title": "type", 21 | "type": "string" 22 | }, 23 | { 24 | "title": "name", 25 | "type": "string" 26 | }, 27 | { 28 | "title": "latitude_deg", 29 | "type": "number" 30 | }, 31 | { 32 | "title": "longitude_deg", 33 | "type": "number" 34 | }, 35 | { 36 | "title": "elevation_ft", 37 | "type": "integer" 38 | }, 39 | { 40 | "title": "continent", 41 | "type": "string" 42 | }, 43 | { 44 | "title": "iso_country", 45 | "type": "string" 46 | }, 47 | { 48 | "title": "iso_region", 49 | "type": "string" 50 | }, 51 | { 52 | "title": "municipality", 53 | "type": "string" 54 | }, 55 | { 56 | "title": "gps_code", 57 | "type": "string" 58 | }, 59 | { 60 | "title": "iata_code", 61 | "type": "string" 62 | }, 63 | { 64 | "title": "local_code", 65 | "type": "string" 66 | } 67 | ] 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /dsio/entry_buffer_test.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "encoding/json" 5 | "testing" 6 | 7 | "github.com/qri-io/dataset" 8 | "github.com/qri-io/dataset/dstest" 9 | ) 10 | 11 | func TestEntryBuffer(t *testing.T) { 12 | tc, err := dstest.NewTestCaseFromDir("testdata/csv/movies") 13 | if err != nil { 14 | t.Errorf("error loading test case: %s", err.Error()) 15 | return 16 | } 17 | 18 | ds := tc.Input 19 | 20 | outst := &dataset.Structure{ 21 | Format: "json", 22 | Schema: ds.Structure.Schema, 23 | } 24 | 25 | rbuf, err := NewEntryBuffer(outst) 26 | if err != nil { 27 | t.Errorf("error allocating EntryBuffer: %s", err.Error()) 28 | return 29 | } 30 | 31 | rr, err := NewEntryReader(ds.Structure, tc.BodyFile()) 32 | if err != nil { 33 | t.Errorf("error allocating RowReader: %s", err.Error()) 34 | return 35 | } 36 | 37 | if err = EachEntry(rr, func(i int, val Entry, err error) error { 38 | if err != nil { 39 | return err 40 | } 41 | return rbuf.WriteEntry(val) 42 | }); err != nil { 43 | t.Errorf("error writing rows: %s", err.Error()) 44 | return 45 | } 46 | 47 | bst := rbuf.Structure() 48 | if diff := dstest.CompareStructures(outst, bst); diff != "" { 49 | t.Errorf("buffer structure mismatch (-wnt +got):\n%s", diff) 50 | return 51 | } 52 | 53 | if err := rbuf.Close(); err != nil { 54 | t.Errorf("error closing buffer: %s", err.Error()) 55 | return 56 | } 57 | 58 | out := []interface{}{} 59 | if err := json.Unmarshal(rbuf.Bytes(), &out); err != nil { 60 | t.Errorf("error unmarshaling encoded bytes: %s", err.Error()) 61 | return 62 | } 63 | 64 | if _, err = json.Marshal(out); err != nil { 65 | t.Errorf("error marshaling json data: %s", err.Error()) 66 | return 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /generate/tabular_test.go: -------------------------------------------------------------------------------- 1 | package generate 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/google/go-cmp/cmp" 8 | "github.com/qri-io/dataset" 9 | "github.com/qri-io/dataset/dsio" 10 | ) 11 | 12 | // Compile time check that Generator satisfies the EntryReader interace. 13 | var _ dsio.EntryReader = (*TabularGenerator)(nil) 14 | 15 | func TestGeneratorForBaseSchemaArray(t *testing.T) { 16 | cases := []struct { 17 | index int 18 | key string 19 | value interface{} 20 | }{ 21 | {0, "", []interface{}{"gltBH"}}, 22 | {1, "", []interface{}{"VJQV"}}, 23 | {2, "", []interface{}{"dv8A"}}, 24 | } 25 | 26 | st := &dataset.Structure{Format: "json", Schema: map[string]interface{}{ 27 | "type": "array", 28 | "items": map[string]interface{}{ 29 | "type": "array", 30 | "items": []interface{}{ 31 | map[string]interface{}{"type": "string", "title": "col_one,"}, 32 | }, 33 | }, 34 | }} 35 | 36 | g, err := NewTabularGenerator(st, AssignSeed, AssignMaxLen) 37 | if err != nil { 38 | t.Fatal(err) 39 | } 40 | defer g.Close() 41 | 42 | if diff := cmp.Diff(st, g.Structure()); diff != "" { 43 | t.Errorf("expected returned structure to match input. (-want +got)P:\n%s", diff) 44 | } 45 | 46 | for i, c := range cases { 47 | t.Run(fmt.Sprintf("%d", c.index), func(t *testing.T) { 48 | e, _ := g.ReadEntry() 49 | if e.Index != c.index { 50 | t.Errorf("case %d index mismatch. expected: %d. got: %d", i, c.index, e.Index) 51 | } 52 | if e.Key != c.key { 53 | t.Errorf("case %d key mismatch. expected: %s. got: %s", i, c.key, e.Key) 54 | } 55 | if diff := cmp.Diff(c.value, e.Value); diff != "" { 56 | t.Errorf("case result mismatch. (-want +got):\n%s", diff) 57 | } 58 | }) 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /detect/json.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | 7 | "github.com/qri-io/dataset" 8 | ) 9 | 10 | // JSONSchema determines the field names and types of an io.Reader of JSON-formatted data, returning a json schema 11 | // This is currently a suuuuuuuuper simple interpretation that spits out a generic schema that'll work. In the future 12 | // we can do all sorts of stuff here to make better inferences about the shape of a dataset, but for now, this'll work, 13 | // and we'll instead focus on making it easier for users to provide hand-built schemas 14 | func JSONSchema(resource *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) { 15 | var ( 16 | count = 0 17 | buf = make([]byte, 100) 18 | ) 19 | 20 | for { 21 | count, err = data.Read(buf) 22 | n += count 23 | if err != nil { 24 | if err == io.EOF { 25 | // possible that data length is less than 100 bytes, 26 | // if we've read more than 0 bytes, we should check it 27 | if count > 0 { 28 | err = nil 29 | } else { 30 | err = fmt.Errorf("invalid json data") 31 | return 32 | } 33 | } else { 34 | log.Debugf(err.Error()) 35 | err = fmt.Errorf("error reading data: %s", err.Error()) 36 | return 37 | } 38 | } 39 | 40 | for _, b := range buf { 41 | switch b { 42 | case '[': 43 | return dataset.BaseSchemaArray, n, nil 44 | case '{': 45 | return dataset.BaseSchemaObject, n, nil 46 | case ' ', '\t', '\n', '\r': 47 | continue 48 | default: 49 | err = fmt.Errorf("invalid json data") 50 | return 51 | } 52 | } 53 | } 54 | } 55 | 56 | // NDJSONSchema returns an array identity schema 57 | func NDJSONSchema(resource *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) { 58 | return dataset.BaseSchemaArray, 0, nil 59 | } 60 | -------------------------------------------------------------------------------- /detect/testdata/spelling.csv: -------------------------------------------------------------------------------- 1 | state,search interest 2 | Idaho (United States),100 3 | Wyoming (United States),96.91 4 | South Dakota (United States),91.89 5 | Iowa (United States),90.46 6 | Utah (United States),88.18 7 | Nebraska (United States),86.05 8 | New Mexico (United States),86 9 | Montana (United States),85.58 10 | Missouri (United States),83.44 11 | West Virginia (United States),83.23 12 | Arkansas (United States),82.55 13 | Georgia (United States),81.51 14 | North Dakota (United States),81.5 15 | Mississippi (United States),80.09 16 | Wisconsin (United States),79.32 17 | Maine (United States),78.89 18 | Indiana (United States),78.11 19 | South Carolina (United States),76.54 20 | Michigan (United States),75.09 21 | Alabama (United States),74.38 22 | Nevada (United States),74.29 23 | Ohio (United States),73.83 24 | Kansas (United States),73.36 25 | Colorado (United States),73.35 26 | Arizona (United States),73.13 27 | Louisiana (United States),72.47 28 | Tennessee (United States),71.93 29 | North Carolina (United States),70.81 30 | Illinois (United States),70.73 31 | Minnesota (United States),70.64 32 | Oklahoma (United States),70.43 33 | Kentucky (United States),69.95 34 | Texas (United States),67.34 35 | Pennsylvania (United States),64.82 36 | New Hampshire (United States),63.37 37 | Delaware (United States),63.02 38 | California (United States),62.64 39 | Rhode Island (United States),61.53 40 | Florida (United States),59.74 41 | Alaska (United States),59.53 42 | Washington (United States),59.47 43 | Vermont (United States),58.59 44 | Hawaii (United States),56.26 45 | Connecticut (United States),56.04 46 | New Jersey (United States),55.17 47 | Maryland (United States),54.56 48 | District of Columbia (United States),52.75 49 | Massachusetts (United States),52.53 50 | New York (United States),50.75 51 | Virginia (United States),33.39 52 | Oregon (United States),28.42 -------------------------------------------------------------------------------- /dstest/golden.go: -------------------------------------------------------------------------------- 1 | package dstest 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "testing" 9 | 10 | "github.com/qri-io/dataset" 11 | ) 12 | 13 | // UpdateGoldenFileEnvVarname is the envionment variable that dstest checks 14 | // before writing 15 | const UpdateGoldenFileEnvVarname = "QRI_UPDATE_GOLDEN_FILES" 16 | 17 | // CompareGoldenDatasetAndUpdateIfEnvVarSet is a convenience wrapper for the 18 | // common case of loading a golden file, comparing it to a dataset, and updating 19 | // the dataset if it fails and the "update" enviornment variable is set 20 | func CompareGoldenDatasetAndUpdateIfEnvVarSet(t *testing.T, goldenFilepath string, got *dataset.Dataset) { 21 | t.Helper() 22 | expect := LoadGoldenFile(t, goldenFilepath) 23 | if diff := CompareDatasets(expect, got); diff != "" { 24 | t.Errorf("dataset golden file mismatch (-want +got):\n%s", diff) 25 | UpdateGoldenFileIfEnvVarSet(goldenFilepath, got) 26 | } 27 | } 28 | 29 | // LoadGoldenFile loads a dataset from a JSON file 30 | func LoadGoldenFile(t *testing.T, filename string) *dataset.Dataset { 31 | t.Helper() 32 | data, err := ioutil.ReadFile(filename) 33 | if err != nil { 34 | t.Fatalf("opening JSON golden file: %s", err) 35 | } 36 | 37 | ds := &dataset.Dataset{} 38 | if err := json.Unmarshal(data, ds); err != nil { 39 | t.Fatalf("unmarshaling JSON golden file: %s", err) 40 | } 41 | 42 | return ds 43 | } 44 | 45 | // UpdateGoldenFileIfEnvVarSet overwrites the given filename if 46 | // QRI_UPDATED_GOLDEN_FILES env var is set 47 | func UpdateGoldenFileIfEnvVarSet(filename string, got *dataset.Dataset) { 48 | if os.Getenv(UpdateGoldenFileEnvVarname) != "" { 49 | fmt.Printf("updating golden file: %q\n", filename) 50 | data, err := json.MarshalIndent(got, "", " ") 51 | if err != nil { 52 | panic(err) 53 | } 54 | if err := ioutil.WriteFile(filename, data, 0644); err != nil { 55 | panic(err) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /validate/testdata_test.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "github.com/qri-io/dataset" 5 | ) 6 | 7 | var emptyRawText = `` 8 | 9 | // has lazy quotes 10 | var rawText1 = `first_name,last_name,username,age 11 | "Rob","Pike",rob, 100 12 | Ken,Thompson,ken, 75.5 13 | "Robert","Griesemer","gri", 100` 14 | 15 | var namesStructure = &dataset.Structure{ 16 | Format: "csv", 17 | FormatConfig: map[string]interface{}{ 18 | "headerRow": true, 19 | }, 20 | Schema: map[string]interface{}{ 21 | "type": "array", 22 | "items": map[string]interface{}{ 23 | "type": "array", 24 | "items": []interface{}{ 25 | map[string]interface{}{"title": "first_name", "type": "string"}, 26 | map[string]interface{}{"title": "last_name", "type": "string"}, 27 | map[string]interface{}{"title": "username", "type": "string"}, 28 | map[string]interface{}{"title": "age", "type": "integer"}, 29 | }, 30 | }, 31 | }, 32 | } 33 | 34 | // has nonNumeric quotes and comma inside quotes on last line 35 | var rawText2 = `"first_name","last_name","username","age" 36 | "Rob","Pike","rob", 22 37 | "Robert","Griesemer","gri", 100 38 | "abc","def,ghi","jkl",1000` 39 | 40 | // same as above but with spaces in last line 41 | var rawText2b = `"first_name","last_name","username","age" 42 | "Rob","Pike","rob", 22 43 | "Robert","Griesemer","gri", 100 44 | "abc", "def,ghi", "jkl", 1000` 45 | 46 | // error in last row "age" column 47 | var rawText2c = `first_name,last_name,username,age 48 | "Rob","Pike","rob",22 49 | "Robert","Griesemer","gri",100 50 | "abc","def,ghi","jkl",_` 51 | 52 | // NOTE: technically this is valid csv and we should be catching this at an earlier filter 53 | var rawText3 = ` 54 | 55 | 56 | col 58 | 59 |
57 |
60 | 61 | ` 62 | 63 | var rawText4 = ` 64 | 65 | 66 | Last Name, First 68 | 69 | 70 |
67 |
71 | 72 | ` 73 | -------------------------------------------------------------------------------- /dsio/replacecr/replace_cr.go: -------------------------------------------------------------------------------- 1 | // Package replacecr defines a wrapper for replacing solo carriage return characters (\r) 2 | // with carriage-return + line feed (\r\n) 3 | package replacecr 4 | 5 | import ( 6 | "bufio" 7 | "io" 8 | ) 9 | 10 | // Reader wraps an io.Reader. on every call of Read. it looks for 11 | // for instances of lonely \r replacing them with \r\n before returning to the end consumer 12 | // lots of files in the wild will come without "proper" line breaks, which irritates go's 13 | // standard csv package. This'll fix by wrapping the reader passed to csv.NewReader: 14 | // rdr, err := csv.NewReader(replacecr.Reader(r)) 15 | // because Reader adds '\n' characters, the number of bytes reported from the underlying 16 | // reader can/will differ from what the underlyng reader would return 17 | // if read from directly. This can cause issues with checksums and byte counts. 18 | // Use with caution. 19 | func Reader(data io.Reader) io.Reader { 20 | return crlfReplaceReader{ 21 | rdr: bufio.NewReader(data), 22 | } 23 | } 24 | 25 | // ReaderWithSize instaties a reader with a given buffer size 26 | func ReaderWithSize(data io.Reader, size int) io.Reader { 27 | return crlfReplaceReader{ 28 | rdr: bufio.NewReaderSize(data, size), 29 | } 30 | } 31 | 32 | // crlfReplaceReader wraps a reader 33 | type crlfReplaceReader struct { 34 | rdr *bufio.Reader 35 | } 36 | 37 | // Read implements io.Reader for crlfReplaceReader 38 | func (c crlfReplaceReader) Read(p []byte) (n int, err error) { 39 | lenP := len(p) 40 | if lenP == 0 { 41 | return 42 | } 43 | 44 | for { 45 | if n == lenP { 46 | return 47 | } 48 | 49 | p[n], err = c.rdr.ReadByte() 50 | if err != nil { 51 | return 52 | } 53 | 54 | // any time we encounter \r & still have space, check to see if \n follows 55 | // ff next char is not \n, add it in manually 56 | if p[n] == '\r' && n < lenP-1 { 57 | if pk, err := c.rdr.Peek(1); (err == nil && pk[0] != '\n') || (err != nil && err.Error() == "EOF") { 58 | n++ 59 | p[n] = '\n' 60 | } 61 | } 62 | 63 | n++ 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /stepfile/stepfile.go: -------------------------------------------------------------------------------- 1 | // Package stepfile provides utilities for reading and writing an ordered set of 2 | // transform steps to and from a flat file representation 3 | // 4 | // A stepfile file consists of one or more steps of input text separated by 5 | // "---" lines. 6 | // 7 | // Example: 8 | // 9 | // "step" 10 | // --- 11 | // "another step" 12 | // --- 13 | // "and another step" 14 | package stepfile 15 | 16 | import ( 17 | "fmt" 18 | "io" 19 | "io/ioutil" 20 | "os" 21 | "strings" 22 | 23 | "github.com/qri-io/dataset" 24 | ) 25 | 26 | // ReadFile opens a stepfile and returns steps 27 | func ReadFile(filename string) (steps []*dataset.TransformStep, err error) { 28 | f, err := os.Open(filename) 29 | if err != nil { 30 | return nil, err 31 | } 32 | defer f.Close() 33 | return Read(f) 34 | } 35 | 36 | // Read consumes a reader into steps 37 | func Read(r io.Reader) (steps []*dataset.TransformStep, err error) { 38 | data, err := ioutil.ReadAll(r) 39 | if err != nil { 40 | return nil, err 41 | } 42 | 43 | for _, chunk := range strings.Split(string(data), "\n---\n") { 44 | steps = append(steps, &dataset.TransformStep{ 45 | Script: chunk, 46 | }) 47 | } 48 | return steps, nil 49 | } 50 | 51 | // Write prints transform steps as a stepfile to a writer 52 | func Write(steps []*dataset.TransformStep, w io.Writer) error { 53 | for i, step := range steps { 54 | if err := writeStepScript(step, w); err != nil { 55 | return err 56 | } 57 | if i != len(steps)-1 { 58 | w.Write([]byte("\n---\n")) 59 | } 60 | } 61 | return nil 62 | } 63 | 64 | func writeStepScript(s *dataset.TransformStep, w io.Writer) error { 65 | if r, ok := s.Script.(io.Reader); ok { 66 | if closer, ok := s.Script.(io.Closer); ok { 67 | defer closer.Close() 68 | } 69 | _, err := io.Copy(w, r) 70 | return err 71 | } 72 | 73 | switch v := s.Script.(type) { 74 | case string: 75 | _, err := w.Write([]byte(v)) 76 | return err 77 | case []byte: 78 | _, err := w.Write(v) 79 | return err 80 | } 81 | return fmt.Errorf("unrecognized script type: %T", s.Script) 82 | } 83 | -------------------------------------------------------------------------------- /dsio/entry_buffer.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "bytes" 5 | 6 | "github.com/qri-io/dataset" 7 | ) 8 | 9 | // EntryBuffer mimics the behaviour of bytes.Buffer, but with structured Dataa 10 | // Read and Write are replaced with ReadEntry and WriteEntry. It's worth noting 11 | // that different data formats have idisyncrcies that affect the behavior 12 | // of buffers and their output. For example, EntryBuffer won't write things like 13 | // CSV header rows or enclosing JSON arrays until after the writer's 14 | // Close method has been called. 15 | type EntryBuffer struct { 16 | structure *dataset.Structure 17 | r EntryReader 18 | w EntryWriter 19 | buf *bytes.Buffer 20 | } 21 | 22 | // NewEntryBuffer allocates a buffer, buffers should always be created with 23 | // NewEntryBuffer, which will error if the provided structure is invalid for 24 | // reading / writing 25 | func NewEntryBuffer(st *dataset.Structure) (*EntryBuffer, error) { 26 | buf := &bytes.Buffer{} 27 | r, err := NewEntryReader(st, buf) 28 | if err != nil { 29 | log.Debug(err.Error()) 30 | return nil, err 31 | } 32 | w, err := NewEntryWriter(st, buf) 33 | if err != nil { 34 | log.Debug(err.Error()) 35 | return nil, err 36 | } 37 | 38 | return &EntryBuffer{ 39 | structure: st, 40 | r: r, 41 | w: w, 42 | buf: buf, 43 | }, nil 44 | } 45 | 46 | // Structure gives the underlying structure this buffer is using 47 | func (b *EntryBuffer) Structure() *dataset.Structure { 48 | return b.structure 49 | } 50 | 51 | // ReadEntry reads one "row" from the buffer 52 | func (b *EntryBuffer) ReadEntry() (Entry, error) { 53 | return b.r.ReadEntry() 54 | } 55 | 56 | // WriteEntry writes one "row" to the buffer 57 | func (b *EntryBuffer) WriteEntry(e Entry) error { 58 | return b.w.WriteEntry(e) 59 | } 60 | 61 | // Close closes the writer portion of the buffer, which will affect 62 | // underlying contents. 63 | func (b *EntryBuffer) Close() error { 64 | return b.w.Close() 65 | } 66 | 67 | // Bytes gives the raw contents of the underlying buffer 68 | func (b *EntryBuffer) Bytes() []byte { 69 | return b.buf.Bytes() 70 | } 71 | -------------------------------------------------------------------------------- /testdata/datasets/airport-codes.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "meta": { 4 | "qri": "md:0", 5 | "homeURL": "http://www.ourairports.com/", 6 | "license": { 7 | "type":"PDDL-1.0" 8 | }, 9 | "title": "Airport Codes", 10 | "citations": [ 11 | { 12 | "name": "Our Airports", 13 | "url": "http://ourairports.com/data/" 14 | } 15 | ] 16 | }, 17 | "commit": { 18 | "title": "initial commit" 19 | }, 20 | "structure": { 21 | "format": "csv", 22 | "qri": "st:0", 23 | "formatConfig": { 24 | "headerRow": true 25 | }, 26 | "errCount": 5, 27 | "schema": { 28 | "type": "array", 29 | "items": { 30 | "type": "array", 31 | "items": [ 32 | { 33 | "title": "ident", 34 | "type": "string" 35 | }, 36 | { 37 | "title": "type", 38 | "type": "string" 39 | }, 40 | { 41 | "title": "name", 42 | "type": "string" 43 | }, 44 | { 45 | "title": "latitude_deg", 46 | "type": "number" 47 | }, 48 | { 49 | "title": "longitude_deg", 50 | "type": "number" 51 | }, 52 | { 53 | "title": "elevation_ft", 54 | "type": "integer" 55 | }, 56 | { 57 | "title": "continent", 58 | "type": "string" 59 | }, 60 | { 61 | "title": "iso_country", 62 | "type": "string" 63 | }, 64 | { 65 | "title": "iso_region", 66 | "type": "string" 67 | }, 68 | { 69 | "title": "municipality", 70 | "type": "string" 71 | }, 72 | { 73 | "title": "gps_code", 74 | "type": "string" 75 | }, 76 | { 77 | "title": "iata_code", 78 | "type": "string" 79 | }, 80 | { 81 | "title": "local_code", 82 | "type": "string" 83 | } 84 | ] 85 | } 86 | } 87 | } 88 | } -------------------------------------------------------------------------------- /dsio/testdata/cbor/flourinated_compounds_in_fast_food_packaging/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "commit" : { 3 | "qri" : "cm:0", 4 | "title" : "initial commit", 5 | "timestamp": "2017-05-01T01:00:00.000Z" 6 | }, 7 | "meta" : { 8 | "title" : "Fluorinated Compounds in U.S. Fast Food Packaging", 9 | "description" : "Paper samples, paper extracts (known), paper extracts (unknown). \n\nThis dataset is associated with the following publication:\nSchaider, L., S. Balan, A. Blum, D. Andrews, M. Strynar, M. Dickinson, D. Lunderberg, J. Lang, and G. Peaslee. Fluorinated Compounds in U.S. Fast Food Packaging. Environmental Science \u0026amp; Technology Letters. American Chemical Society, Washington, DC, USA, 4(3): 105\u0026ndash;111, (2017)." 10 | }, 11 | "structure": { 12 | "qri": "st:0", 13 | "format": "cbor", 14 | "schema": { 15 | "type": "array", 16 | "items": { 17 | "type": "array", 18 | "items": [ 19 | { 20 | "type": "string", 21 | "title": "sample" 22 | }, 23 | { 24 | "type": "string", 25 | "title": "comments" 26 | }, 27 | { 28 | "type": "integer", 29 | "title": "length_cm", 30 | "description": "length of sample in cm" 31 | }, 32 | { 33 | "type": "integer", 34 | "title": "width_cm", 35 | "description": "width of sample in cm" 36 | }, 37 | { 38 | "type": "integer", 39 | "title": "area_cm", 40 | "description": "area of sample in cm2" 41 | }, 42 | { 43 | "type": "number", 44 | "title": "vial_tare_g", 45 | "description": "vial tare in grams" 46 | }, 47 | { 48 | "type": "number", 49 | "title": "vial_with_paper_g", 50 | "description": "mass of vial tare with paper in grams" 51 | }, 52 | { 53 | "type": "number", 54 | "title": "mass_g", 55 | "description": "mass of paper in grams" 56 | } 57 | ] 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /validate/testdata/flourinated_compounds_in_fast_food_packaging/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "commit" : { 3 | "qri" : "cm:0", 4 | "title" : "initial commit", 5 | "timestamp": "2017-05-01T01:00:00.000Z" 6 | }, 7 | "meta" : { 8 | "title" : "Fluorinated Compounds in U.S. Fast Food Packaging", 9 | "description" : "Paper samples, paper extracts (known), paper extracts (unknown). \n\nThis dataset is associated with the following publication:\nSchaider, L., S. Balan, A. Blum, D. Andrews, M. Strynar, M. Dickinson, D. Lunderberg, J. Lang, and G. Peaslee. Fluorinated Compounds in U.S. Fast Food Packaging. Environmental Science \u0026amp; Technology Letters. American Chemical Society, Washington, DC, USA, 4(3): 105\u0026ndash;111, (2017)." 10 | }, 11 | "structure": { 12 | "qri": "st:0", 13 | "format": "cbor", 14 | "schema": { 15 | "type": "array", 16 | "items": { 17 | "type": "array", 18 | "items": [ 19 | { 20 | "type": "string", 21 | "title": "sample" 22 | }, 23 | { 24 | "type": "string", 25 | "title": "comments" 26 | }, 27 | { 28 | "type": "integer", 29 | "title": "length_cm", 30 | "description": "length of sample in cm" 31 | }, 32 | { 33 | "type": "integer", 34 | "title": "width_cm", 35 | "description": "width of sample in cm" 36 | }, 37 | { 38 | "type": "integer", 39 | "title": "area_cm", 40 | "description": "area of sample in cm2" 41 | }, 42 | { 43 | "type": "number", 44 | "title": "vial_tare_g", 45 | "description": "vial tare in grams" 46 | }, 47 | { 48 | "type": "number", 49 | "title": "vial_with_paper_g", 50 | "description": "mass of vial tare with paper in grams" 51 | }, 52 | { 53 | "type": "number", 54 | "title": "mass_g", 55 | "description": "mass of paper in grams" 56 | } 57 | ] 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /vals/compare.go: -------------------------------------------------------------------------------- 1 | package vals 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "reflect" 7 | ) 8 | 9 | // Equal checks if two Values are the same 10 | func Equal(a, b Value) bool { 11 | if a.Type() != b.Type() { 12 | return false 13 | } 14 | switch a.Type() { 15 | case TypeObject, TypeArray: 16 | return reflect.DeepEqual(a, b) 17 | case TypeNumber: 18 | return a.Number() == b.Number() 19 | case TypeInteger: 20 | return a.Integer() == b.Integer() 21 | case TypeBoolean: 22 | return a.Boolean() == b.Boolean() 23 | case TypeNull: 24 | return a.IsNull() == b.IsNull() 25 | case TypeString: 26 | return a.String() == b.String() 27 | } 28 | return false 29 | } 30 | 31 | // CompareTypeBytes compares two byte slices with a known type 32 | // real on the real, this is a bit of a work in progress 33 | // TODO - up tests 34 | func CompareTypeBytes(a, b []byte, t Type) (int, error) { 35 | if len(a) == 0 && len(b) > 0 { 36 | return -1, nil 37 | } else if len(b) == 0 && len(a) > 0 { 38 | return 1, nil 39 | } else if len(b) == 0 && len(a) == 0 { 40 | return 0, nil 41 | } 42 | 43 | switch t { 44 | case TypeString: 45 | return bytes.Compare(a, b), nil 46 | case TypeInteger: 47 | return CompareIntegerBytes(a, b) 48 | case TypeNumber: 49 | return CompareNumberBytes(a, b) 50 | default: 51 | // TODO - other types 52 | return 0, fmt.Errorf("invalid type comparison") 53 | } 54 | } 55 | 56 | // CompareIntegerBytes compares two byte slices of interger data 57 | func CompareIntegerBytes(a, b []byte) (int, error) { 58 | at, err := ParseInteger(a) 59 | if err != nil { 60 | return 0, err 61 | } 62 | bt, err := ParseInteger(b) 63 | if err != nil { 64 | return 0, err 65 | } 66 | if at > bt { 67 | return 1, nil 68 | } else if at == bt { 69 | return 0, nil 70 | } 71 | return -1, nil 72 | } 73 | 74 | // CompareNumberBytes compares two byte slices of float data 75 | func CompareNumberBytes(a, b []byte) (int, error) { 76 | at, err := ParseNumber(a) 77 | if err != nil { 78 | return 0, err 79 | } 80 | bt, err := ParseNumber(b) 81 | if err != nil { 82 | return 0, err 83 | } 84 | if at > bt { 85 | return 1, nil 86 | } else if at == bt { 87 | return 0, nil 88 | } 89 | return -1, nil 90 | } 91 | -------------------------------------------------------------------------------- /kind_test.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import ( 4 | "encoding/json" 5 | "testing" 6 | ) 7 | 8 | func TestKindValid(t *testing.T) { 9 | cases := []struct { 10 | Kind Kind 11 | err string 12 | }{ 13 | {"", "invalid kind: ''. kind must be in the form [type]:[version]"}, 14 | {"ds:0", ""}, 15 | {"vz:0", ""}, 16 | {"st:0", ""}, 17 | {"as:0", ""}, 18 | {"ps:0", ""}, 19 | {"ps:0", ""}, 20 | } 21 | 22 | for i, c := range cases { 23 | err := c.Kind.Valid() 24 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 25 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err) 26 | continue 27 | } 28 | } 29 | } 30 | 31 | func TestKindDatatype(t *testing.T) { 32 | cases := []struct { 33 | Kind Kind 34 | expect string 35 | }{ 36 | {"ds:0", "ds"}, 37 | {"vz:0", "vz"}, 38 | {"st:0", "st"}, 39 | {"as:0", "as"}, 40 | {"ps:0", "ps"}, 41 | } 42 | 43 | for i, c := range cases { 44 | got := c.Kind.Type() 45 | if c.expect != got { 46 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.expect, got) 47 | continue 48 | } 49 | } 50 | } 51 | 52 | func TestKindVersion(t *testing.T) { 53 | cases := []struct { 54 | Kind Kind 55 | expect string 56 | }{ 57 | {"st:2", "2"}, 58 | {"ds:23", "23"}, 59 | } 60 | 61 | for i, c := range cases { 62 | got := c.Kind.Version() 63 | if c.expect != got { 64 | t.Errorf("case %d response mismatch. expected: '%s', got: '%s'", i, c.expect, got) 65 | continue 66 | } 67 | } 68 | } 69 | 70 | func TestKindUnmarshalJSON(t *testing.T) { 71 | cases := []struct { 72 | input string 73 | expect Kind 74 | err string 75 | }{ 76 | {`"st:2"`, Kind("st:2"), ""}, 77 | {`""`, Kind(""), "invalid kind: ''. kind must be in the form [type]:[version]"}, 78 | } 79 | 80 | for i, c := range cases { 81 | got := Kind("") 82 | err := json.Unmarshal([]byte(c.input), &got) 83 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 84 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err) 85 | continue 86 | } 87 | 88 | if got != c.expect { 89 | t.Errorf("case %d response mismatch. expected: '%s', got: '%s'", i, c.expect, got) 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /preview/preview_test.go: -------------------------------------------------------------------------------- 1 | package preview 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | "testing" 8 | 9 | "github.com/qri-io/dataset" 10 | "github.com/qri-io/dataset/dstest" 11 | ) 12 | 13 | func TestCreate(t *testing.T) { 14 | 15 | ctx := context.Background() 16 | 17 | _, err := Create(ctx, &dataset.Dataset{}) 18 | 19 | if err == nil { 20 | t.Fatal(fmt.Errorf("expected empty dataset to error")) 21 | } 22 | 23 | tc, err := dstest.NewTestCaseFromDir("testdata/earthquakes") 24 | if err != nil { 25 | t.Fatal(err) 26 | } 27 | 28 | got, err := Create(ctx, tc.Input) 29 | if err != nil { 30 | t.Fatal(err) 31 | } 32 | 33 | rawBody, ok := got.Body.(json.RawMessage) 34 | if !ok { 35 | t.Fatal("expected preview body to assert to json.RawMessage") 36 | } 37 | 38 | body := [][]interface{}{} 39 | 40 | if err := json.Unmarshal(rawBody, &body); err != nil { 41 | t.Fatal(err) 42 | } 43 | got.Body = body 44 | 45 | if len(body) != 100 { 46 | t.Errorf("error: body length mismatch, expected 100 got %d", len(body)) 47 | } 48 | if got.BodyFile() == nil { 49 | t.Errorf("expected creating a preview to leave existing BodyFile intact, is missing") 50 | // TODO (b5) - confirm body file contents are unmodified 51 | } 52 | if got.Readme.ScriptFile() == nil { 53 | t.Errorf("expected creating a preview to leave existing Readme.ScriptFile intact, is missing") 54 | // TODO (b5) - confirm actual readme scriptfile is unmodified 55 | } 56 | 57 | // TODO (b5) - required adjustments for accurate comparison due to JSON serialization 58 | // issues. either solve the serialization issues or add options to dstest.CompareDatasets 59 | got.Body = []interface{}{} 60 | 61 | expect := dstest.LoadGoldenFile(t, "testdata/earthquakes/golden.dataset.json") 62 | 63 | if diff := dstest.CompareDatasets(expect, got); diff != "" { 64 | t.Errorf("result mismatch. (-want +got):\n%s", diff) 65 | dstest.UpdateGoldenFileIfEnvVarSet("testdata/earthquakes/golden.dataset.json", got) 66 | } 67 | 68 | // make sure you can create a preview of a dataset without a body file 69 | tc.Input.SetBodyFile(nil) 70 | 71 | got, err = Create(ctx, tc.Input) 72 | if err != nil { 73 | t.Fatalf("unexpected error creating a preview of a dataset without a body: %s", err) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /dstest/priv_key.go: -------------------------------------------------------------------------------- 1 | package dstest 2 | 3 | import ( 4 | "encoding/base64" 5 | "fmt" 6 | 7 | crypto "github.com/libp2p/go-libp2p-core/crypto" 8 | ) 9 | 10 | var ( 11 | // PrivKey is a predefined private key for use in tests 12 | PrivKey crypto.PrivKey 13 | // PrivKeyPeerID is the base58-encoded multihash of PrivKey.PublicKey 14 | PrivKeyPeerID = "QmZePf5LeXow3RW5U1AgEiNbW46YnRGhZ7HPvm1UmPFPwt" 15 | ) 16 | 17 | func init() { 18 | testPk := []byte(`CAASpgkwggSiAgEAAoIBAQC/7Q7fILQ8hc9g07a4HAiDKE4FahzL2eO8OlB1K99Ad4L1zc2dCg+gDVuGwdbOC29IngMA7O3UXijycckOSChgFyW3PafXoBF8Zg9MRBDIBo0lXRhW4TrVytm4Etzp4pQMyTeRYyWR8e2hGXeHArXM1R/A/SjzZUbjJYHhgvEE4OZy7WpcYcW6K3qqBGOU5GDMPuCcJWac2NgXzw6JeNsZuTimfVCJHupqG/dLPMnBOypR22dO7yJIaQ3d0PFLxiDG84X9YupF914RzJlopfdcuipI+6gFAgBw3vi6gbECEzcohjKf/4nqBOEvCDD6SXfl5F/MxoHurbGBYB2CJp+FAgMBAAECggEAaVOxe6Y5A5XzrxHBDtzjlwcBels3nm/fWScvjH4dMQXlavwcwPgKhy2NczDhr4X69oEw6Msd4hQiqJrlWd8juUg6vIsrl1wS/JAOCS65fuyJfV3Pw64rWbTPMwO3FOvxj+rFghZFQgjg/i45uHA2UUkM+h504M5Nzs6Arr/rgV7uPGR5e5OBw3lfiS9ZaA7QZiOq7sMy1L0qD49YO1ojqWu3b7UaMaBQx1Dty7b5IVOSYG+Y3U/dLjhTj4Hg1VtCHWRm3nMOE9cVpMJRhRzKhkq6gnZmni8obz2BBDF02X34oQLcHC/Wn8F3E8RiBjZDI66g+iZeCCUXvYz0vxWAQQKBgQDEJu6flyHPvyBPAC4EOxZAw0zh6SF/r8VgjbKO3n/8d+kZJeVmYnbsLodIEEyXQnr35o2CLqhCvR2kstsRSfRz79nMIt6aPWuwYkXNHQGE8rnCxxyJmxV4S63GczLk7SIn4KmqPlCI08AU0TXJS3zwh7O6e6kBljjPt1mnMgvr3QKBgQD6fAkdI0FRZSXwzygx4uSg47Co6X6ESZ9FDf6ph63lvSK5/eue/ugX6p/olMYq5CHXbLpgM4EJYdRfrH6pwqtBwUJhlh1xI6C48nonnw+oh8YPlFCDLxNG4tq6JVo071qH6CFXCIank3ThZeW5a3ZSe5pBZ8h4bUZ9H8pJL4C7yQKBgFb8SN/+/qCJSoOeOcnohhLMSSD56MAeK7KIxAF1jF5isr1TP+rqiYBtldKQX9bIRY3/8QslM7r88NNj+aAuIrjzSausXvkZedMrkXbHgS/7EAPflrkzTA8fyH10AsLgoj/68mKr5bz34nuY13hgAJUOKNbvFeC9RI5g6eIqYH0FAoGAVqFTXZp12rrK1nAvDKHWRLa6wJCQyxvTU8S1UNi2EgDJ492oAgNTLgJdb8kUiH0CH0lhZCgr9py5IKW94OSM6l72oF2UrS6PRafHC7D9b2IV5Al9lwFO/3MyBrMocapeeyaTcVBnkclz4Qim3OwHrhtFjF1ifhP9DwVRpuIg+dECgYANwlHxLe//tr6BM31PUUrOxP5Y/cj+ydxqM/z6papZFkK6Mvi/vMQQNQkh95GH9zqyC5Z/yLxur4ry1eNYty/9FnuZRAkEmlUSZ/DobhU0Pmj8Hep6JsTuMutref6vCk2n02jc9qYmJuD7iXkdXDSawbEG6f5C4MUkJ38z1t1OjA==`) 19 | data, err := base64.StdEncoding.DecodeString(string(testPk)) 20 | if err != nil { 21 | panic(err) 22 | } 23 | PrivKey, err = crypto.UnmarshalPrivateKey(data) 24 | if err != nil { 25 | panic(fmt.Errorf("error unmarshaling private key: %s", err.Error())) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # dataset 2 | 3 | [![Qri](https://img.shields.io/badge/made%20by-qri-magenta.svg?style=flat-square)](https://qri.io) 4 | [![GoDoc](https://godoc.org/github.com/qri-io/dataset?status.svg)](http://godoc.org/github.com/qri-io/dataset) 5 | [![License](https://img.shields.io/github/license/qri-io/dataset.svg?style=flat-square)](./LICENSE) 6 | [![Codecov](https://img.shields.io/codecov/c/github/qri-io/dataset.svg?style=flat-square)](https://codecov.io/gh/qri-io/dataset) 7 | [![CI](https://img.shields.io/circleci/project/github/qri-io/dataset.svg?style=flat-square)](https://circleci.com/gh/qri-io/dataset) 8 | [![Go Report Card](https://goreportcard.com/badge/github.com/qri-io/dataset)](https://goreportcard.com/report/github.com/qri-io/dataset) 9 | 10 | Dataset contains the qri ("query") dataset document definition. This package contains the base definition, as well as a number of 11 | subpackages that build from this base to add functionality as necessary Datasets take inspiration from HTML documents, deliniating semantic purpose to predefined tags of the document, but instead of orienting around presentational markup, dataset documents emphasize interoperability and composition. The principle encoding format for a dataset document is JSON. 12 | 13 | ### Subpackage Overview 14 | 15 | * **compression**: defines supported types of compression for interpreting a dataset 16 | * **detect**: dataset structure & schema inference 17 | * **dsfs**: "datasets on a content-addressed file system" tools to work with datasets stored with the [cafs](https://github.com/qri-io/qri) interface: `github.com/qri-io/qfs/cafs` 18 | * **dsgraph**: expressing relationships between and within datasets as graphs 19 | * **dsio**: `io` primitives for working with dataset bodies as readers, writers, buffers, oriented around row-like "entries". 20 | * **dstest**: utility functions for working with tests that need datasets 21 | * **dsutil**: utility functions that avoid dataset bloat 22 | * **generate**: io primitives for generating data 23 | * **use_generate**: small package that uses generate to create test data 24 | * **validate**: dataset validation & checking functions 25 | * **vals**: data type mappings & definitions 26 | 27 | ## Getting Involved 28 | 29 | We would love involvement from more people! If you notice any errors or would 30 | like to submit changes, please see our 31 | [Contributing Guidelines](./.github/CONTRIBUTING.md). -------------------------------------------------------------------------------- /generate/value.go: -------------------------------------------------------------------------------- 1 | package generate 2 | 3 | import ( 4 | "math" 5 | "math/rand" 6 | ) 7 | 8 | // ValueGenerator is a state machine for producing values 9 | type ValueGenerator struct { 10 | Rand *rand.Rand // random number generator 11 | MaxStringLength int 12 | } 13 | 14 | // Value creates a random value of a random type 15 | func (g *ValueGenerator) Value() interface{} { 16 | i := g.Rand.Intn(40) 17 | if i == 0 { 18 | return nil 19 | } else if i > 0 && i < 10 { 20 | return g.Int() 21 | } else if i > 10 && i < 20 { 22 | return g.String() 23 | } else if i > 20 && i < 30 { 24 | return g.Float() 25 | } else if i > 30 && i < 40 { 26 | return g.Bool() 27 | } 28 | 29 | return nil 30 | } 31 | 32 | // Type creates a value to match a string type. type names match the 33 | // JSON-schema specification 34 | func (g *ValueGenerator) Type(t string) interface{} { 35 | switch t { 36 | case "string": 37 | return g.String() 38 | case "boolean": 39 | return g.Bool() 40 | case "number": 41 | return g.Float() 42 | case "integer": 43 | return g.Int() 44 | case "object": 45 | return g.Object() 46 | case "array": 47 | return g.Array() 48 | case "null": 49 | return nil 50 | default: 51 | return g.Value() 52 | } 53 | } 54 | 55 | var alphaNumericRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") 56 | 57 | // String yields a random string 58 | func (g *ValueGenerator) String() string { 59 | runes := make([]rune, g.Rand.Intn(g.MaxStringLength)) 60 | for i := range runes { 61 | runes[i] = alphaNumericRunes[g.Rand.Intn(len(alphaNumericRunes))] 62 | } 63 | return string(runes) 64 | } 65 | 66 | // Float yields a random floating point number 67 | func (g *ValueGenerator) Float() float64 { 68 | return g.Rand.NormFloat64() 69 | } 70 | 71 | // Int yields a random integer 72 | func (g *ValueGenerator) Int() int { 73 | return g.Rand.Intn(math.MaxInt64) 74 | } 75 | 76 | // Bool yields a random coin flip 77 | func (g *ValueGenerator) Bool() bool { 78 | return g.Rand.Intn(1)%2 == 0 79 | } 80 | 81 | // Object creates an empty object 82 | // TODO (b5) - populate with random values 83 | func (g *ValueGenerator) Object() map[string]interface{} { 84 | return map[string]interface{}{} 85 | } 86 | 87 | // Array creates an empty array 88 | // TODO (b5) - populate with random values 89 | func (g *ValueGenerator) Array() []interface{} { 90 | return []interface{}{} 91 | } 92 | -------------------------------------------------------------------------------- /stats.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import "encoding/json" 4 | 5 | // Stats is a component that contains statistical metadata about the body of a 6 | // dataset 7 | type Stats struct { 8 | Path string `json:"path,omitempty"` 9 | Qri string `json:"qri,omitempty"` 10 | Stats interface{} `json:"stats,omitempty"` 11 | } 12 | 13 | // NewStatsRef creates an empty struct with it's path set 14 | func NewStatsRef(path string) *Stats { 15 | return &Stats{Path: path} 16 | } 17 | 18 | // DropDerivedValues resets all set-on-save fields to their default values 19 | func (sa *Stats) DropDerivedValues() { 20 | sa.Qri = "" 21 | sa.Path = "" 22 | } 23 | 24 | // IsEmpty checks to see if stats has any fields other than Path set 25 | func (sa *Stats) IsEmpty() bool { 26 | return sa.Stats == nil 27 | } 28 | 29 | // Assign collapses all properties of a group of Stats components onto one 30 | func (sa *Stats) Assign(sas ...*Stats) { 31 | for _, s := range sas { 32 | if s == nil { 33 | continue 34 | } 35 | 36 | if s.Stats != nil { 37 | sa.Stats = s.Stats 38 | } 39 | if s.Path != "" { 40 | sa.Path = s.Path 41 | } 42 | if s.Qri != "" { 43 | sa.Qri = s.Qri 44 | } 45 | } 46 | } 47 | 48 | // _stats is a private struct for marshaling into & out of. 49 | // fields must remain sorted in lexographical order 50 | type _stats Stats 51 | 52 | // MarshalJSON satisfies the json.Marshaler interface 53 | func (sa Stats) MarshalJSON() ([]byte, error) { 54 | // if we're dealing with an empty object that has a path specified, marshal to 55 | // a string instead 56 | if sa.Path != "" && sa.IsEmpty() { 57 | return json.Marshal(sa.Path) 58 | } 59 | return sa.MarshalJSONObject() 60 | } 61 | 62 | // MarshalJSONObject always marshals to a json Object, even if Stats is empty or 63 | // a reference 64 | func (sa Stats) MarshalJSONObject() ([]byte, error) { 65 | kind := sa.Qri 66 | if kind == "" { 67 | kind = KindStats.String() 68 | } 69 | 70 | return json.Marshal(&_stats{ 71 | Stats: sa.Stats, 72 | Path: sa.Path, 73 | Qri: kind, 74 | }) 75 | } 76 | 77 | // UnmarshalJSON satisfies the json.Unmarshaler interface 78 | func (sa *Stats) UnmarshalJSON(data []byte) error { 79 | var s string 80 | if err := json.Unmarshal(data, &s); err == nil { 81 | *sa = Stats{Path: s} 82 | return nil 83 | } 84 | 85 | _sa := _stats{} 86 | if err := json.Unmarshal(data, &_sa); err != nil { 87 | return err 88 | } 89 | 90 | *sa = Stats(_sa) 91 | return nil 92 | } 93 | -------------------------------------------------------------------------------- /dstest/testdata/complete/expect.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "abstract": "/map/Qmb3n8FvgDbLoU9d7e3vo1UAyVkwV1RnqXUqPKC3Rj2Ej7", 3 | "abstractTransform": "/map/QmemJQrK7PTQvD3n8gmo9JhyaByyLmETiNR1Y8wS7hv4sP", 4 | "commit": { 5 | "qri": "cm:0", 6 | "signature": "8WVfbCKYc4rpugq5ZKYoWzX6wFQ6odffwe2UDAR1G1ktjQihiRx8EADNmxZDgh8LkuWSQLMKJ5xzndFVbW5AcnfeLkJ9GCut62QWmWapb5TWU2GeBxRZnmDhJKpDjTf5fvExUZk7F7viSbVGUfXWmKPZwieLVfowkJMGee8WLQo7hY3rK42dPjMfqP91AQgQsLCPFFFwGN94FExeQ5FcdP2ecLNpyxTbDNbQWeov6oUiHDTXFQ95T28WkJQDQvp5DwnS3WeBEF2TzxGq165KjLHLq3GJm5s767MzgWdZibKcRZpXX9k2S2DeMdRh1AhTXJEdXXj5TtS37ANeJ9f1QL4Eb6XAue", 7 | "timestamp": "2001-01-01T01:01:01.000000001Z", 8 | "title": "I'm a commit" 9 | }, 10 | "dataPath": "/map/QmcCcPTqmckdXLBwPQXxfyW2BbFcUT6gqv9oGeWDkrNTyD", 11 | "meta": { 12 | "qri": "md:0", 13 | "title": "dataset with all submodels example" 14 | }, 15 | "qri": "ds:0", 16 | "structure": { 17 | "checksum": "QmcCcPTqmckdXLBwPQXxfyW2BbFcUT6gqv9oGeWDkrNTyD", 18 | "entries": 6, 19 | "errCount": 1, 20 | "format": "csv", 21 | "formatConfig": { 22 | "headerRow": true 23 | }, 24 | "length": 155, 25 | "qri": "st:0", 26 | "schema": { 27 | "items": { 28 | "items": [ 29 | { 30 | "title": "title", 31 | "type": "string" 32 | }, 33 | { 34 | "title": "duration", 35 | "type": "integer" 36 | } 37 | ], 38 | "type": "array" 39 | }, 40 | "type": "array" 41 | } 42 | }, 43 | "transform": { 44 | "data": "select * from foo", 45 | "qri": "tf:0", 46 | "resources": { 47 | "foo": "/not/a/real/path" 48 | }, 49 | "structure": { 50 | "errCount": 0, 51 | "format": "csv", 52 | "formatConfig": { 53 | "headerRow": true 54 | }, 55 | "qri": "st:0", 56 | "schema": { 57 | "items": { 58 | "items": [ 59 | { 60 | "title": "title", 61 | "type": "string" 62 | }, 63 | { 64 | "title": "duration", 65 | "type": "integer" 66 | } 67 | ], 68 | "type": "array" 69 | }, 70 | "type": "array" 71 | } 72 | }, 73 | "syntax": "sql" 74 | }, 75 | "visconfig": { 76 | "format": "foo", 77 | "qri": "vc:0", 78 | "visualizations": { 79 | "colors": { 80 | "background": "#000000", 81 | "bars": "#ffffff" 82 | }, 83 | "type": "bar" 84 | } 85 | } 86 | } -------------------------------------------------------------------------------- /validate/data.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "fmt" 7 | 8 | "github.com/qri-io/dataset" 9 | "github.com/qri-io/dataset/dsio" 10 | "github.com/qri-io/jsonschema" 11 | ) 12 | 13 | const batchSize = 5000 14 | 15 | func flushBatch(ctx context.Context, buf *dsio.EntryBuffer, st *dataset.Structure, jsch *jsonschema.Schema, errs *[]jsonschema.KeyError) error { 16 | if len(buf.Bytes()) == 0 { 17 | return nil 18 | } 19 | 20 | if e := buf.Close(); e != nil { 21 | return fmt.Errorf("error closing buffer: %s", e.Error()) 22 | } 23 | 24 | var doc interface{} 25 | if err := json.Unmarshal(buf.Bytes(), &doc); err != nil { 26 | return fmt.Errorf("error parsing JSON bytes: %s", err.Error()) 27 | } 28 | validationState := jsch.Validate(ctx, doc) 29 | *errs = append(*errs, *validationState.Errs...) 30 | 31 | return nil 32 | } 33 | 34 | // EntryReader consumes a reader & returns any validation errors present 35 | // TODO - refactor this to wrap a reader & return a struct that gives an 36 | // error or nil on each entry read. 37 | func EntryReader(r dsio.EntryReader) ([]jsonschema.KeyError, error) { 38 | ctx := context.Background() 39 | st := r.Structure() 40 | 41 | jsch, err := st.JSONSchema() 42 | if err != nil { 43 | return nil, err 44 | } 45 | 46 | valErrors := []jsonschema.KeyError{} 47 | 48 | buf, err := dsio.NewEntryBuffer(&dataset.Structure{ 49 | Format: "json", 50 | Schema: st.Schema, 51 | }) 52 | if err != nil { 53 | return nil, fmt.Errorf("error allocating data buffer: %s", err.Error()) 54 | } 55 | 56 | err = dsio.EachEntry(r, func(i int, ent dsio.Entry, err error) error { 57 | if err != nil { 58 | return fmt.Errorf("error reading row %d: %s", i, err.Error()) 59 | } 60 | 61 | if i%batchSize == 0 { 62 | flushErr := flushBatch(ctx, buf, st, jsch, &valErrors) 63 | if flushErr != nil { 64 | return flushErr 65 | } 66 | var bufErr error 67 | buf, bufErr = dsio.NewEntryBuffer(&dataset.Structure{ 68 | Format: "json", 69 | Schema: st.Schema, 70 | }) 71 | if bufErr != nil { 72 | return fmt.Errorf("error allocating data buffer: %s", bufErr.Error()) 73 | } 74 | } 75 | 76 | err = buf.WriteEntry(ent) 77 | if err != nil { 78 | return fmt.Errorf("error writing row %d: %s", i, err.Error()) 79 | } 80 | 81 | return nil 82 | }) 83 | 84 | if err != nil { 85 | return nil, fmt.Errorf("error reading values: %s", err.Error()) 86 | } 87 | 88 | if err := flushBatch(ctx, buf, st, jsch, &valErrors); err != nil { 89 | return nil, err 90 | } 91 | 92 | return valErrors, nil 93 | } 94 | -------------------------------------------------------------------------------- /dsio/README.md: -------------------------------------------------------------------------------- 1 | ## Performance 2 | 3 | 2018-12-04 4 | 5 | go test github.com/qri-io/dataset/dsio -bench=. 6 | 7 | BenchmarkCBORWriterArrays-2 3000 431290 ns/op 8 | BenchmarkCBORWriterObjects-2 2000 698920 ns/op 9 | BenchmarkCBORReader-2 1000 1764549 ns/op 10 | BenchmarkCSVWriterArrays-2 1000 1548509 ns/op 11 | BenchmarkCSVWriterObjects-2 1000 1458219 ns/op 12 | BenchmarkCSVReader-2 1000 2008097 ns/op 13 | BenchmarkJSONWriterArrays-2 1000 1556416 ns/op 14 | BenchmarkJSONWriterObjects-2 1000 1562488 ns/op 15 | BenchmarkJSONReader-2 500 2984057 ns/op 16 | 17 | 2018-04-17 18 | 19 | go test github.com/qri-io/dataset/dsio -bench=. 20 | 21 | BenchmarkCBORWriterArrays-2 3000 478424 ns/op 22 | BenchmarkCBORWriterObjects-2 2000 584435 ns/op 23 | BenchmarkCBORReader-2 300 5081171 ns/op 24 | BenchmarkCSVWriterArrays-2 1000 1369984 ns/op 25 | BenchmarkCSVWriterObjects-2 1000 1406440 ns/op 26 | BenchmarkCSVReader-2 1000 1463376 ns/op 27 | BenchmarkJSONWriterArrays-2 1000 1377027 ns/op 28 | BenchmarkJSONWriterObjects-2 1000 1558887 ns/op 29 | BenchmarkJSONReader-2 500 2607946 ns/op 30 | 31 | 2018-03-29 32 | 33 | go test github.com/qri-io/dataset/dsio -bench=. 34 | 35 | BenchmarkCBORWriterArrays-2 3000 423851 ns/op 36 | BenchmarkCBORWriterObjects-2 2000 572609 ns/op 37 | BenchmarkCBORReader-2 300 5024830 ns/op 38 | BenchmarkCSVWriterArrays-2 1000 1448891 ns/op 39 | BenchmarkCSVWriterObjects-2 1000 1457973 ns/op 40 | BenchmarkCSVReader-2 1000 1454932 ns/op 41 | BenchmarkJSONWriterArrays-2 1000 1423156 ns/op 42 | BenchmarkJSONWriterObjects-2 1000 1620801 ns/op 43 | BenchmarkJSONReader-2 300 5286851 ns/op 44 | 45 | ## Fuzz testing 46 | 47 | From: [https://medium.com/@dgryski/go-fuzz-github-com-arolek-ase-3c74d5a3150c](http://https://medium.com/@dgryski/go-fuzz-github-com-arolek-ase-3c74d5a3150c) 48 | 49 | How to fuzz test: 50 | 51 | go install github.com/qri-io/dataset/use_generate 52 | cd $GOPATH 53 | mkdir out 54 | bin/use_generate 55 | cp $GOPATH/out/* workdir/corpus/. 56 | 57 | go get github.com/dvyukov/go-fuzz/go-fuzz 58 | go get github.com/dvyukov/go-fuzz/go-fuzz-build 59 | go install github.com/dvyukov/go-fuzz/go-fuzz 60 | go install github.com/dvyukov/go-fuzz/go-fuzz-build 61 | 62 | go-fuzz-build github.com/qri-io/dataset/dsio 63 | go-fuzz -bin=dsio-fuzz.zip -workdir=workdir 64 | -------------------------------------------------------------------------------- /dsio/identity.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | 7 | "github.com/qri-io/dataset" 8 | ) 9 | 10 | // NewIdentityReader creates an EntryReader from native go types, passed in 11 | // data must be of type []interface{} or map[string]interface{} 12 | func NewIdentityReader(st *dataset.Structure, data interface{}) (*IdentityReader, error) { 13 | r := &IdentityReader{st: st} 14 | 15 | if md, ok := data.(map[string]interface{}); ok { 16 | r.entries = r.iterateMap(md) 17 | } else if sd, ok := data.([]interface{}); ok { 18 | r.entries = r.iterateSlice(sd) 19 | } else { 20 | return nil, fmt.Errorf("cannot create entry reader from type %T", data) 21 | } 22 | 23 | return r, nil 24 | } 25 | 26 | // IdentityReader is a dsio.EntryReader that works with native go types 27 | type IdentityReader struct { 28 | st *dataset.Structure 29 | done bool 30 | entries chan Entry 31 | } 32 | 33 | var _ EntryReader = (*IdentityReader)(nil) 34 | 35 | // Structure gives the structure being read 36 | func (r *IdentityReader) Structure() *dataset.Structure { 37 | return r.st 38 | } 39 | 40 | // ReadEntry reads one row of structured data from the reader 41 | func (r *IdentityReader) ReadEntry() (Entry, error) { 42 | if r.done { 43 | return Entry{}, io.EOF 44 | } 45 | 46 | return <-r.entries, nil 47 | } 48 | 49 | // Close finalizes the reader 50 | func (r *IdentityReader) Close() error { 51 | if !r.done { 52 | // drain channel to prevent leaking goroutine 53 | for !r.done { 54 | <-r.entries 55 | } 56 | } 57 | return nil 58 | } 59 | 60 | func (r *IdentityReader) iterateMap(data map[string]interface{}) chan Entry { 61 | res := make(chan Entry) 62 | 63 | go func() { 64 | for key, val := range data { 65 | res <- Entry{Key: key, Value: val} 66 | } 67 | r.done = true 68 | }() 69 | 70 | return res 71 | } 72 | 73 | func (r *IdentityReader) iterateSlice(data []interface{}) chan Entry { 74 | res := make(chan Entry) 75 | 76 | go func() { 77 | for i, val := range data { 78 | res <- Entry{Index: i, Value: val} 79 | } 80 | r.done = true 81 | }() 82 | 83 | return res 84 | } 85 | 86 | // IdentityWriter is a dsio.EntryWriter that works with native go types 87 | type IdentityWriter struct { 88 | st *dataset.Structure 89 | } 90 | 91 | // Structure gives the structure being written 92 | func (w *IdentityWriter) Structure() *dataset.Structure { 93 | return w.st 94 | } 95 | 96 | // WriteEntry writes one "row" of structured data to the Writer 97 | func (w *IdentityWriter) WriteEntry(e Entry) error { 98 | return nil 99 | } 100 | 101 | // Close finalizes the writer, indicating all entries 102 | // have been written 103 | func (w *IdentityWriter) Close() error { 104 | return nil 105 | } 106 | -------------------------------------------------------------------------------- /kind.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | ) 7 | 8 | // CurrentSpecVersion is the current verion of the dataset spec 9 | const CurrentSpecVersion = "0" 10 | 11 | const ( 12 | // KindDataset is the current kind for datasets 13 | KindDataset = Kind("ds:" + CurrentSpecVersion) 14 | // KindBody is the current kind for body components 15 | KindBody = Kind("bd:" + CurrentSpecVersion) 16 | // KindMeta is the current kind for metadata components 17 | KindMeta = Kind("md:" + CurrentSpecVersion) 18 | // KindStructure is the current kind for structure components 19 | KindStructure = Kind("st:" + CurrentSpecVersion) 20 | // KindTransform is the current kind for transform components 21 | KindTransform = Kind("tf:" + CurrentSpecVersion) 22 | // KindCommit is the current kind for commit components 23 | KindCommit = Kind("cm:" + CurrentSpecVersion) 24 | // KindViz is the current kind for viz components 25 | KindViz = Kind("vz:" + CurrentSpecVersion) 26 | // KindReadme is the current kind for readme components 27 | KindReadme = Kind("rm:" + CurrentSpecVersion) 28 | // KindStats is the current kind for stats components 29 | KindStats = Kind("sa:" + CurrentSpecVersion) 30 | ) 31 | 32 | // Kind is a short identifier for all types of qri dataset objects 33 | // Kind does three things: 34 | // 1. Distinguish qri datasets from other formats 35 | // 2. Distinguish different types (Dataset/Structure/Transform/etc.) 36 | // 3. Distinguish between versions of the dataset spec 37 | // Kind is a string in the format 2_letter_prefix + ':' + version 38 | type Kind string 39 | 40 | // String implements the stringer interface 41 | func (k Kind) String() string { 42 | return string(k) 43 | } 44 | 45 | // Valid checks to see if a kind string is valid 46 | func (k Kind) Valid() error { 47 | if len(k) < 4 { 48 | return fmt.Errorf("invalid kind: '%s'. kind must be in the form [type]:[version]", k.String()) 49 | } 50 | return nil 51 | } 52 | 53 | // Type returns the type identifier 54 | func (k Kind) Type() string { 55 | return k.String()[:2] 56 | } 57 | 58 | // Version returns the version portion of the kind identifier 59 | func (k Kind) Version() string { 60 | return k.String()[3:] 61 | } 62 | 63 | // UnmarshalJSON implements the JSON.Unmarshaler interface, 64 | // rejecting any strings that are not a valid kind 65 | func (k *Kind) UnmarshalJSON(data []byte) error { 66 | var _k string 67 | if err := json.Unmarshal(data, &_k); err != nil { 68 | return err 69 | } 70 | *k = Kind(_k) 71 | return k.Valid() 72 | } 73 | 74 | // ComponentTypePrefix prefixes a string with a two letter component type 75 | // identifier & a colon. Example: 76 | // ComponentTypePrefix(KindDataset, "hello") == "ds:hello" 77 | func ComponentTypePrefix(k Kind, str string) string { 78 | return fmt.Sprintf("%s:%s", k.Type(), str) 79 | } 80 | -------------------------------------------------------------------------------- /generate/dsgen/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/csv" 5 | "encoding/json" 6 | "flag" 7 | "fmt" 8 | "io/ioutil" 9 | "os" 10 | "strings" 11 | 12 | "github.com/qri-io/dataset" 13 | "github.com/qri-io/dataset/dsio" 14 | "github.com/qri-io/dataset/generate" 15 | ) 16 | 17 | const help = ` 18 | dsgen generates random CSV data for given tabular structure & prints to stdout. 19 | Use "fixed" to generate 1000byte rows for a fixed 4 column schema. 20 | 21 | Usage: 22 | dsgen [structure.json] --rows [num_rows] 23 | dsgen fixed --rows [num_rows] 24 | ` 25 | 26 | var rows int 27 | 28 | func init() { 29 | flag.IntVar(&rows, "rows", 1000, "number of entries (rows) to generate") 30 | } 31 | 32 | func main() { 33 | flag.Parse() 34 | args := flag.Args() 35 | 36 | if len(args) < 1 { 37 | fmt.Println(help) 38 | os.Exit(1) 39 | } 40 | if args[0] == "fixed" { 41 | if err := writeFixedFile(rows, 0); err != nil { 42 | fmt.Println(err) 43 | os.Exit(1) 44 | } 45 | } else { 46 | if err := generateFile(args[0], rows); err != nil { 47 | fmt.Println(err) 48 | os.Exit(1) 49 | } 50 | } 51 | } 52 | 53 | func generateFile(structurePath string, lines int) error { 54 | data, err := ioutil.ReadFile(structurePath) 55 | if err != nil { 56 | return err 57 | } 58 | st := &dataset.Structure{} 59 | if err := json.Unmarshal(data, st); err != nil { 60 | return err 61 | } 62 | 63 | gen, err := generate.NewTabularGenerator(st) 64 | if err != nil { 65 | return err 66 | } 67 | 68 | w, err := dsio.NewCSVWriter(st, os.Stdout) 69 | if err != nil { 70 | return err 71 | } 72 | 73 | for i := 0; i < lines; i++ { 74 | ent, err := gen.ReadEntry() 75 | if err != nil { 76 | return err 77 | } 78 | w.WriteEntry(ent) 79 | } 80 | w.Close() 81 | gen.Close() 82 | return nil 83 | } 84 | 85 | func writeFixedFile(lines, diffStart int) error { 86 | filler := strings.Repeat("0", 908) 87 | w := csv.NewWriter(os.Stdout) 88 | w.Write([]string{"uuid", "ingest", "occurred", "raw_data"}) 89 | var uuid, ingest, occurred, rawData string 90 | for i := 0; i < lines; i++ { 91 | if diffStart > 0 && i > diffStart { 92 | // write a "diff" line 93 | uuid = fmt.Sprintf("%d-%d-BA882B47-B26A-4E29-BFB4-XXXXXXXXXXXX", i, i) 94 | ingest = fmt.Sprintf("%d%d-01-01 00:00:01.000 UTC", i, i) 95 | occurred = fmt.Sprintf("2000-%d%d-01 00:00:02.000 UTC", i, i) 96 | rawData = fmt.Sprintf("%d%d%s", i, i, filler) 97 | } else { 98 | // write a normal line 99 | uuid = fmt.Sprintf("%d-BA882B47-B26A-4E29-BFB4-XXXXXXXXXXXX", i) 100 | ingest = fmt.Sprintf("%d-01-01 00:00:01.000 UTC", i) 101 | occurred = fmt.Sprintf("2000-%d-01 00:00:02.000 UTC", i) 102 | rawData = fmt.Sprintf("%d%s", i, filler) 103 | } 104 | w.Write([]string{uuid, ingest, occurred, rawData}) 105 | } 106 | 107 | w.Flush() 108 | return nil 109 | } 110 | -------------------------------------------------------------------------------- /compression/compression_test.go: -------------------------------------------------------------------------------- 1 | package compression 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "strings" 7 | "testing" 8 | ) 9 | 10 | func TestParseFormat(t *testing.T) { 11 | good := []string{ 12 | "gz", "gzip", "zstd", 13 | } 14 | 15 | for _, s := range good { 16 | f, err := ParseFormat(s) 17 | if err != nil { 18 | t.Errorf("unexpected error for format %q: %s", s, err) 19 | } 20 | if _, ok := SupportedFormats[f]; !ok { 21 | t.Errorf("expected %q to be a supported format", s) 22 | } 23 | } 24 | 25 | bad := []string{ 26 | "", "tar", 27 | } 28 | for _, s := range bad { 29 | if _, err := ParseFormat(s); err == nil { 30 | t.Errorf("expected format to error: %s, got nil", s) 31 | } 32 | } 33 | } 34 | 35 | func TestNew(t *testing.T) { 36 | if _, err := Compressor("invalid", &bytes.Buffer{}); err == nil { 37 | t.Error("expected error constructing with invalid compression format string") 38 | } 39 | 40 | if _, err := Decompressor("invalid", &bytes.Buffer{}); err == nil { 41 | t.Error("expected error constructing with invalid decompression format string") 42 | } 43 | 44 | SupportedFormats[Format("invalid")] = struct{}{} 45 | defer delete(SupportedFormats, Format("invalid")) 46 | 47 | if _, err := Compressor("invalid", &bytes.Buffer{}); err == nil { 48 | t.Error("expected error constructing with compression format without backing compressor") 49 | } 50 | 51 | if _, err := Decompressor("invalid", &bytes.Buffer{}); err == nil { 52 | t.Error("expected error constructing with decompression format without backing decompressor") 53 | } 54 | } 55 | 56 | func TestCompressionCycle(t *testing.T) { 57 | for f := range SupportedFormats { 58 | t.Run(string(f), func(t *testing.T) { 59 | plainText := "I am a string destined to go through a compression spin cycle" 60 | 61 | buf := &bytes.Buffer{} 62 | comp, err := Compressor(f.String(), buf) 63 | if err != nil { 64 | t.Fatal(err) 65 | } 66 | 67 | if copied, err := io.Copy(comp, strings.NewReader(plainText)); err != nil { 68 | t.Fatal(err) 69 | } else if copied != int64(len([]byte(plainText))) { 70 | t.Errorf("copy byte length mismatch. want: %d got: %d", len(plainText), copied) 71 | } 72 | 73 | if err := comp.Close(); err != nil { 74 | t.Fatal(err) 75 | } 76 | 77 | if buf.String() == plainText { 78 | t.Errorf("buf contents should be compressed, unequal to plain text") 79 | } 80 | 81 | t.Log(buf.String()) 82 | 83 | decomp, err := Decompressor(f.String(), buf) 84 | if err != nil { 85 | t.Fatal(err) 86 | } 87 | defer decomp.Close() 88 | 89 | result := &bytes.Buffer{} 90 | if _, err := io.Copy(result, decomp); err != nil { 91 | t.Fatal(err) 92 | } 93 | 94 | if result.String() != plainText { 95 | t.Errorf("compression round trip result mismatch.\nwant: %s\ngot: %s", plainText, result.String()) 96 | } 97 | }) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /stepfile/stepfile_test.go: -------------------------------------------------------------------------------- 1 | package stepfile 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "io/ioutil" 7 | "os" 8 | "path/filepath" 9 | "testing" 10 | 11 | "github.com/google/go-cmp/cmp" 12 | "github.com/qri-io/dataset" 13 | ) 14 | 15 | func TestRead(t *testing.T) { 16 | cases := []struct { 17 | inputFilename string 18 | expectFilename string 19 | }{ 20 | {"steps.txt", "steps.json"}, 21 | } 22 | 23 | for _, c := range cases { 24 | t.Run(c.inputFilename, func(t *testing.T) { 25 | in := filepath.Join("./testdata", c.inputFilename) 26 | expect := []*dataset.TransformStep{} 27 | f, err := os.Open(filepath.Join("./testdata", c.expectFilename)) 28 | if err != nil { 29 | t.Fatal(err) 30 | } 31 | if err := json.NewDecoder(f).Decode(&expect); err != nil { 32 | t.Fatal(err) 33 | } 34 | f.Close() 35 | 36 | got, err := ReadFile(in) 37 | if err != nil { 38 | t.Fatal(err) 39 | } 40 | 41 | if diff := cmp.Diff(expect, got); diff != "" { 42 | t.Errorf("result mismatch (-want +got):\n%s", diff) 43 | } 44 | }) 45 | } 46 | 47 | t.Run("errors", func(t *testing.T) { 48 | if _, err := ReadFile("unknown"); err == nil { 49 | t.Error("expected error reading unknown file") 50 | } 51 | }) 52 | } 53 | 54 | func TestWrite(t *testing.T) { 55 | cases := []struct { 56 | inputFilename string 57 | expectFilename string 58 | }{ 59 | {"steps.json", "steps.txt"}, 60 | } 61 | 62 | for _, c := range cases { 63 | t.Run(c.inputFilename, func(t *testing.T) { 64 | data, err := ioutil.ReadFile(filepath.Join("./testdata", c.expectFilename)) 65 | if err != nil { 66 | t.Fatal(err) 67 | } 68 | expect := string(data) 69 | 70 | input := []*dataset.TransformStep{} 71 | f, err := os.Open(filepath.Join("./testdata", c.inputFilename)) 72 | if err != nil { 73 | t.Fatal(err) 74 | } 75 | if err := json.NewDecoder(f).Decode(&input); err != nil { 76 | t.Fatal(err) 77 | } 78 | f.Close() 79 | 80 | buf := &bytes.Buffer{} 81 | if err := Write(input, buf); err != nil { 82 | t.Fatal(err) 83 | } 84 | 85 | if diff := cmp.Diff(expect, buf.String()); diff != "" { 86 | t.Errorf("result mismatch (-want +got):\n%s", diff) 87 | } 88 | }) 89 | } 90 | 91 | t.Run("write from a reader", func(t *testing.T) { 92 | steps := []*dataset.TransformStep{ 93 | {Script: bytes.NewBuffer([]byte("oh hai"))}, 94 | {Script: []byte("my friend")}, 95 | } 96 | buf := &bytes.Buffer{} 97 | if err := Write(steps, buf); err != nil { 98 | t.Error(err) 99 | } 100 | expect := "oh hai\n---\nmy friend" 101 | if diff := cmp.Diff(expect, buf.String()); diff != "" { 102 | t.Errorf("result mismatch. (-want +got):\n %s", diff) 103 | } 104 | }) 105 | 106 | t.Run("bad scripts", func(t *testing.T) { 107 | steps := []*dataset.TransformStep{ 108 | {Script: 2}, 109 | } 110 | buf := &bytes.Buffer{} 111 | if err := Write(steps, buf); err == nil { 112 | t.Error("expected error, got none") 113 | } 114 | }) 115 | } 116 | -------------------------------------------------------------------------------- /generate/tabular.go: -------------------------------------------------------------------------------- 1 | // Package generate is for generating random data from given structures 2 | package generate 3 | 4 | import ( 5 | "math/rand" 6 | "time" 7 | 8 | "github.com/qri-io/dataset" 9 | "github.com/qri-io/dataset/dsio" 10 | "github.com/qri-io/dataset/tabular" 11 | ) 12 | 13 | // Config stores settings for the generate package. 14 | type Config struct { 15 | random *rand.Rand 16 | maxLen int 17 | useRandomType bool 18 | } 19 | 20 | // DefaultConfig returns the default configuration for a Generator. 21 | func DefaultConfig() *Config { 22 | return &Config{ 23 | random: rand.New(rand.NewSource(time.Now().UnixNano())), 24 | maxLen: 64, 25 | useRandomType: false, 26 | } 27 | } 28 | 29 | // AssignSeed sets a specific random seed to be used. 30 | func AssignSeed(cfg *Config) { 31 | cfg.random = rand.New(rand.NewSource(4)) 32 | } 33 | 34 | // AssignMaxLen sets a maximum length for generated values. 35 | func AssignMaxLen(cfg *Config) { 36 | cfg.maxLen = 8 37 | } 38 | 39 | // AssignUseRandomType causes generator to generate random types of values. 40 | func AssignUseRandomType(cfg *Config) { 41 | cfg.useRandomType = true 42 | } 43 | 44 | // TabularGenerator is a dsio.EntryReader that creates a new entry on each call 45 | // to ReadEntry 46 | type TabularGenerator struct { 47 | cols tabular.Columns 48 | structure *dataset.Structure 49 | gen *ValueGenerator 50 | // when generating array entries 51 | count int 52 | // only two possible structures for now are "array" or "object" 53 | schemaIsArray bool 54 | } 55 | 56 | // assert at compile time that Generator is a dsio.EntryReader 57 | var _ dsio.EntryReader = (*TabularGenerator)(nil) 58 | 59 | // NewTabularGenerator creates a tablular data generator with the given 60 | // configuration options 61 | func NewTabularGenerator(st *dataset.Structure, options ...func(*Config)) (*TabularGenerator, error) { 62 | cfg := DefaultConfig() 63 | for _, opt := range options { 64 | opt(cfg) 65 | } 66 | 67 | cols, _, err := tabular.ColumnsFromJSONSchema(st.Schema) 68 | if err != nil { 69 | return nil, err 70 | } 71 | 72 | gen := &ValueGenerator{ 73 | Rand: cfg.random, 74 | MaxStringLength: cfg.maxLen, 75 | } 76 | 77 | return &TabularGenerator{ 78 | structure: st, 79 | cols: cols, 80 | gen: gen, 81 | schemaIsArray: true, 82 | }, nil 83 | } 84 | 85 | // ReadEntry implements the dsio.EntryReader interface 86 | func (g *TabularGenerator) ReadEntry() (dsio.Entry, error) { 87 | row := make([]interface{}, len(g.cols)) 88 | for i, col := range g.cols { 89 | row[i] = g.gen.Type([]string(*col.Type)[0]) 90 | } 91 | index := g.count 92 | g.count++ 93 | return dsio.Entry{Index: index, Value: row}, nil 94 | } 95 | 96 | // Structure implements the dsio.EntryReader interface 97 | func (g TabularGenerator) Structure() *dataset.Structure { 98 | return g.structure 99 | } 100 | 101 | // Close finalizes the generator 102 | func (g TabularGenerator) Close() error { 103 | return nil 104 | } 105 | -------------------------------------------------------------------------------- /preview/testdata/earthquakes/readme.md: -------------------------------------------------------------------------------- 1 | # USGS Earthquakes Jan 12th 2 | 3 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas maximus erat ut rhoncus blandit. Duis aliquet vulputate leo eu volutpat. Praesent in mollis metus, non convallis lectus. Vestibulum malesuada mauris quis nisl auctor pellentesque. Duis lacinia nec justo in viverra. Quisque quis aliquet ante. Donec semper scelerisque laoreet. Praesent dapibus interdum mi, sit amet lacinia odio malesuada vitae. Proin eu erat quis nisi tristique mollis. Donec sed eleifend augue, at convallis ex. 4 | 5 | Integer at bibendum nibh. Mauris sit amet justo nisi. Duis aliquam ex sit amet urna elementum, nec venenatis diam dapibus. Donec pellentesque pretium est, eget vehicula libero fringilla id. Curabitur quam massa, interdum vel interdum sed, elementum et velit. Mauris ac consequat ante. Mauris porttitor ex vitae placerat congue. Nullam porta aliquam enim ac congue. 6 | 7 | Aenean non lacus a quam facilisis viverra. Duis mattis leo ac leo maximus dapibus. Suspendisse pulvinar elit non orci elementum ultricies et nec nunc. Maecenas bibendum sapien massa, eu vestibulum metus lacinia at. Ut laoreet nisi id magna iaculis placerat. Pellentesque scelerisque sit amet mauris ut porta. Aliquam interdum, nisi in dapibus ullamcorper, enim magna hendrerit elit, et mollis ex risus in ante. In suscipit varius metus, at posuere tellus lacinia at. Mauris nisi nibh, egestas et sollicitudin id, venenatis in erat. Donec eu lobortis magna, a rutrum mauris. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Ut scelerisque sed ipsum eu eleifend. 8 | 9 | In et porta erat, commodo volutpat neque. Integer augue ipsum, maximus a enim ac, hendrerit lobortis lorem. Nam nec dolor eget dui ornare mollis vitae at nisi. Nunc quis massa nec nulla vehicula posuere. Morbi sed dictum libero. Vivamus feugiat enim vel augue faucibus, vitae consequat dolor molestie. Pellentesque eu pharetra neque. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Proin sodales arcu neque, ac dignissim nisi maximus at. Morbi blandit gravida sapien, et facilisis nulla dapibus nec. Morbi id odio quis neque cursus sollicitudin. Aliquam ut elementum ante, sed rutrum sapien. 10 | 11 | Nulla ligula felis, vulputate vel tristique eu, euismod non est. Aliquam cursus, eros at scelerisque imperdiet, est nunc hendrerit nunc, vel elementum dui velit id ex. Fusce posuere mollis lorem, nec rhoncus nisi laoreet sed. Curabitur gravida ante vitae risus feugiat posuere. Aenean id euismod nunc. Curabitur eget iaculis odio, id vehicula quam. Morbi at urna nec arcu bibendum malesuada quis ac odio. Donec libero massa, viverra eget dapibus id, auctor a risus. Nam semper nisl erat, nec mollis lectus cursus ac. Aliquam pulvinar sapien sapien, ac vestibulum velit aliquet vitae. Maecenas vitae porta arcu. Maecenas tempus quam eget felis convallis, et sagittis urna suscipit. Proin at risus libero. In fermentum nisl ac felis gravida posuere. Etiam vestibulum diam placerat, vehicula orci et, convallis lacus. -------------------------------------------------------------------------------- /dstest/testdata/complete/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "commit": { 4 | "qri": "cm:0", 5 | "title": "I'm a commit" 6 | }, 7 | "meta": { 8 | "qri": "md:0", 9 | "title": "dataset with all submodels example" 10 | }, 11 | "transform": { 12 | "qri": "tf:0", 13 | "syntax": "sql", 14 | "data": "select * from foo", 15 | "structure": { 16 | "qri": "st:0", 17 | "format": "csv", 18 | "formatConfig": { 19 | "headerRow": true 20 | }, 21 | "schema": { 22 | "type": "array", 23 | "items": { 24 | "type": "array", 25 | "items": [ 26 | { 27 | "title": "title", 28 | "type": "string" 29 | }, 30 | { 31 | "title": "duration", 32 | "type": "integer" 33 | } 34 | ] 35 | } 36 | } 37 | }, 38 | "resources": { 39 | "foo": "/not/a/real/path" 40 | } 41 | }, 42 | "abstractTransform": { 43 | "qri": "tf:0", 44 | "data": "select * from a", 45 | "structure": { 46 | "qri": "st:0", 47 | "format": "csv", 48 | "formatConfig": { 49 | "headerRow": true 50 | }, 51 | "schema": { 52 | "type": "array", 53 | "items": { 54 | "type": "array", 55 | "items": [ 56 | { 57 | "title": "a", 58 | "type": "string" 59 | }, 60 | { 61 | "title": "b", 62 | "type": "integer" 63 | } 64 | ] 65 | } 66 | } 67 | }, 68 | "resources": { 69 | "a": "/fake/path/to/abstract/dataset/" 70 | } 71 | }, 72 | "abstract": { 73 | "qri": "ds:0", 74 | "structure": { 75 | "qri": "st:0", 76 | "format": "csv", 77 | "formatConfig": { 78 | "headerRow": true 79 | }, 80 | "schema": { 81 | "type": "array", 82 | "items": { 83 | "type": "array", 84 | "items": [ 85 | { 86 | "title": "a", 87 | "type": "string" 88 | }, 89 | { 90 | "title": "b", 91 | "type": "integer" 92 | } 93 | ] 94 | } 95 | } 96 | } 97 | }, 98 | "structure": { 99 | "qri": "st:0", 100 | "format": "csv", 101 | "formatConfig": { 102 | "headerRow": true 103 | }, 104 | "schema": { 105 | "type": "array", 106 | "items": { 107 | "type": "array", 108 | "items": [ 109 | { 110 | "title": "title", 111 | "type": "string" 112 | }, 113 | { 114 | "title": "duration", 115 | "type": "integer" 116 | } 117 | ] 118 | } 119 | } 120 | }, 121 | "visconfig":{ 122 | "format": "foo", 123 | "qri": "vc:0", 124 | "visualizations": { 125 | "type": "bar", 126 | "colors": { 127 | "bars": "#ffffff", 128 | "background": "#000000" 129 | } 130 | } 131 | } 132 | } -------------------------------------------------------------------------------- /compression/compression.go: -------------------------------------------------------------------------------- 1 | // Package compression presents a uniform interface for a set of compression 2 | // readers & writers in various formats 3 | package compression 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | 9 | "github.com/klauspost/compress/gzip" 10 | "github.com/klauspost/compress/zstd" 11 | ) 12 | 13 | const ( 14 | // FmtNone is a sentinel for no compression 15 | FmtNone Format = "" 16 | // FmtZStandard compression https://facebook.github.io/zstd/ 17 | FmtZStandard Format = "zst" 18 | // FmtGZip GNU zip compression https://www.gnu.org/software/gzip/ 19 | FmtGZip Format = "gzip" 20 | ) 21 | 22 | // Format represents a type of byte compression 23 | type Format string 24 | 25 | // String implements the stringer interface 26 | func (s Format) String() string { 27 | return string(s) 28 | } 29 | 30 | // SupportedFormats indexes supported formats in a map for lookups 31 | var SupportedFormats = map[Format]struct{}{ 32 | FmtZStandard: {}, 33 | FmtGZip: {}, 34 | } 35 | 36 | // ParseFormat interprets a string into a supported compression format 37 | // errors when provided the empty string ("no compression" format) 38 | func ParseFormat(s string) (f Format, err error) { 39 | f, ok := map[string]Format{ 40 | "gzip": FmtGZip, 41 | "gz": FmtGZip, 42 | "zst": FmtZStandard, 43 | "zstd": FmtZStandard, // not a common file ending, but "zstd" is the shorthand name for the library 44 | }[s] 45 | 46 | if !ok { 47 | return f, fmt.Errorf("invalid compression format %q", s) 48 | } 49 | 50 | if _, ok := SupportedFormats[f]; !ok { 51 | return FmtNone, fmt.Errorf("unsupported compression format: %q", s) 52 | } 53 | 54 | return f, nil 55 | } 56 | 57 | // Compressor wraps a given writer with a specified comrpession format 58 | // callers must Close the writer to fully flush the compressor 59 | func Compressor(compressionFormat string, w io.Writer) (io.WriteCloser, error) { 60 | f, err := ParseFormat(compressionFormat) 61 | if err != nil { 62 | return nil, err 63 | } 64 | 65 | switch f { 66 | case FmtZStandard: 67 | return zstd.NewWriter(w) 68 | case FmtGZip: 69 | return gzip.NewWriter(w), nil 70 | } 71 | 72 | return nil, fmt.Errorf("no available compressor for %q format", f) 73 | } 74 | 75 | // Decompressor wraps a reader of compressed data with a decompressor 76 | // callers must .Close() the reader 77 | func Decompressor(compressionFormat string, r io.Reader) (io.ReadCloser, error) { 78 | f, err := ParseFormat(compressionFormat) 79 | if err != nil { 80 | return nil, err 81 | } 82 | 83 | switch f { 84 | case FmtZStandard: 85 | rdr, err := zstd.NewReader(r) 86 | if err != nil { 87 | return nil, err 88 | } 89 | return zstdReadCloserShim{rdr}, nil 90 | case FmtGZip: 91 | return gzip.NewReader(r) 92 | } 93 | 94 | return nil, fmt.Errorf("no available decompressor for %q format", f) 95 | } 96 | 97 | // small struct to compensate for zstd's decoder Close() method, which returns 98 | // no error. This breaks the io.ReadCloser interface. shim in an 99 | // error function with an error that will never occur 100 | type zstdReadCloserShim struct { 101 | *zstd.Decoder 102 | } 103 | 104 | func (d zstdReadCloserShim) Close() error { 105 | d.Decoder.Close() 106 | return nil 107 | } 108 | -------------------------------------------------------------------------------- /detect/testdata/daily_wind_2011.structure.json: -------------------------------------------------------------------------------- 1 | { 2 | "format": "csv", 3 | "formatConfig": { 4 | "headerRow" : true, 5 | "lazyQuotes" : true 6 | }, 7 | "schema": { 8 | "type": "array", 9 | "items": { 10 | "type": "array", 11 | "items": [ 12 | { 13 | "title": "state_code", 14 | "type": "integer" 15 | }, 16 | { 17 | "title": "county_code", 18 | "type": "integer" 19 | }, 20 | { 21 | "title": "site_num", 22 | "type": "integer" 23 | }, 24 | { 25 | "title": "parameter_code", 26 | "type": "integer" 27 | }, 28 | { 29 | "title": "poc", 30 | "type": "integer" 31 | }, 32 | { 33 | "title": "latitude", 34 | "type": "number" 35 | }, 36 | { 37 | "title": "longitude", 38 | "type": "number" 39 | }, 40 | { 41 | "title": "datum", 42 | "type": "string" 43 | }, 44 | { 45 | "title": "parameter_name", 46 | "type": "string" 47 | }, 48 | { 49 | "title": "sample_duration", 50 | "type": "string" 51 | }, 52 | { 53 | "title": "pollutant_standard", 54 | "type": "string" 55 | }, 56 | { 57 | "title": "date_local", 58 | "type": "string" 59 | }, 60 | { 61 | "title": "units_of_measure", 62 | "type": "string" 63 | }, 64 | { 65 | "title": "event_type", 66 | "type": "string" 67 | }, 68 | { 69 | "title": "observation_count", 70 | "type": "integer" 71 | }, 72 | { 73 | "title": "observation_percent", 74 | "type": "number" 75 | }, 76 | { 77 | "title": "arithmetic_mean", 78 | "type": "number" 79 | }, 80 | { 81 | "title": "st_max_value", 82 | "type": "number" 83 | }, 84 | { 85 | "title": "st_max_hour", 86 | "type": "integer" 87 | }, 88 | { 89 | "title": "aqi", 90 | "type": "string" 91 | }, 92 | { 93 | "title": "method_code", 94 | "type": "integer" 95 | }, 96 | { 97 | "title": "method_name", 98 | "type": "string" 99 | }, 100 | { 101 | "title": "local_site_name", 102 | "type": "string" 103 | }, 104 | { 105 | "title": "address", 106 | "type": "string" 107 | }, 108 | { 109 | "title": "state_name", 110 | "type": "string" 111 | }, 112 | { 113 | "title": "county_name", 114 | "type": "string" 115 | }, 116 | { 117 | "title": "city_name", 118 | "type": "string" 119 | }, 120 | { 121 | "title": "cbsa_name", 122 | "type": "string" 123 | }, 124 | { 125 | "title": "date_of_last_change", 126 | "type": "string" 127 | } 128 | ] 129 | } 130 | } 131 | } -------------------------------------------------------------------------------- /detect/testdata/daily_wind_2011.csv: -------------------------------------------------------------------------------- 1 | "State Code","County Code","Site Num","Parameter Code","POC","Latitude","Longitude","Datum","Parameter Name","Sample Duration","Pollutant Standard","Date Local","Units of Measure","Event Type","Observation Count","Observation Percent","Arithmetic Mean","1st Max Value","1st Max Hour","AQI","Method Code","Method Name","Local Site Name","Address","State Name","County Name","City Name","CBSA Name","Date of Last Change" 2 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-01","Knots","None",24,100.0,3.25,6.6,1,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 3 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-02","Knots","None",24,100.0,2.033333,3.6,13,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 4 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-03","Knots","None",24,100.0,0.991667,2.1,11,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 5 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-04","Knots","None",24,100.0,1.091667,2.3,11,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 6 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-05","Knots","None",24,100.0,1.5875,3.2,20,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 7 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-06","Knots","None",24,100.0,2.508333,4,13,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 8 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-07","Knots","None",24,100.0,3.991667,5.7,12,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 9 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-08","Knots","None",24,100.0,5.3,5.3,0,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" 10 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-09","Knots","None",24,100.0,5.3,5.3,0,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22" -------------------------------------------------------------------------------- /dsio/ndjson_test.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "strings" 8 | "testing" 9 | 10 | "github.com/google/go-cmp/cmp" 11 | "github.com/qri-io/dataset" 12 | "github.com/qri-io/dataset/compression" 13 | ) 14 | 15 | func TestNDJSONReadWrite(t *testing.T) { 16 | data := `["a","b","c"] 17 | "apples" 18 | true 19 | 35 20 | null 21 | {} 22 | ` 23 | 24 | st := &dataset.Structure{ 25 | Format: dataset.NDJSONDataFormat.String(), 26 | Schema: dataset.BaseSchemaArray, 27 | } 28 | 29 | rdr, err := NewEntryReader(st, strings.NewReader(data)) 30 | if err != nil { 31 | t.Fatal(err) 32 | } 33 | 34 | buf := &bytes.Buffer{} 35 | wr, err := NewEntryWriter(st, buf) 36 | 37 | if err := Copy(rdr, wr); err != nil { 38 | t.Fatal(err) 39 | } 40 | rdr.Close() 41 | wr.Close() 42 | 43 | if diff := cmp.Diff(data, buf.String()); diff != "" { 44 | t.Errorf("result mismatch (-want +got):\n%s", diff) 45 | } 46 | } 47 | 48 | func TestNDJSONCompression(t *testing.T) { 49 | invalidCompressionSt := &dataset.Structure{Format: "ndjson", Compression: "invalid", Schema: dataset.BaseSchemaArray} 50 | if _, err := NewJSONReader(invalidCompressionSt, nil); err == nil { 51 | t.Errorf("constructing reader with invalid compression should error") 52 | } 53 | if _, err := NewJSONWriter(invalidCompressionSt, nil); err == nil { 54 | t.Errorf("constructing writer with invalid compression should error") 55 | } 56 | 57 | data := `["a","b","c"] 58 | "apples" 59 | true 60 | 35 61 | null 62 | {} 63 | ` 64 | 65 | compressed := &bytes.Buffer{} 66 | compressor, _ := compression.Compressor("zst", compressed) 67 | io.Copy(compressor, strings.NewReader(data)) 68 | compressor.Close() 69 | 70 | st := &dataset.Structure{ 71 | Format: "ndjson", 72 | Compression: "zst", 73 | Schema: dataset.BaseSchemaArray, 74 | } 75 | 76 | rdr, err := NewNDJSONReader(st, compressed) 77 | if err != nil { 78 | t.Fatal(err) 79 | } 80 | 81 | compressed2 := &bytes.Buffer{} 82 | wr, err := NewNDJSONWriter(st, compressed2) 83 | if err != nil { 84 | t.Fatal(err) 85 | } 86 | 87 | if err := Copy(rdr, wr); err != nil { 88 | t.Fatal(err) 89 | } 90 | rdr.Close() 91 | wr.Close() 92 | 93 | if diff := cmp.Diff(compressed.Bytes(), compressed2.Bytes()); diff != "" { 94 | t.Errorf("result mismatch expect (-want +got):\n%s", diff) 95 | } 96 | } 97 | 98 | func TestNDJSONReaderSizeOverflow(t *testing.T) { 99 | // run a test with one 24,000-character long string to ensure the reader 100 | // doesn't choke on a long line of JSON 101 | st := &dataset.Structure{ 102 | Format: "ndjson", 103 | Schema: dataset.BaseSchemaArray, 104 | } 105 | data := fmt.Sprintf(`"hi" 106 | false 107 | %q 108 | null 109 | "bye" 110 | `, strings.Repeat("long", 1024*6)) 111 | 112 | rdr, err := NewNDJSONReader(st, strings.NewReader(data)) 113 | if err != nil { 114 | t.Fatal(err) 115 | } 116 | 117 | vals, err := ReadAll(rdr) 118 | if err != nil { 119 | t.Error(err) 120 | } 121 | 122 | if err := rdr.Close(); err != nil { 123 | t.Error(err) 124 | } 125 | 126 | expect := []interface{}{ 127 | "hi", 128 | false, 129 | strings.Repeat("long", 1024*6), 130 | nil, 131 | "bye", 132 | } 133 | 134 | if diff := cmp.Diff(expect, vals); diff != "" { 135 | t.Errorf("result mismatch (-want +got):\n%s", diff) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /preview/preview.go: -------------------------------------------------------------------------------- 1 | package preview 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "io/ioutil" 10 | 11 | logger "github.com/ipfs/go-log" 12 | "github.com/qri-io/dataset" 13 | "github.com/qri-io/dataset/dsio" 14 | "github.com/qri-io/qfs" 15 | ) 16 | 17 | var ( 18 | log = logger.Logger("preview") 19 | ) 20 | 21 | const ( 22 | // MaxNumDatasetRowsInPreview is the highest number of rows a dataset preview 23 | // can contain 24 | MaxNumDatasetRowsInPreview = 100 25 | // MaxStatsBytes is the maximum number of bytes reserved in a preview for stats 26 | // values. 27 | // TODO(b5): this value is not currently honored, requires implementing 28 | // dataset.Stats.Abbreviate 29 | MaxStatsBytes = 10000 30 | // MaxReadmePreviewBytes determines the maximum amount of bytes a readme 31 | // preview can be. three bytes less than 1000 to make room for an elipsis 32 | MaxReadmePreviewBytes = 997 33 | ) 34 | 35 | // Create generates a preview for a dataset version 36 | // It expects the passed in dataset to have any relevant script files already 37 | // loaded 38 | // Preview currently includes: 39 | // - body: 100 rows 40 | // - readme: first 997 bytes 41 | // - meta: all 42 | // - commit: all 43 | // - structure: all 44 | // - stats: all 45 | // - viz: all 46 | // - transform: all 47 | func Create(ctx context.Context, ds *dataset.Dataset) (*dataset.Dataset, error) { 48 | 49 | if ds == nil { 50 | log.Debug("Create: nil dataset") 51 | return nil, fmt.Errorf("nil dataset") 52 | } 53 | if ds.IsEmpty() { 54 | log.Debug("Create: empty dataset") 55 | return nil, fmt.Errorf("empty dataset") 56 | } 57 | 58 | p := &dataset.Dataset{} 59 | p.Assign(ds) 60 | 61 | if ds.Readme != nil && ds.Readme.ScriptFile() != nil { 62 | buf := &bytes.Buffer{} 63 | f := ds.Readme.ScriptFile() 64 | tr := io.TeeReader(f, buf) 65 | 66 | content, err := ioutil.ReadAll(io.LimitReader(tr, MaxReadmePreviewBytes)) 67 | if err != nil { 68 | log.Debugw("Reading Readme", "err", err.Error()) 69 | return nil, err 70 | } 71 | if len(content) >= MaxReadmePreviewBytes { 72 | content = append(content, []byte("...")...) 73 | } 74 | ds.Readme.Text = string(content) 75 | 76 | ds.Readme.SetScriptFile(qfs.NewMemfileReader(f.FullPath(), io.MultiReader(buf, f))) 77 | } 78 | 79 | if ds.BodyFile() != nil { 80 | st := &dataset.Structure{ 81 | Format: "json", 82 | Schema: ds.Structure.Schema, 83 | } 84 | 85 | buf := &bytes.Buffer{} 86 | f := ds.BodyFile() 87 | tr := io.TeeReader(f, buf) 88 | teedFile := qfs.NewMemfileReader(f.FullPath(), tr) 89 | size := -1 90 | if sf, ok := f.(qfs.SizeFile); ok { 91 | size = int(sf.Size()) 92 | } 93 | 94 | data, err := dsio.ConvertFile(teedFile, ds.Structure, st, MaxNumDatasetRowsInPreview, 0, false) 95 | if err != nil { 96 | log.Debugw("converting body file", "err", err.Error()) 97 | return nil, err 98 | } 99 | 100 | ds.Body = json.RawMessage(data) 101 | ds.SetBodyFile(qfs.NewMemfileReaderSize(f.FullPath(), io.MultiReader(buf, f), int64(size))) 102 | } 103 | 104 | // Note: stats can get arbitrarily large, potentially bloating the size 105 | // of previews. Add a method for bounding the final size of stats to a 106 | // constant byte size 107 | if ds.Stats != nil && !ds.Stats.IsEmpty() { 108 | p.Stats = ds.Stats 109 | } 110 | 111 | return ds, nil 112 | } 113 | -------------------------------------------------------------------------------- /vals/compare_test.go: -------------------------------------------------------------------------------- 1 | package vals 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestEqual(t *testing.T) { 8 | cases := []struct { 9 | a, b Value 10 | expect bool 11 | }{ 12 | {Array{Number(1)}, Array{Number(1)}, true}, 13 | {Array{Number(1)}, Array{Number(2)}, false}, 14 | {Object{"a": String("a")}, Object{"a": String("a")}, true}, 15 | {Object{"a": String("a")}, Object{"a": String("b")}, false}, 16 | {String("a"), String("a"), true}, 17 | {String("a"), String("b"), false}, 18 | {Boolean(true), Boolean(true), true}, 19 | {Boolean(true), Boolean(false), false}, 20 | {Integer(1), Integer(1), true}, 21 | {Integer(1), Integer(2), false}, 22 | {Number(1.1), Number(1.1), true}, 23 | {Number(1.1), Number(1.11), false}, 24 | } 25 | 26 | for i, c := range cases { 27 | got := Equal(c.a, c.b) 28 | if got != c.expect { 29 | t.Errorf("case: %d. %v == %v != %t", i, c.a, c.b, c.expect) 30 | } 31 | } 32 | } 33 | 34 | func TestCompareTypeBytes(t *testing.T) { 35 | cases := []struct { 36 | a, b string 37 | t Type 38 | expect int 39 | err string 40 | }{ 41 | {"0", "0", TypeUnknown, 0, "invalid type comparison"}, 42 | {"", "", TypeString, 0, ""}, 43 | {"", "foo", TypeString, -1, ""}, 44 | {"foo", "", TypeString, 1, ""}, 45 | {"foo", "bar", TypeString, 1, ""}, 46 | {"bar", "foo", TypeString, -1, ""}, 47 | {"0", "0", TypeNumber, 0, ""}, 48 | {"0", "0", TypeInteger, 0, ""}, 49 | } 50 | 51 | for i, c := range cases { 52 | got, err := CompareTypeBytes([]byte(c.a), []byte(c.b), c.t) 53 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 54 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err) 55 | continue 56 | } 57 | if got != c.expect { 58 | t.Errorf("case %d response mismatch: %d != %d", i, c.expect, got) 59 | continue 60 | } 61 | } 62 | } 63 | 64 | func TestCompareIntegerBytes(t *testing.T) { 65 | cases := []struct { 66 | a, b string 67 | expect int 68 | err string 69 | }{ 70 | {"0", "", 0, "strconv.ParseInt: parsing \"\": invalid syntax"}, 71 | {"", "0", 0, "strconv.ParseInt: parsing \"\": invalid syntax"}, 72 | {"0", "0", 0, ""}, 73 | {"-1", "0", -1, ""}, 74 | {"0", "-1", 1, ""}, 75 | } 76 | 77 | for i, c := range cases { 78 | got, err := CompareIntegerBytes([]byte(c.a), []byte(c.b)) 79 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 80 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err) 81 | continue 82 | } 83 | if got != c.expect { 84 | t.Errorf("case %d response mismatch: %d != %d", i, c.expect, got) 85 | continue 86 | } 87 | } 88 | } 89 | 90 | func TestCompareNumberBytes(t *testing.T) { 91 | cases := []struct { 92 | a, b string 93 | expect int 94 | err string 95 | }{ 96 | {"0", "", 0, "strconv.ParseFloat: parsing \"\": invalid syntax"}, 97 | {"", "0", 0, "strconv.ParseFloat: parsing \"\": invalid syntax"}, 98 | {"0", "0", 0, ""}, 99 | {"-1", "0", -1, ""}, 100 | {"0", "-1", 1, ""}, 101 | } 102 | 103 | for i, c := range cases { 104 | got, err := CompareNumberBytes([]byte(c.a), []byte(c.b)) 105 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 106 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err) 107 | continue 108 | } 109 | if got != c.expect { 110 | t.Errorf("case %d response mismatch: %d != %d", i, c.expect, got) 111 | continue 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /vals/coding_test.go: -------------------------------------------------------------------------------- 1 | package vals 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "testing" 7 | ) 8 | 9 | var ( 10 | array0 = &Array{String("a"), Boolean(false), Null(true), Integer(2), Number(23.5)} 11 | object0 = &Object{"city": String("toronto"), "pop": Integer(40000000), "avg_age": Number(55.5), "in_usa": Boolean(false)} 12 | array1 = &Array{*array0, *array0} 13 | array2 = &Array{*object0, *object0} 14 | ) 15 | 16 | func TestConvertDecoded(t *testing.T) { 17 | cases := []struct { 18 | in interface{} 19 | expect Value 20 | err string 21 | }{ 22 | {map[string]interface{}{}, &Object{}, ""}, 23 | {map[string]interface{}{ 24 | "a": 0, 25 | "b": float64(0), 26 | "c": nil, 27 | "d": true, 28 | "e": "foo", 29 | "f": []interface{}{}, 30 | "g": map[string]interface{}{}, 31 | "h": uint8(0), 32 | "i": uint16(0), 33 | "j": uint64(0), 34 | "k": int32(0), 35 | "l": int64(0), 36 | "m": map[interface{}]interface{}{}, 37 | }, &Object{ 38 | "a": Integer(0), 39 | "b": Number(0), 40 | "c": Null(true), 41 | "d": Boolean(true), 42 | "e": String("foo"), 43 | "f": &Array{}, 44 | "g": &Object{}, 45 | "h": Integer(0), 46 | "i": Integer(0), 47 | "j": Integer(0), 48 | "k": Integer(0), 49 | "l": Integer(0), 50 | "m": &Object{}, 51 | }, ""}, 52 | } 53 | 54 | for i, c := range cases { 55 | got, err := ConvertDecoded(c.in) 56 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 57 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err) 58 | continue 59 | } 60 | 61 | if !Equal(c.expect, got) { 62 | t.Errorf("case %d result mismatch. epxected: %#v, got: %#v", i, c.expect, got) 63 | continue 64 | } 65 | } 66 | } 67 | 68 | func TestUnmarshalJSON(t *testing.T) { 69 | cases := []struct { 70 | input string 71 | expect Value 72 | err string 73 | }{ 74 | {`"foo"`, String("foo"), ""}, 75 | {`123`, Integer(123), ""}, 76 | {`123.45`, Number(123.45), ""}, 77 | {`{ "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }`, *object0, ""}, 78 | {`["a", false, null, 2, 23.5]`, *array0, ""}, 79 | {`[null, null, null]`, Array{Null(true), Null(true), Null(true)}, ""}, 80 | {`[["a", false, null, 2, 23.5],["a", false, null, 2, 23.5]]`, *array1, ""}, 81 | {`[{ "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false },{ "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }]`, *array2, ""}, 82 | } 83 | for i, c := range cases { 84 | got, err := UnmarshalJSON([]byte(c.input)) 85 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 86 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err) 87 | continue 88 | } 89 | 90 | if !Equal(c.expect, got) { 91 | t.Errorf("case %d result mismatch. expected: %#v, got: %#v", i, c.expect, got) 92 | continue 93 | } 94 | } 95 | } 96 | 97 | func TestMarshalJSON(t *testing.T) { 98 | d := Array{ 99 | Object{"foo": Boolean(false)}, 100 | Boolean(true), 101 | Integer(12), 102 | Null(true), 103 | Number(123.456), 104 | Array{String("foo"), String("bar")}, 105 | } 106 | 107 | b, err := json.Marshal(d) 108 | if err != nil { 109 | t.Errorf("unexpected error marshaling to JSON: %s", err.Error()) 110 | return 111 | } 112 | 113 | expect := `[{"foo":false},true,12,null,123.456,["foo","bar"]]` 114 | if !bytes.Equal([]byte(expect), b) { 115 | t.Errorf("byte mismatch. expected: %s, got: %s", expect, string(b)) 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /dstest/compare.go: -------------------------------------------------------------------------------- 1 | package dstest 2 | 3 | import ( 4 | "github.com/google/go-cmp/cmp" 5 | "github.com/google/go-cmp/cmp/cmpopts" 6 | "github.com/qri-io/dataset" 7 | ) 8 | 9 | // CompareDatasets checks two given datasets for equality, returng a diff string 10 | // describing the difference between each dataset/ return will be the empty 11 | // string 12 | // if datasets are equal 13 | // CompareDatasets defaults to a strict compraison of all exported fields 14 | // operates on copies of passed-in datasets to keep this function free of side 15 | // effects 16 | func CompareDatasets(expect, got *dataset.Dataset, opts ...CompareOpts) string { 17 | cfg := &CompareConfig{} 18 | for _, opt := range opts { 19 | opt.Apply(cfg) 20 | } 21 | 22 | a := &dataset.Dataset{} 23 | a.Assign(expect) 24 | 25 | b := &dataset.Dataset{} 26 | b.Assign(got) 27 | 28 | if cfg.dropTransients { 29 | a.DropTransientValues() 30 | a.DropTransientValues() 31 | } 32 | 33 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported( 34 | dataset.Dataset{}, 35 | dataset.Commit{}, 36 | dataset.Meta{}, 37 | dataset.Transform{}, 38 | dataset.Readme{}, 39 | dataset.Viz{}, 40 | )) 41 | } 42 | 43 | // CompareConfig defines configuration parameters, which are unexported, but 44 | // settable via CompareOpt's supplied ot a Compare function 45 | type CompareConfig struct { 46 | dropTransients bool 47 | } 48 | 49 | // CompareOpts adusts component comparison functions 50 | type CompareOpts interface { 51 | Apply(cfg *CompareConfig) 52 | } 53 | 54 | // OptDropTransientValues drops transients on both dataset before making the 55 | // comparison, allowing things like dataset name & 56 | type OptDropTransientValues int 57 | 58 | // Apply sets unexported configuration 59 | func (OptDropTransientValues) Apply(cfg *CompareConfig) { 60 | cfg.dropTransients = true 61 | } 62 | 63 | // CompareCommits is CompareDatasets, but for commit components 64 | func CompareCommits(expect, got *dataset.Commit, opts ...CompareOpts) string { 65 | cfg := &CompareConfig{} 66 | for _, opt := range opts { 67 | opt.Apply(cfg) 68 | } 69 | 70 | a := &dataset.Commit{} 71 | a.Assign(expect) 72 | 73 | b := &dataset.Commit{} 74 | b.Assign(got) 75 | 76 | if cfg.dropTransients { 77 | a.DropTransientValues() 78 | a.DropTransientValues() 79 | } 80 | 81 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported( 82 | dataset.Commit{}, 83 | )) 84 | } 85 | 86 | // CompareMetas is CompareDatasets, but for meta components 87 | func CompareMetas(expect, got *dataset.Meta, opts ...CompareOpts) string { 88 | cfg := &CompareConfig{} 89 | for _, opt := range opts { 90 | opt.Apply(cfg) 91 | } 92 | 93 | a := &dataset.Meta{} 94 | a.Assign(expect) 95 | 96 | b := &dataset.Meta{} 97 | b.Assign(got) 98 | 99 | if cfg.dropTransients { 100 | a.DropTransientValues() 101 | a.DropTransientValues() 102 | } 103 | 104 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported( 105 | dataset.Meta{}, 106 | )) 107 | } 108 | 109 | // CompareStructures is CompareDatasets, but for structure components 110 | func CompareStructures(expect, got *dataset.Structure, opts ...CompareOpts) string { 111 | cfg := &CompareConfig{} 112 | for _, opt := range opts { 113 | opt.Apply(cfg) 114 | } 115 | 116 | a := &dataset.Structure{} 117 | a.Assign(expect) 118 | 119 | b := &dataset.Structure{} 120 | b.Assign(got) 121 | 122 | if cfg.dropTransients { 123 | a.DropTransientValues() 124 | a.DropTransientValues() 125 | } 126 | 127 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported( 128 | dataset.Structure{}, 129 | )) 130 | } 131 | -------------------------------------------------------------------------------- /dsio/ndjson.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "bufio" 5 | "encoding/json" 6 | "fmt" 7 | "io" 8 | 9 | "github.com/qri-io/dataset" 10 | ) 11 | 12 | // NDJSONReader implements the EntryReader interface for the JSON data format 13 | type NDJSONReader struct { 14 | entriesRead int 15 | st *dataset.Structure 16 | buf *bufio.Reader 17 | close func() error // close func from wrapped reader 18 | prevSize int // when buffer is extended, remember how much of the old buffer to discard 19 | } 20 | 21 | var _ EntryReader = (*NDJSONReader)(nil) 22 | 23 | // NewNDJSONReader creates a reader from a structure and read source 24 | func NewNDJSONReader(st *dataset.Structure, r io.Reader) (*NDJSONReader, error) { 25 | if st.Schema == nil { 26 | err := fmt.Errorf("schema required for NDJSON reader") 27 | log.Debug(err.Error()) 28 | return nil, err 29 | } 30 | 31 | tlt, err := GetTopLevelType(st) 32 | if err != nil { 33 | return nil, err 34 | } 35 | if tlt != "array" { 36 | return nil, fmt.Errorf("NDJSON top level type must be 'array'") 37 | } 38 | 39 | r, close, err := maybeWrapDecompressor(st, r) 40 | if err != nil { 41 | return nil, err 42 | } 43 | 44 | ndjr := &NDJSONReader{ 45 | st: st, 46 | buf: bufio.NewReader(r), 47 | close: close, 48 | } 49 | return ndjr, nil 50 | } 51 | 52 | // Structure gives this writer's structure 53 | func (r *NDJSONReader) Structure() *dataset.Structure { 54 | return r.st 55 | } 56 | 57 | // ReadEntry reads one JSON record from the reader 58 | func (r *NDJSONReader) ReadEntry() (Entry, error) { 59 | line, err := r.buf.ReadBytes('\n') 60 | if err != nil { 61 | return Entry{}, err 62 | } 63 | 64 | var v interface{} 65 | if err := json.Unmarshal(line, &v); err != nil { 66 | return Entry{}, err 67 | } 68 | 69 | ent := Entry{ 70 | Index: r.entriesRead, 71 | Value: v, 72 | } 73 | 74 | r.entriesRead++ 75 | return ent, nil 76 | } 77 | 78 | // Close finalizes the reader 79 | func (r *NDJSONReader) Close() error { 80 | if r.close != nil { 81 | return r.close() 82 | } 83 | return nil 84 | } 85 | 86 | // NDJSONWriter implements the EntryWriter interface for 87 | // Newline-Deliminted-JSON-formatted data 88 | type NDJSONWriter struct { 89 | rowsWritten int 90 | st *dataset.Structure 91 | wr io.Writer 92 | enc *json.Encoder 93 | close func() error // close func from wrapped writer 94 | } 95 | 96 | var _ EntryWriter = (*NDJSONWriter)(nil) 97 | 98 | // NewNDJSONWriter creates a Writer from a structure and write destination 99 | func NewNDJSONWriter(st *dataset.Structure, w io.Writer) (*NDJSONWriter, error) { 100 | if st.Schema == nil { 101 | err := fmt.Errorf("schema required for NDJSON writer") 102 | log.Debug(err.Error()) 103 | return nil, err 104 | } 105 | 106 | w, close, err := maybeWrapCompressor(st, w) 107 | if err != nil { 108 | return nil, err 109 | } 110 | 111 | jw := &NDJSONWriter{ 112 | st: st, 113 | wr: w, 114 | enc: json.NewEncoder(w), 115 | close: close, 116 | } 117 | 118 | return jw, nil 119 | } 120 | 121 | // Structure gives this writer's structure 122 | func (w *NDJSONWriter) Structure() *dataset.Structure { 123 | return w.st 124 | } 125 | 126 | // WriteEntry writes one JSON entry to the writer 127 | func (w *NDJSONWriter) WriteEntry(ent Entry) error { 128 | return w.enc.Encode(ent.Value) 129 | } 130 | 131 | // Close finalizes the writer 132 | func (w *NDJSONWriter) Close() error { 133 | if w.close != nil { 134 | return w.close() 135 | } 136 | return nil 137 | } 138 | -------------------------------------------------------------------------------- /testdata/datasets/complete.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri": "ds:0", 3 | "meta": { 4 | "title": "dataset with all submodels example", 5 | "description": "foo", 6 | "accessURL": "foo", 7 | "downloadURL": "foo", 8 | "accrualPeriodicity": "1W", 9 | "version": "0", 10 | "readme": "foo", 11 | "queryString": "foo", 12 | "previous": "foo", 13 | "qri": "md:0", 14 | "identifier": "foo", 15 | "iconImage": "foo", 16 | "length": 2503, 17 | "image": "foo", 18 | "keywords": [ 19 | "a", 20 | "b", 21 | "foo" 22 | ], 23 | "language": [ 24 | "english" 25 | ], 26 | "theme": [ 27 | "foo" 28 | ], 29 | "author": { 30 | "email": "foo" 31 | }, 32 | "data": "foo", 33 | "contributors": [ 34 | { 35 | "email": "foo" 36 | } 37 | ] 38 | }, 39 | "commit": { 40 | "qri": "cm:0", 41 | "timestamp": "2017-12-21T04:13:22.534Z", 42 | "message": "I'm a commit" 43 | }, 44 | "transform": { 45 | "qri": "tf:0", 46 | "syntax": "sql", 47 | "data": "select * from foo", 48 | "structure": { 49 | "qri": "st:0", 50 | "format": "csv", 51 | "formatConfig": { 52 | "headerRow": true 53 | }, 54 | "schema": { 55 | "type": "array", 56 | "items": { 57 | "type":"array", 58 | "items": [ 59 | { 60 | "title": "title", 61 | "type": "string" 62 | }, 63 | { 64 | "title": "duration", 65 | "type": "integer" 66 | } 67 | 68 | ] 69 | } 70 | } 71 | }, 72 | "resources": { 73 | "foo": {"path": "/not/a/real/path"} 74 | } 75 | }, 76 | "abstractTransform": { 77 | "qri": "tf:0", 78 | "data": "select * from a", 79 | "structure": { 80 | "qri": "st:0", 81 | "format": "csv", 82 | "formatConfig": { 83 | "headerRow": true 84 | }, 85 | "schema": { 86 | "type": "array", 87 | "items": { 88 | "type": "array", 89 | "items": [ 90 | { 91 | "title": "a", 92 | "type": "string" 93 | }, 94 | { 95 | "title": "b", 96 | "type": "integer" 97 | } 98 | ] 99 | } 100 | } 101 | }, 102 | "resources": { 103 | "a": "/fake/path/to/abstract/dataset/" 104 | } 105 | }, 106 | "abstract": { 107 | "qri": "ds:0", 108 | "structure": { 109 | "qri": "st:0", 110 | "format": "csv", 111 | "formatConfig": { 112 | "headerRow": true 113 | }, 114 | "schema": { 115 | "type": "array", 116 | "items": { 117 | "type": "array", 118 | "items": [ 119 | { 120 | "type": "string" 121 | }, 122 | { 123 | "type": "integer" 124 | } 125 | ] 126 | } 127 | } 128 | } 129 | }, 130 | "structure": { 131 | "qri": "st:0", 132 | "format": "csv", 133 | "formatConfig": { 134 | "headerRow": true 135 | }, 136 | "schema": { 137 | "type": "array", 138 | "items": { 139 | "type": "array", 140 | "items": [ 141 | { 142 | "title": "title", 143 | "type": "string" 144 | }, 145 | { 146 | "title": "duration", 147 | "type": "integer" 148 | } 149 | ] 150 | } 151 | } 152 | } 153 | } -------------------------------------------------------------------------------- /data_format.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | ) 7 | 8 | // ErrUnknownDataFormat is the expected error for 9 | // when a data format is missing or unknown 10 | var ErrUnknownDataFormat = fmt.Errorf("Unknown Data Format") 11 | 12 | // DataFormat represents different types of data formats. 13 | // formats specified here have some degree of support within 14 | // the dataset packages 15 | type DataFormat int 16 | 17 | const ( 18 | // UnknownDataFormat is the default dataformat, meaning 19 | // that a data format should always be specified when 20 | // using the DataFormat type 21 | UnknownDataFormat DataFormat = iota 22 | // CSVDataFormat specifies comma separated value-formatted data 23 | CSVDataFormat 24 | // JSONDataFormat specifies Javascript Object Notation-formatted data 25 | JSONDataFormat 26 | // NDJSONDataFormat newline-delimited JSON files 27 | // https://github.com/ndjson/ndjson-spec 28 | NDJSONDataFormat 29 | // CBORDataFormat specifies RFC 7049 Concise Binary Object Representation 30 | // read more at cbor.io 31 | CBORDataFormat 32 | // XMLDataFormat specifies eXtensible Markup Language-formatted data 33 | // currently not supported. 34 | XMLDataFormat 35 | // XLSXDataFormat specifies microsoft excel formatted data 36 | XLSXDataFormat 37 | ) 38 | 39 | // SupportedDataFormats gives a slice of data formats that are 40 | // expected to work with this dataset package. As we work through 41 | // support for different formats, the last step of providing full 42 | // support to a format will be an addition to this slice 43 | func SupportedDataFormats() []DataFormat { 44 | return []DataFormat{ 45 | CBORDataFormat, 46 | JSONDataFormat, 47 | CSVDataFormat, 48 | XLSXDataFormat, 49 | NDJSONDataFormat, 50 | } 51 | } 52 | 53 | // String implements stringer interface for DataFormat 54 | func (f DataFormat) String() string { 55 | s, ok := map[DataFormat]string{ 56 | UnknownDataFormat: "", 57 | CSVDataFormat: "csv", 58 | JSONDataFormat: "json", 59 | XMLDataFormat: "xml", 60 | XLSXDataFormat: "xlsx", 61 | CBORDataFormat: "cbor", 62 | NDJSONDataFormat: "ndjson", 63 | }[f] 64 | 65 | if !ok { 66 | return "" 67 | } 68 | 69 | return s 70 | } 71 | 72 | // ParseDataFormatString takes a string representation of a data format 73 | // TODO (b5): trim "." prefix, remove prefixed map keys 74 | func ParseDataFormatString(s string) (df DataFormat, err error) { 75 | df, ok := map[string]DataFormat{ 76 | "": UnknownDataFormat, 77 | ".csv": CSVDataFormat, 78 | "csv": CSVDataFormat, 79 | ".json": JSONDataFormat, 80 | "json": JSONDataFormat, 81 | ".xml": XMLDataFormat, 82 | "xml": XMLDataFormat, 83 | ".xlsx": XLSXDataFormat, 84 | "xlsx": XLSXDataFormat, 85 | "cbor": CBORDataFormat, 86 | ".cbor": CBORDataFormat, 87 | ".ndjson": NDJSONDataFormat, 88 | "ndjson": NDJSONDataFormat, 89 | ".jsonl": NDJSONDataFormat, 90 | "jsonl": NDJSONDataFormat, 91 | }[s] 92 | if !ok { 93 | err = fmt.Errorf("invalid data format: `%s`", s) 94 | df = UnknownDataFormat 95 | } 96 | 97 | return 98 | } 99 | 100 | // MarshalJSON satisfies the json.Marshaler interface 101 | func (f DataFormat) MarshalJSON() ([]byte, error) { 102 | if f == UnknownDataFormat { 103 | return nil, ErrUnknownDataFormat 104 | } 105 | return []byte(fmt.Sprintf(`"%s"`, f.String())), nil 106 | } 107 | 108 | // UnmarshalJSON satisfies the json.Unmarshaler interface 109 | func (f *DataFormat) UnmarshalJSON(data []byte) error { 110 | var s string 111 | if err := json.Unmarshal(data, &s); err != nil { 112 | return fmt.Errorf("Data Format type should be a string, got %s", data) 113 | } 114 | 115 | df, err := ParseDataFormatString(s) 116 | if err != nil { 117 | return err 118 | } 119 | 120 | *f = df 121 | return nil 122 | } 123 | -------------------------------------------------------------------------------- /preview/testdata/earthquakes/input.dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "qri":"ds:0", 3 | "meta": { 4 | "description": "List of earthquakes recorded by the USGS from Jan 12th", 5 | "keywords": [ 6 | "earthquakes", 7 | "usgs", 8 | "geology" 9 | ], 10 | "path": "/ipfs/QmaTUEjSnonrVpDA47e9yBiK9yb1hQA6rAJkvrF2WotJnR", 11 | "qri": "md:0", 12 | "theme": [ 13 | "geology" 14 | ], 15 | "title": "USGS Earthquakes" 16 | }, 17 | "structure": { 18 | "checksum": "QmWsTpdYkiKThJh4uB8dXqScMYFrFLbYxTr5XkV36XR6Ed", 19 | "depth": 2, 20 | "entries": 240, 21 | "errCount": 410, 22 | "format": "csv", 23 | "formatConfig": { 24 | "headerRow": true, 25 | "lazyQuotes": true 26 | }, 27 | "length": 44883, 28 | "path": "/ipfs/QmXXGgzEfyYBBGFdbMM1uKQ97kgqdjjoDdk8S1AaaqWrC2", 29 | "qri": "st:0", 30 | "schema": { 31 | "items": { 32 | "items": [ 33 | { 34 | "title": "time", 35 | "type": "string" 36 | }, 37 | { 38 | "title": "latitude", 39 | "type": "number" 40 | }, 41 | { 42 | "title": "longitude", 43 | "type": "number" 44 | }, 45 | { 46 | "title": "depth", 47 | "type": "number" 48 | }, 49 | { 50 | "title": "mag", 51 | "type": "number" 52 | }, 53 | { 54 | "title": "mag_type", 55 | "type": "string" 56 | }, 57 | { 58 | "title": "nst", 59 | "type": "integer" 60 | }, 61 | { 62 | "title": "gap", 63 | "type": "integer" 64 | }, 65 | { 66 | "title": "dmin", 67 | "type": "number" 68 | }, 69 | { 70 | "title": "rms", 71 | "type": "number" 72 | }, 73 | { 74 | "title": "net", 75 | "type": "string" 76 | }, 77 | { 78 | "title": "id", 79 | "type": "string" 80 | }, 81 | { 82 | "title": "updated", 83 | "type": "string" 84 | }, 85 | { 86 | "title": "place", 87 | "type": "string" 88 | }, 89 | { 90 | "title": "type", 91 | "type": "string" 92 | }, 93 | { 94 | "title": "horizontal_error", 95 | "type": "number" 96 | }, 97 | { 98 | "title": "depth_error", 99 | "type": "number" 100 | }, 101 | { 102 | "title": "mag_error", 103 | "type": "number" 104 | }, 105 | { 106 | "title": "mag_nst", 107 | "type": "integer" 108 | }, 109 | { 110 | "title": "status", 111 | "type": "string" 112 | }, 113 | { 114 | "title": "location_source", 115 | "type": "string" 116 | }, 117 | { 118 | "title": "mag_source", 119 | "type": "string" 120 | } 121 | ], 122 | "type": "array" 123 | }, 124 | "type": "array" 125 | } 126 | }, 127 | "commit": { 128 | "author": { 129 | "id": "QmeitTcgUPiw1PyUDdaCbMcNotx84yR8EwJVjTv7MRmELA" 130 | }, 131 | "message": "meta added\nreadme added", 132 | "path": "/ipfs/QmWH51TF5EcAjF4NyffgHF1hfd4VL8Xq8ctgK7rR2xdwdy", 133 | "qri": "cm:0", 134 | "signature": "MpfKssuFkcLpvkVMSzbJxsxDMXhLSnnfeEWs9usvduttdzSt8sAM0DD1UmEwRsDziE9oYe3GbGqu5eWqs9KYqmMbkMZU8cPrIQ4JYRUfPxQYDQh8cQhe65hTU30UM29+KR1DfhdWZzlNIu6NkIX4YHUMCNIJUk5HFU90BzYpB5agvp5ZxqDzHDVgDYflqalchkyl2jQ/OdWPQgj1BYoP7O5QGgq3ZhPnloq440y0QEengwix30nIdOiGFEH3lFNtFeHYktSAaOQY5/X8co3ttv6iI2XwyE9jvPWDtx1o8bNpgmlVFUP4oteb9iRDT7e6byRLQC7k3WHZ/JiXL7mAAw==", 135 | "timestamp": "2021-01-25T16:33:26.588501Z", 136 | "title": "Add meta \u0026 readme" 137 | }, 138 | "readme": { 139 | "qri":"rm:0" 140 | }, 141 | "stats": { 142 | "qri":"sa:0", 143 | "stats": {"todo": "add real stats"} 144 | } 145 | } -------------------------------------------------------------------------------- /vals/coding.go: -------------------------------------------------------------------------------- 1 | package vals 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | ) 7 | 8 | // ConvertDecoded converts an interface that has been decoded into standard go types to a Value 9 | func ConvertDecoded(d interface{}) (Value, error) { 10 | var err error 11 | if d == nil { 12 | return Null(true), nil 13 | } 14 | switch v := d.(type) { 15 | case uint8: 16 | return Integer(v), nil 17 | case uint16: 18 | return Integer(v), nil 19 | case uint32: 20 | return Integer(v), nil 21 | case uint64: 22 | return Integer(v), nil 23 | case float64: 24 | return Number(v), nil 25 | case int: 26 | return Integer(v), nil 27 | case int32: 28 | return Integer(int(v)), nil 29 | case int64: 30 | return Integer(int(v)), nil 31 | case string: 32 | return String(v), nil 33 | case bool: 34 | return Boolean(v), nil 35 | case []interface{}: 36 | arr := make(Array, len(v)) 37 | for i, val := range v { 38 | arr[i], err = ConvertDecoded(val) 39 | if err != nil { 40 | return arr, err 41 | } 42 | } 43 | return &arr, nil 44 | case map[string]interface{}: 45 | obj := make(Object, len(v)) 46 | for key, val := range v { 47 | obj[key], err = ConvertDecoded(val) 48 | if err != nil { 49 | return obj, err 50 | } 51 | } 52 | return &obj, nil 53 | case map[interface{}]interface{}: 54 | obj := make(Object, len(v)) 55 | for keyi, val := range v { 56 | key, ok := keyi.(string) 57 | if !ok { 58 | return nil, fmt.Errorf("only strings may be used as keys. got %#v", keyi) 59 | } 60 | obj[key], err = ConvertDecoded(val) 61 | if err != nil { 62 | return obj, err 63 | } 64 | } 65 | return &obj, nil 66 | default: 67 | return nil, fmt.Errorf("unrecognized decoded type: %#v", v) 68 | } 69 | } 70 | 71 | // UnmarshalJSON turns a slice of JSON bytes into a Value 72 | func UnmarshalJSON(data []byte) (v Value, err error) { 73 | switch ParseType(data) { 74 | case TypeObject: 75 | return unmarshalObject(data) 76 | case TypeArray: 77 | return unmarshalArray(data) 78 | case TypeString: 79 | s := String("") 80 | v = &s 81 | case TypeInteger: 82 | i := Integer(0) 83 | v = &i 84 | case TypeNumber: 85 | n := Number(0) 86 | v = &n 87 | case TypeBoolean: 88 | b := Boolean(false) 89 | v = &b 90 | case TypeNull: 91 | n := Null(true) 92 | v = &n 93 | } 94 | 95 | err = json.Unmarshal(data, v) 96 | return 97 | } 98 | 99 | type decodeObj map[string]json.RawMessage 100 | 101 | func unmarshalObject(data []byte) (Value, error) { 102 | do := decodeObj{} 103 | if err := json.Unmarshal(data, &do); err != nil { 104 | return nil, err 105 | } 106 | 107 | obj := make(Object, len(do)) 108 | for key, rm := range do { 109 | val, err := UnmarshalJSON([]byte(rm)) 110 | if err != nil { 111 | return nil, err 112 | } 113 | switch t := val.(type) { 114 | case *String: 115 | obj[key] = *t 116 | case *Number: 117 | obj[key] = *t 118 | case *Integer: 119 | obj[key] = *t 120 | case *Null: 121 | obj[key] = *t 122 | case Object: 123 | obj[key] = t 124 | case Array: 125 | obj[key] = t 126 | case *Boolean: 127 | obj[key] = *t 128 | } 129 | } 130 | 131 | return obj, nil 132 | } 133 | 134 | type decodeArray []json.RawMessage 135 | 136 | func unmarshalArray(data []byte) (Value, error) { 137 | da := decodeArray{} 138 | if err := json.Unmarshal(data, &da); err != nil { 139 | return nil, err 140 | } 141 | 142 | arr := make(Array, len(da)) 143 | for i, rm := range da { 144 | val, err := UnmarshalJSON([]byte(rm)) 145 | if err != nil { 146 | return nil, err 147 | } 148 | switch t := val.(type) { 149 | case *String: 150 | arr[i] = *t 151 | case *Number: 152 | arr[i] = *t 153 | case *Integer: 154 | arr[i] = *t 155 | case *Null: 156 | arr[i] = *t 157 | case Object: 158 | arr[i] = t 159 | case Array: 160 | arr[i] = t 161 | case *Boolean: 162 | arr[i] = *t 163 | } 164 | } 165 | 166 | return arr, nil 167 | } 168 | -------------------------------------------------------------------------------- /data_format_test.go: -------------------------------------------------------------------------------- 1 | package dataset 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | ) 7 | 8 | func TestSupportedDataFormats(t *testing.T) { 9 | expect := []DataFormat{ 10 | CBORDataFormat, 11 | JSONDataFormat, 12 | CSVDataFormat, 13 | XLSXDataFormat, 14 | NDJSONDataFormat, 15 | } 16 | 17 | for i, f := range SupportedDataFormats() { 18 | if expect[i] != f { 19 | t.Errorf("index %d mismatch. expected: %s got: %s", i, expect, f) 20 | } 21 | } 22 | } 23 | 24 | func TestDataFormatString(t *testing.T) { 25 | cases := []struct { 26 | f DataFormat 27 | expect string 28 | }{ 29 | {UnknownDataFormat, ""}, 30 | {CSVDataFormat, "csv"}, 31 | {JSONDataFormat, "json"}, 32 | {XMLDataFormat, "xml"}, 33 | {XLSXDataFormat, "xlsx"}, 34 | {CBORDataFormat, "cbor"}, 35 | {NDJSONDataFormat, "ndjson"}, 36 | } 37 | 38 | for i, c := range cases { 39 | if got := c.f.String(); got != c.expect { 40 | t.Errorf("case %d mismatch. expected: %q, got: %q", i, c.expect, got) 41 | continue 42 | } 43 | } 44 | } 45 | 46 | func TestParseDataFormatString(t *testing.T) { 47 | cases := []struct { 48 | in string 49 | expect DataFormat 50 | err string 51 | }{ 52 | {"", UnknownDataFormat, ""}, 53 | {".csv", CSVDataFormat, ""}, 54 | {"csv", CSVDataFormat, ""}, 55 | {".json", JSONDataFormat, ""}, 56 | {"json", JSONDataFormat, ""}, 57 | {".xml", XMLDataFormat, ""}, 58 | {"xml", XMLDataFormat, ""}, 59 | {".xlsx", XLSXDataFormat, ""}, 60 | {"xlsx", XLSXDataFormat, ""}, 61 | {"cbor", CBORDataFormat, ""}, 62 | {".cbor", CBORDataFormat, ""}, 63 | {".ndjson", NDJSONDataFormat, ""}, 64 | {"ndjson", NDJSONDataFormat, ""}, 65 | {".jsonl", NDJSONDataFormat, ""}, 66 | {"jsonl", NDJSONDataFormat, ""}, 67 | } 68 | 69 | for i, c := range cases { 70 | got, err := ParseDataFormatString(c.in) 71 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 72 | t.Errorf("case %d error mismatch '%s' != '%s'", i, c.expect, err) 73 | continue 74 | } 75 | if got != c.expect { 76 | t.Errorf("case %d response mismatch. expected: %s got: %s", i, c.expect, got) 77 | continue 78 | } 79 | } 80 | } 81 | 82 | func TestDataFormatMarshalJSON(t *testing.T) { 83 | cases := []struct { 84 | format DataFormat 85 | expect []byte 86 | err string 87 | }{ 88 | {CSVDataFormat, []byte(`"csv"`), ""}, 89 | {JSONDataFormat, []byte(`"json"`), ""}, 90 | {XMLDataFormat, []byte(`"xml"`), ""}, 91 | {XLSXDataFormat, []byte(`"xlsx"`), ""}, 92 | {CBORDataFormat, []byte(`"cbor"`), ""}, 93 | {NDJSONDataFormat, []byte(`"ndjson"`), ""}, 94 | } 95 | for i, c := range cases { 96 | got, err := c.format.MarshalJSON() 97 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 98 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err) 99 | continue 100 | } 101 | if !bytes.Equal(got, c.expect) { 102 | t.Errorf(`case %d response mismatch. expected: %s, got: %s`, i, string(c.expect), string(got)) 103 | continue 104 | } 105 | } 106 | } 107 | 108 | func TestDataFormatUnmarshalJSON(t *testing.T) { 109 | cases := []struct { 110 | data []byte 111 | expect DataFormat 112 | err string 113 | }{ 114 | {[]byte(`"csv"`), CSVDataFormat, ""}, 115 | {[]byte(`"json"`), JSONDataFormat, ""}, 116 | {[]byte(`"xml"`), XMLDataFormat, ""}, 117 | {[]byte(`"xlsx"`), XLSXDataFormat, ""}, 118 | {[]byte(`"cbor"`), CBORDataFormat, ""}, 119 | {[]byte(`"ndjson"`), NDJSONDataFormat, ""}, 120 | } 121 | 122 | for i, c := range cases { 123 | a := DataFormat(0) 124 | got := &a 125 | err := got.UnmarshalJSON(c.data) 126 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) { 127 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err) 128 | continue 129 | } 130 | if *got != c.expect { 131 | t.Errorf(`case %d response mismatch. expected: %s, got: %s`, i, c.expect, *got) 132 | continue 133 | } 134 | 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /validate/dataset.go: -------------------------------------------------------------------------------- 1 | package validate 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/qri-io/dataset" 7 | "github.com/qri-io/jsonschema" 8 | ) 9 | 10 | // Dataset checks that a dataset is valid for use 11 | // returning the first error encountered, nil if valid 12 | func Dataset(ds *dataset.Dataset) error { 13 | if ds == nil { 14 | return nil 15 | } 16 | 17 | if ds.Commit == nil { 18 | err := fmt.Errorf("commit is required") 19 | log.Debug(err.Error()) 20 | return err 21 | } else if err := Commit(ds.Commit); err != nil { 22 | err := fmt.Errorf("commit: %s", err.Error()) 23 | log.Debug(err.Error()) 24 | return err 25 | } 26 | if ds.Structure != nil { 27 | if err := Structure(ds.Structure); err != nil { 28 | return fmt.Errorf("structure: %s", err.Error()) 29 | } 30 | } 31 | 32 | return nil 33 | } 34 | 35 | // Commit checks that a dataset Commit is valid for use 36 | // returning the first error encountered, nil if valid 37 | func Commit(cm *dataset.Commit) error { 38 | if cm == nil { 39 | return nil 40 | } 41 | 42 | if cm.Title == "" { 43 | // return fmt.Errorf("title is required") 44 | 45 | } else if len(cm.Title) > 100 { 46 | return fmt.Errorf("title is too long. %d length exceeds 100 character limit", len(cm.Title)) 47 | } 48 | 49 | return nil 50 | } 51 | 52 | // Structure checks that a dataset structure is valid for use 53 | // returning the first error encountered, nil if valid 54 | func Structure(s *dataset.Structure) error { 55 | if s == nil { 56 | return nil 57 | } 58 | 59 | df := s.DataFormat() 60 | if df == dataset.UnknownDataFormat { 61 | return fmt.Errorf("format is required") 62 | } else if df == dataset.CSVDataFormat { 63 | if s.Schema == nil { 64 | return fmt.Errorf("csv data format requires a schema") 65 | } 66 | } 67 | 68 | if err := Schema(s.Schema); err != nil { 69 | return fmt.Errorf("schema: %s", err.Error()) 70 | } 71 | 72 | return nil 73 | } 74 | 75 | // csvMetaSchema is a jsonschema for validating CSV schema definitions 76 | var csvMetaSchema = jsonschema.Must(`{ 77 | "type": "object", 78 | "properties": { 79 | "type": { 80 | "const": "array" 81 | }, 82 | "items": { 83 | "type": "object", 84 | "properties": { 85 | "type": { 86 | "const": "array" 87 | }, 88 | "items": { 89 | "type": "array", 90 | "items": { 91 | "type": "object", 92 | "minItems": 1, 93 | "properties": { 94 | "title": { 95 | "type": "string" 96 | }, 97 | "type": true 98 | } 99 | } 100 | } 101 | } 102 | } 103 | } 104 | }`) 105 | 106 | // jsonMetaSchema is a jsonschema for validating JSON schema definitions 107 | // var jsonMetaSchema = jsonschema.Must(``) 108 | 109 | // Schema checks that a dataset schema is valid for use 110 | // returning the first error encountered, nil if valid 111 | func Schema(sch map[string]interface{}) error { 112 | if sch == nil { 113 | return fmt.Errorf("schema is required") 114 | } 115 | 116 | // TODO (b5): Um, like, finish this 117 | 118 | // if len(s.Fields) == 0 { 119 | // return fmt.Errorf("fields are required") 120 | // } else if err := Fields(s.Fields); err != nil { 121 | // return fmt.Errorf("fields: %s", err.Error()) 122 | // } 123 | 124 | return nil 125 | } 126 | 127 | // Fields checks that a slice of dataset fields is valid for use 128 | // returning the first error encountered, nil if valid 129 | // func Fields(fields []*dataset.Field) error { 130 | // if fields == nil { 131 | // return nil 132 | // } 133 | 134 | // checkedFieldNames := map[string]bool{} 135 | // for _, field := range fields { 136 | // if err := ValidName(field.Name); err != nil { 137 | // return err 138 | // } 139 | // seen := checkedFieldNames[field.Name] 140 | // if seen { 141 | // return fmt.Errorf("error: cannot use the same name, '%s' more than once", field.Name) 142 | // } 143 | // checkedFieldNames[field.Name] = true 144 | // } 145 | // return nil 146 | // } 147 | -------------------------------------------------------------------------------- /dstest/dstest_test.go: -------------------------------------------------------------------------------- 1 | package dstest 2 | 3 | import ( 4 | "bytes" 5 | "io/ioutil" 6 | "os" 7 | "testing" 8 | 9 | "github.com/qri-io/dataset" 10 | ) 11 | 12 | func TestDatasetChecksum(t *testing.T) { 13 | expect := "085e607818aae2920e0e4b57c321c3b58e17b85d" 14 | sum := DatasetChecksum(&dataset.Dataset{}) 15 | if sum != expect { 16 | t.Errorf("empty pod hash mismatch. expected: %s, got: %s", expect, sum) 17 | } 18 | } 19 | 20 | func TestLoadTestCases(t *testing.T) { 21 | tcs, err := LoadTestCases("testdata") 22 | if err != nil { 23 | t.Error(err) 24 | } 25 | if len(tcs) == 0 { 26 | t.Errorf("expected at least one test case to load") 27 | } 28 | } 29 | 30 | func TestBodyFilepath(t *testing.T) { 31 | fp, err := BodyFilepath("testdata/complete") 32 | if err != nil { 33 | t.Error(err.Error()) 34 | return 35 | } 36 | if fp != "testdata/complete/body.csv" { 37 | t.Errorf("%s != %s", "testdata/complete/body.csv", fp) 38 | } 39 | } 40 | 41 | func TestReadInputTransformScript(t *testing.T) { 42 | if _, _, err := ReadInputTransformScript("bad_dir"); err != os.ErrNotExist { 43 | t.Error("expected os.ErrNotExist on bad tf script read") 44 | } 45 | } 46 | 47 | func TestReadInputReadmeScript(t *testing.T) { 48 | if _, _, err := ReadInputReadmeScript("bad_dir"); err != os.ErrNotExist { 49 | t.Error("expected os.ErrNotExist on bad tf script read") 50 | } 51 | _, _, err := ReadInputReadmeScript("testdata/complete") 52 | if err != nil { 53 | t.Fatal("could not open 'readme.md' file: %w", err) 54 | } 55 | } 56 | 57 | func TestNewTestCaseFromDir(t *testing.T) { 58 | var err error 59 | if _, err = NewTestCaseFromDir("testdata"); err == nil { 60 | t.Errorf("expected error") 61 | return 62 | } 63 | 64 | tc, err := NewTestCaseFromDir("testdata/complete") 65 | if err != nil { 66 | t.Errorf("error reading test dir: %s", err.Error()) 67 | return 68 | } 69 | 70 | name := "complete" 71 | if tc.Name != name { 72 | t.Errorf("expected name to equal: %s. got: %s", name, tc.Name) 73 | } 74 | 75 | fn := "body.csv" 76 | if tc.BodyFilename != fn { 77 | t.Errorf("expected BodyFilename to equal: %s. got: %s", fn, tc.BodyFilename) 78 | } 79 | 80 | data := []byte(`city,pop,avg_age,in_usa 81 | toronto,40000000,55.5,false 82 | new york,8500000,44.4,true 83 | chicago,300000,44.4,true 84 | chatham,35000,65.25,true 85 | raleigh,250000,50.65,true 86 | `) 87 | if !bytes.Equal(tc.Body, data) { 88 | t.Errorf("data mismatch") 89 | } 90 | 91 | mf := tc.BodyFile() 92 | if mf.FileName() != tc.BodyFilename { 93 | t.Errorf("filename mismatch: %s != %s", mf.FileName(), tc.BodyFilename) 94 | } 95 | 96 | if ts, ok := tc.TransformScriptFile(); !ok { 97 | t.Errorf("expected tranform script to load") 98 | } else { 99 | if ts.FileName() != "transform.star" { 100 | t.Errorf("expected TransformScript filename to be transform.star") 101 | } 102 | } 103 | tc.TransformScript = nil 104 | if _, ok := tc.TransformScriptFile(); ok { 105 | t.Error("shouldn't generate TransformScript File if bytes are nil") 106 | } 107 | 108 | if vz, ok := tc.VizScriptFile(); !ok { 109 | t.Errorf("expected viz script to load") 110 | } else { 111 | if vz.FileName() != "template.html" { 112 | t.Errorf("expected VizScript filename to be template.html") 113 | } 114 | } 115 | tc.VizScript = nil 116 | if _, ok := tc.VizScriptFile(); ok { 117 | t.Error("shouldn't generate VizScript File if bytes are nil") 118 | } 119 | 120 | if rm, ok := tc.ReadmeScriptFile(); !ok { 121 | t.Errorf("expected readme script to load") 122 | } else { 123 | if rm.FileName() != "readme.md" { 124 | t.Errorf("expected ReadmeScript filename to be template.html") 125 | } 126 | } 127 | tc.ReadmeScript = nil 128 | if _, ok := tc.ReadmeScriptFile(); ok { 129 | t.Error("shouldn't generate ReadmeScript File if bytes are nil") 130 | } 131 | 132 | mfdata, err := ioutil.ReadAll(mf) 133 | if err != nil { 134 | t.Errorf("error reading file: %s", err.Error()) 135 | } 136 | 137 | if !bytes.Equal(mfdata, data) { 138 | t.Errorf("memfile data mismatch") 139 | } 140 | 141 | rendered, err := tc.RenderedFile() 142 | if err != nil { 143 | t.Errorf("reading %s: %s", RenderedFilename, err) 144 | } 145 | if rendered == nil { 146 | t.Error("expected rendered to not equal nil") 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /detect/determineFields_test.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/google/go-cmp/cmp" 8 | "github.com/qri-io/dataset" 9 | ) 10 | 11 | var egCorruptCsvData = []byte(` 12 | """fhkajslfnakjlcdnajcl ashklj asdhcjklads ch,,,\dagfd 13 | `) 14 | 15 | var egNaicsCsvData = []byte(` 16 | STATE,FIRM,PAYR_N,PAYRFL_N,STATEDSCR,NAICSDSCR,entrsizedscr 17 | 00,--,74883.53,5621697325,United States,Total,01: Total 18 | 00,--,35806.37,241347624,United States,Total,02: 0-4`) 19 | 20 | var egNoHeaderData1 = []byte(` 21 | example,false,other,stuff 22 | ex,true,text,col 23 | `) 24 | 25 | var egNoHeaderData2 = []byte(` 26 | this,example,has,a,number,column,1 27 | this,example,has,a,number,column,2 28 | this,example,has,a,number,column,3`) 29 | 30 | var egNoHeaderData3 = []byte(` 31 | one, 1, three 32 | one, 2, three`) 33 | 34 | var egNoHeaderData4 = []byte(`one,two,3 35 | four,five,6`) 36 | 37 | var egNonDeterministicHeader = []byte(` 38 | not,possible,to,tell,if,this,csv,data,has,a,header 39 | not,possible,to,tell,if,this,csv,data,has,a,header 40 | not,possible,to,tell,if,this,csv,data,has,a,header 41 | not,possible,to,tell,if,this,csv,data,has,a,header 42 | `) 43 | 44 | func TestDetermineCSVSchema(t *testing.T) { 45 | 46 | runTestCase(t, "noHeaderData1", egNoHeaderData1, 47 | map[string]interface{}{ 48 | "items": map[string]interface{}{ 49 | "items": []interface{}{ 50 | map[string]interface{}{ 51 | "title": "field_1", 52 | "type": "string", 53 | }, 54 | map[string]interface{}{ 55 | "title": "field_2", 56 | "type": "boolean", 57 | }, map[string]interface{}{ 58 | "title": "field_3", 59 | "type": "string", 60 | }, map[string]interface{}{ 61 | "title": "field_4", 62 | "type": "string", 63 | }, 64 | }, 65 | "type": "array", 66 | }, 67 | "type": "array", 68 | }) 69 | 70 | runTestCase(t, "noHeaderData2", egNoHeaderData2, 71 | map[string]interface{}{ 72 | "items": map[string]interface{}{ 73 | "items": []interface{}{ 74 | map[string]interface{}{ 75 | "title": "field_1", 76 | "type": "string", 77 | }, 78 | map[string]interface{}{ 79 | "title": "field_2", 80 | "type": "string", 81 | }, map[string]interface{}{ 82 | "title": "field_3", 83 | "type": "string", 84 | }, map[string]interface{}{ 85 | "title": "field_4", 86 | "type": "string", 87 | }, map[string]interface{}{ 88 | "title": "field_5", 89 | "type": "string", 90 | }, map[string]interface{}{ 91 | "title": "field_6", 92 | "type": "string", 93 | }, map[string]interface{}{ 94 | "title": "field_7", 95 | "type": "integer", 96 | }, 97 | }, 98 | "type": "array", 99 | }, 100 | "type": "array", 101 | }) 102 | 103 | runTestCase(t, "noHeaderData3", egNoHeaderData3, 104 | map[string]interface{}{ 105 | "items": map[string]interface{}{ 106 | "items": []interface{}{ 107 | map[string]interface{}{ 108 | "title": "field_1", 109 | "type": "string", 110 | }, 111 | map[string]interface{}{ 112 | "title": "field_2", 113 | "type": "integer", 114 | }, map[string]interface{}{ 115 | "title": "field_3", 116 | "type": "string", 117 | }, 118 | }, 119 | "type": "array", 120 | }, 121 | "type": "array", 122 | }) 123 | 124 | runTestCase(t, "noHeaderData4", egNoHeaderData4, 125 | map[string]interface{}{ 126 | "items": map[string]interface{}{ 127 | "items": []interface{}{ 128 | map[string]interface{}{ 129 | "title": "field_1", 130 | "type": "string", 131 | }, 132 | map[string]interface{}{ 133 | "title": "field_2", 134 | "type": "string", 135 | }, map[string]interface{}{ 136 | "title": "field_3", 137 | "type": "integer", 138 | }, 139 | }, 140 | "type": "array", 141 | }, 142 | "type": "array", 143 | }) 144 | } 145 | 146 | func runTestCase(t *testing.T, description string, input []byte, expect map[string]interface{}) { 147 | st := dataset.Structure{Format: "csv"} 148 | reader := bytes.NewReader(input) 149 | schema, _, err := CSVSchema(&st, reader) 150 | if err != nil { 151 | t.Fatal(err) 152 | } 153 | if diff := cmp.Diff(expect, schema); diff != "" { 154 | t.Errorf("mismatch for \"%s\" (-want +got):\n%s\n", description, diff) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /dsio/xlsx_test.go: -------------------------------------------------------------------------------- 1 | package dsio 2 | 3 | import ( 4 | "bytes" 5 | "os" 6 | "testing" 7 | 8 | "github.com/qri-io/dataset" 9 | "github.com/qri-io/dataset/dstest" 10 | ) 11 | 12 | var xlsxStruct = &dataset.Structure{ 13 | Format: "xlsx", 14 | FormatConfig: map[string]interface{}{ 15 | "sheetName": "Sheet1", 16 | }, 17 | Schema: map[string]interface{}{ 18 | "type": "array", 19 | "items": map[string]interface{}{ 20 | "type": "array", 21 | "items": []interface{}{ 22 | map[string]interface{}{"title": "col_a", "type": "string"}, 23 | map[string]interface{}{"title": "col_b", "type": "number"}, 24 | map[string]interface{}{"title": "col_c", "type": "integer"}, 25 | map[string]interface{}{"title": "col_d", "type": "boolean"}, 26 | map[string]interface{}{"title": "col_e", "type": "object"}, 27 | map[string]interface{}{"title": "col_f", "type": "array"}, 28 | map[string]interface{}{"title": "col_g", "type": "null"}, 29 | }, 30 | }, 31 | }, 32 | } 33 | 34 | func TestXLSXReader(t *testing.T) { 35 | f, err := os.Open("testdata/xlsx/simple/body.xlsx") 36 | if err != nil { 37 | t.Fatal(err.Error()) 38 | } 39 | 40 | rdr, err := NewEntryReader(xlsxStruct, f) 41 | if err != nil { 42 | t.Errorf("error allocating EntryReader: %s", err.Error()) 43 | return 44 | } 45 | count := 0 46 | for { 47 | ent, err := rdr.ReadEntry() 48 | if err != nil { 49 | if err.Error() == "EOF" { 50 | break 51 | } 52 | t.Errorf("unexpected error: %s", err.Error()) 53 | return 54 | } 55 | 56 | if arr, ok := ent.Value.([]interface{}); ok { 57 | if len(arr) != 2 { 58 | t.Errorf("invalid row length for row %d. expected %d, got %d", count, 7, len(arr)) 59 | continue 60 | } 61 | } else { 62 | t.Errorf("expected value to []interface{}. got: %#v", ent.Value) 63 | continue 64 | } 65 | 66 | count++ 67 | } 68 | if count != 4 { 69 | t.Errorf("expected: %d rows, got: %d", 4, count) 70 | } 71 | } 72 | 73 | func TestColIndexToLetters(t *testing.T) { 74 | cases := []struct { 75 | in int 76 | expect string 77 | }{ 78 | {0, "A"}, 79 | {25, "Z"}, 80 | {26, "AA"}, 81 | } 82 | for i, c := range cases { 83 | got := ColIndexToLetters(c.in) 84 | if got != c.expect { 85 | t.Errorf("case %d expected: %s, got: %s", i, c.expect, got) 86 | } 87 | } 88 | } 89 | 90 | func TestXLSXWriter(t *testing.T) { 91 | rows := []Entry{ 92 | // TODO - vary up test input 93 | {Value: []interface{}{"a", float64(12), 23, nil}}, 94 | {Value: []interface{}{"a", float64(12), 23, []interface{}{"foo", "bar"}}}, 95 | {Value: []interface{}{"a", float64(12), 23, map[string]interface{}{"foo": "bar"}}}, 96 | {Value: []interface{}{"a", float64(12), int64(23), false}}, 97 | {Value: []interface{}{"a", float64(12), 23, false}}, 98 | } 99 | 100 | buf := &bytes.Buffer{} 101 | rw, err := NewEntryWriter(xlsxStruct, buf) 102 | if err != nil { 103 | t.Errorf("error allocating EntryWriter: %s", err.Error()) 104 | return 105 | } 106 | st := rw.Structure() 107 | if diff := dstest.CompareStructures(st, xlsxStruct); diff != "" { 108 | t.Errorf("structure mismatch: %s", diff) 109 | return 110 | } 111 | 112 | for i, row := range rows { 113 | if err := rw.WriteEntry(row); err != nil { 114 | t.Errorf("row %d write error: %s", i, err.Error()) 115 | } 116 | } 117 | 118 | if err := rw.Close(); err != nil { 119 | t.Errorf("close reader error: %s", err.Error()) 120 | return 121 | } 122 | } 123 | 124 | func TestXLSXCompression(t *testing.T) { 125 | if _, err := NewXLSXReader(&dataset.Structure{Format: "xlsx", Compression: "gzip"}, nil); err == nil { 126 | t.Error("expected xlsx to fail when using compression") 127 | } 128 | if _, err := NewXLSXWriter(&dataset.Structure{Format: "xlsx", Compression: "gzip"}, nil); err == nil { 129 | t.Error("expected xlsx to fail when using compression") 130 | } 131 | } 132 | 133 | /* 134 | TODO(dustmop): Disabled, testdata/movies/data.xlsx doesn't exist 135 | 136 | func BenchmarkXLSXReader(b *testing.B) { 137 | st := &dataset.Structure{Format: "xlsx", Schema: tabular.BaseTabularSchema} 138 | 139 | for n := 0; n < b.N; n++ { 140 | file, err := os.Open("testdata/movies/data.xlsx") 141 | if err != nil { 142 | b.Errorf("unexpected error: %s", err.Error()) 143 | } 144 | r, err := NewXLSXReader(st, file) 145 | if err != nil { 146 | b.Fatalf("unexpected error: %s", err.Error()) 147 | } 148 | for { 149 | _, err = r.ReadEntry() 150 | if err != nil { 151 | break 152 | } 153 | } 154 | } 155 | } 156 | */ 157 | --------------------------------------------------------------------------------