├── .gitignore
├── detect
├── testdata
│ ├── invalid.cbor
│ ├── array.json
│ ├── object.json
│ ├── cbor_array.cbor
│ ├── cbor_object.cbor
│ ├── cbor_array.structure.json
│ ├── cbor_object.structure.json
│ ├── sitemap_array.structure.json
│ ├── sitemap_object.structure.json
│ ├── hours.csv
│ ├── hours-with-header.csv
│ ├── spelling.structure.json
│ ├── hours.structure.json
│ ├── hours-with-header.structure.json
│ ├── police.structure.json
│ ├── spelling.csv
│ ├── daily_wind_2011.structure.json
│ └── daily_wind_2011.csv
├── xlsx.go
├── cbor.go
├── json_test.go
├── json.go
└── determineFields_test.go
├── testdata
├── vizs
│ ├── invalidJSON.json
│ ├── visconfig2.json
│ ├── visconfig3.json
│ └── visconfig1.json
├── readmes
│ ├── invalidJSON.json
│ ├── readmeconfig2.json
│ ├── readmeconfig3.json
│ └── readmeconfig1.json
├── zip
│ └── exported.zip
├── metadata
│ ├── hours.json
│ ├── airport-codes.json
│ └── continent-codes.json
├── datasets
│ ├── hours.json
│ ├── continent-codes.json
│ ├── airport-codes.json
│ └── complete.json
└── structures
│ ├── continent-codes.json
│ ├── hours.json
│ └── airport-codes.json
├── dsdiff
├── exp0.txt
├── got0.txt
├── Makefile
├── README.md
├── testdata
│ ├── structureJsonSchemaOrig.json
│ ├── structureJsonSchemaNew.json
│ ├── newStructure.json
│ ├── orig.json
│ ├── newData.json
│ ├── newDescription.json
│ ├── newTitle.json
│ ├── newVisConfig.json
│ └── newTransform.json
└── LICENSE
├── dstest
├── testdata
│ └── complete
│ │ ├── transform.star
│ │ ├── template.html
│ │ ├── body.csv
│ │ ├── rendered.html
│ │ ├── readme.md
│ │ ├── expect.dataset.json
│ │ └── input.dataset.json
├── template.go
├── golden.go
├── priv_key.go
├── compare.go
└── dstest_test.go
├── stepfile
├── testdata
│ ├── steps.txt
│ └── steps.json
├── stepfile.go
└── stepfile_test.go
├── dsio
├── testdata
│ ├── movies
│ │ └── body.cbor
│ ├── cbor
│ │ ├── array
│ │ │ └── body.cbor
│ │ ├── city
│ │ │ └── body.cbor
│ │ ├── movies
│ │ │ └── body.cbor
│ │ ├── object
│ │ │ └── body.cbor
│ │ ├── sitemap
│ │ │ └── body.cbor
│ │ ├── craigslist
│ │ │ └── body.cbor
│ │ ├── links_array
│ │ │ └── body.cbor
│ │ ├── links_object
│ │ │ └── body.cbor
│ │ ├── sitemap_object
│ │ │ └── body.cbor
│ │ └── flourinated_compounds_in_fast_food_packaging
│ │ │ ├── body.cbor
│ │ │ └── input.dataset.json
│ ├── xlsx
│ │ ├── simple
│ │ │ └── body.xlsx
│ │ └── obj_cell
│ │ │ └── body.xlsx
│ ├── json
│ │ ├── city
│ │ │ ├── input.dataset.json
│ │ │ └── body.json
│ │ ├── array
│ │ │ └── body.json
│ │ ├── object
│ │ │ └── body.json
│ │ ├── links_array
│ │ │ └── body.json
│ │ └── links_object
│ │ │ └── body.json
│ └── csv
│ │ ├── cities_unique
│ │ └── cities_unique.csv
│ │ ├── cities
│ │ ├── data.csv
│ │ └── input.dataset.json
│ │ ├── movies
│ │ ├── input.dataset.json
│ │ └── body.csv
│ │ ├── movies_sorted_duration_desc
│ │ └── body.csv
│ │ ├── movies_sorted_movie_title
│ │ └── body.csv
│ │ ├── movies_sorted_movie_title_desc
│ │ └── body.csv
│ │ └── movies_sorted_duration_movie_title
│ │ └── body.csv
├── replacecr
│ ├── replace_cr_test.go
│ └── replace_cr.go
├── tracked_reader_test.go
├── fuzz.go
├── tracked_reader.go
├── entry.go
├── entry_test.go
├── streams.go
├── entry_buffer_test.go
├── entry_buffer.go
├── README.md
├── identity.go
├── ndjson_test.go
├── ndjson.go
└── xlsx_test.go
├── dsviz
├── testdata
│ ├── custom
│ │ ├── body.json
│ │ ├── template.html
│ │ ├── rendered.html
│ │ └── input.dataset.json
│ └── default
│ │ ├── body.json
│ │ ├── input.dataset.json
│ │ ├── rendered.html
│ │ └── template.html
└── doc.go
├── validate
├── testdata
│ ├── city
│ │ ├── input.dataset.json
│ │ └── body.json
│ ├── flourinated_compounds_in_fast_food_packaging
│ │ ├── body.cbor
│ │ └── input.dataset.json
│ └── movies
│ │ ├── input.dataset.json
│ │ └── body.csv
├── csv_test.go
├── validate_test.go
├── validate.go
├── csv.go
├── data_test.go
├── testdata_test.go
├── data.go
└── dataset.go
├── codecov.yml
├── Makefile
├── vals
├── object_value.go
├── object_value_test.go
├── compare.go
├── compare_test.go
├── coding_test.go
└── coding.go
├── hash_test.go
├── .codeclimate.yml
├── LICENSE
├── go.mod
├── hash.go
├── .circleci
└── config.yml
├── generate
├── tabular_test.go
├── value.go
├── dsgen
│ └── main.go
└── tabular.go
├── kind_test.go
├── preview
├── preview_test.go
├── testdata
│ └── earthquakes
│ │ ├── readme.md
│ │ └── input.dataset.json
└── preview.go
├── readme.md
├── stats.go
├── kind.go
├── compression
├── compression_test.go
└── compression.go
├── data_format.go
└── data_format_test.go
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | coverage.txt
--------------------------------------------------------------------------------
/detect/testdata/invalid.cbor:
--------------------------------------------------------------------------------
1 | 9af4f5f6fb41
--------------------------------------------------------------------------------
/testdata/vizs/invalidJSON.json:
--------------------------------------------------------------------------------
1 | Invalid Json
--------------------------------------------------------------------------------
/detect/testdata/array.json:
--------------------------------------------------------------------------------
1 | [
2 | "foo"
3 | ]
--------------------------------------------------------------------------------
/testdata/readmes/invalidJSON.json:
--------------------------------------------------------------------------------
1 | Invalid Json
--------------------------------------------------------------------------------
/detect/testdata/object.json:
--------------------------------------------------------------------------------
1 | {
2 | "foo" : "bar"
3 | }
--------------------------------------------------------------------------------
/dsdiff/exp0.txt:
--------------------------------------------------------------------------------
1 | VisConfig: 1 change
2 | - modified format
--------------------------------------------------------------------------------
/dstest/testdata/complete/transform.star:
--------------------------------------------------------------------------------
1 | commit([1,2,3,4,5])
--------------------------------------------------------------------------------
/stepfile/testdata/steps.txt:
--------------------------------------------------------------------------------
1 | I am a step
2 | ---
3 | I am another step
4 | ---
5 | I am a third step
--------------------------------------------------------------------------------
/dsdiff/got0.txt:
--------------------------------------------------------------------------------
1 | {
2 | - "format": "abc",
3 | + "format": "new thing",
4 | "qri": ""
5 | }
6 |
--------------------------------------------------------------------------------
/testdata/zip/exported.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/testdata/zip/exported.zip
--------------------------------------------------------------------------------
/testdata/vizs/visconfig2.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "bar",
3 | "qri": "vz:0",
4 | "scriptPath": "two"
5 | }
--------------------------------------------------------------------------------
/testdata/vizs/visconfig3.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "bar",
3 | "qri": "vz:0",
4 | "scriptPath": "three"
5 | }
--------------------------------------------------------------------------------
/detect/testdata/cbor_array.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/detect/testdata/cbor_array.cbor
--------------------------------------------------------------------------------
/dsio/testdata/movies/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/movies/body.cbor
--------------------------------------------------------------------------------
/testdata/readmes/readmeconfig2.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "bar",
3 | "qri": "rm:0",
4 | "scriptPath": "two"
5 | }
--------------------------------------------------------------------------------
/detect/testdata/cbor_object.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/detect/testdata/cbor_object.cbor
--------------------------------------------------------------------------------
/dsio/testdata/cbor/array/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/array/body.cbor
--------------------------------------------------------------------------------
/dsio/testdata/cbor/city/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/city/body.cbor
--------------------------------------------------------------------------------
/dsviz/testdata/custom/body.json:
--------------------------------------------------------------------------------
1 | [
2 | [2017, 7500000000],
3 | [2016, 7444000000],
4 | [2015, 7358000000]
5 | ]
--------------------------------------------------------------------------------
/dsviz/testdata/default/body.json:
--------------------------------------------------------------------------------
1 | [
2 | [2017, 7500000000],
3 | [2016, 7444000000],
4 | [2015, 7358000000]
5 | ]
--------------------------------------------------------------------------------
/testdata/readmes/readmeconfig3.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "bar",
3 | "qri": "rm:0",
4 | "scriptPath": "three"
5 | }
--------------------------------------------------------------------------------
/detect/testdata/cbor_array.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "cbor",
3 | "schema": {
4 | "type": "array"
5 | }
6 | }
--------------------------------------------------------------------------------
/dsio/testdata/cbor/movies/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/movies/body.cbor
--------------------------------------------------------------------------------
/dsio/testdata/cbor/object/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/object/body.cbor
--------------------------------------------------------------------------------
/dsio/testdata/cbor/sitemap/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/sitemap/body.cbor
--------------------------------------------------------------------------------
/dsio/testdata/xlsx/simple/body.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/xlsx/simple/body.xlsx
--------------------------------------------------------------------------------
/detect/testdata/cbor_object.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "cbor",
3 | "schema": {
4 | "type": "object"
5 | }
6 | }
--------------------------------------------------------------------------------
/detect/testdata/sitemap_array.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "json",
3 | "schema": {
4 | "type": "array"
5 | }
6 | }
--------------------------------------------------------------------------------
/detect/testdata/sitemap_object.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "json",
3 | "schema": {
4 | "type": "object"
5 | }
6 | }
--------------------------------------------------------------------------------
/dsio/testdata/cbor/craigslist/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/craigslist/body.cbor
--------------------------------------------------------------------------------
/dsio/testdata/xlsx/obj_cell/body.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/xlsx/obj_cell/body.xlsx
--------------------------------------------------------------------------------
/dsio/testdata/cbor/links_array/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/links_array/body.cbor
--------------------------------------------------------------------------------
/dsio/testdata/cbor/links_object/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/links_object/body.cbor
--------------------------------------------------------------------------------
/dsio/testdata/cbor/sitemap_object/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/sitemap_object/body.cbor
--------------------------------------------------------------------------------
/testdata/vizs/visconfig1.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "foo",
3 | "qri": "vz:0",
4 | "scriptPath": "one",
5 | "renderedPath": "one"
6 | }
--------------------------------------------------------------------------------
/testdata/readmes/readmeconfig1.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "foo",
3 | "qri": "rm:0",
4 | "scriptPath": "one",
5 | "renderedPath": "one"
6 | }
--------------------------------------------------------------------------------
/dsio/testdata/json/city/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "structure": {
3 | "format": "json",
4 | "schema": {
5 | "type": "object"
6 | }
7 | }
8 | }
--------------------------------------------------------------------------------
/validate/testdata/city/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "structure": {
3 | "format": "json",
4 | "schema": {
5 | "type": "object"
6 | }
7 | }
8 | }
--------------------------------------------------------------------------------
/dsdiff/Makefile:
--------------------------------------------------------------------------------
1 | # Let's keep all our changelog commands the same across all our packages:
2 | update-changelog:
3 | conventional-changelog -p angular -i CHANGELOG.md -s
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | codecov:
2 | ci:
3 | - "ci/circle-ci"
4 | notify:
5 | require_ci_to_pass: no
6 | after_n_builds: 1
7 | coverage:
8 | range: "80...100"
9 | comment: off
--------------------------------------------------------------------------------
/dsio/testdata/cbor/flourinated_compounds_in_fast_food_packaging/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/dsio/testdata/cbor/flourinated_compounds_in_fast_food_packaging/body.cbor
--------------------------------------------------------------------------------
/dstest/testdata/complete/template.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {{ .Meta.Title }}
5 |
6 |
7 | {{ .Meta.Title }}
8 |
9 |
--------------------------------------------------------------------------------
/validate/testdata/flourinated_compounds_in_fast_food_packaging/body.cbor:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qri-io/dataset/HEAD/validate/testdata/flourinated_compounds_in_fast_food_packaging/body.cbor
--------------------------------------------------------------------------------
/stepfile/testdata/steps.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "script": "I am a step"
4 | },
5 | {
6 | "script": "I am another step"
7 | },
8 | {
9 | "script": "I am a third step"
10 | }
11 | ]
--------------------------------------------------------------------------------
/dstest/testdata/complete/body.csv:
--------------------------------------------------------------------------------
1 | city,pop,avg_age,in_usa
2 | toronto,40000000,55.5,false
3 | new york,8500000,44.4,true
4 | chicago,300000,44.4,true
5 | chatham,35000,65.25,true
6 | raleigh,250000,50.65,true
7 |
--------------------------------------------------------------------------------
/testdata/metadata/hours.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "md:0",
3 | "title": "hours",
4 | "accessURL": "https://example.com/not/a/url",
5 | "downloadURL": "https://example.com/not/a/url",
6 | "readmeURL": "/ipfs/notahash"
7 | }
--------------------------------------------------------------------------------
/dsio/testdata/csv/cities_unique/cities_unique.csv:
--------------------------------------------------------------------------------
1 | city,pop,avg_age,in_usa
2 | toronto,40000000,55.5,false
3 | new york,8500000,44.4,true
4 | chicago,300000,44.4,true
5 | chatham,35000,65.25,true
6 | raleigh,250000,50.65,true
7 |
--------------------------------------------------------------------------------
/dstest/testdata/complete/rendered.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | dataset with all submodels example
5 |
6 |
7 | dataset with all submodels example
8 |
9 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Let's keep all our changelog commands the same across all our packages:
2 | update-changelog:
3 | conventional-changelog -p angular -i CHANGELOG.md -s
4 |
5 | test:
6 | go test ./... -v --coverprofile=coverage.txt --covermode=atomic
--------------------------------------------------------------------------------
/dsio/testdata/csv/cities/data.csv:
--------------------------------------------------------------------------------
1 | city,pop,avg_age,in_usa
2 | toronto,40000000,55.5,false
3 | toronto,40000000,55.5,false
4 | new york,8500000,44.4,true
5 | chicago,300000,44.4,true
6 | chatham,35000,65.25,true
7 | raleigh,250000,50.65,true
8 |
--------------------------------------------------------------------------------
/testdata/metadata/airport-codes.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "md:0",
3 | "homeURL": "http://www.ourairports.com/",
4 | "license": {
5 | "type":"PDDL-1.0"
6 | },
7 | "title": "Airport Codes",
8 | "citations": [
9 | {
10 | "name" : "Our Airports",
11 | "url" : "http://ourairports.com/data/"
12 | }
13 | ]
14 | }
--------------------------------------------------------------------------------
/testdata/datasets/hours.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "meta": {
4 | "qri": "md:0",
5 | "title": "hours",
6 | "accessURL": "https://example.com/not/a/url",
7 | "downloadURL": "https://example.com/not/a/url",
8 | "readmeURL": "/ipfs/notahash"
9 | },
10 | "bodyPath": "/ipfs/QmS1dVa1xemo7gQzJgjimj1WwnVBF3TwRTGsyKa1uEBWbJ"
11 | }
--------------------------------------------------------------------------------
/detect/xlsx.go:
--------------------------------------------------------------------------------
1 | package detect
2 |
3 | import (
4 | "io"
5 |
6 | "github.com/qri-io/dataset"
7 | )
8 |
9 | // XLSXSchema determines any schema information for an excel spreadsheet
10 | // TODO (b5): currently unimplemented
11 | func XLSXSchema(r *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) {
12 | return dataset.BaseSchemaArray, 0, nil
13 | }
14 |
--------------------------------------------------------------------------------
/testdata/structures/continent-codes.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "csv",
3 | "schema": {
4 | "type": "array",
5 | "items": {
6 | "type": "array",
7 | "items": [
8 | {
9 | "title": "Code",
10 | "type": "string"
11 | },
12 | {
13 | "title": "Name",
14 | "type": "string"
15 | }
16 | ]
17 | }
18 | }
19 | }
--------------------------------------------------------------------------------
/detect/testdata/hours.csv:
--------------------------------------------------------------------------------
1 | 11 Jan 16 12:00 EST, 1.0, dev, working on some cool stuff
2 | 11 Jan 16 12:00 EST, 1.0, dev, working
3 | 11 Jan 16 12:00 EST, 1.0, dev, other stuff
4 | 11 Jan 16 12:00 EST, 1.0, dev, moar work
5 | 11 Jan 16 12:00 EST, 1.0, dev, youtube work?
6 | 11 Jan 16 12:00 EST, 1.0, dev, is this really work?
7 | 11 Jan 16 12:00 EST, 1.0, dev, werd
8 | 11 Jan 16 12:00 EST, 1.0, dev, making more work
--------------------------------------------------------------------------------
/testdata/metadata/continent-codes.json:
--------------------------------------------------------------------------------
1 | {
2 | "title": "Continent Codes",
3 | "qri": "md:0",
4 | "description": "list of continents with corresponding two letter codes",
5 | "license": {
6 | "type": "odc-pddl",
7 | "url": "http://opendatacommons.org/licenses/pddl/"
8 | },
9 | "keywords": [
10 | "Continents",
11 | "Two letter code",
12 | "Continent codes",
13 | "Continent code list"
14 | ]
15 | }
--------------------------------------------------------------------------------
/vals/object_value.go:
--------------------------------------------------------------------------------
1 | package vals
2 |
3 | // ObjectValue is a special value that represents a value in the context of a parent object
4 | // It wraps a value, adding a property "Key" that holds the value's key in the parent object
5 | type ObjectValue struct {
6 | Key string
7 | Value
8 | }
9 |
10 | // NewObjectValue allocates a new Object Value
11 | func NewObjectValue(key string, v Value) Value {
12 | return ObjectValue{key, v}
13 | }
14 |
--------------------------------------------------------------------------------
/detect/testdata/hours-with-header.csv:
--------------------------------------------------------------------------------
1 | timestamp,hours,category,comments
2 | 11 Jan 16 12:00 EST, 1.0, dev, working on some cool stuff
3 | 11 Jan 16 12:00 EST, 1.0, dev, working
4 | 11 Jan 16 12:00 EST, 1.0, dev, other stuff
5 | 11 Jan 16 12:00 EST, 1.0, dev, moar work
6 | 11 Jan 16 12:00 EST, 1.0, dev, youtube work?
7 | 11 Jan 16 12:00 EST, 1.0, dev, is this really work?
8 | 11 Jan 16 12:00 EST, 1.0, dev, werd
9 | 11 Jan 16 12:00 EST, 1.0, dev, making more work
--------------------------------------------------------------------------------
/detect/testdata/spelling.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "csv",
3 | "formatConfig" : {
4 | "headerRow" : true,
5 | "lazyQuotes": true
6 | },
7 | "schema": {
8 | "type": "array",
9 | "items": {
10 | "type":"array",
11 | "items": [
12 | {
13 | "title": "state",
14 | "type": "string"
15 | },
16 | {
17 | "title": "search_interest",
18 | "type": "number"
19 | }
20 | ]
21 | }
22 | }
23 | }
--------------------------------------------------------------------------------
/testdata/datasets/continent-codes.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "meta": {
4 | "title": "Continent Codes",
5 | "qri": "md:0",
6 | "description": "list of continents with corresponding two letter codes",
7 | "license": {
8 | "type": "odc-pddl",
9 | "url": "http://opendatacommons.org/licenses/pddl/"
10 | },
11 | "keywords": [
12 | "Continents",
13 | "Two letter code",
14 | "Continent codes",
15 | "Continent code list"
16 | ]
17 | }
18 | }
--------------------------------------------------------------------------------
/dsio/testdata/json/city/body.json:
--------------------------------------------------------------------------------
1 | [
2 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false },
3 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false },
4 | { "city" : "new york", "pop" : 8500000, "avg_age": 44.4, "in_usa" : true },
5 | { "city" : "chicago", "pop" : 300000, "avg_age" : 44.4 , "in_usa" : true },
6 | { "city" : "chatham", "pop" : 35000, "avg_age" : 65.25 , "in_usa" : false },
7 | { "city" : "raleigh", "pop" : 250000, "avg_age" : 50.65 , "in_usa" : true }
8 | ]
--------------------------------------------------------------------------------
/validate/testdata/city/body.json:
--------------------------------------------------------------------------------
1 | [
2 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false },
3 | { "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false },
4 | { "city" : "new york", "pop" : 8500000, "avg_age": 44.4, "in_usa" : true },
5 | { "city" : "chicago", "pop" : 300000, "avg_age" : 44.4 , "in_usa" : true },
6 | { "city" : "chatham", "pop" : 35000, "avg_age" : 65.25 , "in_usa" : false },
7 | { "city" : "raleigh", "pop" : 250000, "avg_age" : 50.65 , "in_usa" : true }
8 | ]
--------------------------------------------------------------------------------
/vals/object_value_test.go:
--------------------------------------------------------------------------------
1 | package vals
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestNewObjectValue(t *testing.T) {
8 | v := NewObjectValue("foo", String(""))
9 | if v.Type() != TypeString {
10 | t.Errorf("type mismatch. expected: %s. got: %s", TypeString, v.Type())
11 | }
12 |
13 | if ov, ok := v.(ObjectValue); ok {
14 | if ov.Key != "foo" {
15 | t.Errorf("key mismatch. expected: %s, got: %s", "foo", ov.Key)
16 | }
17 | } else {
18 |
19 | t.Errorf("expected ObjectValue type")
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/dstest/testdata/complete/readme.md:
--------------------------------------------------------------------------------
1 | # Lorem ipsum dolor
2 |
3 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas maximus erat ut rhoncus blandit. Duis aliquet vulputate leo eu volutpat. Praesent in mollis metus, non convallis lectus. Vestibulum malesuada mauris quis nisl auctor pellentesque. Duis lacinia nec justo in viverra. Quisque quis aliquet ante. Donec semper scelerisque laoreet. Praesent dapibus interdum mi, sit amet lacinia odio malesuada vitae. Proin eu erat quis nisi tristique mollis. Donec sed eleifend augue, at convallis ex.
4 |
--------------------------------------------------------------------------------
/dsviz/testdata/custom/template.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {{ ds.meta.title }}
5 |
6 |
7 | {{ ds.meta.title }}
8 | First Row:
9 |
10 | {{- range bodyEntries 0 1 }}
11 |
12 | {{ range . }}| {{ . }} | {{ end }}
13 |
14 | {{- end }}
15 |
16 | Full Body:
17 |
18 | {{- range allBodyEntries }}
19 |
20 | {{ range . }}| {{ . }} | {{ end }}
21 |
22 | {{- end }}
23 |
24 |
25 |
--------------------------------------------------------------------------------
/dsviz/testdata/custom/rendered.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | World Population
5 |
6 |
7 | World Population
8 | First Row:
9 |
10 |
11 | | 2017 | 7500000000 |
12 |
13 |
14 | Full Body:
15 |
16 |
17 | | 2017 | 7500000000 |
18 |
19 |
20 | | 2016 | 7444000000 |
21 |
22 |
23 | | 2015 | 7358000000 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/dsio/testdata/csv/movies/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "meta": {
4 | "title": "example movie data"
5 | },
6 | "structure": {
7 | "format": "csv",
8 | "formatConfig": {
9 | "headerRow": true
10 | },
11 | "schema": {
12 | "type": "array",
13 | "items": {
14 | "type": "array",
15 | "items": [
16 | {
17 | "title": "movie_title",
18 | "type": "string"
19 | },
20 | {
21 | "title": "duration",
22 | "type": "integer"
23 | }
24 | ]
25 | }
26 | }
27 | }
28 | }
--------------------------------------------------------------------------------
/validate/testdata/movies/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "meta": {
4 | "title": "example movie data"
5 | },
6 | "structure": {
7 | "format": "csv",
8 | "formatConfig": {
9 | "headerRow": true
10 | },
11 | "schema": {
12 | "type": "array",
13 | "items": {
14 | "type": "array",
15 | "items": [
16 | {
17 | "title": "movie_title",
18 | "type": "string"
19 | },
20 | {
21 | "title": "duration",
22 | "type": "integer"
23 | }
24 | ]
25 | }
26 | }
27 | }
28 | }
--------------------------------------------------------------------------------
/hash_test.go:
--------------------------------------------------------------------------------
1 | package dataset
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestHashBytes(t *testing.T) {
8 | cases := []struct {
9 | in []byte
10 | out string
11 | err error
12 | }{
13 | {[]byte(""), "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n", nil},
14 | }
15 |
16 | for i, c := range cases {
17 | got, err := HashBytes(c.in)
18 | if err != c.err {
19 | t.Errorf("case %d error mismatch. expected: %s got: %s", i, c.err, err)
20 | continue
21 | }
22 |
23 | if got != c.out {
24 | t.Errorf("case %d result mismatch. expected: %s got: %s", i, c.out, got)
25 | continue
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/testdata/structures/hours.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "csv",
3 | "formatOptions": null,
4 | "depth" : 2,
5 | "schema": {
6 | "type": "array",
7 | "items": {
8 | "type": "array",
9 | "items": [
10 | {
11 | "title": "field_1",
12 | "type": "string"
13 | },
14 | {
15 | "title": "field_2",
16 | "type": "number"
17 | },
18 | {
19 | "title": "field_3",
20 | "type": "string"
21 | },
22 | {
23 | "title": "field_4",
24 | "type": "string"
25 | }
26 | ]
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/dstest/template.go:
--------------------------------------------------------------------------------
1 | package dstest
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 | "text/template"
7 | )
8 |
9 | // Template executes & returns a template string, failing the test if the
10 | // template fails to compile
11 | func Template(t *testing.T, tmplStr string, data interface{}) string {
12 | t.Helper()
13 | tmpl, err := template.New("tmpl").Parse(tmplStr)
14 | if err != nil {
15 | t.Fatalf("error parsing dstest template: %s", err)
16 | }
17 |
18 | w := &bytes.Buffer{}
19 | if err := tmpl.Execute(w, data); err != nil {
20 | t.Fatalf("error executing dstest template: %s", err)
21 | }
22 |
23 | return w.String()
24 | }
25 |
--------------------------------------------------------------------------------
/dsviz/testdata/custom/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "peername" : "steve",
3 | "name" : "world_pop",
4 | "meta": {
5 | "title": "World Population",
6 | "description": "a dataset showing the population of the world"
7 | },
8 | "structure" : {
9 | "format": "json",
10 | "schema": {
11 | "type": "array",
12 | "items": {
13 | "type": "array",
14 | "items": [
15 | { "title": "year", "type": "integer" },
16 | { "title": "population", "type": "integer" }
17 | ]
18 | }
19 | }
20 | },
21 | "viz":{
22 | "format": "html",
23 | "scriptPath": "bar"
24 | }
25 | }
--------------------------------------------------------------------------------
/detect/testdata/hours.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "csv",
3 | "formatConfig": {
4 | "lazyQuotes" : true
5 | },
6 | "schema": {
7 | "type": "array",
8 | "items": {
9 | "type": "array",
10 | "items": [
11 | {
12 | "title": "field_1",
13 | "type": "string"
14 | },
15 | {
16 | "title": "field_2",
17 | "type": "number"
18 | },
19 | {
20 | "title": "field_3",
21 | "type": "string"
22 | },
23 | {
24 | "title": "field_4",
25 | "type": "string"
26 | }
27 | ]
28 | }
29 | }
30 | }
--------------------------------------------------------------------------------
/dsio/replacecr/replace_cr_test.go:
--------------------------------------------------------------------------------
1 | package replacecr
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 | )
7 |
8 | func TestReader(t *testing.T) {
9 | input := []byte("foo\r\rbar\r\nbaz\r\r")
10 | expect := []byte("foo\r\n\r\nbar\r\nbaz\r\n\r\n")
11 |
12 | got := make([]byte, 19)
13 | n, err := Reader(bytes.NewReader(input)).Read(got)
14 | if err != nil && err.Error() != "EOF" {
15 | t.Errorf("unexpected error: %s", err.Error())
16 | }
17 | if n != 19 {
18 | t.Errorf("length error. expected: %d, got: %d", 19, n)
19 | }
20 | if !bytes.Equal(expect, got) {
21 | t.Errorf("byte mismatch. expected:\n%v\ngot:\n%v", expect, got)
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/dsio/tracked_reader_test.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "strings"
5 | "testing"
6 | )
7 |
8 | func TestTrackedReader(t *testing.T) {
9 | r := strings.NewReader("0123456789")
10 | tr := NewTrackedReader(r)
11 |
12 | buf := make([]byte, 4)
13 | tr.Read(buf)
14 | if tr.BytesRead() != 4 {
15 | t.Errorf("expected bytes read to equal 4, got: %d", tr.BytesRead())
16 | }
17 | tr.Read(buf)
18 | if tr.BytesRead() != 8 {
19 | t.Errorf("expected bytes read to equal 4, got: %d", tr.BytesRead())
20 | }
21 | tr.Read(buf)
22 | if tr.BytesRead() != 10 {
23 | t.Errorf("expected bytes read to equal 4, got: %d", tr.BytesRead())
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/validate/testdata/movies/body.csv:
--------------------------------------------------------------------------------
1 | movie_title,duration
2 | Avatar ,
3 | Pirates of the Caribbean: At World's End ,
4 | Spectre ,148
5 | The Dark Knight Rises ,164
6 | Star Wars: Episode VII - The Force Awakens ,0
7 | John Carter ,132
8 | Spider-Man 3 ,156
9 | Tangled ,100
10 | Avengers: Age of Ultron ,141
11 | Harry Potter and the Half-Blood Prince ,153
12 | Batman v Superman: Dawn of Justice ,183
13 | Superman Returns ,169
14 | Quantum of Solace ,106
15 | Pirates of the Caribbean: Dead Man's Chest ,151
16 | The Lone Ranger ,150
17 | Man of Steel ,143
18 | The Chronicles of Narnia: Prince Caspian ,150
19 | The Avengers ,173
20 | Pirates of the Caribbean: On Stranger Tides ,136
21 |
--------------------------------------------------------------------------------
/dsio/testdata/csv/movies/body.csv:
--------------------------------------------------------------------------------
1 | movie_title,duration
2 | Avatar ,178
3 | Pirates of the Caribbean: At World's End ,169
4 | Spectre ,148
5 | The Dark Knight Rises ,164
6 | Star Wars: Episode VII - The Force Awakens ,
7 | John Carter ,132
8 | Spider-Man 3 ,156
9 | Tangled ,100
10 | Avengers: Age of Ultron ,141
11 | Harry Potter and the Half-Blood Prince ,153
12 | Batman v Superman: Dawn of Justice ,183
13 | Superman Returns ,169
14 | Quantum of Solace ,106
15 | Pirates of the Caribbean: Dead Man's Chest ,151
16 | The Lone Ranger ,150
17 | Man of Steel ,143
18 | The Chronicles of Narnia: Prince Caspian ,150
19 | The Avengers ,173
20 | Pirates of the Caribbean: On Stranger Tides ,136
21 |
--------------------------------------------------------------------------------
/detect/testdata/hours-with-header.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "csv",
3 | "formatConfig" : {
4 | "headerRow" : true,
5 | "lazyQuotes" : true
6 | },
7 | "schema": {
8 | "type": "array",
9 | "items": {
10 | "type": "array",
11 | "items": [
12 | {
13 | "title": "timestamp",
14 | "type": "string"
15 | },
16 | {
17 | "title": "hours",
18 | "type": "number"
19 | },
20 | {
21 | "title": "category",
22 | "type": "string"
23 | },
24 | {
25 | "title": "comments",
26 | "type": "string"
27 | }
28 | ]
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/dsio/fuzz.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 |
7 | "github.com/qri-io/dataset"
8 | )
9 |
10 | // Fuzz is the entry-point for go-fuzz. Return 1 for a successful parse and 0 for failures.
11 | func Fuzz(data []byte) int {
12 | r := bytes.NewReader(data)
13 | st := &dataset.Structure{Format: dataset.JSONDataFormat.String(), Schema: dataset.BaseSchemaObject}
14 | reader, err := NewJSONReader(st, r)
15 | if err != nil {
16 | return 0
17 | }
18 | for {
19 | _, err = reader.ReadEntry()
20 | if err != nil {
21 | if err.Error() == "EOF" {
22 | break
23 | }
24 | fmt.Printf("Error: %s\n", err.Error())
25 | return 0
26 | }
27 | }
28 | return 1
29 | }
30 |
--------------------------------------------------------------------------------
/dsio/testdata/csv/movies_sorted_duration_desc/body.csv:
--------------------------------------------------------------------------------
1 | movie_title,duration
2 | Batman v Superman: Dawn of Justice ,183
3 | Avatar ,178
4 | The Avengers ,173
5 | Superman Returns ,169
6 | Pirates of the Caribbean: At World's End ,169
7 | The Dark Knight Rises ,164
8 | Spider-Man 3 ,156
9 | Harry Potter and the Half-Blood Prince ,153
10 | Pirates of the Caribbean: Dead Man's Chest ,151
11 | The Chronicles of Narnia: Prince Caspian ,150
12 | The Lone Ranger ,150
13 | Spectre ,148
14 | Man of Steel ,143
15 | Avengers: Age of Ultron ,141
16 | Pirates of the Caribbean: On Stranger Tides ,136
17 | John Carter ,132
18 | Quantum of Solace ,106
19 | Tangled ,100
20 | Star Wars: Episode VII - The Force Awakens ,
21 |
--------------------------------------------------------------------------------
/dsio/testdata/csv/movies_sorted_movie_title/body.csv:
--------------------------------------------------------------------------------
1 | movie_title,duration
2 | Avatar ,178
3 | Avengers: Age of Ultron ,141
4 | Batman v Superman: Dawn of Justice ,183
5 | Harry Potter and the Half-Blood Prince ,153
6 | John Carter ,132
7 | Man of Steel ,143
8 | Pirates of the Caribbean: At World's End ,169
9 | Pirates of the Caribbean: Dead Man's Chest ,151
10 | Pirates of the Caribbean: On Stranger Tides ,136
11 | Quantum of Solace ,106
12 | Spectre ,148
13 | Spider-Man 3 ,156
14 | Star Wars: Episode VII - The Force Awakens ,
15 | Superman Returns ,169
16 | Tangled ,100
17 | The Avengers ,173
18 | The Chronicles of Narnia: Prince Caspian ,150
19 | The Dark Knight Rises ,164
20 | The Lone Ranger ,150
21 |
--------------------------------------------------------------------------------
/.codeclimate.yml:
--------------------------------------------------------------------------------
1 | ratings:
2 | paths:
3 | - "**/*.go"
4 |
5 | engines:
6 | fixme:
7 | enabled: true
8 | config:
9 | strings:
10 | - TODO
11 | golint:
12 | enabled: true
13 | govet:
14 | enabled: true
15 | gofmt:
16 | enabled: true
17 |
18 | version: "2"
19 | checks:
20 | argument-count:
21 | enabled: false
22 | complex-logic:
23 | enabled: false
24 | file-lines:
25 | enabled: false
26 | method-complexity:
27 | enabled: false
28 | method-count:
29 | enabled: false
30 | method-lines:
31 | enabled: false
32 | nested-control-flow:
33 | enabled: false
34 | return-statements:
35 | enabled: false
36 | similar-code:
37 | enabled: false
--------------------------------------------------------------------------------
/dsio/testdata/csv/movies_sorted_movie_title_desc/body.csv:
--------------------------------------------------------------------------------
1 | movie_title,duration
2 | The Lone Ranger ,150
3 | The Dark Knight Rises ,164
4 | The Chronicles of Narnia: Prince Caspian ,150
5 | The Avengers ,173
6 | Tangled ,100
7 | Superman Returns ,169
8 | Star Wars: Episode VII - The Force Awakens ,
9 | Spider-Man 3 ,156
10 | Spectre ,148
11 | Quantum of Solace ,106
12 | Pirates of the Caribbean: On Stranger Tides ,136
13 | Pirates of the Caribbean: Dead Man's Chest ,151
14 | Pirates of the Caribbean: At World's End ,169
15 | Man of Steel ,143
16 | John Carter ,132
17 | Harry Potter and the Half-Blood Prince ,153
18 | Batman v Superman: Dawn of Justice ,183
19 | Avengers: Age of Ultron ,141
20 | Avatar ,178
21 |
--------------------------------------------------------------------------------
/dsio/testdata/csv/movies_sorted_duration_movie_title/body.csv:
--------------------------------------------------------------------------------
1 | movie_title,duration
2 | Star Wars: Episode VII - The Force Awakens ,
3 | Tangled ,100
4 | Quantum of Solace ,106
5 | John Carter ,132
6 | Pirates of the Caribbean: On Stranger Tides ,136
7 | Avengers: Age of Ultron ,141
8 | Man of Steel ,143
9 | Spectre ,148
10 | The Chronicles of Narnia: Prince Caspian ,150
11 | The Lone Ranger ,150
12 | Pirates of the Caribbean: Dead Man's Chest ,151
13 | Harry Potter and the Half-Blood Prince ,153
14 | Spider-Man 3 ,156
15 | The Dark Knight Rises ,164
16 | Pirates of the Caribbean: At World's End ,169
17 | Superman Returns ,169
18 | The Avengers ,173
19 | Avatar ,178
20 | Batman v Superman: Dawn of Justice ,183
21 |
--------------------------------------------------------------------------------
/validate/csv_test.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "strings"
5 | "testing"
6 | )
7 |
8 | func TestCheckCsvRowLengths(t *testing.T) {
9 | cases := []struct {
10 | input string
11 | err string
12 | }{
13 | {rawText1, ""},
14 | {rawText2, ""},
15 | {rawText2b, ""},
16 | {rawText3, ""}, //Note: since there are no commas this should pass
17 | {rawText4, "error: inconsistent column length on line 4 of length 2 (rather than 1). ensure all csv columns same length"},
18 | }
19 |
20 | for i, c := range cases {
21 | r := strings.NewReader(c.input)
22 | err := CheckCsvRowLengths(r)
23 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
24 | t.Errorf("case [%d] error mismatch. expected: '%s', got: '%s'", i, c.err, err)
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/dsdiff/README.md:
--------------------------------------------------------------------------------
1 | [](https://qri.io)
2 | [](http://godoc.org/github.com/qri-io/dsdiff)
3 | [](./LICENSE)
4 | [](https://codecov.io/gh/qri-io/dsdiff)
5 | [](https://circleci.com/gh/qri-io/dsdiff)
6 | [](https://goreportcard.com/report/github.com/qri-io/dsdiff)
7 |
8 | # dsdiff
9 |
10 | Utility for Diffing Datasets, currently a very basic placeholder
11 |
--------------------------------------------------------------------------------
/dsdiff/testdata/structureJsonSchemaOrig.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "BodyPath": "abc",
4 | "path": "123",
5 | "format": "csv",
6 | "formatConfig": {
7 | "headerRow": true
8 | },
9 | "structure": {
10 | "schema": {
11 | "title": "Person",
12 | "type": "object",
13 | "properties": {
14 | "firstName": {
15 | "type": "string"
16 | },
17 | "lastName": {
18 | "type": "string"
19 | },
20 | "age": {
21 | "description": "Age in years",
22 | "type": "integer",
23 | "minimum": 0
24 | },
25 | "friends": {
26 | "type" : "array",
27 | "items" : { "title" : "REFERENCE", "$ref" : "#" }
28 | }
29 | },
30 | "required": ["firstName", "lastName"]
31 | }
32 | }
33 | }
--------------------------------------------------------------------------------
/dsviz/testdata/default/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "path" : "/ipfs/QmSH2WNg8x3ckC8GYTZDY6kVtxfMo2RNJSMgcc2Ewb7iiJ",
3 | "peername" : "steve",
4 | "name" : "world_pop",
5 | "commit" : {
6 | "timestamp": "2019-03-20T20:02:24.689938Z"
7 | },
8 | "meta": {
9 | "title": "World Population",
10 | "description": "a dataset showing the population of the world"
11 | },
12 | "structure" : {
13 | "format": "json",
14 | "length" : 234567,
15 | "entries" : 234,
16 | "schema": {
17 | "type": "array",
18 | "items": {
19 | "type": "array",
20 | "items": [
21 | { "title": "year", "type": "integer" },
22 | { "title": "population", "type": "integer" }
23 | ]
24 | }
25 | }
26 | },
27 | "viz":{
28 | "format": "html",
29 | "scriptPath": "bar"
30 | }
31 | }
--------------------------------------------------------------------------------
/dsio/testdata/csv/cities/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "structure": {
4 | "title": "example city data",
5 | "structure": {
6 | "format": "csv",
7 | "formatConfig": {
8 | "headerRow": true
9 | },
10 | "schema": {
11 | "type": "array",
12 | "items": {
13 | "type": "array",
14 | "items": [
15 | {
16 | "title": "city",
17 | "type": "string"
18 | },
19 | {
20 | "title": "pop",
21 | "type": "integer"
22 | },
23 | {
24 | "title": "avg_age",
25 | "type": "number"
26 | },
27 | {
28 | "title": "in_usa",
29 | "type": "boolean"
30 | }
31 | ]
32 | }
33 | }
34 | }
35 | }
36 | }
--------------------------------------------------------------------------------
/validate/validate_test.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestValidName(t *testing.T) {
8 | cases := []struct {
9 | name string
10 | err string
11 | }{
12 | {"", "error: name cannot be empty"},
13 | {"9", "error: illegal name '9', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters"},
14 | {"_", "error: illegal name '_', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters"},
15 | {"_foo", "error: illegal name '_foo', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters"},
16 | }
17 |
18 | for i, c := range cases {
19 | err := ValidName(c.name)
20 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
21 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err)
22 | continue
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/dsdiff/testdata/structureJsonSchemaNew.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "BodyPath": "abc",
4 | "path": "123",
5 | "format": "csv",
6 | "formatConfig": {
7 | "headerRow": true
8 | },
9 | "structure": {
10 | "schema": {
11 | "title": "Person",
12 | "type": "object",
13 | "properties": {
14 | "firstName": {
15 | "type": "string"
16 | },
17 | "middleName": {
18 | "type": "string"
19 | },
20 | "lastName": {
21 | "type": "string"
22 | },
23 | "age": {
24 | "description": "Age in years",
25 | "type": "integer",
26 | "minimum": 0
27 | },
28 | "friends": {
29 | "type" : "array",
30 | "items" : { "title" : "REFERENCE", "$ref" : "#" }
31 | }
32 | },
33 | "required": ["firstName", "lastName"]
34 | }
35 | }
36 | }
--------------------------------------------------------------------------------
/dsviz/testdata/default/rendered.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 | World Population
10 |
11 |
12 |
13 |
14 | World Population
15 |
16 |
17 | a dataset showing the population of the world
18 |
19 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/validate/validate.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "fmt"
5 | "regexp"
6 |
7 | logger "github.com/ipfs/go-log"
8 | )
9 |
10 | var (
11 | alphaNumericRegex = regexp.MustCompile(`^[a-zA-Z]\w{0,143}$`)
12 | log = logger.Logger("validate")
13 | )
14 |
15 | // ValidName checks for a valid variable name
16 | // names must:
17 | // * start with a letter
18 | // * consist of only alpha-numeric characters and/or underscores
19 | // * have a total length of no more than 144 characters
20 | func ValidName(name string) error {
21 | if name == "" {
22 | err := fmt.Errorf("error: name cannot be empty")
23 | log.Debug(err.Error())
24 | return err
25 | }
26 | if alphaNumericRegex.FindString(name) == "" {
27 | err := fmt.Errorf("error: illegal name '%s', names must start with a letter and consist of only a-z,0-9, and _. max length 144 characters", name)
28 | log.Debug(err.Error())
29 | return err
30 | }
31 | return nil
32 | }
33 |
--------------------------------------------------------------------------------
/dsio/tracked_reader.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import "io"
4 |
5 | // TrackedReader wraps a reader, keeping an internal count of the bytes read
6 | type TrackedReader struct {
7 | read int
8 | r io.Reader
9 | }
10 |
11 | // NewTrackedReader creates a new tracked reader
12 | func NewTrackedReader(r io.Reader) *TrackedReader {
13 | return &TrackedReader{r: r}
14 | }
15 |
16 | // Read implements the io.Reader interface
17 | func (tr *TrackedReader) Read(p []byte) (n int, err error) {
18 | n, err = tr.r.Read(p)
19 | tr.read += n
20 | return
21 | }
22 |
23 | // BytesRead gives the total number of bytes read from the underlying reader
24 | func (tr *TrackedReader) BytesRead() int {
25 | return tr.read
26 | }
27 |
28 | // Close implements the io.Closer interface, closes the underlying reader if
29 | // it's an io.Closer
30 | func (tr *TrackedReader) Close() error {
31 | if cl, ok := tr.r.(io.Closer); ok {
32 | return cl.Close()
33 | }
34 | return nil
35 | }
36 |
--------------------------------------------------------------------------------
/detect/testdata/police.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "csv",
3 | "schema": {
4 | "type": "array",
5 | "items": {
6 | "type": "array",
7 | "items": [
8 | {
9 | "title": "city",
10 | "type": "string"
11 | },
12 | {
13 | "title": "police_force_size",
14 | "type": "integer"
15 | },
16 | {
17 | "title": "all",
18 | "type": "number"
19 | },
20 | {
21 | "title": "white",
22 | "type": "number"
23 | },
24 | {
25 | "title": "non_white",
26 | "type": "number"
27 | },
28 | {
29 | "title": "black",
30 | "type": "number"
31 | },
32 | {
33 | "title": "hispanic",
34 | "type": "number"
35 | },
36 | {
37 | "title": "asian",
38 | "type": "number"
39 | }
40 | ]
41 | }
42 | }
43 | }
--------------------------------------------------------------------------------
/validate/csv.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "encoding/csv"
5 | "fmt"
6 | "io"
7 | )
8 |
9 | // CheckCsvRowLengths ensures that csv input has
10 | // the same number of columns in every row and otherwise
11 | // returns an error
12 | func CheckCsvRowLengths(r io.Reader) error {
13 | csvReader := csv.NewReader(r)
14 | csvReader.FieldsPerRecord = -1
15 | csvReader.TrimLeadingSpace = true
16 | //csvReader.LazyQuotes = true
17 | firstRow, err := csvReader.Read()
18 | rowLen := len(firstRow)
19 | if err != nil {
20 | return fmt.Errorf("error reading first row of csv: %s", err.Error())
21 | }
22 | for i := 1; ; i++ {
23 | record, err := csvReader.Read()
24 | if err == io.EOF {
25 | return nil
26 | }
27 | if err != nil {
28 | return err
29 | }
30 | if len(record) != rowLen {
31 | return fmt.Errorf("error: inconsistent column length on line %d of length %d (rather than %d). ensure all csv columns same length", i, len(record), rowLen)
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/dsio/testdata/json/array/body.json:
--------------------------------------------------------------------------------
1 | [
2 | false,
3 | true,
4 | null,
5 | 1234567890,
6 | -1234567890e3,
7 | "this is a very long string to make sure the bytes.Scanner needs to load more data at least once during the course of scanning. So now I'm just gonna paste the word puppy like 30x. puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy ",
8 | "oh hello there",
9 | {
10 | "key": "oh hello there"
11 | },
12 | [
13 | "key",
14 | "oh hello there"
15 | ],
16 | {
17 | "objects": {
18 | "within": {
19 | "objects": {
20 | "that": {
21 | "haz": [
22 | "array"
23 | ]
24 | }
25 | }
26 | }
27 | }
28 | }
29 | ]
--------------------------------------------------------------------------------
/dsio/testdata/json/object/body.json:
--------------------------------------------------------------------------------
1 | {
2 | "a": false,
3 | "b": true,
4 | "c": null,
5 | "g": "this is a very long string to make sure the bytes.Scanner needs to load more data at least once during the course of scanning. So now I'm just gonna paste the word puppy like 30x. puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy puppy ",
6 | "d": "oh hello there",
7 | "e": {
8 | "key": "oh hello there"
9 | },
10 | "f": [
11 | "key",
12 | "oh hello there"
13 | ],
14 | "l": {
15 | "objects": {
16 | "within": {
17 | "objects": {
18 | "that": {
19 | "haz": [
20 | "array"
21 | ]
22 | }
23 | }
24 | }
25 | }
26 | },
27 | "m": 1234567890,
28 | "n": -1234567890
29 | }
--------------------------------------------------------------------------------
/dsdiff/testdata/newStructure.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "BodyPath": "abc",
4 | "path": "123",
5 | "structure": {
6 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcaaa",
7 | "entries": 35,
8 | "format": "csv",
9 | "formatConfig": {
10 | "headerRow": true
11 | },
12 | "length": 1582,
13 | "qri": "st:0",
14 | "schema": {
15 | "items": {
16 | "items": [
17 | {
18 | "title": "ranking",
19 | "type": "integer"
20 | },
21 | {
22 | "title": "prob_of_automation",
23 | "type": "number"
24 | },
25 | {
26 | "title": "soc_code",
27 | "type": "string"
28 | },
29 | {
30 | "title": "job_title",
31 | "type": "string"
32 | }
33 | ],
34 | "type": "array"
35 | },
36 | "type": "array"
37 | }
38 | },
39 | "meta": {
40 | "title": "abc",
41 | "description": "I am a dataset"
42 | },
43 | "visConfig": {
44 | "format": "abc"
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2017 Qri, Inc.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/dsdiff/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Qri.io
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/qri-io/dataset
2 |
3 | go 1.15
4 |
5 | require (
6 | github.com/360EntSecGroup-Skylar/excelize v1.4.1
7 | github.com/axiomhq/hyperloglog v0.0.0-20191112132149-a4c4c47bc57f
8 | github.com/dgryski/go-sip13 v0.0.0-20200911182023-62edffca9245 // indirect
9 | github.com/dgryski/go-topk v0.0.0-20191119021947-593b4f2374c9
10 | github.com/google/go-cmp v0.5.5
11 | github.com/ipfs/go-log v1.0.5
12 | github.com/jinzhu/copier v0.0.0-20190924061706-b57f9002281a
13 | github.com/k0kubun/colorstring v0.0.0-20150214042306-9440f1994b88 // indirect
14 | github.com/klauspost/compress v1.13.0
15 | github.com/libp2p/go-libp2p-core v0.8.5
16 | github.com/mr-tron/base58 v1.2.0
17 | github.com/multiformats/go-multihash v0.0.15
18 | github.com/qri-io/compare v0.1.0
19 | github.com/qri-io/jsonschema v0.2.2-0.20210618085106-a515144d7449
20 | github.com/qri-io/qfs v0.6.1-0.20210629014446-45bdcdb57434
21 | github.com/qri-io/varName v0.1.0
22 | github.com/sergi/go-diff v1.1.0 // indirect
23 | github.com/ugorji/go/codec v1.1.7
24 | github.com/yudai/gojsondiff v1.0.0
25 | github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 // indirect
26 | github.com/yudai/pp v2.0.1+incompatible // indirect
27 | )
28 |
--------------------------------------------------------------------------------
/detect/cbor.go:
--------------------------------------------------------------------------------
1 | package detect
2 |
3 | import (
4 | "bufio"
5 | "fmt"
6 | "io"
7 |
8 | "github.com/qri-io/dataset"
9 | )
10 |
11 | const (
12 | cborBdIndefiniteArray byte = 0x9f
13 | cborBdIndefiniteMap = 0xbf
14 | cborBaseArray = 0x80
15 | cborBaseMap = 0xa0
16 | cborBaseTag = 0xc0
17 | )
18 |
19 | // CBORSchema determines the field names and types of an io.Reader of CBOR-formatted data, returning a json schema
20 | func CBORSchema(resource *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) {
21 | rd := bufio.NewReader(data)
22 | bd, err := rd.ReadByte()
23 | n++
24 | if err != nil && err != io.EOF {
25 | log.Debugf(err.Error())
26 | err = fmt.Errorf("error reading data: %s", err.Error())
27 | return
28 | }
29 |
30 | switch {
31 | case bd >= cborBaseArray && bd < cborBaseMap, bd == cborBdIndefiniteArray:
32 | return dataset.BaseSchemaArray, n, nil
33 | case bd >= cborBaseMap && bd < cborBaseTag, bd == cborBdIndefiniteMap:
34 | return dataset.BaseSchemaObject, n, nil
35 | default:
36 | err = fmt.Errorf("invalid top-level type for CBOR data. cbor datasets must begin with either an array or map")
37 | log.Debugf(err.Error())
38 | return
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/dsdiff/testdata/orig.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "bodyPath": "abc",
4 | "path": "123",
5 |
6 | "structure": {
7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9",
8 | "entries": 33,
9 | "format": "csv",
10 | "formatConfig": {
11 | "headerRow": true
12 | },
13 | "length": 1582,
14 | "qri": "st:0",
15 | "Commit": {
16 | "title": "abc"
17 | },
18 | "PreviousPath": "",
19 | "schema": {
20 | "items": {
21 | "items": [
22 | {
23 | "title": "rank",
24 | "type": "integer"
25 | },
26 | {
27 | "title": "probability_of_automation",
28 | "type": "number"
29 | },
30 | {
31 | "title": "soc_code",
32 | "type": "string"
33 | },
34 | {
35 | "title": "job_title",
36 | "type": "string"
37 | }
38 | ],
39 | "type": "array"
40 | },
41 | "type": "array"
42 | }
43 | },
44 | "meta": {
45 | "title": "abc",
46 | "description": "I am a dataset"
47 | },
48 | "transform": {
49 | "syntax": "python",
50 | "data": "abc",
51 | "config": {
52 | "option": "value"
53 | }
54 | },
55 | "viz": {
56 | "format": "abc"
57 | }
58 | }
59 |
60 |
--------------------------------------------------------------------------------
/dsio/entry.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | )
7 |
8 | // Entry is a "row" of a dataset
9 | type Entry struct {
10 | // Index represents this entry's numeric position in a dataset
11 | // this index may not necessarily refer to the overall position within the dataset
12 | // as things like offsets affect where the index begins
13 | Index int
14 | // Key is a string key for this entry
15 | // only present when the top level structure is a map
16 | Key string
17 | // Value is information contained within the row
18 | Value interface{}
19 | }
20 |
21 | // DataIteratorFunc is a function for each "row" of a resource's raw data
22 | type DataIteratorFunc func(int, Entry, error) error
23 |
24 | // EachEntry calls fn on each row of a given EntryReader
25 | func EachEntry(rr EntryReader, fn DataIteratorFunc) error {
26 | num := 0
27 | for {
28 | row, err := rr.ReadEntry()
29 | if err != nil {
30 | if err.Error() == io.EOF.Error() {
31 | return nil
32 | }
33 | err := fmt.Errorf("error reading row %d: %s", num, err.Error())
34 | log.Debug(err.Error())
35 | return err
36 | }
37 |
38 | if err := fn(num, row, err); err != nil {
39 | if err.Error() == io.EOF.Error() {
40 | return nil
41 | }
42 | log.Debug(err.Error())
43 | return err
44 | }
45 | num++
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/dsdiff/testdata/newData.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "bodyPath": "abcdefg",
4 | "path": "123",
5 |
6 | "structure": {
7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9",
8 | "entries": 33,
9 | "format": "csv",
10 | "formatConfig": {
11 | "headerRow": true
12 | },
13 | "length": 1582,
14 | "qri": "st:0",
15 | "Commit": {
16 | "title": "abc"
17 | },
18 | "PreviousPath": "",
19 | "schema": {
20 | "items": {
21 | "items": [
22 | {
23 | "title": "rank",
24 | "type": "integer"
25 | },
26 | {
27 | "title": "probability_of_automation",
28 | "type": "number"
29 | },
30 | {
31 | "title": "soc_code",
32 | "type": "string"
33 | },
34 | {
35 | "title": "job_title",
36 | "type": "string"
37 | }
38 | ],
39 | "type": "array"
40 | },
41 | "type": "array"
42 | }
43 | },
44 | "meta": {
45 | "title": "abc",
46 | "description": "I am a dataset"
47 | },
48 | "transform": {
49 | "syntax": "python",
50 | "data": "abc",
51 | "config": {
52 | "option": "value"
53 | }
54 | },
55 | "viz": {
56 | "format": "abc"
57 | }
58 | }
59 |
60 |
--------------------------------------------------------------------------------
/dsdiff/testdata/newDescription.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "BodyPath": "abc",
4 | "path": "123",
5 |
6 | "structure": {
7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9",
8 | "entries": 33,
9 | "format": "csv",
10 | "formatConfig": {
11 | "headerRow": true
12 | },
13 | "length": 1582,
14 | "qri": "st:0",
15 | "Commit": {
16 | "title": "abc"
17 | },
18 | "PreviousPath": "",
19 | "schema": {
20 | "items": {
21 | "items": [
22 | {
23 | "title": "rank",
24 | "type": "integer"
25 | },
26 | {
27 | "title": "probability_of_automation",
28 | "type": "number"
29 | },
30 | {
31 | "title": "soc_code",
32 | "type": "string"
33 | },
34 | {
35 | "title": "job_title",
36 | "type": "string"
37 | }
38 | ],
39 | "type": "array"
40 | },
41 | "type": "array"
42 | }
43 | },
44 | "meta": {
45 | "title": "abc",
46 | "description": "I am a new description"
47 | },
48 | "transform": {
49 | "syntax": "python",
50 | "data": "abc",
51 | "config": {
52 | "option": "value"
53 | }
54 | },
55 | "viz": {
56 | "format": "abc"
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/hash.go:
--------------------------------------------------------------------------------
1 | package dataset
2 |
3 | import (
4 | "crypto/sha256"
5 | "encoding/json"
6 | "fmt"
7 |
8 | "github.com/mr-tron/base58/base58"
9 | "github.com/multiformats/go-multihash"
10 | )
11 |
12 | // JSONHash calculates the hash of a json.Marshaler
13 | // It's important to note that this is *NOT* the same as an IPFS hash,
14 | // These hash functions should be used for other things like
15 | // checksumming, in-memory content-addressing, etc.
16 | func JSONHash(m json.Marshaler) (hash string, err error) {
17 | // marshal to cannoncical JSON representation
18 | data, err := m.MarshalJSON()
19 | if err != nil {
20 | return
21 | }
22 | return HashBytes(data)
23 | }
24 |
25 | // HashBytes generates the base-58 encoded SHA-256 hash of a byte slice
26 | // It's important to note that this is *NOT* the same as an IPFS hash,
27 | // These hash functions should be used for other things like
28 | // checksumming, in-memory content-addressing, etc.
29 | func HashBytes(data []byte) (hash string, err error) {
30 | h := sha256.New()
31 |
32 | if _, err = h.Write(data); err != nil {
33 | return
34 | }
35 |
36 | mhBuf, err := multihash.Encode(h.Sum(nil), multihash.SHA2_256)
37 | if err != nil {
38 | err = fmt.Errorf("error allocating multihash buffer: %s", err.Error())
39 | return
40 | }
41 |
42 | hash = base58.Encode(mhBuf)
43 | return
44 | }
45 |
--------------------------------------------------------------------------------
/dsdiff/testdata/newTitle.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "BodyPath": "abc",
4 | "path": "123",
5 | "structure": {
6 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9",
7 | "entries": 33,
8 | "format": "csv",
9 | "formatConfig": {
10 | "headerRow": true
11 | },
12 | "length": 1582,
13 | "qri": "st:0",
14 | "transform": {
15 | "syntax": "python",
16 | "data": "abc",
17 | "config": {
18 | "option": "value"
19 | }
20 | },
21 | "Commit": {
22 | "title": "abc"
23 | },
24 | "PreviousPath": "",
25 | "schema": {
26 | "items": {
27 | "items": [
28 | {
29 | "title": "rank",
30 | "type": "integer"
31 | },
32 | {
33 | "title": "probability_of_automation",
34 | "type": "number"
35 | },
36 | {
37 | "title": "soc_code",
38 | "type": "string"
39 | },
40 | {
41 | "title": "job_title",
42 | "type": "string"
43 | }
44 | ],
45 | "type": "array"
46 | },
47 | "type": "array"
48 | }
49 | },
50 | "meta": {
51 | "title": "data data data",
52 | "description": "I am a dataset"
53 | },
54 | "visConfig": {
55 | "format": "abc"
56 | }
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/dsdiff/testdata/newVisConfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "BodyPath": "abc",
4 | "path": "123",
5 |
6 | "structure": {
7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9",
8 | "entries": 33,
9 | "format": "csv",
10 | "formatConfig": {
11 | "headerRow": true
12 | },
13 | "length": 1582,
14 | "qri": "st:0",
15 | "Commit": {
16 | "title": "abc"
17 | },
18 | "PreviousPath": "",
19 | "schema": {
20 | "items": {
21 | "items": [
22 | {
23 | "title": "rank",
24 | "type": "integer"
25 | },
26 | {
27 | "title": "probability_of_automation",
28 | "type": "number"
29 | },
30 | {
31 | "title": "soc_code",
32 | "type": "string"
33 | },
34 | {
35 | "title": "job_title",
36 | "type": "string"
37 | }
38 | ],
39 | "type": "array"
40 | },
41 | "type": "array"
42 | }
43 | },
44 | "meta": {
45 | "title": "abc",
46 | "description": "I am a dataset"
47 | },
48 | "transform": {
49 | "syntax": "python",
50 | "data": "abc",
51 | "config": {
52 | "option": "value"
53 | }
54 | },
55 | "viz": {
56 | "format": "new thing"
57 | }
58 | }
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/dsviz/testdata/default/template.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | {{- block "stylesheet" . }}
5 |
10 | {{ end -}}
11 | {{ title }}
12 |
13 |
14 |
15 | {{ block "header" . -}}
16 |
19 | {{ end -}}
20 | {{- block "summary" . -}}
21 |
22 | {{ ds.meta.description }}
23 |
24 | {{ end -}}
25 | {{- block "stats" . }}{{ if ds.stats -}}
26 |
stats
27 | {{ end }}{{ end -}}
28 | {{- block "citations" . -}}
29 |
38 | {{- end }}
39 |
40 |
41 |
--------------------------------------------------------------------------------
/dsdiff/testdata/newTransform.json:
--------------------------------------------------------------------------------
1 | {
2 | "kind": "qri:ds:0",
3 | "BodyPath": "abc",
4 | "path": "123",
5 |
6 | "structure": {
7 | "checksum": "QmRSLr53cRGhkwx1L3uGNCY7QGvup3XGy9Jcud9",
8 | "entries": 33,
9 | "format": "csv",
10 | "formatConfig": {
11 | "headerRow": true
12 | },
13 | "length": 1582,
14 | "qri": "st:0",
15 | "Commit": {
16 | "title": "abc"
17 | },
18 | "PreviousPath": "",
19 | "schema": {
20 | "items": {
21 | "items": [
22 | {
23 | "title": "rank",
24 | "type": "integer"
25 | },
26 | {
27 | "title": "probability_of_automation",
28 | "type": "number"
29 | },
30 | {
31 | "title": "soc_code",
32 | "type": "string"
33 | },
34 | {
35 | "title": "job_title",
36 | "type": "string"
37 | }
38 | ],
39 | "type": "array"
40 | },
41 | "type": "array"
42 | }
43 | },
44 | "meta": {
45 | "title": "abc",
46 | "description": "I am a dataset"
47 | },
48 | "transform": {
49 | "appVersion": "0.1.0",
50 | "syntax": "sql",
51 | "data": "xyz",
52 | "config": {
53 | "option": "new_value"
54 | }
55 | },
56 | "viz": {
57 | "format": "abc"
58 | }
59 | }
60 |
61 |
--------------------------------------------------------------------------------
/detect/json_test.go:
--------------------------------------------------------------------------------
1 | package detect
2 |
3 | import (
4 | "io"
5 | "strings"
6 | "testing"
7 |
8 | "github.com/google/go-cmp/cmp"
9 | "github.com/qri-io/dataset"
10 | )
11 |
12 | func TestJSONSchema(t *testing.T) {
13 |
14 | pr, _ := io.Pipe()
15 | pr.Close()
16 | _, _, err := JSONSchema(&dataset.Structure{}, pr)
17 | if err == nil {
18 | t.Error("expected error when reading bad reader")
19 | return
20 | }
21 |
22 | cases := []struct {
23 | st *dataset.Structure
24 | data string
25 | expect map[string]interface{}
26 | err string
27 | }{
28 | {&dataset.Structure{}, "", nil, "invalid json data"},
29 | {&dataset.Structure{}, "f", nil, "invalid json data"},
30 | {&dataset.Structure{}, "{", dataset.BaseSchemaObject, ""},
31 | {&dataset.Structure{}, "[", dataset.BaseSchemaArray, ""},
32 | {&dataset.Structure{}, strings.Repeat(" ", 250) + "[", dataset.BaseSchemaArray, ""},
33 | }
34 |
35 | for i, c := range cases {
36 | rdr := strings.NewReader(c.data)
37 |
38 | got, _, err := JSONSchema(c.st, rdr)
39 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
40 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err)
41 | return
42 | }
43 |
44 | if diff := cmp.Diff(c.expect, got); diff != "" {
45 | t.Errorf("case %d returned schema mismatch (-want +got):\n%s", i, diff)
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/dsio/testdata/json/links_array/body.json:
--------------------------------------------------------------------------------
1 | [
2 | "http://datatogether.org",
3 | "https://datatogether.org/css/style.css",
4 | "https://datatogether.org/img/favicon.ico",
5 | "https://datatogether.org",
6 | "https://datatogether.org/public-record",
7 | "https://datatogether.org/activities",
8 | "https://datatogether.org/activities/harvesting",
9 | "https://datatogether.org/activities/monitoring",
10 | "https://datatogether.org/activities/storing",
11 | "https://datatogether.org/activities/rescuing",
12 | "http://2017.code4lib.org",
13 | "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.pdf",
14 | "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.key",
15 | "http://www.esipfed.org/meetings/upcoming-meetings/esip-summer-meeting-2017",
16 | "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.pdf",
17 | "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.key",
18 | "https://archive.org/details/ndsr-dc-2017",
19 | "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.pdf",
20 | "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.key",
21 | "https://github.com/datatogether"
22 | ]
--------------------------------------------------------------------------------
/dsio/entry_test.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "os"
5 | "path/filepath"
6 | "testing"
7 |
8 | "github.com/qri-io/dataset"
9 | "github.com/qri-io/dataset/dstest"
10 | )
11 |
12 | func TestEachEntry(t *testing.T) {
13 | tc, err := dstest.NewTestCaseFromDir("testdata/json/city")
14 | if err != nil {
15 | t.Errorf("error reading test case: %s", err.Error())
16 | return
17 | }
18 |
19 | st := &dataset.Structure{
20 | Format: "json",
21 | Schema: dataset.BaseSchemaArray,
22 | }
23 | r, err := NewEntryReader(st, tc.BodyFile())
24 | if err != nil {
25 | t.Errorf("error allocating RowReader: %s", err.Error())
26 | return
27 | }
28 |
29 | err = EachEntry(r, func(i int, ent Entry, err error) error {
30 | if err != nil {
31 | return err
32 | }
33 |
34 | // if len(expect[i]) != len(data) {
35 | // return fmt.Errorf("data length mismatch. expected %d, got: %d", len(expect[i]), len(data))
36 | // }
37 |
38 | // for j, cell := range data {
39 | // if !bytes.Equal(expect[i][j], cell) {
40 | // return fmt.Errorf("result mismatch. row: %d, cell: %d. %s != %s", i, j, string(expect[i][j]), string(cell))
41 | // }
42 | // }
43 |
44 | return nil
45 | })
46 |
47 | if err != nil {
48 | t.Errorf("eachrow error: %s", err.Error())
49 | return
50 | }
51 | }
52 |
53 | func testdataFile(base string) string {
54 | return filepath.Join(os.Getenv("GOPATH"), "/src/github.com/qri-io/dataset/testdata/", base)
55 | }
56 |
--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | jobs:
3 | build:
4 | working_directory: /go/src/github.com/qri-io/dataset
5 | docker:
6 | - image: circleci/golang:latest
7 | environment:
8 | TEST_RESULTS: /tmp/test-results
9 | GO111MODULE: "on"
10 | GOPROXY: "https://proxy.golang.org"
11 | steps:
12 | - checkout
13 | - run: mkdir -p $TEST_RESULTS
14 | - restore_cache:
15 | key: dependency-cache-{{ checksum "go.sum" }}
16 | - run:
17 | name: Get CI Deps
18 | command: go get github.com/jstemmer/go-junit-report golang.org/x/lint/golint
19 | - run:
20 | name: Lint
21 | command: golint -set_exit_status ./...
22 | - run:
23 | name: Go Vet
24 | command: go vet ./...
25 | - run:
26 | name: Run Tests
27 | command: |
28 | trap "go-junit-report <${TEST_RESULTS}/go-test.out > ${TEST_RESULTS}/go-test-report.xml" EXIT
29 | make test | tee ${TEST_RESULTS}/go-test.out
30 | - save_cache:
31 | key: dependency-cache-{{ checksum "go.sum" }}
32 | paths:
33 | - /go/pkg/mod
34 | - run:
35 | name: Publish coverage info to codecov.io
36 | command: bash <(curl -s https://codecov.io/bash)
37 | - store_artifacts:
38 | path: /tmp/test-results
39 | destination: raw-test-output
40 | - store_test_results:
41 | path: /tmp/test-results
42 |
--------------------------------------------------------------------------------
/dsio/testdata/json/links_object/body.json:
--------------------------------------------------------------------------------
1 | {
2 | "a": "http://datatogether.org",
3 | "b": "https://datatogether.org/css/style.css",
4 | "c": "https://datatogether.org/img/favicon.ico",
5 | "d": "https://datatogether.org",
6 | "e": "https://datatogether.org/public-record",
7 | "f": "https://datatogether.org/activities",
8 | "g": "https://datatogether.org/activities/harvesting",
9 | "h": "https://datatogether.org/activities/monitoring",
10 | "i": "https://datatogether.org/activities/storing",
11 | "j": "https://datatogether.org/activities/rescuing",
12 | "k": "http://2017.code4lib.org",
13 | "l": "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.pdf",
14 | "m": "https://datatogether.org/presentations/Code4Lib%202017%20-%20Golden%20Age%20for%20Libraries%20-%20Storing%20Data%20Together.key",
15 | "n": "http://www.esipfed.org/meetings/upcoming-meetings/esip-summer-meeting-2017",
16 | "o": "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.pdf",
17 | "p": "https://datatogether.org/presentations/Data%20Together%20-%20ESIP%20Summer%20Meeting%20July%202017.key",
18 | "q": "https://archive.org/details/ndsr-dc-2017",
19 | "r": "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.pdf",
20 | "s": "https://datatogether.org/presentations/Data%20Together%20-%20NDSR%20-%20swadeshi.key",
21 | "t": "https://github.com/datatogether"
22 | }
--------------------------------------------------------------------------------
/validate/data_test.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/qri-io/dataset/dsio"
8 | "github.com/qri-io/dataset/dstest"
9 | )
10 |
11 | func TestEntryReader(t *testing.T) {
12 | cases := []struct {
13 | name string
14 | err string
15 | errors []string
16 | }{
17 | {"craigslist", "", nil},
18 | {"movies", "", []string{
19 | `/0/1: "" type should be integer, got string`,
20 | `/1/1: "" type should be integer, got string`,
21 | }},
22 | }
23 |
24 | for _, c := range cases {
25 | tc, err := dstest.NewTestCaseFromDir(fmt.Sprintf("testdata/%s", c.name))
26 | if err != nil {
27 | t.Errorf("%s: error loading %s", c.name, err.Error())
28 | continue
29 | }
30 |
31 | r, err := dsio.NewEntryReader(tc.Input.Structure, tc.BodyFile())
32 | if err != nil {
33 | t.Errorf("%s: error creating entry reader: %s", c.name, err.Error())
34 | continue
35 | }
36 |
37 | errors, err := EntryReader(r)
38 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
39 | t.Errorf("%s error mismatch. expected: %s, got: %s", c.name, c.err, err)
40 | continue
41 | }
42 |
43 | if len(errors) != len(c.errors) {
44 | t.Errorf("%s: error length mismatch. expected: %d, got: %d", c.name, len(c.errors), len(errors))
45 | continue
46 | }
47 |
48 | for j, e := range errors {
49 | if e.Error() != c.errors[j] {
50 | t.Errorf("%s: validation error %d mismatch. expected: %s, got: %s", c.name, j, c.errors[j], e.Error())
51 | continue
52 | }
53 | }
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/dsio/streams.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "fmt"
5 | "io"
6 |
7 | "github.com/qri-io/dataset"
8 | )
9 |
10 | // PagedReader wraps a reader, starting reads from offset, and only reads limit number of entries
11 | type PagedReader struct {
12 | Reader EntryReader
13 | Limit int
14 | Offset int
15 | }
16 |
17 | var _ EntryReader = (*PagedReader)(nil)
18 |
19 | // Structure returns the wrapped reader's structure
20 | func (r *PagedReader) Structure() *dataset.Structure {
21 | return r.Reader.Structure()
22 | }
23 |
24 | // ReadEntry returns an entry, taking offset and limit into account
25 | func (r *PagedReader) ReadEntry() (Entry, error) {
26 | for r.Offset > 0 {
27 | _, err := r.Reader.ReadEntry()
28 | if err != nil {
29 | return Entry{}, err
30 | }
31 | r.Offset--
32 | }
33 | if r.Limit == 0 {
34 | return Entry{}, io.EOF
35 | }
36 | r.Limit--
37 | return r.Reader.ReadEntry()
38 | }
39 |
40 | // Close finalizes the writer, indicating no more records
41 | // will be written
42 | func (r *PagedReader) Close() error {
43 | return r.Reader.Close()
44 | }
45 |
46 | // Copy reads all entries from the reader and writes them to the writer
47 | func Copy(reader EntryReader, writer EntryWriter) error {
48 | for {
49 | val, err := reader.ReadEntry()
50 | if err != nil {
51 | if err == io.EOF {
52 | break
53 | }
54 | return fmt.Errorf("row iteration error: %s", err.Error())
55 | }
56 | if err := writer.WriteEntry(val); err != nil {
57 | return fmt.Errorf("error writing value to buffer: %s", err.Error())
58 | }
59 | }
60 | return nil
61 | }
62 |
--------------------------------------------------------------------------------
/dsviz/doc.go:
--------------------------------------------------------------------------------
1 | /*Package dsviz renders the viz component of a dataset, returning a qfs.File of
2 | data
3 |
4 | HTML rendering uses go's html/template package to generate html documents from
5 | an input dataset. It's API has been adjusted to use lowerCamelCase instead of
6 | UpperCamelCase naming conventions
7 |
8 | outline: html viz templates
9 | HTML template should expose a number of helper template functions, along
10 | with a dataset document at ds. Exposing the dataset document as "ds"
11 | matches our conventions for referring to a dataset elsewhere, and allows
12 | access to all defined parts of a dataset.
13 | HTML visualization is built atop the
14 | [go template syntax](https://golang.org/pkg/text/template/#hdr-Functions)
15 | types:
16 | {{ ds }}
17 | the dataset being visualized, ds can have a number of components like
18 | commit, meta, transform, body, all of which have helpful fields for
19 | visualization. Details of the dataset document are outlined in the
20 | dataset document definition
21 | functions:
22 | {{ allBodyEntries }}
23 | load the full dataset body
24 | {{ bodyEntries offset limit }}
25 | get body entries within an offset/limit range. passing offset: 0,
26 | limit: -1 returns the entire body
27 | {{ filesize }}
28 | convert byte count to kb/mb/etc string
29 | {{ title }}
30 | give the title of a dataset
31 | {{ isType $val "type" }}
32 | return true or false if the type of $val matches the given type string
33 | possible type values are "string", "object", "array", "boolean", "number"
34 | */
35 | package dsviz
36 |
--------------------------------------------------------------------------------
/testdata/structures/airport-codes.json:
--------------------------------------------------------------------------------
1 | {
2 | "errCount": 5,
3 | "format": "csv",
4 | "formatConfig": {
5 | "headerRow": true
6 | },
7 | "qri": "st:0",
8 | "mediatype": "text/csv",
9 | "readme": "readme.md",
10 | "schema": {
11 | "type": "array",
12 | "items": {
13 | "type": "array",
14 | "items": [
15 | {
16 | "title": "ident",
17 | "type": "string"
18 | },
19 | {
20 | "title": "type",
21 | "type": "string"
22 | },
23 | {
24 | "title": "name",
25 | "type": "string"
26 | },
27 | {
28 | "title": "latitude_deg",
29 | "type": "number"
30 | },
31 | {
32 | "title": "longitude_deg",
33 | "type": "number"
34 | },
35 | {
36 | "title": "elevation_ft",
37 | "type": "integer"
38 | },
39 | {
40 | "title": "continent",
41 | "type": "string"
42 | },
43 | {
44 | "title": "iso_country",
45 | "type": "string"
46 | },
47 | {
48 | "title": "iso_region",
49 | "type": "string"
50 | },
51 | {
52 | "title": "municipality",
53 | "type": "string"
54 | },
55 | {
56 | "title": "gps_code",
57 | "type": "string"
58 | },
59 | {
60 | "title": "iata_code",
61 | "type": "string"
62 | },
63 | {
64 | "title": "local_code",
65 | "type": "string"
66 | }
67 | ]
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/dsio/entry_buffer_test.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "encoding/json"
5 | "testing"
6 |
7 | "github.com/qri-io/dataset"
8 | "github.com/qri-io/dataset/dstest"
9 | )
10 |
11 | func TestEntryBuffer(t *testing.T) {
12 | tc, err := dstest.NewTestCaseFromDir("testdata/csv/movies")
13 | if err != nil {
14 | t.Errorf("error loading test case: %s", err.Error())
15 | return
16 | }
17 |
18 | ds := tc.Input
19 |
20 | outst := &dataset.Structure{
21 | Format: "json",
22 | Schema: ds.Structure.Schema,
23 | }
24 |
25 | rbuf, err := NewEntryBuffer(outst)
26 | if err != nil {
27 | t.Errorf("error allocating EntryBuffer: %s", err.Error())
28 | return
29 | }
30 |
31 | rr, err := NewEntryReader(ds.Structure, tc.BodyFile())
32 | if err != nil {
33 | t.Errorf("error allocating RowReader: %s", err.Error())
34 | return
35 | }
36 |
37 | if err = EachEntry(rr, func(i int, val Entry, err error) error {
38 | if err != nil {
39 | return err
40 | }
41 | return rbuf.WriteEntry(val)
42 | }); err != nil {
43 | t.Errorf("error writing rows: %s", err.Error())
44 | return
45 | }
46 |
47 | bst := rbuf.Structure()
48 | if diff := dstest.CompareStructures(outst, bst); diff != "" {
49 | t.Errorf("buffer structure mismatch (-wnt +got):\n%s", diff)
50 | return
51 | }
52 |
53 | if err := rbuf.Close(); err != nil {
54 | t.Errorf("error closing buffer: %s", err.Error())
55 | return
56 | }
57 |
58 | out := []interface{}{}
59 | if err := json.Unmarshal(rbuf.Bytes(), &out); err != nil {
60 | t.Errorf("error unmarshaling encoded bytes: %s", err.Error())
61 | return
62 | }
63 |
64 | if _, err = json.Marshal(out); err != nil {
65 | t.Errorf("error marshaling json data: %s", err.Error())
66 | return
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/generate/tabular_test.go:
--------------------------------------------------------------------------------
1 | package generate
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/google/go-cmp/cmp"
8 | "github.com/qri-io/dataset"
9 | "github.com/qri-io/dataset/dsio"
10 | )
11 |
12 | // Compile time check that Generator satisfies the EntryReader interace.
13 | var _ dsio.EntryReader = (*TabularGenerator)(nil)
14 |
15 | func TestGeneratorForBaseSchemaArray(t *testing.T) {
16 | cases := []struct {
17 | index int
18 | key string
19 | value interface{}
20 | }{
21 | {0, "", []interface{}{"gltBH"}},
22 | {1, "", []interface{}{"VJQV"}},
23 | {2, "", []interface{}{"dv8A"}},
24 | }
25 |
26 | st := &dataset.Structure{Format: "json", Schema: map[string]interface{}{
27 | "type": "array",
28 | "items": map[string]interface{}{
29 | "type": "array",
30 | "items": []interface{}{
31 | map[string]interface{}{"type": "string", "title": "col_one,"},
32 | },
33 | },
34 | }}
35 |
36 | g, err := NewTabularGenerator(st, AssignSeed, AssignMaxLen)
37 | if err != nil {
38 | t.Fatal(err)
39 | }
40 | defer g.Close()
41 |
42 | if diff := cmp.Diff(st, g.Structure()); diff != "" {
43 | t.Errorf("expected returned structure to match input. (-want +got)P:\n%s", diff)
44 | }
45 |
46 | for i, c := range cases {
47 | t.Run(fmt.Sprintf("%d", c.index), func(t *testing.T) {
48 | e, _ := g.ReadEntry()
49 | if e.Index != c.index {
50 | t.Errorf("case %d index mismatch. expected: %d. got: %d", i, c.index, e.Index)
51 | }
52 | if e.Key != c.key {
53 | t.Errorf("case %d key mismatch. expected: %s. got: %s", i, c.key, e.Key)
54 | }
55 | if diff := cmp.Diff(c.value, e.Value); diff != "" {
56 | t.Errorf("case result mismatch. (-want +got):\n%s", diff)
57 | }
58 | })
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/detect/json.go:
--------------------------------------------------------------------------------
1 | package detect
2 |
3 | import (
4 | "fmt"
5 | "io"
6 |
7 | "github.com/qri-io/dataset"
8 | )
9 |
10 | // JSONSchema determines the field names and types of an io.Reader of JSON-formatted data, returning a json schema
11 | // This is currently a suuuuuuuuper simple interpretation that spits out a generic schema that'll work. In the future
12 | // we can do all sorts of stuff here to make better inferences about the shape of a dataset, but for now, this'll work,
13 | // and we'll instead focus on making it easier for users to provide hand-built schemas
14 | func JSONSchema(resource *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) {
15 | var (
16 | count = 0
17 | buf = make([]byte, 100)
18 | )
19 |
20 | for {
21 | count, err = data.Read(buf)
22 | n += count
23 | if err != nil {
24 | if err == io.EOF {
25 | // possible that data length is less than 100 bytes,
26 | // if we've read more than 0 bytes, we should check it
27 | if count > 0 {
28 | err = nil
29 | } else {
30 | err = fmt.Errorf("invalid json data")
31 | return
32 | }
33 | } else {
34 | log.Debugf(err.Error())
35 | err = fmt.Errorf("error reading data: %s", err.Error())
36 | return
37 | }
38 | }
39 |
40 | for _, b := range buf {
41 | switch b {
42 | case '[':
43 | return dataset.BaseSchemaArray, n, nil
44 | case '{':
45 | return dataset.BaseSchemaObject, n, nil
46 | case ' ', '\t', '\n', '\r':
47 | continue
48 | default:
49 | err = fmt.Errorf("invalid json data")
50 | return
51 | }
52 | }
53 | }
54 | }
55 |
56 | // NDJSONSchema returns an array identity schema
57 | func NDJSONSchema(resource *dataset.Structure, data io.Reader) (schema map[string]interface{}, n int, err error) {
58 | return dataset.BaseSchemaArray, 0, nil
59 | }
60 |
--------------------------------------------------------------------------------
/detect/testdata/spelling.csv:
--------------------------------------------------------------------------------
1 | state,search interest
2 | Idaho (United States),100
3 | Wyoming (United States),96.91
4 | South Dakota (United States),91.89
5 | Iowa (United States),90.46
6 | Utah (United States),88.18
7 | Nebraska (United States),86.05
8 | New Mexico (United States),86
9 | Montana (United States),85.58
10 | Missouri (United States),83.44
11 | West Virginia (United States),83.23
12 | Arkansas (United States),82.55
13 | Georgia (United States),81.51
14 | North Dakota (United States),81.5
15 | Mississippi (United States),80.09
16 | Wisconsin (United States),79.32
17 | Maine (United States),78.89
18 | Indiana (United States),78.11
19 | South Carolina (United States),76.54
20 | Michigan (United States),75.09
21 | Alabama (United States),74.38
22 | Nevada (United States),74.29
23 | Ohio (United States),73.83
24 | Kansas (United States),73.36
25 | Colorado (United States),73.35
26 | Arizona (United States),73.13
27 | Louisiana (United States),72.47
28 | Tennessee (United States),71.93
29 | North Carolina (United States),70.81
30 | Illinois (United States),70.73
31 | Minnesota (United States),70.64
32 | Oklahoma (United States),70.43
33 | Kentucky (United States),69.95
34 | Texas (United States),67.34
35 | Pennsylvania (United States),64.82
36 | New Hampshire (United States),63.37
37 | Delaware (United States),63.02
38 | California (United States),62.64
39 | Rhode Island (United States),61.53
40 | Florida (United States),59.74
41 | Alaska (United States),59.53
42 | Washington (United States),59.47
43 | Vermont (United States),58.59
44 | Hawaii (United States),56.26
45 | Connecticut (United States),56.04
46 | New Jersey (United States),55.17
47 | Maryland (United States),54.56
48 | District of Columbia (United States),52.75
49 | Massachusetts (United States),52.53
50 | New York (United States),50.75
51 | Virginia (United States),33.39
52 | Oregon (United States),28.42
--------------------------------------------------------------------------------
/dstest/golden.go:
--------------------------------------------------------------------------------
1 | package dstest
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "io/ioutil"
7 | "os"
8 | "testing"
9 |
10 | "github.com/qri-io/dataset"
11 | )
12 |
13 | // UpdateGoldenFileEnvVarname is the envionment variable that dstest checks
14 | // before writing
15 | const UpdateGoldenFileEnvVarname = "QRI_UPDATE_GOLDEN_FILES"
16 |
17 | // CompareGoldenDatasetAndUpdateIfEnvVarSet is a convenience wrapper for the
18 | // common case of loading a golden file, comparing it to a dataset, and updating
19 | // the dataset if it fails and the "update" enviornment variable is set
20 | func CompareGoldenDatasetAndUpdateIfEnvVarSet(t *testing.T, goldenFilepath string, got *dataset.Dataset) {
21 | t.Helper()
22 | expect := LoadGoldenFile(t, goldenFilepath)
23 | if diff := CompareDatasets(expect, got); diff != "" {
24 | t.Errorf("dataset golden file mismatch (-want +got):\n%s", diff)
25 | UpdateGoldenFileIfEnvVarSet(goldenFilepath, got)
26 | }
27 | }
28 |
29 | // LoadGoldenFile loads a dataset from a JSON file
30 | func LoadGoldenFile(t *testing.T, filename string) *dataset.Dataset {
31 | t.Helper()
32 | data, err := ioutil.ReadFile(filename)
33 | if err != nil {
34 | t.Fatalf("opening JSON golden file: %s", err)
35 | }
36 |
37 | ds := &dataset.Dataset{}
38 | if err := json.Unmarshal(data, ds); err != nil {
39 | t.Fatalf("unmarshaling JSON golden file: %s", err)
40 | }
41 |
42 | return ds
43 | }
44 |
45 | // UpdateGoldenFileIfEnvVarSet overwrites the given filename if
46 | // QRI_UPDATED_GOLDEN_FILES env var is set
47 | func UpdateGoldenFileIfEnvVarSet(filename string, got *dataset.Dataset) {
48 | if os.Getenv(UpdateGoldenFileEnvVarname) != "" {
49 | fmt.Printf("updating golden file: %q\n", filename)
50 | data, err := json.MarshalIndent(got, "", " ")
51 | if err != nil {
52 | panic(err)
53 | }
54 | if err := ioutil.WriteFile(filename, data, 0644); err != nil {
55 | panic(err)
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/validate/testdata_test.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "github.com/qri-io/dataset"
5 | )
6 |
7 | var emptyRawText = ``
8 |
9 | // has lazy quotes
10 | var rawText1 = `first_name,last_name,username,age
11 | "Rob","Pike",rob, 100
12 | Ken,Thompson,ken, 75.5
13 | "Robert","Griesemer","gri", 100`
14 |
15 | var namesStructure = &dataset.Structure{
16 | Format: "csv",
17 | FormatConfig: map[string]interface{}{
18 | "headerRow": true,
19 | },
20 | Schema: map[string]interface{}{
21 | "type": "array",
22 | "items": map[string]interface{}{
23 | "type": "array",
24 | "items": []interface{}{
25 | map[string]interface{}{"title": "first_name", "type": "string"},
26 | map[string]interface{}{"title": "last_name", "type": "string"},
27 | map[string]interface{}{"title": "username", "type": "string"},
28 | map[string]interface{}{"title": "age", "type": "integer"},
29 | },
30 | },
31 | },
32 | }
33 |
34 | // has nonNumeric quotes and comma inside quotes on last line
35 | var rawText2 = `"first_name","last_name","username","age"
36 | "Rob","Pike","rob", 22
37 | "Robert","Griesemer","gri", 100
38 | "abc","def,ghi","jkl",1000`
39 |
40 | // same as above but with spaces in last line
41 | var rawText2b = `"first_name","last_name","username","age"
42 | "Rob","Pike","rob", 22
43 | "Robert","Griesemer","gri", 100
44 | "abc", "def,ghi", "jkl", 1000`
45 |
46 | // error in last row "age" column
47 | var rawText2c = `first_name,last_name,username,age
48 | "Rob","Pike","rob",22
49 | "Robert","Griesemer","gri",100
50 | "abc","def,ghi","jkl",_`
51 |
52 | // NOTE: technically this is valid csv and we should be catching this at an earlier filter
53 | var rawText3 = `
54 |
55 |
60 |
61 | `
62 |
63 | var rawText4 = `
64 |
65 |
66 | |
67 | | Last Name, First
68 |
69 |
70 |
71 |
72 | `
73 |
--------------------------------------------------------------------------------
/dsio/replacecr/replace_cr.go:
--------------------------------------------------------------------------------
1 | // Package replacecr defines a wrapper for replacing solo carriage return characters (\r)
2 | // with carriage-return + line feed (\r\n)
3 | package replacecr
4 |
5 | import (
6 | "bufio"
7 | "io"
8 | )
9 |
10 | // Reader wraps an io.Reader. on every call of Read. it looks for
11 | // for instances of lonely \r replacing them with \r\n before returning to the end consumer
12 | // lots of files in the wild will come without "proper" line breaks, which irritates go's
13 | // standard csv package. This'll fix by wrapping the reader passed to csv.NewReader:
14 | // rdr, err := csv.NewReader(replacecr.Reader(r))
15 | // because Reader adds '\n' characters, the number of bytes reported from the underlying
16 | // reader can/will differ from what the underlyng reader would return
17 | // if read from directly. This can cause issues with checksums and byte counts.
18 | // Use with caution.
19 | func Reader(data io.Reader) io.Reader {
20 | return crlfReplaceReader{
21 | rdr: bufio.NewReader(data),
22 | }
23 | }
24 |
25 | // ReaderWithSize instaties a reader with a given buffer size
26 | func ReaderWithSize(data io.Reader, size int) io.Reader {
27 | return crlfReplaceReader{
28 | rdr: bufio.NewReaderSize(data, size),
29 | }
30 | }
31 |
32 | // crlfReplaceReader wraps a reader
33 | type crlfReplaceReader struct {
34 | rdr *bufio.Reader
35 | }
36 |
37 | // Read implements io.Reader for crlfReplaceReader
38 | func (c crlfReplaceReader) Read(p []byte) (n int, err error) {
39 | lenP := len(p)
40 | if lenP == 0 {
41 | return
42 | }
43 |
44 | for {
45 | if n == lenP {
46 | return
47 | }
48 |
49 | p[n], err = c.rdr.ReadByte()
50 | if err != nil {
51 | return
52 | }
53 |
54 | // any time we encounter \r & still have space, check to see if \n follows
55 | // ff next char is not \n, add it in manually
56 | if p[n] == '\r' && n < lenP-1 {
57 | if pk, err := c.rdr.Peek(1); (err == nil && pk[0] != '\n') || (err != nil && err.Error() == "EOF") {
58 | n++
59 | p[n] = '\n'
60 | }
61 | }
62 |
63 | n++
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/stepfile/stepfile.go:
--------------------------------------------------------------------------------
1 | // Package stepfile provides utilities for reading and writing an ordered set of
2 | // transform steps to and from a flat file representation
3 | //
4 | // A stepfile file consists of one or more steps of input text separated by
5 | // "---" lines.
6 | //
7 | // Example:
8 | //
9 | // "step"
10 | // ---
11 | // "another step"
12 | // ---
13 | // "and another step"
14 | package stepfile
15 |
16 | import (
17 | "fmt"
18 | "io"
19 | "io/ioutil"
20 | "os"
21 | "strings"
22 |
23 | "github.com/qri-io/dataset"
24 | )
25 |
26 | // ReadFile opens a stepfile and returns steps
27 | func ReadFile(filename string) (steps []*dataset.TransformStep, err error) {
28 | f, err := os.Open(filename)
29 | if err != nil {
30 | return nil, err
31 | }
32 | defer f.Close()
33 | return Read(f)
34 | }
35 |
36 | // Read consumes a reader into steps
37 | func Read(r io.Reader) (steps []*dataset.TransformStep, err error) {
38 | data, err := ioutil.ReadAll(r)
39 | if err != nil {
40 | return nil, err
41 | }
42 |
43 | for _, chunk := range strings.Split(string(data), "\n---\n") {
44 | steps = append(steps, &dataset.TransformStep{
45 | Script: chunk,
46 | })
47 | }
48 | return steps, nil
49 | }
50 |
51 | // Write prints transform steps as a stepfile to a writer
52 | func Write(steps []*dataset.TransformStep, w io.Writer) error {
53 | for i, step := range steps {
54 | if err := writeStepScript(step, w); err != nil {
55 | return err
56 | }
57 | if i != len(steps)-1 {
58 | w.Write([]byte("\n---\n"))
59 | }
60 | }
61 | return nil
62 | }
63 |
64 | func writeStepScript(s *dataset.TransformStep, w io.Writer) error {
65 | if r, ok := s.Script.(io.Reader); ok {
66 | if closer, ok := s.Script.(io.Closer); ok {
67 | defer closer.Close()
68 | }
69 | _, err := io.Copy(w, r)
70 | return err
71 | }
72 |
73 | switch v := s.Script.(type) {
74 | case string:
75 | _, err := w.Write([]byte(v))
76 | return err
77 | case []byte:
78 | _, err := w.Write(v)
79 | return err
80 | }
81 | return fmt.Errorf("unrecognized script type: %T", s.Script)
82 | }
83 |
--------------------------------------------------------------------------------
/dsio/entry_buffer.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "bytes"
5 |
6 | "github.com/qri-io/dataset"
7 | )
8 |
9 | // EntryBuffer mimics the behaviour of bytes.Buffer, but with structured Dataa
10 | // Read and Write are replaced with ReadEntry and WriteEntry. It's worth noting
11 | // that different data formats have idisyncrcies that affect the behavior
12 | // of buffers and their output. For example, EntryBuffer won't write things like
13 | // CSV header rows or enclosing JSON arrays until after the writer's
14 | // Close method has been called.
15 | type EntryBuffer struct {
16 | structure *dataset.Structure
17 | r EntryReader
18 | w EntryWriter
19 | buf *bytes.Buffer
20 | }
21 |
22 | // NewEntryBuffer allocates a buffer, buffers should always be created with
23 | // NewEntryBuffer, which will error if the provided structure is invalid for
24 | // reading / writing
25 | func NewEntryBuffer(st *dataset.Structure) (*EntryBuffer, error) {
26 | buf := &bytes.Buffer{}
27 | r, err := NewEntryReader(st, buf)
28 | if err != nil {
29 | log.Debug(err.Error())
30 | return nil, err
31 | }
32 | w, err := NewEntryWriter(st, buf)
33 | if err != nil {
34 | log.Debug(err.Error())
35 | return nil, err
36 | }
37 |
38 | return &EntryBuffer{
39 | structure: st,
40 | r: r,
41 | w: w,
42 | buf: buf,
43 | }, nil
44 | }
45 |
46 | // Structure gives the underlying structure this buffer is using
47 | func (b *EntryBuffer) Structure() *dataset.Structure {
48 | return b.structure
49 | }
50 |
51 | // ReadEntry reads one "row" from the buffer
52 | func (b *EntryBuffer) ReadEntry() (Entry, error) {
53 | return b.r.ReadEntry()
54 | }
55 |
56 | // WriteEntry writes one "row" to the buffer
57 | func (b *EntryBuffer) WriteEntry(e Entry) error {
58 | return b.w.WriteEntry(e)
59 | }
60 |
61 | // Close closes the writer portion of the buffer, which will affect
62 | // underlying contents.
63 | func (b *EntryBuffer) Close() error {
64 | return b.w.Close()
65 | }
66 |
67 | // Bytes gives the raw contents of the underlying buffer
68 | func (b *EntryBuffer) Bytes() []byte {
69 | return b.buf.Bytes()
70 | }
71 |
--------------------------------------------------------------------------------
/testdata/datasets/airport-codes.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "meta": {
4 | "qri": "md:0",
5 | "homeURL": "http://www.ourairports.com/",
6 | "license": {
7 | "type":"PDDL-1.0"
8 | },
9 | "title": "Airport Codes",
10 | "citations": [
11 | {
12 | "name": "Our Airports",
13 | "url": "http://ourairports.com/data/"
14 | }
15 | ]
16 | },
17 | "commit": {
18 | "title": "initial commit"
19 | },
20 | "structure": {
21 | "format": "csv",
22 | "qri": "st:0",
23 | "formatConfig": {
24 | "headerRow": true
25 | },
26 | "errCount": 5,
27 | "schema": {
28 | "type": "array",
29 | "items": {
30 | "type": "array",
31 | "items": [
32 | {
33 | "title": "ident",
34 | "type": "string"
35 | },
36 | {
37 | "title": "type",
38 | "type": "string"
39 | },
40 | {
41 | "title": "name",
42 | "type": "string"
43 | },
44 | {
45 | "title": "latitude_deg",
46 | "type": "number"
47 | },
48 | {
49 | "title": "longitude_deg",
50 | "type": "number"
51 | },
52 | {
53 | "title": "elevation_ft",
54 | "type": "integer"
55 | },
56 | {
57 | "title": "continent",
58 | "type": "string"
59 | },
60 | {
61 | "title": "iso_country",
62 | "type": "string"
63 | },
64 | {
65 | "title": "iso_region",
66 | "type": "string"
67 | },
68 | {
69 | "title": "municipality",
70 | "type": "string"
71 | },
72 | {
73 | "title": "gps_code",
74 | "type": "string"
75 | },
76 | {
77 | "title": "iata_code",
78 | "type": "string"
79 | },
80 | {
81 | "title": "local_code",
82 | "type": "string"
83 | }
84 | ]
85 | }
86 | }
87 | }
88 | }
--------------------------------------------------------------------------------
/dsio/testdata/cbor/flourinated_compounds_in_fast_food_packaging/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "commit" : {
3 | "qri" : "cm:0",
4 | "title" : "initial commit",
5 | "timestamp": "2017-05-01T01:00:00.000Z"
6 | },
7 | "meta" : {
8 | "title" : "Fluorinated Compounds in U.S. Fast Food Packaging",
9 | "description" : "Paper samples, paper extracts (known), paper extracts (unknown). \n\nThis dataset is associated with the following publication:\nSchaider, L., S. Balan, A. Blum, D. Andrews, M. Strynar, M. Dickinson, D. Lunderberg, J. Lang, and G. Peaslee. Fluorinated Compounds in U.S. Fast Food Packaging. Environmental Science \u0026amp; Technology Letters. American Chemical Society, Washington, DC, USA, 4(3): 105\u0026ndash;111, (2017)."
10 | },
11 | "structure": {
12 | "qri": "st:0",
13 | "format": "cbor",
14 | "schema": {
15 | "type": "array",
16 | "items": {
17 | "type": "array",
18 | "items": [
19 | {
20 | "type": "string",
21 | "title": "sample"
22 | },
23 | {
24 | "type": "string",
25 | "title": "comments"
26 | },
27 | {
28 | "type": "integer",
29 | "title": "length_cm",
30 | "description": "length of sample in cm"
31 | },
32 | {
33 | "type": "integer",
34 | "title": "width_cm",
35 | "description": "width of sample in cm"
36 | },
37 | {
38 | "type": "integer",
39 | "title": "area_cm",
40 | "description": "area of sample in cm2"
41 | },
42 | {
43 | "type": "number",
44 | "title": "vial_tare_g",
45 | "description": "vial tare in grams"
46 | },
47 | {
48 | "type": "number",
49 | "title": "vial_with_paper_g",
50 | "description": "mass of vial tare with paper in grams"
51 | },
52 | {
53 | "type": "number",
54 | "title": "mass_g",
55 | "description": "mass of paper in grams"
56 | }
57 | ]
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/validate/testdata/flourinated_compounds_in_fast_food_packaging/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "commit" : {
3 | "qri" : "cm:0",
4 | "title" : "initial commit",
5 | "timestamp": "2017-05-01T01:00:00.000Z"
6 | },
7 | "meta" : {
8 | "title" : "Fluorinated Compounds in U.S. Fast Food Packaging",
9 | "description" : "Paper samples, paper extracts (known), paper extracts (unknown). \n\nThis dataset is associated with the following publication:\nSchaider, L., S. Balan, A. Blum, D. Andrews, M. Strynar, M. Dickinson, D. Lunderberg, J. Lang, and G. Peaslee. Fluorinated Compounds in U.S. Fast Food Packaging. Environmental Science \u0026amp; Technology Letters. American Chemical Society, Washington, DC, USA, 4(3): 105\u0026ndash;111, (2017)."
10 | },
11 | "structure": {
12 | "qri": "st:0",
13 | "format": "cbor",
14 | "schema": {
15 | "type": "array",
16 | "items": {
17 | "type": "array",
18 | "items": [
19 | {
20 | "type": "string",
21 | "title": "sample"
22 | },
23 | {
24 | "type": "string",
25 | "title": "comments"
26 | },
27 | {
28 | "type": "integer",
29 | "title": "length_cm",
30 | "description": "length of sample in cm"
31 | },
32 | {
33 | "type": "integer",
34 | "title": "width_cm",
35 | "description": "width of sample in cm"
36 | },
37 | {
38 | "type": "integer",
39 | "title": "area_cm",
40 | "description": "area of sample in cm2"
41 | },
42 | {
43 | "type": "number",
44 | "title": "vial_tare_g",
45 | "description": "vial tare in grams"
46 | },
47 | {
48 | "type": "number",
49 | "title": "vial_with_paper_g",
50 | "description": "mass of vial tare with paper in grams"
51 | },
52 | {
53 | "type": "number",
54 | "title": "mass_g",
55 | "description": "mass of paper in grams"
56 | }
57 | ]
58 | }
59 | }
60 | }
61 | }
--------------------------------------------------------------------------------
/vals/compare.go:
--------------------------------------------------------------------------------
1 | package vals
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "reflect"
7 | )
8 |
9 | // Equal checks if two Values are the same
10 | func Equal(a, b Value) bool {
11 | if a.Type() != b.Type() {
12 | return false
13 | }
14 | switch a.Type() {
15 | case TypeObject, TypeArray:
16 | return reflect.DeepEqual(a, b)
17 | case TypeNumber:
18 | return a.Number() == b.Number()
19 | case TypeInteger:
20 | return a.Integer() == b.Integer()
21 | case TypeBoolean:
22 | return a.Boolean() == b.Boolean()
23 | case TypeNull:
24 | return a.IsNull() == b.IsNull()
25 | case TypeString:
26 | return a.String() == b.String()
27 | }
28 | return false
29 | }
30 |
31 | // CompareTypeBytes compares two byte slices with a known type
32 | // real on the real, this is a bit of a work in progress
33 | // TODO - up tests
34 | func CompareTypeBytes(a, b []byte, t Type) (int, error) {
35 | if len(a) == 0 && len(b) > 0 {
36 | return -1, nil
37 | } else if len(b) == 0 && len(a) > 0 {
38 | return 1, nil
39 | } else if len(b) == 0 && len(a) == 0 {
40 | return 0, nil
41 | }
42 |
43 | switch t {
44 | case TypeString:
45 | return bytes.Compare(a, b), nil
46 | case TypeInteger:
47 | return CompareIntegerBytes(a, b)
48 | case TypeNumber:
49 | return CompareNumberBytes(a, b)
50 | default:
51 | // TODO - other types
52 | return 0, fmt.Errorf("invalid type comparison")
53 | }
54 | }
55 |
56 | // CompareIntegerBytes compares two byte slices of interger data
57 | func CompareIntegerBytes(a, b []byte) (int, error) {
58 | at, err := ParseInteger(a)
59 | if err != nil {
60 | return 0, err
61 | }
62 | bt, err := ParseInteger(b)
63 | if err != nil {
64 | return 0, err
65 | }
66 | if at > bt {
67 | return 1, nil
68 | } else if at == bt {
69 | return 0, nil
70 | }
71 | return -1, nil
72 | }
73 |
74 | // CompareNumberBytes compares two byte slices of float data
75 | func CompareNumberBytes(a, b []byte) (int, error) {
76 | at, err := ParseNumber(a)
77 | if err != nil {
78 | return 0, err
79 | }
80 | bt, err := ParseNumber(b)
81 | if err != nil {
82 | return 0, err
83 | }
84 | if at > bt {
85 | return 1, nil
86 | } else if at == bt {
87 | return 0, nil
88 | }
89 | return -1, nil
90 | }
91 |
--------------------------------------------------------------------------------
/kind_test.go:
--------------------------------------------------------------------------------
1 | package dataset
2 |
3 | import (
4 | "encoding/json"
5 | "testing"
6 | )
7 |
8 | func TestKindValid(t *testing.T) {
9 | cases := []struct {
10 | Kind Kind
11 | err string
12 | }{
13 | {"", "invalid kind: ''. kind must be in the form [type]:[version]"},
14 | {"ds:0", ""},
15 | {"vz:0", ""},
16 | {"st:0", ""},
17 | {"as:0", ""},
18 | {"ps:0", ""},
19 | {"ps:0", ""},
20 | }
21 |
22 | for i, c := range cases {
23 | err := c.Kind.Valid()
24 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
25 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err)
26 | continue
27 | }
28 | }
29 | }
30 |
31 | func TestKindDatatype(t *testing.T) {
32 | cases := []struct {
33 | Kind Kind
34 | expect string
35 | }{
36 | {"ds:0", "ds"},
37 | {"vz:0", "vz"},
38 | {"st:0", "st"},
39 | {"as:0", "as"},
40 | {"ps:0", "ps"},
41 | }
42 |
43 | for i, c := range cases {
44 | got := c.Kind.Type()
45 | if c.expect != got {
46 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.expect, got)
47 | continue
48 | }
49 | }
50 | }
51 |
52 | func TestKindVersion(t *testing.T) {
53 | cases := []struct {
54 | Kind Kind
55 | expect string
56 | }{
57 | {"st:2", "2"},
58 | {"ds:23", "23"},
59 | }
60 |
61 | for i, c := range cases {
62 | got := c.Kind.Version()
63 | if c.expect != got {
64 | t.Errorf("case %d response mismatch. expected: '%s', got: '%s'", i, c.expect, got)
65 | continue
66 | }
67 | }
68 | }
69 |
70 | func TestKindUnmarshalJSON(t *testing.T) {
71 | cases := []struct {
72 | input string
73 | expect Kind
74 | err string
75 | }{
76 | {`"st:2"`, Kind("st:2"), ""},
77 | {`""`, Kind(""), "invalid kind: ''. kind must be in the form [type]:[version]"},
78 | }
79 |
80 | for i, c := range cases {
81 | got := Kind("")
82 | err := json.Unmarshal([]byte(c.input), &got)
83 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
84 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err)
85 | continue
86 | }
87 |
88 | if got != c.expect {
89 | t.Errorf("case %d response mismatch. expected: '%s', got: '%s'", i, c.expect, got)
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/preview/preview_test.go:
--------------------------------------------------------------------------------
1 | package preview
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "fmt"
7 | "testing"
8 |
9 | "github.com/qri-io/dataset"
10 | "github.com/qri-io/dataset/dstest"
11 | )
12 |
13 | func TestCreate(t *testing.T) {
14 |
15 | ctx := context.Background()
16 |
17 | _, err := Create(ctx, &dataset.Dataset{})
18 |
19 | if err == nil {
20 | t.Fatal(fmt.Errorf("expected empty dataset to error"))
21 | }
22 |
23 | tc, err := dstest.NewTestCaseFromDir("testdata/earthquakes")
24 | if err != nil {
25 | t.Fatal(err)
26 | }
27 |
28 | got, err := Create(ctx, tc.Input)
29 | if err != nil {
30 | t.Fatal(err)
31 | }
32 |
33 | rawBody, ok := got.Body.(json.RawMessage)
34 | if !ok {
35 | t.Fatal("expected preview body to assert to json.RawMessage")
36 | }
37 |
38 | body := [][]interface{}{}
39 |
40 | if err := json.Unmarshal(rawBody, &body); err != nil {
41 | t.Fatal(err)
42 | }
43 | got.Body = body
44 |
45 | if len(body) != 100 {
46 | t.Errorf("error: body length mismatch, expected 100 got %d", len(body))
47 | }
48 | if got.BodyFile() == nil {
49 | t.Errorf("expected creating a preview to leave existing BodyFile intact, is missing")
50 | // TODO (b5) - confirm body file contents are unmodified
51 | }
52 | if got.Readme.ScriptFile() == nil {
53 | t.Errorf("expected creating a preview to leave existing Readme.ScriptFile intact, is missing")
54 | // TODO (b5) - confirm actual readme scriptfile is unmodified
55 | }
56 |
57 | // TODO (b5) - required adjustments for accurate comparison due to JSON serialization
58 | // issues. either solve the serialization issues or add options to dstest.CompareDatasets
59 | got.Body = []interface{}{}
60 |
61 | expect := dstest.LoadGoldenFile(t, "testdata/earthquakes/golden.dataset.json")
62 |
63 | if diff := dstest.CompareDatasets(expect, got); diff != "" {
64 | t.Errorf("result mismatch. (-want +got):\n%s", diff)
65 | dstest.UpdateGoldenFileIfEnvVarSet("testdata/earthquakes/golden.dataset.json", got)
66 | }
67 |
68 | // make sure you can create a preview of a dataset without a body file
69 | tc.Input.SetBodyFile(nil)
70 |
71 | got, err = Create(ctx, tc.Input)
72 | if err != nil {
73 | t.Fatalf("unexpected error creating a preview of a dataset without a body: %s", err)
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/dstest/priv_key.go:
--------------------------------------------------------------------------------
1 | package dstest
2 |
3 | import (
4 | "encoding/base64"
5 | "fmt"
6 |
7 | crypto "github.com/libp2p/go-libp2p-core/crypto"
8 | )
9 |
10 | var (
11 | // PrivKey is a predefined private key for use in tests
12 | PrivKey crypto.PrivKey
13 | // PrivKeyPeerID is the base58-encoded multihash of PrivKey.PublicKey
14 | PrivKeyPeerID = "QmZePf5LeXow3RW5U1AgEiNbW46YnRGhZ7HPvm1UmPFPwt"
15 | )
16 |
17 | func init() {
18 | testPk := []byte(`CAASpgkwggSiAgEAAoIBAQC/7Q7fILQ8hc9g07a4HAiDKE4FahzL2eO8OlB1K99Ad4L1zc2dCg+gDVuGwdbOC29IngMA7O3UXijycckOSChgFyW3PafXoBF8Zg9MRBDIBo0lXRhW4TrVytm4Etzp4pQMyTeRYyWR8e2hGXeHArXM1R/A/SjzZUbjJYHhgvEE4OZy7WpcYcW6K3qqBGOU5GDMPuCcJWac2NgXzw6JeNsZuTimfVCJHupqG/dLPMnBOypR22dO7yJIaQ3d0PFLxiDG84X9YupF914RzJlopfdcuipI+6gFAgBw3vi6gbECEzcohjKf/4nqBOEvCDD6SXfl5F/MxoHurbGBYB2CJp+FAgMBAAECggEAaVOxe6Y5A5XzrxHBDtzjlwcBels3nm/fWScvjH4dMQXlavwcwPgKhy2NczDhr4X69oEw6Msd4hQiqJrlWd8juUg6vIsrl1wS/JAOCS65fuyJfV3Pw64rWbTPMwO3FOvxj+rFghZFQgjg/i45uHA2UUkM+h504M5Nzs6Arr/rgV7uPGR5e5OBw3lfiS9ZaA7QZiOq7sMy1L0qD49YO1ojqWu3b7UaMaBQx1Dty7b5IVOSYG+Y3U/dLjhTj4Hg1VtCHWRm3nMOE9cVpMJRhRzKhkq6gnZmni8obz2BBDF02X34oQLcHC/Wn8F3E8RiBjZDI66g+iZeCCUXvYz0vxWAQQKBgQDEJu6flyHPvyBPAC4EOxZAw0zh6SF/r8VgjbKO3n/8d+kZJeVmYnbsLodIEEyXQnr35o2CLqhCvR2kstsRSfRz79nMIt6aPWuwYkXNHQGE8rnCxxyJmxV4S63GczLk7SIn4KmqPlCI08AU0TXJS3zwh7O6e6kBljjPt1mnMgvr3QKBgQD6fAkdI0FRZSXwzygx4uSg47Co6X6ESZ9FDf6ph63lvSK5/eue/ugX6p/olMYq5CHXbLpgM4EJYdRfrH6pwqtBwUJhlh1xI6C48nonnw+oh8YPlFCDLxNG4tq6JVo071qH6CFXCIank3ThZeW5a3ZSe5pBZ8h4bUZ9H8pJL4C7yQKBgFb8SN/+/qCJSoOeOcnohhLMSSD56MAeK7KIxAF1jF5isr1TP+rqiYBtldKQX9bIRY3/8QslM7r88NNj+aAuIrjzSausXvkZedMrkXbHgS/7EAPflrkzTA8fyH10AsLgoj/68mKr5bz34nuY13hgAJUOKNbvFeC9RI5g6eIqYH0FAoGAVqFTXZp12rrK1nAvDKHWRLa6wJCQyxvTU8S1UNi2EgDJ492oAgNTLgJdb8kUiH0CH0lhZCgr9py5IKW94OSM6l72oF2UrS6PRafHC7D9b2IV5Al9lwFO/3MyBrMocapeeyaTcVBnkclz4Qim3OwHrhtFjF1ifhP9DwVRpuIg+dECgYANwlHxLe//tr6BM31PUUrOxP5Y/cj+ydxqM/z6papZFkK6Mvi/vMQQNQkh95GH9zqyC5Z/yLxur4ry1eNYty/9FnuZRAkEmlUSZ/DobhU0Pmj8Hep6JsTuMutref6vCk2n02jc9qYmJuD7iXkdXDSawbEG6f5C4MUkJ38z1t1OjA==`)
19 | data, err := base64.StdEncoding.DecodeString(string(testPk))
20 | if err != nil {
21 | panic(err)
22 | }
23 | PrivKey, err = crypto.UnmarshalPrivateKey(data)
24 | if err != nil {
25 | panic(fmt.Errorf("error unmarshaling private key: %s", err.Error()))
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # dataset
2 |
3 | [](https://qri.io)
4 | [](http://godoc.org/github.com/qri-io/dataset)
5 | [](./LICENSE)
6 | [](https://codecov.io/gh/qri-io/dataset)
7 | [](https://circleci.com/gh/qri-io/dataset)
8 | [](https://goreportcard.com/report/github.com/qri-io/dataset)
9 |
10 | Dataset contains the qri ("query") dataset document definition. This package contains the base definition, as well as a number of
11 | subpackages that build from this base to add functionality as necessary Datasets take inspiration from HTML documents, deliniating semantic purpose to predefined tags of the document, but instead of orienting around presentational markup, dataset documents emphasize interoperability and composition. The principle encoding format for a dataset document is JSON.
12 |
13 | ### Subpackage Overview
14 |
15 | * **compression**: defines supported types of compression for interpreting a dataset
16 | * **detect**: dataset structure & schema inference
17 | * **dsfs**: "datasets on a content-addressed file system" tools to work with datasets stored with the [cafs](https://github.com/qri-io/qri) interface: `github.com/qri-io/qfs/cafs`
18 | * **dsgraph**: expressing relationships between and within datasets as graphs
19 | * **dsio**: `io` primitives for working with dataset bodies as readers, writers, buffers, oriented around row-like "entries".
20 | * **dstest**: utility functions for working with tests that need datasets
21 | * **dsutil**: utility functions that avoid dataset bloat
22 | * **generate**: io primitives for generating data
23 | * **use_generate**: small package that uses generate to create test data
24 | * **validate**: dataset validation & checking functions
25 | * **vals**: data type mappings & definitions
26 |
27 | ## Getting Involved
28 |
29 | We would love involvement from more people! If you notice any errors or would
30 | like to submit changes, please see our
31 | [Contributing Guidelines](./.github/CONTRIBUTING.md).
--------------------------------------------------------------------------------
/generate/value.go:
--------------------------------------------------------------------------------
1 | package generate
2 |
3 | import (
4 | "math"
5 | "math/rand"
6 | )
7 |
8 | // ValueGenerator is a state machine for producing values
9 | type ValueGenerator struct {
10 | Rand *rand.Rand // random number generator
11 | MaxStringLength int
12 | }
13 |
14 | // Value creates a random value of a random type
15 | func (g *ValueGenerator) Value() interface{} {
16 | i := g.Rand.Intn(40)
17 | if i == 0 {
18 | return nil
19 | } else if i > 0 && i < 10 {
20 | return g.Int()
21 | } else if i > 10 && i < 20 {
22 | return g.String()
23 | } else if i > 20 && i < 30 {
24 | return g.Float()
25 | } else if i > 30 && i < 40 {
26 | return g.Bool()
27 | }
28 |
29 | return nil
30 | }
31 |
32 | // Type creates a value to match a string type. type names match the
33 | // JSON-schema specification
34 | func (g *ValueGenerator) Type(t string) interface{} {
35 | switch t {
36 | case "string":
37 | return g.String()
38 | case "boolean":
39 | return g.Bool()
40 | case "number":
41 | return g.Float()
42 | case "integer":
43 | return g.Int()
44 | case "object":
45 | return g.Object()
46 | case "array":
47 | return g.Array()
48 | case "null":
49 | return nil
50 | default:
51 | return g.Value()
52 | }
53 | }
54 |
55 | var alphaNumericRunes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
56 |
57 | // String yields a random string
58 | func (g *ValueGenerator) String() string {
59 | runes := make([]rune, g.Rand.Intn(g.MaxStringLength))
60 | for i := range runes {
61 | runes[i] = alphaNumericRunes[g.Rand.Intn(len(alphaNumericRunes))]
62 | }
63 | return string(runes)
64 | }
65 |
66 | // Float yields a random floating point number
67 | func (g *ValueGenerator) Float() float64 {
68 | return g.Rand.NormFloat64()
69 | }
70 |
71 | // Int yields a random integer
72 | func (g *ValueGenerator) Int() int {
73 | return g.Rand.Intn(math.MaxInt64)
74 | }
75 |
76 | // Bool yields a random coin flip
77 | func (g *ValueGenerator) Bool() bool {
78 | return g.Rand.Intn(1)%2 == 0
79 | }
80 |
81 | // Object creates an empty object
82 | // TODO (b5) - populate with random values
83 | func (g *ValueGenerator) Object() map[string]interface{} {
84 | return map[string]interface{}{}
85 | }
86 |
87 | // Array creates an empty array
88 | // TODO (b5) - populate with random values
89 | func (g *ValueGenerator) Array() []interface{} {
90 | return []interface{}{}
91 | }
92 |
--------------------------------------------------------------------------------
/stats.go:
--------------------------------------------------------------------------------
1 | package dataset
2 |
3 | import "encoding/json"
4 |
5 | // Stats is a component that contains statistical metadata about the body of a
6 | // dataset
7 | type Stats struct {
8 | Path string `json:"path,omitempty"`
9 | Qri string `json:"qri,omitempty"`
10 | Stats interface{} `json:"stats,omitempty"`
11 | }
12 |
13 | // NewStatsRef creates an empty struct with it's path set
14 | func NewStatsRef(path string) *Stats {
15 | return &Stats{Path: path}
16 | }
17 |
18 | // DropDerivedValues resets all set-on-save fields to their default values
19 | func (sa *Stats) DropDerivedValues() {
20 | sa.Qri = ""
21 | sa.Path = ""
22 | }
23 |
24 | // IsEmpty checks to see if stats has any fields other than Path set
25 | func (sa *Stats) IsEmpty() bool {
26 | return sa.Stats == nil
27 | }
28 |
29 | // Assign collapses all properties of a group of Stats components onto one
30 | func (sa *Stats) Assign(sas ...*Stats) {
31 | for _, s := range sas {
32 | if s == nil {
33 | continue
34 | }
35 |
36 | if s.Stats != nil {
37 | sa.Stats = s.Stats
38 | }
39 | if s.Path != "" {
40 | sa.Path = s.Path
41 | }
42 | if s.Qri != "" {
43 | sa.Qri = s.Qri
44 | }
45 | }
46 | }
47 |
48 | // _stats is a private struct for marshaling into & out of.
49 | // fields must remain sorted in lexographical order
50 | type _stats Stats
51 |
52 | // MarshalJSON satisfies the json.Marshaler interface
53 | func (sa Stats) MarshalJSON() ([]byte, error) {
54 | // if we're dealing with an empty object that has a path specified, marshal to
55 | // a string instead
56 | if sa.Path != "" && sa.IsEmpty() {
57 | return json.Marshal(sa.Path)
58 | }
59 | return sa.MarshalJSONObject()
60 | }
61 |
62 | // MarshalJSONObject always marshals to a json Object, even if Stats is empty or
63 | // a reference
64 | func (sa Stats) MarshalJSONObject() ([]byte, error) {
65 | kind := sa.Qri
66 | if kind == "" {
67 | kind = KindStats.String()
68 | }
69 |
70 | return json.Marshal(&_stats{
71 | Stats: sa.Stats,
72 | Path: sa.Path,
73 | Qri: kind,
74 | })
75 | }
76 |
77 | // UnmarshalJSON satisfies the json.Unmarshaler interface
78 | func (sa *Stats) UnmarshalJSON(data []byte) error {
79 | var s string
80 | if err := json.Unmarshal(data, &s); err == nil {
81 | *sa = Stats{Path: s}
82 | return nil
83 | }
84 |
85 | _sa := _stats{}
86 | if err := json.Unmarshal(data, &_sa); err != nil {
87 | return err
88 | }
89 |
90 | *sa = Stats(_sa)
91 | return nil
92 | }
93 |
--------------------------------------------------------------------------------
/dstest/testdata/complete/expect.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "abstract": "/map/Qmb3n8FvgDbLoU9d7e3vo1UAyVkwV1RnqXUqPKC3Rj2Ej7",
3 | "abstractTransform": "/map/QmemJQrK7PTQvD3n8gmo9JhyaByyLmETiNR1Y8wS7hv4sP",
4 | "commit": {
5 | "qri": "cm:0",
6 | "signature": "8WVfbCKYc4rpugq5ZKYoWzX6wFQ6odffwe2UDAR1G1ktjQihiRx8EADNmxZDgh8LkuWSQLMKJ5xzndFVbW5AcnfeLkJ9GCut62QWmWapb5TWU2GeBxRZnmDhJKpDjTf5fvExUZk7F7viSbVGUfXWmKPZwieLVfowkJMGee8WLQo7hY3rK42dPjMfqP91AQgQsLCPFFFwGN94FExeQ5FcdP2ecLNpyxTbDNbQWeov6oUiHDTXFQ95T28WkJQDQvp5DwnS3WeBEF2TzxGq165KjLHLq3GJm5s767MzgWdZibKcRZpXX9k2S2DeMdRh1AhTXJEdXXj5TtS37ANeJ9f1QL4Eb6XAue",
7 | "timestamp": "2001-01-01T01:01:01.000000001Z",
8 | "title": "I'm a commit"
9 | },
10 | "dataPath": "/map/QmcCcPTqmckdXLBwPQXxfyW2BbFcUT6gqv9oGeWDkrNTyD",
11 | "meta": {
12 | "qri": "md:0",
13 | "title": "dataset with all submodels example"
14 | },
15 | "qri": "ds:0",
16 | "structure": {
17 | "checksum": "QmcCcPTqmckdXLBwPQXxfyW2BbFcUT6gqv9oGeWDkrNTyD",
18 | "entries": 6,
19 | "errCount": 1,
20 | "format": "csv",
21 | "formatConfig": {
22 | "headerRow": true
23 | },
24 | "length": 155,
25 | "qri": "st:0",
26 | "schema": {
27 | "items": {
28 | "items": [
29 | {
30 | "title": "title",
31 | "type": "string"
32 | },
33 | {
34 | "title": "duration",
35 | "type": "integer"
36 | }
37 | ],
38 | "type": "array"
39 | },
40 | "type": "array"
41 | }
42 | },
43 | "transform": {
44 | "data": "select * from foo",
45 | "qri": "tf:0",
46 | "resources": {
47 | "foo": "/not/a/real/path"
48 | },
49 | "structure": {
50 | "errCount": 0,
51 | "format": "csv",
52 | "formatConfig": {
53 | "headerRow": true
54 | },
55 | "qri": "st:0",
56 | "schema": {
57 | "items": {
58 | "items": [
59 | {
60 | "title": "title",
61 | "type": "string"
62 | },
63 | {
64 | "title": "duration",
65 | "type": "integer"
66 | }
67 | ],
68 | "type": "array"
69 | },
70 | "type": "array"
71 | }
72 | },
73 | "syntax": "sql"
74 | },
75 | "visconfig": {
76 | "format": "foo",
77 | "qri": "vc:0",
78 | "visualizations": {
79 | "colors": {
80 | "background": "#000000",
81 | "bars": "#ffffff"
82 | },
83 | "type": "bar"
84 | }
85 | }
86 | }
--------------------------------------------------------------------------------
/validate/data.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "context"
5 | "encoding/json"
6 | "fmt"
7 |
8 | "github.com/qri-io/dataset"
9 | "github.com/qri-io/dataset/dsio"
10 | "github.com/qri-io/jsonschema"
11 | )
12 |
13 | const batchSize = 5000
14 |
15 | func flushBatch(ctx context.Context, buf *dsio.EntryBuffer, st *dataset.Structure, jsch *jsonschema.Schema, errs *[]jsonschema.KeyError) error {
16 | if len(buf.Bytes()) == 0 {
17 | return nil
18 | }
19 |
20 | if e := buf.Close(); e != nil {
21 | return fmt.Errorf("error closing buffer: %s", e.Error())
22 | }
23 |
24 | var doc interface{}
25 | if err := json.Unmarshal(buf.Bytes(), &doc); err != nil {
26 | return fmt.Errorf("error parsing JSON bytes: %s", err.Error())
27 | }
28 | validationState := jsch.Validate(ctx, doc)
29 | *errs = append(*errs, *validationState.Errs...)
30 |
31 | return nil
32 | }
33 |
34 | // EntryReader consumes a reader & returns any validation errors present
35 | // TODO - refactor this to wrap a reader & return a struct that gives an
36 | // error or nil on each entry read.
37 | func EntryReader(r dsio.EntryReader) ([]jsonschema.KeyError, error) {
38 | ctx := context.Background()
39 | st := r.Structure()
40 |
41 | jsch, err := st.JSONSchema()
42 | if err != nil {
43 | return nil, err
44 | }
45 |
46 | valErrors := []jsonschema.KeyError{}
47 |
48 | buf, err := dsio.NewEntryBuffer(&dataset.Structure{
49 | Format: "json",
50 | Schema: st.Schema,
51 | })
52 | if err != nil {
53 | return nil, fmt.Errorf("error allocating data buffer: %s", err.Error())
54 | }
55 |
56 | err = dsio.EachEntry(r, func(i int, ent dsio.Entry, err error) error {
57 | if err != nil {
58 | return fmt.Errorf("error reading row %d: %s", i, err.Error())
59 | }
60 |
61 | if i%batchSize == 0 {
62 | flushErr := flushBatch(ctx, buf, st, jsch, &valErrors)
63 | if flushErr != nil {
64 | return flushErr
65 | }
66 | var bufErr error
67 | buf, bufErr = dsio.NewEntryBuffer(&dataset.Structure{
68 | Format: "json",
69 | Schema: st.Schema,
70 | })
71 | if bufErr != nil {
72 | return fmt.Errorf("error allocating data buffer: %s", bufErr.Error())
73 | }
74 | }
75 |
76 | err = buf.WriteEntry(ent)
77 | if err != nil {
78 | return fmt.Errorf("error writing row %d: %s", i, err.Error())
79 | }
80 |
81 | return nil
82 | })
83 |
84 | if err != nil {
85 | return nil, fmt.Errorf("error reading values: %s", err.Error())
86 | }
87 |
88 | if err := flushBatch(ctx, buf, st, jsch, &valErrors); err != nil {
89 | return nil, err
90 | }
91 |
92 | return valErrors, nil
93 | }
94 |
--------------------------------------------------------------------------------
/dsio/README.md:
--------------------------------------------------------------------------------
1 | ## Performance
2 |
3 | 2018-12-04
4 |
5 | go test github.com/qri-io/dataset/dsio -bench=.
6 |
7 | BenchmarkCBORWriterArrays-2 3000 431290 ns/op
8 | BenchmarkCBORWriterObjects-2 2000 698920 ns/op
9 | BenchmarkCBORReader-2 1000 1764549 ns/op
10 | BenchmarkCSVWriterArrays-2 1000 1548509 ns/op
11 | BenchmarkCSVWriterObjects-2 1000 1458219 ns/op
12 | BenchmarkCSVReader-2 1000 2008097 ns/op
13 | BenchmarkJSONWriterArrays-2 1000 1556416 ns/op
14 | BenchmarkJSONWriterObjects-2 1000 1562488 ns/op
15 | BenchmarkJSONReader-2 500 2984057 ns/op
16 |
17 | 2018-04-17
18 |
19 | go test github.com/qri-io/dataset/dsio -bench=.
20 |
21 | BenchmarkCBORWriterArrays-2 3000 478424 ns/op
22 | BenchmarkCBORWriterObjects-2 2000 584435 ns/op
23 | BenchmarkCBORReader-2 300 5081171 ns/op
24 | BenchmarkCSVWriterArrays-2 1000 1369984 ns/op
25 | BenchmarkCSVWriterObjects-2 1000 1406440 ns/op
26 | BenchmarkCSVReader-2 1000 1463376 ns/op
27 | BenchmarkJSONWriterArrays-2 1000 1377027 ns/op
28 | BenchmarkJSONWriterObjects-2 1000 1558887 ns/op
29 | BenchmarkJSONReader-2 500 2607946 ns/op
30 |
31 | 2018-03-29
32 |
33 | go test github.com/qri-io/dataset/dsio -bench=.
34 |
35 | BenchmarkCBORWriterArrays-2 3000 423851 ns/op
36 | BenchmarkCBORWriterObjects-2 2000 572609 ns/op
37 | BenchmarkCBORReader-2 300 5024830 ns/op
38 | BenchmarkCSVWriterArrays-2 1000 1448891 ns/op
39 | BenchmarkCSVWriterObjects-2 1000 1457973 ns/op
40 | BenchmarkCSVReader-2 1000 1454932 ns/op
41 | BenchmarkJSONWriterArrays-2 1000 1423156 ns/op
42 | BenchmarkJSONWriterObjects-2 1000 1620801 ns/op
43 | BenchmarkJSONReader-2 300 5286851 ns/op
44 |
45 | ## Fuzz testing
46 |
47 | From: [https://medium.com/@dgryski/go-fuzz-github-com-arolek-ase-3c74d5a3150c](http://https://medium.com/@dgryski/go-fuzz-github-com-arolek-ase-3c74d5a3150c)
48 |
49 | How to fuzz test:
50 |
51 | go install github.com/qri-io/dataset/use_generate
52 | cd $GOPATH
53 | mkdir out
54 | bin/use_generate
55 | cp $GOPATH/out/* workdir/corpus/.
56 |
57 | go get github.com/dvyukov/go-fuzz/go-fuzz
58 | go get github.com/dvyukov/go-fuzz/go-fuzz-build
59 | go install github.com/dvyukov/go-fuzz/go-fuzz
60 | go install github.com/dvyukov/go-fuzz/go-fuzz-build
61 |
62 | go-fuzz-build github.com/qri-io/dataset/dsio
63 | go-fuzz -bin=dsio-fuzz.zip -workdir=workdir
64 |
--------------------------------------------------------------------------------
/dsio/identity.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "fmt"
5 | "io"
6 |
7 | "github.com/qri-io/dataset"
8 | )
9 |
10 | // NewIdentityReader creates an EntryReader from native go types, passed in
11 | // data must be of type []interface{} or map[string]interface{}
12 | func NewIdentityReader(st *dataset.Structure, data interface{}) (*IdentityReader, error) {
13 | r := &IdentityReader{st: st}
14 |
15 | if md, ok := data.(map[string]interface{}); ok {
16 | r.entries = r.iterateMap(md)
17 | } else if sd, ok := data.([]interface{}); ok {
18 | r.entries = r.iterateSlice(sd)
19 | } else {
20 | return nil, fmt.Errorf("cannot create entry reader from type %T", data)
21 | }
22 |
23 | return r, nil
24 | }
25 |
26 | // IdentityReader is a dsio.EntryReader that works with native go types
27 | type IdentityReader struct {
28 | st *dataset.Structure
29 | done bool
30 | entries chan Entry
31 | }
32 |
33 | var _ EntryReader = (*IdentityReader)(nil)
34 |
35 | // Structure gives the structure being read
36 | func (r *IdentityReader) Structure() *dataset.Structure {
37 | return r.st
38 | }
39 |
40 | // ReadEntry reads one row of structured data from the reader
41 | func (r *IdentityReader) ReadEntry() (Entry, error) {
42 | if r.done {
43 | return Entry{}, io.EOF
44 | }
45 |
46 | return <-r.entries, nil
47 | }
48 |
49 | // Close finalizes the reader
50 | func (r *IdentityReader) Close() error {
51 | if !r.done {
52 | // drain channel to prevent leaking goroutine
53 | for !r.done {
54 | <-r.entries
55 | }
56 | }
57 | return nil
58 | }
59 |
60 | func (r *IdentityReader) iterateMap(data map[string]interface{}) chan Entry {
61 | res := make(chan Entry)
62 |
63 | go func() {
64 | for key, val := range data {
65 | res <- Entry{Key: key, Value: val}
66 | }
67 | r.done = true
68 | }()
69 |
70 | return res
71 | }
72 |
73 | func (r *IdentityReader) iterateSlice(data []interface{}) chan Entry {
74 | res := make(chan Entry)
75 |
76 | go func() {
77 | for i, val := range data {
78 | res <- Entry{Index: i, Value: val}
79 | }
80 | r.done = true
81 | }()
82 |
83 | return res
84 | }
85 |
86 | // IdentityWriter is a dsio.EntryWriter that works with native go types
87 | type IdentityWriter struct {
88 | st *dataset.Structure
89 | }
90 |
91 | // Structure gives the structure being written
92 | func (w *IdentityWriter) Structure() *dataset.Structure {
93 | return w.st
94 | }
95 |
96 | // WriteEntry writes one "row" of structured data to the Writer
97 | func (w *IdentityWriter) WriteEntry(e Entry) error {
98 | return nil
99 | }
100 |
101 | // Close finalizes the writer, indicating all entries
102 | // have been written
103 | func (w *IdentityWriter) Close() error {
104 | return nil
105 | }
106 |
--------------------------------------------------------------------------------
/kind.go:
--------------------------------------------------------------------------------
1 | package dataset
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | )
7 |
8 | // CurrentSpecVersion is the current verion of the dataset spec
9 | const CurrentSpecVersion = "0"
10 |
11 | const (
12 | // KindDataset is the current kind for datasets
13 | KindDataset = Kind("ds:" + CurrentSpecVersion)
14 | // KindBody is the current kind for body components
15 | KindBody = Kind("bd:" + CurrentSpecVersion)
16 | // KindMeta is the current kind for metadata components
17 | KindMeta = Kind("md:" + CurrentSpecVersion)
18 | // KindStructure is the current kind for structure components
19 | KindStructure = Kind("st:" + CurrentSpecVersion)
20 | // KindTransform is the current kind for transform components
21 | KindTransform = Kind("tf:" + CurrentSpecVersion)
22 | // KindCommit is the current kind for commit components
23 | KindCommit = Kind("cm:" + CurrentSpecVersion)
24 | // KindViz is the current kind for viz components
25 | KindViz = Kind("vz:" + CurrentSpecVersion)
26 | // KindReadme is the current kind for readme components
27 | KindReadme = Kind("rm:" + CurrentSpecVersion)
28 | // KindStats is the current kind for stats components
29 | KindStats = Kind("sa:" + CurrentSpecVersion)
30 | )
31 |
32 | // Kind is a short identifier for all types of qri dataset objects
33 | // Kind does three things:
34 | // 1. Distinguish qri datasets from other formats
35 | // 2. Distinguish different types (Dataset/Structure/Transform/etc.)
36 | // 3. Distinguish between versions of the dataset spec
37 | // Kind is a string in the format 2_letter_prefix + ':' + version
38 | type Kind string
39 |
40 | // String implements the stringer interface
41 | func (k Kind) String() string {
42 | return string(k)
43 | }
44 |
45 | // Valid checks to see if a kind string is valid
46 | func (k Kind) Valid() error {
47 | if len(k) < 4 {
48 | return fmt.Errorf("invalid kind: '%s'. kind must be in the form [type]:[version]", k.String())
49 | }
50 | return nil
51 | }
52 |
53 | // Type returns the type identifier
54 | func (k Kind) Type() string {
55 | return k.String()[:2]
56 | }
57 |
58 | // Version returns the version portion of the kind identifier
59 | func (k Kind) Version() string {
60 | return k.String()[3:]
61 | }
62 |
63 | // UnmarshalJSON implements the JSON.Unmarshaler interface,
64 | // rejecting any strings that are not a valid kind
65 | func (k *Kind) UnmarshalJSON(data []byte) error {
66 | var _k string
67 | if err := json.Unmarshal(data, &_k); err != nil {
68 | return err
69 | }
70 | *k = Kind(_k)
71 | return k.Valid()
72 | }
73 |
74 | // ComponentTypePrefix prefixes a string with a two letter component type
75 | // identifier & a colon. Example:
76 | // ComponentTypePrefix(KindDataset, "hello") == "ds:hello"
77 | func ComponentTypePrefix(k Kind, str string) string {
78 | return fmt.Sprintf("%s:%s", k.Type(), str)
79 | }
80 |
--------------------------------------------------------------------------------
/generate/dsgen/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/csv"
5 | "encoding/json"
6 | "flag"
7 | "fmt"
8 | "io/ioutil"
9 | "os"
10 | "strings"
11 |
12 | "github.com/qri-io/dataset"
13 | "github.com/qri-io/dataset/dsio"
14 | "github.com/qri-io/dataset/generate"
15 | )
16 |
17 | const help = `
18 | dsgen generates random CSV data for given tabular structure & prints to stdout.
19 | Use "fixed" to generate 1000byte rows for a fixed 4 column schema.
20 |
21 | Usage:
22 | dsgen [structure.json] --rows [num_rows]
23 | dsgen fixed --rows [num_rows]
24 | `
25 |
26 | var rows int
27 |
28 | func init() {
29 | flag.IntVar(&rows, "rows", 1000, "number of entries (rows) to generate")
30 | }
31 |
32 | func main() {
33 | flag.Parse()
34 | args := flag.Args()
35 |
36 | if len(args) < 1 {
37 | fmt.Println(help)
38 | os.Exit(1)
39 | }
40 | if args[0] == "fixed" {
41 | if err := writeFixedFile(rows, 0); err != nil {
42 | fmt.Println(err)
43 | os.Exit(1)
44 | }
45 | } else {
46 | if err := generateFile(args[0], rows); err != nil {
47 | fmt.Println(err)
48 | os.Exit(1)
49 | }
50 | }
51 | }
52 |
53 | func generateFile(structurePath string, lines int) error {
54 | data, err := ioutil.ReadFile(structurePath)
55 | if err != nil {
56 | return err
57 | }
58 | st := &dataset.Structure{}
59 | if err := json.Unmarshal(data, st); err != nil {
60 | return err
61 | }
62 |
63 | gen, err := generate.NewTabularGenerator(st)
64 | if err != nil {
65 | return err
66 | }
67 |
68 | w, err := dsio.NewCSVWriter(st, os.Stdout)
69 | if err != nil {
70 | return err
71 | }
72 |
73 | for i := 0; i < lines; i++ {
74 | ent, err := gen.ReadEntry()
75 | if err != nil {
76 | return err
77 | }
78 | w.WriteEntry(ent)
79 | }
80 | w.Close()
81 | gen.Close()
82 | return nil
83 | }
84 |
85 | func writeFixedFile(lines, diffStart int) error {
86 | filler := strings.Repeat("0", 908)
87 | w := csv.NewWriter(os.Stdout)
88 | w.Write([]string{"uuid", "ingest", "occurred", "raw_data"})
89 | var uuid, ingest, occurred, rawData string
90 | for i := 0; i < lines; i++ {
91 | if diffStart > 0 && i > diffStart {
92 | // write a "diff" line
93 | uuid = fmt.Sprintf("%d-%d-BA882B47-B26A-4E29-BFB4-XXXXXXXXXXXX", i, i)
94 | ingest = fmt.Sprintf("%d%d-01-01 00:00:01.000 UTC", i, i)
95 | occurred = fmt.Sprintf("2000-%d%d-01 00:00:02.000 UTC", i, i)
96 | rawData = fmt.Sprintf("%d%d%s", i, i, filler)
97 | } else {
98 | // write a normal line
99 | uuid = fmt.Sprintf("%d-BA882B47-B26A-4E29-BFB4-XXXXXXXXXXXX", i)
100 | ingest = fmt.Sprintf("%d-01-01 00:00:01.000 UTC", i)
101 | occurred = fmt.Sprintf("2000-%d-01 00:00:02.000 UTC", i)
102 | rawData = fmt.Sprintf("%d%s", i, filler)
103 | }
104 | w.Write([]string{uuid, ingest, occurred, rawData})
105 | }
106 |
107 | w.Flush()
108 | return nil
109 | }
110 |
--------------------------------------------------------------------------------
/compression/compression_test.go:
--------------------------------------------------------------------------------
1 | package compression
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | "strings"
7 | "testing"
8 | )
9 |
10 | func TestParseFormat(t *testing.T) {
11 | good := []string{
12 | "gz", "gzip", "zstd",
13 | }
14 |
15 | for _, s := range good {
16 | f, err := ParseFormat(s)
17 | if err != nil {
18 | t.Errorf("unexpected error for format %q: %s", s, err)
19 | }
20 | if _, ok := SupportedFormats[f]; !ok {
21 | t.Errorf("expected %q to be a supported format", s)
22 | }
23 | }
24 |
25 | bad := []string{
26 | "", "tar",
27 | }
28 | for _, s := range bad {
29 | if _, err := ParseFormat(s); err == nil {
30 | t.Errorf("expected format to error: %s, got nil", s)
31 | }
32 | }
33 | }
34 |
35 | func TestNew(t *testing.T) {
36 | if _, err := Compressor("invalid", &bytes.Buffer{}); err == nil {
37 | t.Error("expected error constructing with invalid compression format string")
38 | }
39 |
40 | if _, err := Decompressor("invalid", &bytes.Buffer{}); err == nil {
41 | t.Error("expected error constructing with invalid decompression format string")
42 | }
43 |
44 | SupportedFormats[Format("invalid")] = struct{}{}
45 | defer delete(SupportedFormats, Format("invalid"))
46 |
47 | if _, err := Compressor("invalid", &bytes.Buffer{}); err == nil {
48 | t.Error("expected error constructing with compression format without backing compressor")
49 | }
50 |
51 | if _, err := Decompressor("invalid", &bytes.Buffer{}); err == nil {
52 | t.Error("expected error constructing with decompression format without backing decompressor")
53 | }
54 | }
55 |
56 | func TestCompressionCycle(t *testing.T) {
57 | for f := range SupportedFormats {
58 | t.Run(string(f), func(t *testing.T) {
59 | plainText := "I am a string destined to go through a compression spin cycle"
60 |
61 | buf := &bytes.Buffer{}
62 | comp, err := Compressor(f.String(), buf)
63 | if err != nil {
64 | t.Fatal(err)
65 | }
66 |
67 | if copied, err := io.Copy(comp, strings.NewReader(plainText)); err != nil {
68 | t.Fatal(err)
69 | } else if copied != int64(len([]byte(plainText))) {
70 | t.Errorf("copy byte length mismatch. want: %d got: %d", len(plainText), copied)
71 | }
72 |
73 | if err := comp.Close(); err != nil {
74 | t.Fatal(err)
75 | }
76 |
77 | if buf.String() == plainText {
78 | t.Errorf("buf contents should be compressed, unequal to plain text")
79 | }
80 |
81 | t.Log(buf.String())
82 |
83 | decomp, err := Decompressor(f.String(), buf)
84 | if err != nil {
85 | t.Fatal(err)
86 | }
87 | defer decomp.Close()
88 |
89 | result := &bytes.Buffer{}
90 | if _, err := io.Copy(result, decomp); err != nil {
91 | t.Fatal(err)
92 | }
93 |
94 | if result.String() != plainText {
95 | t.Errorf("compression round trip result mismatch.\nwant: %s\ngot: %s", plainText, result.String())
96 | }
97 | })
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/stepfile/stepfile_test.go:
--------------------------------------------------------------------------------
1 | package stepfile
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "io/ioutil"
7 | "os"
8 | "path/filepath"
9 | "testing"
10 |
11 | "github.com/google/go-cmp/cmp"
12 | "github.com/qri-io/dataset"
13 | )
14 |
15 | func TestRead(t *testing.T) {
16 | cases := []struct {
17 | inputFilename string
18 | expectFilename string
19 | }{
20 | {"steps.txt", "steps.json"},
21 | }
22 |
23 | for _, c := range cases {
24 | t.Run(c.inputFilename, func(t *testing.T) {
25 | in := filepath.Join("./testdata", c.inputFilename)
26 | expect := []*dataset.TransformStep{}
27 | f, err := os.Open(filepath.Join("./testdata", c.expectFilename))
28 | if err != nil {
29 | t.Fatal(err)
30 | }
31 | if err := json.NewDecoder(f).Decode(&expect); err != nil {
32 | t.Fatal(err)
33 | }
34 | f.Close()
35 |
36 | got, err := ReadFile(in)
37 | if err != nil {
38 | t.Fatal(err)
39 | }
40 |
41 | if diff := cmp.Diff(expect, got); diff != "" {
42 | t.Errorf("result mismatch (-want +got):\n%s", diff)
43 | }
44 | })
45 | }
46 |
47 | t.Run("errors", func(t *testing.T) {
48 | if _, err := ReadFile("unknown"); err == nil {
49 | t.Error("expected error reading unknown file")
50 | }
51 | })
52 | }
53 |
54 | func TestWrite(t *testing.T) {
55 | cases := []struct {
56 | inputFilename string
57 | expectFilename string
58 | }{
59 | {"steps.json", "steps.txt"},
60 | }
61 |
62 | for _, c := range cases {
63 | t.Run(c.inputFilename, func(t *testing.T) {
64 | data, err := ioutil.ReadFile(filepath.Join("./testdata", c.expectFilename))
65 | if err != nil {
66 | t.Fatal(err)
67 | }
68 | expect := string(data)
69 |
70 | input := []*dataset.TransformStep{}
71 | f, err := os.Open(filepath.Join("./testdata", c.inputFilename))
72 | if err != nil {
73 | t.Fatal(err)
74 | }
75 | if err := json.NewDecoder(f).Decode(&input); err != nil {
76 | t.Fatal(err)
77 | }
78 | f.Close()
79 |
80 | buf := &bytes.Buffer{}
81 | if err := Write(input, buf); err != nil {
82 | t.Fatal(err)
83 | }
84 |
85 | if diff := cmp.Diff(expect, buf.String()); diff != "" {
86 | t.Errorf("result mismatch (-want +got):\n%s", diff)
87 | }
88 | })
89 | }
90 |
91 | t.Run("write from a reader", func(t *testing.T) {
92 | steps := []*dataset.TransformStep{
93 | {Script: bytes.NewBuffer([]byte("oh hai"))},
94 | {Script: []byte("my friend")},
95 | }
96 | buf := &bytes.Buffer{}
97 | if err := Write(steps, buf); err != nil {
98 | t.Error(err)
99 | }
100 | expect := "oh hai\n---\nmy friend"
101 | if diff := cmp.Diff(expect, buf.String()); diff != "" {
102 | t.Errorf("result mismatch. (-want +got):\n %s", diff)
103 | }
104 | })
105 |
106 | t.Run("bad scripts", func(t *testing.T) {
107 | steps := []*dataset.TransformStep{
108 | {Script: 2},
109 | }
110 | buf := &bytes.Buffer{}
111 | if err := Write(steps, buf); err == nil {
112 | t.Error("expected error, got none")
113 | }
114 | })
115 | }
116 |
--------------------------------------------------------------------------------
/generate/tabular.go:
--------------------------------------------------------------------------------
1 | // Package generate is for generating random data from given structures
2 | package generate
3 |
4 | import (
5 | "math/rand"
6 | "time"
7 |
8 | "github.com/qri-io/dataset"
9 | "github.com/qri-io/dataset/dsio"
10 | "github.com/qri-io/dataset/tabular"
11 | )
12 |
13 | // Config stores settings for the generate package.
14 | type Config struct {
15 | random *rand.Rand
16 | maxLen int
17 | useRandomType bool
18 | }
19 |
20 | // DefaultConfig returns the default configuration for a Generator.
21 | func DefaultConfig() *Config {
22 | return &Config{
23 | random: rand.New(rand.NewSource(time.Now().UnixNano())),
24 | maxLen: 64,
25 | useRandomType: false,
26 | }
27 | }
28 |
29 | // AssignSeed sets a specific random seed to be used.
30 | func AssignSeed(cfg *Config) {
31 | cfg.random = rand.New(rand.NewSource(4))
32 | }
33 |
34 | // AssignMaxLen sets a maximum length for generated values.
35 | func AssignMaxLen(cfg *Config) {
36 | cfg.maxLen = 8
37 | }
38 |
39 | // AssignUseRandomType causes generator to generate random types of values.
40 | func AssignUseRandomType(cfg *Config) {
41 | cfg.useRandomType = true
42 | }
43 |
44 | // TabularGenerator is a dsio.EntryReader that creates a new entry on each call
45 | // to ReadEntry
46 | type TabularGenerator struct {
47 | cols tabular.Columns
48 | structure *dataset.Structure
49 | gen *ValueGenerator
50 | // when generating array entries
51 | count int
52 | // only two possible structures for now are "array" or "object"
53 | schemaIsArray bool
54 | }
55 |
56 | // assert at compile time that Generator is a dsio.EntryReader
57 | var _ dsio.EntryReader = (*TabularGenerator)(nil)
58 |
59 | // NewTabularGenerator creates a tablular data generator with the given
60 | // configuration options
61 | func NewTabularGenerator(st *dataset.Structure, options ...func(*Config)) (*TabularGenerator, error) {
62 | cfg := DefaultConfig()
63 | for _, opt := range options {
64 | opt(cfg)
65 | }
66 |
67 | cols, _, err := tabular.ColumnsFromJSONSchema(st.Schema)
68 | if err != nil {
69 | return nil, err
70 | }
71 |
72 | gen := &ValueGenerator{
73 | Rand: cfg.random,
74 | MaxStringLength: cfg.maxLen,
75 | }
76 |
77 | return &TabularGenerator{
78 | structure: st,
79 | cols: cols,
80 | gen: gen,
81 | schemaIsArray: true,
82 | }, nil
83 | }
84 |
85 | // ReadEntry implements the dsio.EntryReader interface
86 | func (g *TabularGenerator) ReadEntry() (dsio.Entry, error) {
87 | row := make([]interface{}, len(g.cols))
88 | for i, col := range g.cols {
89 | row[i] = g.gen.Type([]string(*col.Type)[0])
90 | }
91 | index := g.count
92 | g.count++
93 | return dsio.Entry{Index: index, Value: row}, nil
94 | }
95 |
96 | // Structure implements the dsio.EntryReader interface
97 | func (g TabularGenerator) Structure() *dataset.Structure {
98 | return g.structure
99 | }
100 |
101 | // Close finalizes the generator
102 | func (g TabularGenerator) Close() error {
103 | return nil
104 | }
105 |
--------------------------------------------------------------------------------
/preview/testdata/earthquakes/readme.md:
--------------------------------------------------------------------------------
1 | # USGS Earthquakes Jan 12th
2 |
3 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas maximus erat ut rhoncus blandit. Duis aliquet vulputate leo eu volutpat. Praesent in mollis metus, non convallis lectus. Vestibulum malesuada mauris quis nisl auctor pellentesque. Duis lacinia nec justo in viverra. Quisque quis aliquet ante. Donec semper scelerisque laoreet. Praesent dapibus interdum mi, sit amet lacinia odio malesuada vitae. Proin eu erat quis nisi tristique mollis. Donec sed eleifend augue, at convallis ex.
4 |
5 | Integer at bibendum nibh. Mauris sit amet justo nisi. Duis aliquam ex sit amet urna elementum, nec venenatis diam dapibus. Donec pellentesque pretium est, eget vehicula libero fringilla id. Curabitur quam massa, interdum vel interdum sed, elementum et velit. Mauris ac consequat ante. Mauris porttitor ex vitae placerat congue. Nullam porta aliquam enim ac congue.
6 |
7 | Aenean non lacus a quam facilisis viverra. Duis mattis leo ac leo maximus dapibus. Suspendisse pulvinar elit non orci elementum ultricies et nec nunc. Maecenas bibendum sapien massa, eu vestibulum metus lacinia at. Ut laoreet nisi id magna iaculis placerat. Pellentesque scelerisque sit amet mauris ut porta. Aliquam interdum, nisi in dapibus ullamcorper, enim magna hendrerit elit, et mollis ex risus in ante. In suscipit varius metus, at posuere tellus lacinia at. Mauris nisi nibh, egestas et sollicitudin id, venenatis in erat. Donec eu lobortis magna, a rutrum mauris. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Orci varius natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Ut scelerisque sed ipsum eu eleifend.
8 |
9 | In et porta erat, commodo volutpat neque. Integer augue ipsum, maximus a enim ac, hendrerit lobortis lorem. Nam nec dolor eget dui ornare mollis vitae at nisi. Nunc quis massa nec nulla vehicula posuere. Morbi sed dictum libero. Vivamus feugiat enim vel augue faucibus, vitae consequat dolor molestie. Pellentesque eu pharetra neque. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Proin sodales arcu neque, ac dignissim nisi maximus at. Morbi blandit gravida sapien, et facilisis nulla dapibus nec. Morbi id odio quis neque cursus sollicitudin. Aliquam ut elementum ante, sed rutrum sapien.
10 |
11 | Nulla ligula felis, vulputate vel tristique eu, euismod non est. Aliquam cursus, eros at scelerisque imperdiet, est nunc hendrerit nunc, vel elementum dui velit id ex. Fusce posuere mollis lorem, nec rhoncus nisi laoreet sed. Curabitur gravida ante vitae risus feugiat posuere. Aenean id euismod nunc. Curabitur eget iaculis odio, id vehicula quam. Morbi at urna nec arcu bibendum malesuada quis ac odio. Donec libero massa, viverra eget dapibus id, auctor a risus. Nam semper nisl erat, nec mollis lectus cursus ac. Aliquam pulvinar sapien sapien, ac vestibulum velit aliquet vitae. Maecenas vitae porta arcu. Maecenas tempus quam eget felis convallis, et sagittis urna suscipit. Proin at risus libero. In fermentum nisl ac felis gravida posuere. Etiam vestibulum diam placerat, vehicula orci et, convallis lacus.
--------------------------------------------------------------------------------
/dstest/testdata/complete/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "commit": {
4 | "qri": "cm:0",
5 | "title": "I'm a commit"
6 | },
7 | "meta": {
8 | "qri": "md:0",
9 | "title": "dataset with all submodels example"
10 | },
11 | "transform": {
12 | "qri": "tf:0",
13 | "syntax": "sql",
14 | "data": "select * from foo",
15 | "structure": {
16 | "qri": "st:0",
17 | "format": "csv",
18 | "formatConfig": {
19 | "headerRow": true
20 | },
21 | "schema": {
22 | "type": "array",
23 | "items": {
24 | "type": "array",
25 | "items": [
26 | {
27 | "title": "title",
28 | "type": "string"
29 | },
30 | {
31 | "title": "duration",
32 | "type": "integer"
33 | }
34 | ]
35 | }
36 | }
37 | },
38 | "resources": {
39 | "foo": "/not/a/real/path"
40 | }
41 | },
42 | "abstractTransform": {
43 | "qri": "tf:0",
44 | "data": "select * from a",
45 | "structure": {
46 | "qri": "st:0",
47 | "format": "csv",
48 | "formatConfig": {
49 | "headerRow": true
50 | },
51 | "schema": {
52 | "type": "array",
53 | "items": {
54 | "type": "array",
55 | "items": [
56 | {
57 | "title": "a",
58 | "type": "string"
59 | },
60 | {
61 | "title": "b",
62 | "type": "integer"
63 | }
64 | ]
65 | }
66 | }
67 | },
68 | "resources": {
69 | "a": "/fake/path/to/abstract/dataset/"
70 | }
71 | },
72 | "abstract": {
73 | "qri": "ds:0",
74 | "structure": {
75 | "qri": "st:0",
76 | "format": "csv",
77 | "formatConfig": {
78 | "headerRow": true
79 | },
80 | "schema": {
81 | "type": "array",
82 | "items": {
83 | "type": "array",
84 | "items": [
85 | {
86 | "title": "a",
87 | "type": "string"
88 | },
89 | {
90 | "title": "b",
91 | "type": "integer"
92 | }
93 | ]
94 | }
95 | }
96 | }
97 | },
98 | "structure": {
99 | "qri": "st:0",
100 | "format": "csv",
101 | "formatConfig": {
102 | "headerRow": true
103 | },
104 | "schema": {
105 | "type": "array",
106 | "items": {
107 | "type": "array",
108 | "items": [
109 | {
110 | "title": "title",
111 | "type": "string"
112 | },
113 | {
114 | "title": "duration",
115 | "type": "integer"
116 | }
117 | ]
118 | }
119 | }
120 | },
121 | "visconfig":{
122 | "format": "foo",
123 | "qri": "vc:0",
124 | "visualizations": {
125 | "type": "bar",
126 | "colors": {
127 | "bars": "#ffffff",
128 | "background": "#000000"
129 | }
130 | }
131 | }
132 | }
--------------------------------------------------------------------------------
/compression/compression.go:
--------------------------------------------------------------------------------
1 | // Package compression presents a uniform interface for a set of compression
2 | // readers & writers in various formats
3 | package compression
4 |
5 | import (
6 | "fmt"
7 | "io"
8 |
9 | "github.com/klauspost/compress/gzip"
10 | "github.com/klauspost/compress/zstd"
11 | )
12 |
13 | const (
14 | // FmtNone is a sentinel for no compression
15 | FmtNone Format = ""
16 | // FmtZStandard compression https://facebook.github.io/zstd/
17 | FmtZStandard Format = "zst"
18 | // FmtGZip GNU zip compression https://www.gnu.org/software/gzip/
19 | FmtGZip Format = "gzip"
20 | )
21 |
22 | // Format represents a type of byte compression
23 | type Format string
24 |
25 | // String implements the stringer interface
26 | func (s Format) String() string {
27 | return string(s)
28 | }
29 |
30 | // SupportedFormats indexes supported formats in a map for lookups
31 | var SupportedFormats = map[Format]struct{}{
32 | FmtZStandard: {},
33 | FmtGZip: {},
34 | }
35 |
36 | // ParseFormat interprets a string into a supported compression format
37 | // errors when provided the empty string ("no compression" format)
38 | func ParseFormat(s string) (f Format, err error) {
39 | f, ok := map[string]Format{
40 | "gzip": FmtGZip,
41 | "gz": FmtGZip,
42 | "zst": FmtZStandard,
43 | "zstd": FmtZStandard, // not a common file ending, but "zstd" is the shorthand name for the library
44 | }[s]
45 |
46 | if !ok {
47 | return f, fmt.Errorf("invalid compression format %q", s)
48 | }
49 |
50 | if _, ok := SupportedFormats[f]; !ok {
51 | return FmtNone, fmt.Errorf("unsupported compression format: %q", s)
52 | }
53 |
54 | return f, nil
55 | }
56 |
57 | // Compressor wraps a given writer with a specified comrpession format
58 | // callers must Close the writer to fully flush the compressor
59 | func Compressor(compressionFormat string, w io.Writer) (io.WriteCloser, error) {
60 | f, err := ParseFormat(compressionFormat)
61 | if err != nil {
62 | return nil, err
63 | }
64 |
65 | switch f {
66 | case FmtZStandard:
67 | return zstd.NewWriter(w)
68 | case FmtGZip:
69 | return gzip.NewWriter(w), nil
70 | }
71 |
72 | return nil, fmt.Errorf("no available compressor for %q format", f)
73 | }
74 |
75 | // Decompressor wraps a reader of compressed data with a decompressor
76 | // callers must .Close() the reader
77 | func Decompressor(compressionFormat string, r io.Reader) (io.ReadCloser, error) {
78 | f, err := ParseFormat(compressionFormat)
79 | if err != nil {
80 | return nil, err
81 | }
82 |
83 | switch f {
84 | case FmtZStandard:
85 | rdr, err := zstd.NewReader(r)
86 | if err != nil {
87 | return nil, err
88 | }
89 | return zstdReadCloserShim{rdr}, nil
90 | case FmtGZip:
91 | return gzip.NewReader(r)
92 | }
93 |
94 | return nil, fmt.Errorf("no available decompressor for %q format", f)
95 | }
96 |
97 | // small struct to compensate for zstd's decoder Close() method, which returns
98 | // no error. This breaks the io.ReadCloser interface. shim in an
99 | // error function with an error that will never occur
100 | type zstdReadCloserShim struct {
101 | *zstd.Decoder
102 | }
103 |
104 | func (d zstdReadCloserShim) Close() error {
105 | d.Decoder.Close()
106 | return nil
107 | }
108 |
--------------------------------------------------------------------------------
/detect/testdata/daily_wind_2011.structure.json:
--------------------------------------------------------------------------------
1 | {
2 | "format": "csv",
3 | "formatConfig": {
4 | "headerRow" : true,
5 | "lazyQuotes" : true
6 | },
7 | "schema": {
8 | "type": "array",
9 | "items": {
10 | "type": "array",
11 | "items": [
12 | {
13 | "title": "state_code",
14 | "type": "integer"
15 | },
16 | {
17 | "title": "county_code",
18 | "type": "integer"
19 | },
20 | {
21 | "title": "site_num",
22 | "type": "integer"
23 | },
24 | {
25 | "title": "parameter_code",
26 | "type": "integer"
27 | },
28 | {
29 | "title": "poc",
30 | "type": "integer"
31 | },
32 | {
33 | "title": "latitude",
34 | "type": "number"
35 | },
36 | {
37 | "title": "longitude",
38 | "type": "number"
39 | },
40 | {
41 | "title": "datum",
42 | "type": "string"
43 | },
44 | {
45 | "title": "parameter_name",
46 | "type": "string"
47 | },
48 | {
49 | "title": "sample_duration",
50 | "type": "string"
51 | },
52 | {
53 | "title": "pollutant_standard",
54 | "type": "string"
55 | },
56 | {
57 | "title": "date_local",
58 | "type": "string"
59 | },
60 | {
61 | "title": "units_of_measure",
62 | "type": "string"
63 | },
64 | {
65 | "title": "event_type",
66 | "type": "string"
67 | },
68 | {
69 | "title": "observation_count",
70 | "type": "integer"
71 | },
72 | {
73 | "title": "observation_percent",
74 | "type": "number"
75 | },
76 | {
77 | "title": "arithmetic_mean",
78 | "type": "number"
79 | },
80 | {
81 | "title": "st_max_value",
82 | "type": "number"
83 | },
84 | {
85 | "title": "st_max_hour",
86 | "type": "integer"
87 | },
88 | {
89 | "title": "aqi",
90 | "type": "string"
91 | },
92 | {
93 | "title": "method_code",
94 | "type": "integer"
95 | },
96 | {
97 | "title": "method_name",
98 | "type": "string"
99 | },
100 | {
101 | "title": "local_site_name",
102 | "type": "string"
103 | },
104 | {
105 | "title": "address",
106 | "type": "string"
107 | },
108 | {
109 | "title": "state_name",
110 | "type": "string"
111 | },
112 | {
113 | "title": "county_name",
114 | "type": "string"
115 | },
116 | {
117 | "title": "city_name",
118 | "type": "string"
119 | },
120 | {
121 | "title": "cbsa_name",
122 | "type": "string"
123 | },
124 | {
125 | "title": "date_of_last_change",
126 | "type": "string"
127 | }
128 | ]
129 | }
130 | }
131 | }
--------------------------------------------------------------------------------
/detect/testdata/daily_wind_2011.csv:
--------------------------------------------------------------------------------
1 | "State Code","County Code","Site Num","Parameter Code","POC","Latitude","Longitude","Datum","Parameter Name","Sample Duration","Pollutant Standard","Date Local","Units of Measure","Event Type","Observation Count","Observation Percent","Arithmetic Mean","1st Max Value","1st Max Hour","AQI","Method Code","Method Name","Local Site Name","Address","State Name","County Name","City Name","CBSA Name","Date of Last Change"
2 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-01","Knots","None",24,100.0,3.25,6.6,1,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
3 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-02","Knots","None",24,100.0,2.033333,3.6,13,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
4 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-03","Knots","None",24,100.0,0.991667,2.1,11,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
5 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-04","Knots","None",24,100.0,1.091667,2.3,11,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
6 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-05","Knots","None",24,100.0,1.5875,3.2,20,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
7 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-06","Knots","None",24,100.0,2.508333,4,13,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
8 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-07","Knots","None",24,100.0,3.991667,5.7,12,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
9 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-08","Knots","None",24,100.0,5.3,5.3,0,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
10 | "01","073","0023","61103",1,33.553056,-86.815,"WGS84","Wind Speed - Resultant","1 HOUR","","2011-01-09","Knots","None",24,100.0,5.3,5.3,0,"","061","Instrumental - Met One Sonic Anemometer Model 50.5","North Birmingham","NO. B'HAM,SOU R.R., 3009 28TH ST. NO.","Alabama","Jefferson","Birmingham","Birmingham-Hoover, AL","2016-04-22"
--------------------------------------------------------------------------------
/dsio/ndjson_test.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "strings"
8 | "testing"
9 |
10 | "github.com/google/go-cmp/cmp"
11 | "github.com/qri-io/dataset"
12 | "github.com/qri-io/dataset/compression"
13 | )
14 |
15 | func TestNDJSONReadWrite(t *testing.T) {
16 | data := `["a","b","c"]
17 | "apples"
18 | true
19 | 35
20 | null
21 | {}
22 | `
23 |
24 | st := &dataset.Structure{
25 | Format: dataset.NDJSONDataFormat.String(),
26 | Schema: dataset.BaseSchemaArray,
27 | }
28 |
29 | rdr, err := NewEntryReader(st, strings.NewReader(data))
30 | if err != nil {
31 | t.Fatal(err)
32 | }
33 |
34 | buf := &bytes.Buffer{}
35 | wr, err := NewEntryWriter(st, buf)
36 |
37 | if err := Copy(rdr, wr); err != nil {
38 | t.Fatal(err)
39 | }
40 | rdr.Close()
41 | wr.Close()
42 |
43 | if diff := cmp.Diff(data, buf.String()); diff != "" {
44 | t.Errorf("result mismatch (-want +got):\n%s", diff)
45 | }
46 | }
47 |
48 | func TestNDJSONCompression(t *testing.T) {
49 | invalidCompressionSt := &dataset.Structure{Format: "ndjson", Compression: "invalid", Schema: dataset.BaseSchemaArray}
50 | if _, err := NewJSONReader(invalidCompressionSt, nil); err == nil {
51 | t.Errorf("constructing reader with invalid compression should error")
52 | }
53 | if _, err := NewJSONWriter(invalidCompressionSt, nil); err == nil {
54 | t.Errorf("constructing writer with invalid compression should error")
55 | }
56 |
57 | data := `["a","b","c"]
58 | "apples"
59 | true
60 | 35
61 | null
62 | {}
63 | `
64 |
65 | compressed := &bytes.Buffer{}
66 | compressor, _ := compression.Compressor("zst", compressed)
67 | io.Copy(compressor, strings.NewReader(data))
68 | compressor.Close()
69 |
70 | st := &dataset.Structure{
71 | Format: "ndjson",
72 | Compression: "zst",
73 | Schema: dataset.BaseSchemaArray,
74 | }
75 |
76 | rdr, err := NewNDJSONReader(st, compressed)
77 | if err != nil {
78 | t.Fatal(err)
79 | }
80 |
81 | compressed2 := &bytes.Buffer{}
82 | wr, err := NewNDJSONWriter(st, compressed2)
83 | if err != nil {
84 | t.Fatal(err)
85 | }
86 |
87 | if err := Copy(rdr, wr); err != nil {
88 | t.Fatal(err)
89 | }
90 | rdr.Close()
91 | wr.Close()
92 |
93 | if diff := cmp.Diff(compressed.Bytes(), compressed2.Bytes()); diff != "" {
94 | t.Errorf("result mismatch expect (-want +got):\n%s", diff)
95 | }
96 | }
97 |
98 | func TestNDJSONReaderSizeOverflow(t *testing.T) {
99 | // run a test with one 24,000-character long string to ensure the reader
100 | // doesn't choke on a long line of JSON
101 | st := &dataset.Structure{
102 | Format: "ndjson",
103 | Schema: dataset.BaseSchemaArray,
104 | }
105 | data := fmt.Sprintf(`"hi"
106 | false
107 | %q
108 | null
109 | "bye"
110 | `, strings.Repeat("long", 1024*6))
111 |
112 | rdr, err := NewNDJSONReader(st, strings.NewReader(data))
113 | if err != nil {
114 | t.Fatal(err)
115 | }
116 |
117 | vals, err := ReadAll(rdr)
118 | if err != nil {
119 | t.Error(err)
120 | }
121 |
122 | if err := rdr.Close(); err != nil {
123 | t.Error(err)
124 | }
125 |
126 | expect := []interface{}{
127 | "hi",
128 | false,
129 | strings.Repeat("long", 1024*6),
130 | nil,
131 | "bye",
132 | }
133 |
134 | if diff := cmp.Diff(expect, vals); diff != "" {
135 | t.Errorf("result mismatch (-want +got):\n%s", diff)
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/preview/preview.go:
--------------------------------------------------------------------------------
1 | package preview
2 |
3 | import (
4 | "bytes"
5 | "context"
6 | "encoding/json"
7 | "fmt"
8 | "io"
9 | "io/ioutil"
10 |
11 | logger "github.com/ipfs/go-log"
12 | "github.com/qri-io/dataset"
13 | "github.com/qri-io/dataset/dsio"
14 | "github.com/qri-io/qfs"
15 | )
16 |
17 | var (
18 | log = logger.Logger("preview")
19 | )
20 |
21 | const (
22 | // MaxNumDatasetRowsInPreview is the highest number of rows a dataset preview
23 | // can contain
24 | MaxNumDatasetRowsInPreview = 100
25 | // MaxStatsBytes is the maximum number of bytes reserved in a preview for stats
26 | // values.
27 | // TODO(b5): this value is not currently honored, requires implementing
28 | // dataset.Stats.Abbreviate
29 | MaxStatsBytes = 10000
30 | // MaxReadmePreviewBytes determines the maximum amount of bytes a readme
31 | // preview can be. three bytes less than 1000 to make room for an elipsis
32 | MaxReadmePreviewBytes = 997
33 | )
34 |
35 | // Create generates a preview for a dataset version
36 | // It expects the passed in dataset to have any relevant script files already
37 | // loaded
38 | // Preview currently includes:
39 | // - body: 100 rows
40 | // - readme: first 997 bytes
41 | // - meta: all
42 | // - commit: all
43 | // - structure: all
44 | // - stats: all
45 | // - viz: all
46 | // - transform: all
47 | func Create(ctx context.Context, ds *dataset.Dataset) (*dataset.Dataset, error) {
48 |
49 | if ds == nil {
50 | log.Debug("Create: nil dataset")
51 | return nil, fmt.Errorf("nil dataset")
52 | }
53 | if ds.IsEmpty() {
54 | log.Debug("Create: empty dataset")
55 | return nil, fmt.Errorf("empty dataset")
56 | }
57 |
58 | p := &dataset.Dataset{}
59 | p.Assign(ds)
60 |
61 | if ds.Readme != nil && ds.Readme.ScriptFile() != nil {
62 | buf := &bytes.Buffer{}
63 | f := ds.Readme.ScriptFile()
64 | tr := io.TeeReader(f, buf)
65 |
66 | content, err := ioutil.ReadAll(io.LimitReader(tr, MaxReadmePreviewBytes))
67 | if err != nil {
68 | log.Debugw("Reading Readme", "err", err.Error())
69 | return nil, err
70 | }
71 | if len(content) >= MaxReadmePreviewBytes {
72 | content = append(content, []byte("...")...)
73 | }
74 | ds.Readme.Text = string(content)
75 |
76 | ds.Readme.SetScriptFile(qfs.NewMemfileReader(f.FullPath(), io.MultiReader(buf, f)))
77 | }
78 |
79 | if ds.BodyFile() != nil {
80 | st := &dataset.Structure{
81 | Format: "json",
82 | Schema: ds.Structure.Schema,
83 | }
84 |
85 | buf := &bytes.Buffer{}
86 | f := ds.BodyFile()
87 | tr := io.TeeReader(f, buf)
88 | teedFile := qfs.NewMemfileReader(f.FullPath(), tr)
89 | size := -1
90 | if sf, ok := f.(qfs.SizeFile); ok {
91 | size = int(sf.Size())
92 | }
93 |
94 | data, err := dsio.ConvertFile(teedFile, ds.Structure, st, MaxNumDatasetRowsInPreview, 0, false)
95 | if err != nil {
96 | log.Debugw("converting body file", "err", err.Error())
97 | return nil, err
98 | }
99 |
100 | ds.Body = json.RawMessage(data)
101 | ds.SetBodyFile(qfs.NewMemfileReaderSize(f.FullPath(), io.MultiReader(buf, f), int64(size)))
102 | }
103 |
104 | // Note: stats can get arbitrarily large, potentially bloating the size
105 | // of previews. Add a method for bounding the final size of stats to a
106 | // constant byte size
107 | if ds.Stats != nil && !ds.Stats.IsEmpty() {
108 | p.Stats = ds.Stats
109 | }
110 |
111 | return ds, nil
112 | }
113 |
--------------------------------------------------------------------------------
/vals/compare_test.go:
--------------------------------------------------------------------------------
1 | package vals
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestEqual(t *testing.T) {
8 | cases := []struct {
9 | a, b Value
10 | expect bool
11 | }{
12 | {Array{Number(1)}, Array{Number(1)}, true},
13 | {Array{Number(1)}, Array{Number(2)}, false},
14 | {Object{"a": String("a")}, Object{"a": String("a")}, true},
15 | {Object{"a": String("a")}, Object{"a": String("b")}, false},
16 | {String("a"), String("a"), true},
17 | {String("a"), String("b"), false},
18 | {Boolean(true), Boolean(true), true},
19 | {Boolean(true), Boolean(false), false},
20 | {Integer(1), Integer(1), true},
21 | {Integer(1), Integer(2), false},
22 | {Number(1.1), Number(1.1), true},
23 | {Number(1.1), Number(1.11), false},
24 | }
25 |
26 | for i, c := range cases {
27 | got := Equal(c.a, c.b)
28 | if got != c.expect {
29 | t.Errorf("case: %d. %v == %v != %t", i, c.a, c.b, c.expect)
30 | }
31 | }
32 | }
33 |
34 | func TestCompareTypeBytes(t *testing.T) {
35 | cases := []struct {
36 | a, b string
37 | t Type
38 | expect int
39 | err string
40 | }{
41 | {"0", "0", TypeUnknown, 0, "invalid type comparison"},
42 | {"", "", TypeString, 0, ""},
43 | {"", "foo", TypeString, -1, ""},
44 | {"foo", "", TypeString, 1, ""},
45 | {"foo", "bar", TypeString, 1, ""},
46 | {"bar", "foo", TypeString, -1, ""},
47 | {"0", "0", TypeNumber, 0, ""},
48 | {"0", "0", TypeInteger, 0, ""},
49 | }
50 |
51 | for i, c := range cases {
52 | got, err := CompareTypeBytes([]byte(c.a), []byte(c.b), c.t)
53 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
54 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err)
55 | continue
56 | }
57 | if got != c.expect {
58 | t.Errorf("case %d response mismatch: %d != %d", i, c.expect, got)
59 | continue
60 | }
61 | }
62 | }
63 |
64 | func TestCompareIntegerBytes(t *testing.T) {
65 | cases := []struct {
66 | a, b string
67 | expect int
68 | err string
69 | }{
70 | {"0", "", 0, "strconv.ParseInt: parsing \"\": invalid syntax"},
71 | {"", "0", 0, "strconv.ParseInt: parsing \"\": invalid syntax"},
72 | {"0", "0", 0, ""},
73 | {"-1", "0", -1, ""},
74 | {"0", "-1", 1, ""},
75 | }
76 |
77 | for i, c := range cases {
78 | got, err := CompareIntegerBytes([]byte(c.a), []byte(c.b))
79 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
80 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err)
81 | continue
82 | }
83 | if got != c.expect {
84 | t.Errorf("case %d response mismatch: %d != %d", i, c.expect, got)
85 | continue
86 | }
87 | }
88 | }
89 |
90 | func TestCompareNumberBytes(t *testing.T) {
91 | cases := []struct {
92 | a, b string
93 | expect int
94 | err string
95 | }{
96 | {"0", "", 0, "strconv.ParseFloat: parsing \"\": invalid syntax"},
97 | {"", "0", 0, "strconv.ParseFloat: parsing \"\": invalid syntax"},
98 | {"0", "0", 0, ""},
99 | {"-1", "0", -1, ""},
100 | {"0", "-1", 1, ""},
101 | }
102 |
103 | for i, c := range cases {
104 | got, err := CompareNumberBytes([]byte(c.a), []byte(c.b))
105 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
106 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err)
107 | continue
108 | }
109 | if got != c.expect {
110 | t.Errorf("case %d response mismatch: %d != %d", i, c.expect, got)
111 | continue
112 | }
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/vals/coding_test.go:
--------------------------------------------------------------------------------
1 | package vals
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "testing"
7 | )
8 |
9 | var (
10 | array0 = &Array{String("a"), Boolean(false), Null(true), Integer(2), Number(23.5)}
11 | object0 = &Object{"city": String("toronto"), "pop": Integer(40000000), "avg_age": Number(55.5), "in_usa": Boolean(false)}
12 | array1 = &Array{*array0, *array0}
13 | array2 = &Array{*object0, *object0}
14 | )
15 |
16 | func TestConvertDecoded(t *testing.T) {
17 | cases := []struct {
18 | in interface{}
19 | expect Value
20 | err string
21 | }{
22 | {map[string]interface{}{}, &Object{}, ""},
23 | {map[string]interface{}{
24 | "a": 0,
25 | "b": float64(0),
26 | "c": nil,
27 | "d": true,
28 | "e": "foo",
29 | "f": []interface{}{},
30 | "g": map[string]interface{}{},
31 | "h": uint8(0),
32 | "i": uint16(0),
33 | "j": uint64(0),
34 | "k": int32(0),
35 | "l": int64(0),
36 | "m": map[interface{}]interface{}{},
37 | }, &Object{
38 | "a": Integer(0),
39 | "b": Number(0),
40 | "c": Null(true),
41 | "d": Boolean(true),
42 | "e": String("foo"),
43 | "f": &Array{},
44 | "g": &Object{},
45 | "h": Integer(0),
46 | "i": Integer(0),
47 | "j": Integer(0),
48 | "k": Integer(0),
49 | "l": Integer(0),
50 | "m": &Object{},
51 | }, ""},
52 | }
53 |
54 | for i, c := range cases {
55 | got, err := ConvertDecoded(c.in)
56 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
57 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err)
58 | continue
59 | }
60 |
61 | if !Equal(c.expect, got) {
62 | t.Errorf("case %d result mismatch. epxected: %#v, got: %#v", i, c.expect, got)
63 | continue
64 | }
65 | }
66 | }
67 |
68 | func TestUnmarshalJSON(t *testing.T) {
69 | cases := []struct {
70 | input string
71 | expect Value
72 | err string
73 | }{
74 | {`"foo"`, String("foo"), ""},
75 | {`123`, Integer(123), ""},
76 | {`123.45`, Number(123.45), ""},
77 | {`{ "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }`, *object0, ""},
78 | {`["a", false, null, 2, 23.5]`, *array0, ""},
79 | {`[null, null, null]`, Array{Null(true), Null(true), Null(true)}, ""},
80 | {`[["a", false, null, 2, 23.5],["a", false, null, 2, 23.5]]`, *array1, ""},
81 | {`[{ "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false },{ "city" : "toronto", "pop" : 40000000, "avg_age" : 55.5 , "in_usa" : false }]`, *array2, ""},
82 | }
83 | for i, c := range cases {
84 | got, err := UnmarshalJSON([]byte(c.input))
85 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
86 | t.Errorf("case %d error mismatch. expected: '%s', got: '%s'", i, c.err, err)
87 | continue
88 | }
89 |
90 | if !Equal(c.expect, got) {
91 | t.Errorf("case %d result mismatch. expected: %#v, got: %#v", i, c.expect, got)
92 | continue
93 | }
94 | }
95 | }
96 |
97 | func TestMarshalJSON(t *testing.T) {
98 | d := Array{
99 | Object{"foo": Boolean(false)},
100 | Boolean(true),
101 | Integer(12),
102 | Null(true),
103 | Number(123.456),
104 | Array{String("foo"), String("bar")},
105 | }
106 |
107 | b, err := json.Marshal(d)
108 | if err != nil {
109 | t.Errorf("unexpected error marshaling to JSON: %s", err.Error())
110 | return
111 | }
112 |
113 | expect := `[{"foo":false},true,12,null,123.456,["foo","bar"]]`
114 | if !bytes.Equal([]byte(expect), b) {
115 | t.Errorf("byte mismatch. expected: %s, got: %s", expect, string(b))
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/dstest/compare.go:
--------------------------------------------------------------------------------
1 | package dstest
2 |
3 | import (
4 | "github.com/google/go-cmp/cmp"
5 | "github.com/google/go-cmp/cmp/cmpopts"
6 | "github.com/qri-io/dataset"
7 | )
8 |
9 | // CompareDatasets checks two given datasets for equality, returng a diff string
10 | // describing the difference between each dataset/ return will be the empty
11 | // string
12 | // if datasets are equal
13 | // CompareDatasets defaults to a strict compraison of all exported fields
14 | // operates on copies of passed-in datasets to keep this function free of side
15 | // effects
16 | func CompareDatasets(expect, got *dataset.Dataset, opts ...CompareOpts) string {
17 | cfg := &CompareConfig{}
18 | for _, opt := range opts {
19 | opt.Apply(cfg)
20 | }
21 |
22 | a := &dataset.Dataset{}
23 | a.Assign(expect)
24 |
25 | b := &dataset.Dataset{}
26 | b.Assign(got)
27 |
28 | if cfg.dropTransients {
29 | a.DropTransientValues()
30 | a.DropTransientValues()
31 | }
32 |
33 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported(
34 | dataset.Dataset{},
35 | dataset.Commit{},
36 | dataset.Meta{},
37 | dataset.Transform{},
38 | dataset.Readme{},
39 | dataset.Viz{},
40 | ))
41 | }
42 |
43 | // CompareConfig defines configuration parameters, which are unexported, but
44 | // settable via CompareOpt's supplied ot a Compare function
45 | type CompareConfig struct {
46 | dropTransients bool
47 | }
48 |
49 | // CompareOpts adusts component comparison functions
50 | type CompareOpts interface {
51 | Apply(cfg *CompareConfig)
52 | }
53 |
54 | // OptDropTransientValues drops transients on both dataset before making the
55 | // comparison, allowing things like dataset name &
56 | type OptDropTransientValues int
57 |
58 | // Apply sets unexported configuration
59 | func (OptDropTransientValues) Apply(cfg *CompareConfig) {
60 | cfg.dropTransients = true
61 | }
62 |
63 | // CompareCommits is CompareDatasets, but for commit components
64 | func CompareCommits(expect, got *dataset.Commit, opts ...CompareOpts) string {
65 | cfg := &CompareConfig{}
66 | for _, opt := range opts {
67 | opt.Apply(cfg)
68 | }
69 |
70 | a := &dataset.Commit{}
71 | a.Assign(expect)
72 |
73 | b := &dataset.Commit{}
74 | b.Assign(got)
75 |
76 | if cfg.dropTransients {
77 | a.DropTransientValues()
78 | a.DropTransientValues()
79 | }
80 |
81 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported(
82 | dataset.Commit{},
83 | ))
84 | }
85 |
86 | // CompareMetas is CompareDatasets, but for meta components
87 | func CompareMetas(expect, got *dataset.Meta, opts ...CompareOpts) string {
88 | cfg := &CompareConfig{}
89 | for _, opt := range opts {
90 | opt.Apply(cfg)
91 | }
92 |
93 | a := &dataset.Meta{}
94 | a.Assign(expect)
95 |
96 | b := &dataset.Meta{}
97 | b.Assign(got)
98 |
99 | if cfg.dropTransients {
100 | a.DropTransientValues()
101 | a.DropTransientValues()
102 | }
103 |
104 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported(
105 | dataset.Meta{},
106 | ))
107 | }
108 |
109 | // CompareStructures is CompareDatasets, but for structure components
110 | func CompareStructures(expect, got *dataset.Structure, opts ...CompareOpts) string {
111 | cfg := &CompareConfig{}
112 | for _, opt := range opts {
113 | opt.Apply(cfg)
114 | }
115 |
116 | a := &dataset.Structure{}
117 | a.Assign(expect)
118 |
119 | b := &dataset.Structure{}
120 | b.Assign(got)
121 |
122 | if cfg.dropTransients {
123 | a.DropTransientValues()
124 | a.DropTransientValues()
125 | }
126 |
127 | return cmp.Diff(a, b, cmpopts.IgnoreUnexported(
128 | dataset.Structure{},
129 | ))
130 | }
131 |
--------------------------------------------------------------------------------
/dsio/ndjson.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "bufio"
5 | "encoding/json"
6 | "fmt"
7 | "io"
8 |
9 | "github.com/qri-io/dataset"
10 | )
11 |
12 | // NDJSONReader implements the EntryReader interface for the JSON data format
13 | type NDJSONReader struct {
14 | entriesRead int
15 | st *dataset.Structure
16 | buf *bufio.Reader
17 | close func() error // close func from wrapped reader
18 | prevSize int // when buffer is extended, remember how much of the old buffer to discard
19 | }
20 |
21 | var _ EntryReader = (*NDJSONReader)(nil)
22 |
23 | // NewNDJSONReader creates a reader from a structure and read source
24 | func NewNDJSONReader(st *dataset.Structure, r io.Reader) (*NDJSONReader, error) {
25 | if st.Schema == nil {
26 | err := fmt.Errorf("schema required for NDJSON reader")
27 | log.Debug(err.Error())
28 | return nil, err
29 | }
30 |
31 | tlt, err := GetTopLevelType(st)
32 | if err != nil {
33 | return nil, err
34 | }
35 | if tlt != "array" {
36 | return nil, fmt.Errorf("NDJSON top level type must be 'array'")
37 | }
38 |
39 | r, close, err := maybeWrapDecompressor(st, r)
40 | if err != nil {
41 | return nil, err
42 | }
43 |
44 | ndjr := &NDJSONReader{
45 | st: st,
46 | buf: bufio.NewReader(r),
47 | close: close,
48 | }
49 | return ndjr, nil
50 | }
51 |
52 | // Structure gives this writer's structure
53 | func (r *NDJSONReader) Structure() *dataset.Structure {
54 | return r.st
55 | }
56 |
57 | // ReadEntry reads one JSON record from the reader
58 | func (r *NDJSONReader) ReadEntry() (Entry, error) {
59 | line, err := r.buf.ReadBytes('\n')
60 | if err != nil {
61 | return Entry{}, err
62 | }
63 |
64 | var v interface{}
65 | if err := json.Unmarshal(line, &v); err != nil {
66 | return Entry{}, err
67 | }
68 |
69 | ent := Entry{
70 | Index: r.entriesRead,
71 | Value: v,
72 | }
73 |
74 | r.entriesRead++
75 | return ent, nil
76 | }
77 |
78 | // Close finalizes the reader
79 | func (r *NDJSONReader) Close() error {
80 | if r.close != nil {
81 | return r.close()
82 | }
83 | return nil
84 | }
85 |
86 | // NDJSONWriter implements the EntryWriter interface for
87 | // Newline-Deliminted-JSON-formatted data
88 | type NDJSONWriter struct {
89 | rowsWritten int
90 | st *dataset.Structure
91 | wr io.Writer
92 | enc *json.Encoder
93 | close func() error // close func from wrapped writer
94 | }
95 |
96 | var _ EntryWriter = (*NDJSONWriter)(nil)
97 |
98 | // NewNDJSONWriter creates a Writer from a structure and write destination
99 | func NewNDJSONWriter(st *dataset.Structure, w io.Writer) (*NDJSONWriter, error) {
100 | if st.Schema == nil {
101 | err := fmt.Errorf("schema required for NDJSON writer")
102 | log.Debug(err.Error())
103 | return nil, err
104 | }
105 |
106 | w, close, err := maybeWrapCompressor(st, w)
107 | if err != nil {
108 | return nil, err
109 | }
110 |
111 | jw := &NDJSONWriter{
112 | st: st,
113 | wr: w,
114 | enc: json.NewEncoder(w),
115 | close: close,
116 | }
117 |
118 | return jw, nil
119 | }
120 |
121 | // Structure gives this writer's structure
122 | func (w *NDJSONWriter) Structure() *dataset.Structure {
123 | return w.st
124 | }
125 |
126 | // WriteEntry writes one JSON entry to the writer
127 | func (w *NDJSONWriter) WriteEntry(ent Entry) error {
128 | return w.enc.Encode(ent.Value)
129 | }
130 |
131 | // Close finalizes the writer
132 | func (w *NDJSONWriter) Close() error {
133 | if w.close != nil {
134 | return w.close()
135 | }
136 | return nil
137 | }
138 |
--------------------------------------------------------------------------------
/testdata/datasets/complete.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri": "ds:0",
3 | "meta": {
4 | "title": "dataset with all submodels example",
5 | "description": "foo",
6 | "accessURL": "foo",
7 | "downloadURL": "foo",
8 | "accrualPeriodicity": "1W",
9 | "version": "0",
10 | "readme": "foo",
11 | "queryString": "foo",
12 | "previous": "foo",
13 | "qri": "md:0",
14 | "identifier": "foo",
15 | "iconImage": "foo",
16 | "length": 2503,
17 | "image": "foo",
18 | "keywords": [
19 | "a",
20 | "b",
21 | "foo"
22 | ],
23 | "language": [
24 | "english"
25 | ],
26 | "theme": [
27 | "foo"
28 | ],
29 | "author": {
30 | "email": "foo"
31 | },
32 | "data": "foo",
33 | "contributors": [
34 | {
35 | "email": "foo"
36 | }
37 | ]
38 | },
39 | "commit": {
40 | "qri": "cm:0",
41 | "timestamp": "2017-12-21T04:13:22.534Z",
42 | "message": "I'm a commit"
43 | },
44 | "transform": {
45 | "qri": "tf:0",
46 | "syntax": "sql",
47 | "data": "select * from foo",
48 | "structure": {
49 | "qri": "st:0",
50 | "format": "csv",
51 | "formatConfig": {
52 | "headerRow": true
53 | },
54 | "schema": {
55 | "type": "array",
56 | "items": {
57 | "type":"array",
58 | "items": [
59 | {
60 | "title": "title",
61 | "type": "string"
62 | },
63 | {
64 | "title": "duration",
65 | "type": "integer"
66 | }
67 |
68 | ]
69 | }
70 | }
71 | },
72 | "resources": {
73 | "foo": {"path": "/not/a/real/path"}
74 | }
75 | },
76 | "abstractTransform": {
77 | "qri": "tf:0",
78 | "data": "select * from a",
79 | "structure": {
80 | "qri": "st:0",
81 | "format": "csv",
82 | "formatConfig": {
83 | "headerRow": true
84 | },
85 | "schema": {
86 | "type": "array",
87 | "items": {
88 | "type": "array",
89 | "items": [
90 | {
91 | "title": "a",
92 | "type": "string"
93 | },
94 | {
95 | "title": "b",
96 | "type": "integer"
97 | }
98 | ]
99 | }
100 | }
101 | },
102 | "resources": {
103 | "a": "/fake/path/to/abstract/dataset/"
104 | }
105 | },
106 | "abstract": {
107 | "qri": "ds:0",
108 | "structure": {
109 | "qri": "st:0",
110 | "format": "csv",
111 | "formatConfig": {
112 | "headerRow": true
113 | },
114 | "schema": {
115 | "type": "array",
116 | "items": {
117 | "type": "array",
118 | "items": [
119 | {
120 | "type": "string"
121 | },
122 | {
123 | "type": "integer"
124 | }
125 | ]
126 | }
127 | }
128 | }
129 | },
130 | "structure": {
131 | "qri": "st:0",
132 | "format": "csv",
133 | "formatConfig": {
134 | "headerRow": true
135 | },
136 | "schema": {
137 | "type": "array",
138 | "items": {
139 | "type": "array",
140 | "items": [
141 | {
142 | "title": "title",
143 | "type": "string"
144 | },
145 | {
146 | "title": "duration",
147 | "type": "integer"
148 | }
149 | ]
150 | }
151 | }
152 | }
153 | }
--------------------------------------------------------------------------------
/data_format.go:
--------------------------------------------------------------------------------
1 | package dataset
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | )
7 |
8 | // ErrUnknownDataFormat is the expected error for
9 | // when a data format is missing or unknown
10 | var ErrUnknownDataFormat = fmt.Errorf("Unknown Data Format")
11 |
12 | // DataFormat represents different types of data formats.
13 | // formats specified here have some degree of support within
14 | // the dataset packages
15 | type DataFormat int
16 |
17 | const (
18 | // UnknownDataFormat is the default dataformat, meaning
19 | // that a data format should always be specified when
20 | // using the DataFormat type
21 | UnknownDataFormat DataFormat = iota
22 | // CSVDataFormat specifies comma separated value-formatted data
23 | CSVDataFormat
24 | // JSONDataFormat specifies Javascript Object Notation-formatted data
25 | JSONDataFormat
26 | // NDJSONDataFormat newline-delimited JSON files
27 | // https://github.com/ndjson/ndjson-spec
28 | NDJSONDataFormat
29 | // CBORDataFormat specifies RFC 7049 Concise Binary Object Representation
30 | // read more at cbor.io
31 | CBORDataFormat
32 | // XMLDataFormat specifies eXtensible Markup Language-formatted data
33 | // currently not supported.
34 | XMLDataFormat
35 | // XLSXDataFormat specifies microsoft excel formatted data
36 | XLSXDataFormat
37 | )
38 |
39 | // SupportedDataFormats gives a slice of data formats that are
40 | // expected to work with this dataset package. As we work through
41 | // support for different formats, the last step of providing full
42 | // support to a format will be an addition to this slice
43 | func SupportedDataFormats() []DataFormat {
44 | return []DataFormat{
45 | CBORDataFormat,
46 | JSONDataFormat,
47 | CSVDataFormat,
48 | XLSXDataFormat,
49 | NDJSONDataFormat,
50 | }
51 | }
52 |
53 | // String implements stringer interface for DataFormat
54 | func (f DataFormat) String() string {
55 | s, ok := map[DataFormat]string{
56 | UnknownDataFormat: "",
57 | CSVDataFormat: "csv",
58 | JSONDataFormat: "json",
59 | XMLDataFormat: "xml",
60 | XLSXDataFormat: "xlsx",
61 | CBORDataFormat: "cbor",
62 | NDJSONDataFormat: "ndjson",
63 | }[f]
64 |
65 | if !ok {
66 | return ""
67 | }
68 |
69 | return s
70 | }
71 |
72 | // ParseDataFormatString takes a string representation of a data format
73 | // TODO (b5): trim "." prefix, remove prefixed map keys
74 | func ParseDataFormatString(s string) (df DataFormat, err error) {
75 | df, ok := map[string]DataFormat{
76 | "": UnknownDataFormat,
77 | ".csv": CSVDataFormat,
78 | "csv": CSVDataFormat,
79 | ".json": JSONDataFormat,
80 | "json": JSONDataFormat,
81 | ".xml": XMLDataFormat,
82 | "xml": XMLDataFormat,
83 | ".xlsx": XLSXDataFormat,
84 | "xlsx": XLSXDataFormat,
85 | "cbor": CBORDataFormat,
86 | ".cbor": CBORDataFormat,
87 | ".ndjson": NDJSONDataFormat,
88 | "ndjson": NDJSONDataFormat,
89 | ".jsonl": NDJSONDataFormat,
90 | "jsonl": NDJSONDataFormat,
91 | }[s]
92 | if !ok {
93 | err = fmt.Errorf("invalid data format: `%s`", s)
94 | df = UnknownDataFormat
95 | }
96 |
97 | return
98 | }
99 |
100 | // MarshalJSON satisfies the json.Marshaler interface
101 | func (f DataFormat) MarshalJSON() ([]byte, error) {
102 | if f == UnknownDataFormat {
103 | return nil, ErrUnknownDataFormat
104 | }
105 | return []byte(fmt.Sprintf(`"%s"`, f.String())), nil
106 | }
107 |
108 | // UnmarshalJSON satisfies the json.Unmarshaler interface
109 | func (f *DataFormat) UnmarshalJSON(data []byte) error {
110 | var s string
111 | if err := json.Unmarshal(data, &s); err != nil {
112 | return fmt.Errorf("Data Format type should be a string, got %s", data)
113 | }
114 |
115 | df, err := ParseDataFormatString(s)
116 | if err != nil {
117 | return err
118 | }
119 |
120 | *f = df
121 | return nil
122 | }
123 |
--------------------------------------------------------------------------------
/preview/testdata/earthquakes/input.dataset.json:
--------------------------------------------------------------------------------
1 | {
2 | "qri":"ds:0",
3 | "meta": {
4 | "description": "List of earthquakes recorded by the USGS from Jan 12th",
5 | "keywords": [
6 | "earthquakes",
7 | "usgs",
8 | "geology"
9 | ],
10 | "path": "/ipfs/QmaTUEjSnonrVpDA47e9yBiK9yb1hQA6rAJkvrF2WotJnR",
11 | "qri": "md:0",
12 | "theme": [
13 | "geology"
14 | ],
15 | "title": "USGS Earthquakes"
16 | },
17 | "structure": {
18 | "checksum": "QmWsTpdYkiKThJh4uB8dXqScMYFrFLbYxTr5XkV36XR6Ed",
19 | "depth": 2,
20 | "entries": 240,
21 | "errCount": 410,
22 | "format": "csv",
23 | "formatConfig": {
24 | "headerRow": true,
25 | "lazyQuotes": true
26 | },
27 | "length": 44883,
28 | "path": "/ipfs/QmXXGgzEfyYBBGFdbMM1uKQ97kgqdjjoDdk8S1AaaqWrC2",
29 | "qri": "st:0",
30 | "schema": {
31 | "items": {
32 | "items": [
33 | {
34 | "title": "time",
35 | "type": "string"
36 | },
37 | {
38 | "title": "latitude",
39 | "type": "number"
40 | },
41 | {
42 | "title": "longitude",
43 | "type": "number"
44 | },
45 | {
46 | "title": "depth",
47 | "type": "number"
48 | },
49 | {
50 | "title": "mag",
51 | "type": "number"
52 | },
53 | {
54 | "title": "mag_type",
55 | "type": "string"
56 | },
57 | {
58 | "title": "nst",
59 | "type": "integer"
60 | },
61 | {
62 | "title": "gap",
63 | "type": "integer"
64 | },
65 | {
66 | "title": "dmin",
67 | "type": "number"
68 | },
69 | {
70 | "title": "rms",
71 | "type": "number"
72 | },
73 | {
74 | "title": "net",
75 | "type": "string"
76 | },
77 | {
78 | "title": "id",
79 | "type": "string"
80 | },
81 | {
82 | "title": "updated",
83 | "type": "string"
84 | },
85 | {
86 | "title": "place",
87 | "type": "string"
88 | },
89 | {
90 | "title": "type",
91 | "type": "string"
92 | },
93 | {
94 | "title": "horizontal_error",
95 | "type": "number"
96 | },
97 | {
98 | "title": "depth_error",
99 | "type": "number"
100 | },
101 | {
102 | "title": "mag_error",
103 | "type": "number"
104 | },
105 | {
106 | "title": "mag_nst",
107 | "type": "integer"
108 | },
109 | {
110 | "title": "status",
111 | "type": "string"
112 | },
113 | {
114 | "title": "location_source",
115 | "type": "string"
116 | },
117 | {
118 | "title": "mag_source",
119 | "type": "string"
120 | }
121 | ],
122 | "type": "array"
123 | },
124 | "type": "array"
125 | }
126 | },
127 | "commit": {
128 | "author": {
129 | "id": "QmeitTcgUPiw1PyUDdaCbMcNotx84yR8EwJVjTv7MRmELA"
130 | },
131 | "message": "meta added\nreadme added",
132 | "path": "/ipfs/QmWH51TF5EcAjF4NyffgHF1hfd4VL8Xq8ctgK7rR2xdwdy",
133 | "qri": "cm:0",
134 | "signature": "MpfKssuFkcLpvkVMSzbJxsxDMXhLSnnfeEWs9usvduttdzSt8sAM0DD1UmEwRsDziE9oYe3GbGqu5eWqs9KYqmMbkMZU8cPrIQ4JYRUfPxQYDQh8cQhe65hTU30UM29+KR1DfhdWZzlNIu6NkIX4YHUMCNIJUk5HFU90BzYpB5agvp5ZxqDzHDVgDYflqalchkyl2jQ/OdWPQgj1BYoP7O5QGgq3ZhPnloq440y0QEengwix30nIdOiGFEH3lFNtFeHYktSAaOQY5/X8co3ttv6iI2XwyE9jvPWDtx1o8bNpgmlVFUP4oteb9iRDT7e6byRLQC7k3WHZ/JiXL7mAAw==",
135 | "timestamp": "2021-01-25T16:33:26.588501Z",
136 | "title": "Add meta \u0026 readme"
137 | },
138 | "readme": {
139 | "qri":"rm:0"
140 | },
141 | "stats": {
142 | "qri":"sa:0",
143 | "stats": {"todo": "add real stats"}
144 | }
145 | }
--------------------------------------------------------------------------------
/vals/coding.go:
--------------------------------------------------------------------------------
1 | package vals
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | )
7 |
8 | // ConvertDecoded converts an interface that has been decoded into standard go types to a Value
9 | func ConvertDecoded(d interface{}) (Value, error) {
10 | var err error
11 | if d == nil {
12 | return Null(true), nil
13 | }
14 | switch v := d.(type) {
15 | case uint8:
16 | return Integer(v), nil
17 | case uint16:
18 | return Integer(v), nil
19 | case uint32:
20 | return Integer(v), nil
21 | case uint64:
22 | return Integer(v), nil
23 | case float64:
24 | return Number(v), nil
25 | case int:
26 | return Integer(v), nil
27 | case int32:
28 | return Integer(int(v)), nil
29 | case int64:
30 | return Integer(int(v)), nil
31 | case string:
32 | return String(v), nil
33 | case bool:
34 | return Boolean(v), nil
35 | case []interface{}:
36 | arr := make(Array, len(v))
37 | for i, val := range v {
38 | arr[i], err = ConvertDecoded(val)
39 | if err != nil {
40 | return arr, err
41 | }
42 | }
43 | return &arr, nil
44 | case map[string]interface{}:
45 | obj := make(Object, len(v))
46 | for key, val := range v {
47 | obj[key], err = ConvertDecoded(val)
48 | if err != nil {
49 | return obj, err
50 | }
51 | }
52 | return &obj, nil
53 | case map[interface{}]interface{}:
54 | obj := make(Object, len(v))
55 | for keyi, val := range v {
56 | key, ok := keyi.(string)
57 | if !ok {
58 | return nil, fmt.Errorf("only strings may be used as keys. got %#v", keyi)
59 | }
60 | obj[key], err = ConvertDecoded(val)
61 | if err != nil {
62 | return obj, err
63 | }
64 | }
65 | return &obj, nil
66 | default:
67 | return nil, fmt.Errorf("unrecognized decoded type: %#v", v)
68 | }
69 | }
70 |
71 | // UnmarshalJSON turns a slice of JSON bytes into a Value
72 | func UnmarshalJSON(data []byte) (v Value, err error) {
73 | switch ParseType(data) {
74 | case TypeObject:
75 | return unmarshalObject(data)
76 | case TypeArray:
77 | return unmarshalArray(data)
78 | case TypeString:
79 | s := String("")
80 | v = &s
81 | case TypeInteger:
82 | i := Integer(0)
83 | v = &i
84 | case TypeNumber:
85 | n := Number(0)
86 | v = &n
87 | case TypeBoolean:
88 | b := Boolean(false)
89 | v = &b
90 | case TypeNull:
91 | n := Null(true)
92 | v = &n
93 | }
94 |
95 | err = json.Unmarshal(data, v)
96 | return
97 | }
98 |
99 | type decodeObj map[string]json.RawMessage
100 |
101 | func unmarshalObject(data []byte) (Value, error) {
102 | do := decodeObj{}
103 | if err := json.Unmarshal(data, &do); err != nil {
104 | return nil, err
105 | }
106 |
107 | obj := make(Object, len(do))
108 | for key, rm := range do {
109 | val, err := UnmarshalJSON([]byte(rm))
110 | if err != nil {
111 | return nil, err
112 | }
113 | switch t := val.(type) {
114 | case *String:
115 | obj[key] = *t
116 | case *Number:
117 | obj[key] = *t
118 | case *Integer:
119 | obj[key] = *t
120 | case *Null:
121 | obj[key] = *t
122 | case Object:
123 | obj[key] = t
124 | case Array:
125 | obj[key] = t
126 | case *Boolean:
127 | obj[key] = *t
128 | }
129 | }
130 |
131 | return obj, nil
132 | }
133 |
134 | type decodeArray []json.RawMessage
135 |
136 | func unmarshalArray(data []byte) (Value, error) {
137 | da := decodeArray{}
138 | if err := json.Unmarshal(data, &da); err != nil {
139 | return nil, err
140 | }
141 |
142 | arr := make(Array, len(da))
143 | for i, rm := range da {
144 | val, err := UnmarshalJSON([]byte(rm))
145 | if err != nil {
146 | return nil, err
147 | }
148 | switch t := val.(type) {
149 | case *String:
150 | arr[i] = *t
151 | case *Number:
152 | arr[i] = *t
153 | case *Integer:
154 | arr[i] = *t
155 | case *Null:
156 | arr[i] = *t
157 | case Object:
158 | arr[i] = t
159 | case Array:
160 | arr[i] = t
161 | case *Boolean:
162 | arr[i] = *t
163 | }
164 | }
165 |
166 | return arr, nil
167 | }
168 |
--------------------------------------------------------------------------------
/data_format_test.go:
--------------------------------------------------------------------------------
1 | package dataset
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 | )
7 |
8 | func TestSupportedDataFormats(t *testing.T) {
9 | expect := []DataFormat{
10 | CBORDataFormat,
11 | JSONDataFormat,
12 | CSVDataFormat,
13 | XLSXDataFormat,
14 | NDJSONDataFormat,
15 | }
16 |
17 | for i, f := range SupportedDataFormats() {
18 | if expect[i] != f {
19 | t.Errorf("index %d mismatch. expected: %s got: %s", i, expect, f)
20 | }
21 | }
22 | }
23 |
24 | func TestDataFormatString(t *testing.T) {
25 | cases := []struct {
26 | f DataFormat
27 | expect string
28 | }{
29 | {UnknownDataFormat, ""},
30 | {CSVDataFormat, "csv"},
31 | {JSONDataFormat, "json"},
32 | {XMLDataFormat, "xml"},
33 | {XLSXDataFormat, "xlsx"},
34 | {CBORDataFormat, "cbor"},
35 | {NDJSONDataFormat, "ndjson"},
36 | }
37 |
38 | for i, c := range cases {
39 | if got := c.f.String(); got != c.expect {
40 | t.Errorf("case %d mismatch. expected: %q, got: %q", i, c.expect, got)
41 | continue
42 | }
43 | }
44 | }
45 |
46 | func TestParseDataFormatString(t *testing.T) {
47 | cases := []struct {
48 | in string
49 | expect DataFormat
50 | err string
51 | }{
52 | {"", UnknownDataFormat, ""},
53 | {".csv", CSVDataFormat, ""},
54 | {"csv", CSVDataFormat, ""},
55 | {".json", JSONDataFormat, ""},
56 | {"json", JSONDataFormat, ""},
57 | {".xml", XMLDataFormat, ""},
58 | {"xml", XMLDataFormat, ""},
59 | {".xlsx", XLSXDataFormat, ""},
60 | {"xlsx", XLSXDataFormat, ""},
61 | {"cbor", CBORDataFormat, ""},
62 | {".cbor", CBORDataFormat, ""},
63 | {".ndjson", NDJSONDataFormat, ""},
64 | {"ndjson", NDJSONDataFormat, ""},
65 | {".jsonl", NDJSONDataFormat, ""},
66 | {"jsonl", NDJSONDataFormat, ""},
67 | }
68 |
69 | for i, c := range cases {
70 | got, err := ParseDataFormatString(c.in)
71 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
72 | t.Errorf("case %d error mismatch '%s' != '%s'", i, c.expect, err)
73 | continue
74 | }
75 | if got != c.expect {
76 | t.Errorf("case %d response mismatch. expected: %s got: %s", i, c.expect, got)
77 | continue
78 | }
79 | }
80 | }
81 |
82 | func TestDataFormatMarshalJSON(t *testing.T) {
83 | cases := []struct {
84 | format DataFormat
85 | expect []byte
86 | err string
87 | }{
88 | {CSVDataFormat, []byte(`"csv"`), ""},
89 | {JSONDataFormat, []byte(`"json"`), ""},
90 | {XMLDataFormat, []byte(`"xml"`), ""},
91 | {XLSXDataFormat, []byte(`"xlsx"`), ""},
92 | {CBORDataFormat, []byte(`"cbor"`), ""},
93 | {NDJSONDataFormat, []byte(`"ndjson"`), ""},
94 | }
95 | for i, c := range cases {
96 | got, err := c.format.MarshalJSON()
97 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
98 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err)
99 | continue
100 | }
101 | if !bytes.Equal(got, c.expect) {
102 | t.Errorf(`case %d response mismatch. expected: %s, got: %s`, i, string(c.expect), string(got))
103 | continue
104 | }
105 | }
106 | }
107 |
108 | func TestDataFormatUnmarshalJSON(t *testing.T) {
109 | cases := []struct {
110 | data []byte
111 | expect DataFormat
112 | err string
113 | }{
114 | {[]byte(`"csv"`), CSVDataFormat, ""},
115 | {[]byte(`"json"`), JSONDataFormat, ""},
116 | {[]byte(`"xml"`), XMLDataFormat, ""},
117 | {[]byte(`"xlsx"`), XLSXDataFormat, ""},
118 | {[]byte(`"cbor"`), CBORDataFormat, ""},
119 | {[]byte(`"ndjson"`), NDJSONDataFormat, ""},
120 | }
121 |
122 | for i, c := range cases {
123 | a := DataFormat(0)
124 | got := &a
125 | err := got.UnmarshalJSON(c.data)
126 | if !(err == nil && c.err == "" || err != nil && err.Error() == c.err) {
127 | t.Errorf("case %d error mismatch. expected: %s, got: %s", i, c.err, err)
128 | continue
129 | }
130 | if *got != c.expect {
131 | t.Errorf(`case %d response mismatch. expected: %s, got: %s`, i, c.expect, *got)
132 | continue
133 | }
134 |
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/validate/dataset.go:
--------------------------------------------------------------------------------
1 | package validate
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/qri-io/dataset"
7 | "github.com/qri-io/jsonschema"
8 | )
9 |
10 | // Dataset checks that a dataset is valid for use
11 | // returning the first error encountered, nil if valid
12 | func Dataset(ds *dataset.Dataset) error {
13 | if ds == nil {
14 | return nil
15 | }
16 |
17 | if ds.Commit == nil {
18 | err := fmt.Errorf("commit is required")
19 | log.Debug(err.Error())
20 | return err
21 | } else if err := Commit(ds.Commit); err != nil {
22 | err := fmt.Errorf("commit: %s", err.Error())
23 | log.Debug(err.Error())
24 | return err
25 | }
26 | if ds.Structure != nil {
27 | if err := Structure(ds.Structure); err != nil {
28 | return fmt.Errorf("structure: %s", err.Error())
29 | }
30 | }
31 |
32 | return nil
33 | }
34 |
35 | // Commit checks that a dataset Commit is valid for use
36 | // returning the first error encountered, nil if valid
37 | func Commit(cm *dataset.Commit) error {
38 | if cm == nil {
39 | return nil
40 | }
41 |
42 | if cm.Title == "" {
43 | // return fmt.Errorf("title is required")
44 |
45 | } else if len(cm.Title) > 100 {
46 | return fmt.Errorf("title is too long. %d length exceeds 100 character limit", len(cm.Title))
47 | }
48 |
49 | return nil
50 | }
51 |
52 | // Structure checks that a dataset structure is valid for use
53 | // returning the first error encountered, nil if valid
54 | func Structure(s *dataset.Structure) error {
55 | if s == nil {
56 | return nil
57 | }
58 |
59 | df := s.DataFormat()
60 | if df == dataset.UnknownDataFormat {
61 | return fmt.Errorf("format is required")
62 | } else if df == dataset.CSVDataFormat {
63 | if s.Schema == nil {
64 | return fmt.Errorf("csv data format requires a schema")
65 | }
66 | }
67 |
68 | if err := Schema(s.Schema); err != nil {
69 | return fmt.Errorf("schema: %s", err.Error())
70 | }
71 |
72 | return nil
73 | }
74 |
75 | // csvMetaSchema is a jsonschema for validating CSV schema definitions
76 | var csvMetaSchema = jsonschema.Must(`{
77 | "type": "object",
78 | "properties": {
79 | "type": {
80 | "const": "array"
81 | },
82 | "items": {
83 | "type": "object",
84 | "properties": {
85 | "type": {
86 | "const": "array"
87 | },
88 | "items": {
89 | "type": "array",
90 | "items": {
91 | "type": "object",
92 | "minItems": 1,
93 | "properties": {
94 | "title": {
95 | "type": "string"
96 | },
97 | "type": true
98 | }
99 | }
100 | }
101 | }
102 | }
103 | }
104 | }`)
105 |
106 | // jsonMetaSchema is a jsonschema for validating JSON schema definitions
107 | // var jsonMetaSchema = jsonschema.Must(``)
108 |
109 | // Schema checks that a dataset schema is valid for use
110 | // returning the first error encountered, nil if valid
111 | func Schema(sch map[string]interface{}) error {
112 | if sch == nil {
113 | return fmt.Errorf("schema is required")
114 | }
115 |
116 | // TODO (b5): Um, like, finish this
117 |
118 | // if len(s.Fields) == 0 {
119 | // return fmt.Errorf("fields are required")
120 | // } else if err := Fields(s.Fields); err != nil {
121 | // return fmt.Errorf("fields: %s", err.Error())
122 | // }
123 |
124 | return nil
125 | }
126 |
127 | // Fields checks that a slice of dataset fields is valid for use
128 | // returning the first error encountered, nil if valid
129 | // func Fields(fields []*dataset.Field) error {
130 | // if fields == nil {
131 | // return nil
132 | // }
133 |
134 | // checkedFieldNames := map[string]bool{}
135 | // for _, field := range fields {
136 | // if err := ValidName(field.Name); err != nil {
137 | // return err
138 | // }
139 | // seen := checkedFieldNames[field.Name]
140 | // if seen {
141 | // return fmt.Errorf("error: cannot use the same name, '%s' more than once", field.Name)
142 | // }
143 | // checkedFieldNames[field.Name] = true
144 | // }
145 | // return nil
146 | // }
147 |
--------------------------------------------------------------------------------
/dstest/dstest_test.go:
--------------------------------------------------------------------------------
1 | package dstest
2 |
3 | import (
4 | "bytes"
5 | "io/ioutil"
6 | "os"
7 | "testing"
8 |
9 | "github.com/qri-io/dataset"
10 | )
11 |
12 | func TestDatasetChecksum(t *testing.T) {
13 | expect := "085e607818aae2920e0e4b57c321c3b58e17b85d"
14 | sum := DatasetChecksum(&dataset.Dataset{})
15 | if sum != expect {
16 | t.Errorf("empty pod hash mismatch. expected: %s, got: %s", expect, sum)
17 | }
18 | }
19 |
20 | func TestLoadTestCases(t *testing.T) {
21 | tcs, err := LoadTestCases("testdata")
22 | if err != nil {
23 | t.Error(err)
24 | }
25 | if len(tcs) == 0 {
26 | t.Errorf("expected at least one test case to load")
27 | }
28 | }
29 |
30 | func TestBodyFilepath(t *testing.T) {
31 | fp, err := BodyFilepath("testdata/complete")
32 | if err != nil {
33 | t.Error(err.Error())
34 | return
35 | }
36 | if fp != "testdata/complete/body.csv" {
37 | t.Errorf("%s != %s", "testdata/complete/body.csv", fp)
38 | }
39 | }
40 |
41 | func TestReadInputTransformScript(t *testing.T) {
42 | if _, _, err := ReadInputTransformScript("bad_dir"); err != os.ErrNotExist {
43 | t.Error("expected os.ErrNotExist on bad tf script read")
44 | }
45 | }
46 |
47 | func TestReadInputReadmeScript(t *testing.T) {
48 | if _, _, err := ReadInputReadmeScript("bad_dir"); err != os.ErrNotExist {
49 | t.Error("expected os.ErrNotExist on bad tf script read")
50 | }
51 | _, _, err := ReadInputReadmeScript("testdata/complete")
52 | if err != nil {
53 | t.Fatal("could not open 'readme.md' file: %w", err)
54 | }
55 | }
56 |
57 | func TestNewTestCaseFromDir(t *testing.T) {
58 | var err error
59 | if _, err = NewTestCaseFromDir("testdata"); err == nil {
60 | t.Errorf("expected error")
61 | return
62 | }
63 |
64 | tc, err := NewTestCaseFromDir("testdata/complete")
65 | if err != nil {
66 | t.Errorf("error reading test dir: %s", err.Error())
67 | return
68 | }
69 |
70 | name := "complete"
71 | if tc.Name != name {
72 | t.Errorf("expected name to equal: %s. got: %s", name, tc.Name)
73 | }
74 |
75 | fn := "body.csv"
76 | if tc.BodyFilename != fn {
77 | t.Errorf("expected BodyFilename to equal: %s. got: %s", fn, tc.BodyFilename)
78 | }
79 |
80 | data := []byte(`city,pop,avg_age,in_usa
81 | toronto,40000000,55.5,false
82 | new york,8500000,44.4,true
83 | chicago,300000,44.4,true
84 | chatham,35000,65.25,true
85 | raleigh,250000,50.65,true
86 | `)
87 | if !bytes.Equal(tc.Body, data) {
88 | t.Errorf("data mismatch")
89 | }
90 |
91 | mf := tc.BodyFile()
92 | if mf.FileName() != tc.BodyFilename {
93 | t.Errorf("filename mismatch: %s != %s", mf.FileName(), tc.BodyFilename)
94 | }
95 |
96 | if ts, ok := tc.TransformScriptFile(); !ok {
97 | t.Errorf("expected tranform script to load")
98 | } else {
99 | if ts.FileName() != "transform.star" {
100 | t.Errorf("expected TransformScript filename to be transform.star")
101 | }
102 | }
103 | tc.TransformScript = nil
104 | if _, ok := tc.TransformScriptFile(); ok {
105 | t.Error("shouldn't generate TransformScript File if bytes are nil")
106 | }
107 |
108 | if vz, ok := tc.VizScriptFile(); !ok {
109 | t.Errorf("expected viz script to load")
110 | } else {
111 | if vz.FileName() != "template.html" {
112 | t.Errorf("expected VizScript filename to be template.html")
113 | }
114 | }
115 | tc.VizScript = nil
116 | if _, ok := tc.VizScriptFile(); ok {
117 | t.Error("shouldn't generate VizScript File if bytes are nil")
118 | }
119 |
120 | if rm, ok := tc.ReadmeScriptFile(); !ok {
121 | t.Errorf("expected readme script to load")
122 | } else {
123 | if rm.FileName() != "readme.md" {
124 | t.Errorf("expected ReadmeScript filename to be template.html")
125 | }
126 | }
127 | tc.ReadmeScript = nil
128 | if _, ok := tc.ReadmeScriptFile(); ok {
129 | t.Error("shouldn't generate ReadmeScript File if bytes are nil")
130 | }
131 |
132 | mfdata, err := ioutil.ReadAll(mf)
133 | if err != nil {
134 | t.Errorf("error reading file: %s", err.Error())
135 | }
136 |
137 | if !bytes.Equal(mfdata, data) {
138 | t.Errorf("memfile data mismatch")
139 | }
140 |
141 | rendered, err := tc.RenderedFile()
142 | if err != nil {
143 | t.Errorf("reading %s: %s", RenderedFilename, err)
144 | }
145 | if rendered == nil {
146 | t.Error("expected rendered to not equal nil")
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/detect/determineFields_test.go:
--------------------------------------------------------------------------------
1 | package detect
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/google/go-cmp/cmp"
8 | "github.com/qri-io/dataset"
9 | )
10 |
11 | var egCorruptCsvData = []byte(`
12 | """fhkajslfnakjlcdnajcl ashklj asdhcjklads ch,,,\dagfd
13 | `)
14 |
15 | var egNaicsCsvData = []byte(`
16 | STATE,FIRM,PAYR_N,PAYRFL_N,STATEDSCR,NAICSDSCR,entrsizedscr
17 | 00,--,74883.53,5621697325,United States,Total,01: Total
18 | 00,--,35806.37,241347624,United States,Total,02: 0-4`)
19 |
20 | var egNoHeaderData1 = []byte(`
21 | example,false,other,stuff
22 | ex,true,text,col
23 | `)
24 |
25 | var egNoHeaderData2 = []byte(`
26 | this,example,has,a,number,column,1
27 | this,example,has,a,number,column,2
28 | this,example,has,a,number,column,3`)
29 |
30 | var egNoHeaderData3 = []byte(`
31 | one, 1, three
32 | one, 2, three`)
33 |
34 | var egNoHeaderData4 = []byte(`one,two,3
35 | four,five,6`)
36 |
37 | var egNonDeterministicHeader = []byte(`
38 | not,possible,to,tell,if,this,csv,data,has,a,header
39 | not,possible,to,tell,if,this,csv,data,has,a,header
40 | not,possible,to,tell,if,this,csv,data,has,a,header
41 | not,possible,to,tell,if,this,csv,data,has,a,header
42 | `)
43 |
44 | func TestDetermineCSVSchema(t *testing.T) {
45 |
46 | runTestCase(t, "noHeaderData1", egNoHeaderData1,
47 | map[string]interface{}{
48 | "items": map[string]interface{}{
49 | "items": []interface{}{
50 | map[string]interface{}{
51 | "title": "field_1",
52 | "type": "string",
53 | },
54 | map[string]interface{}{
55 | "title": "field_2",
56 | "type": "boolean",
57 | }, map[string]interface{}{
58 | "title": "field_3",
59 | "type": "string",
60 | }, map[string]interface{}{
61 | "title": "field_4",
62 | "type": "string",
63 | },
64 | },
65 | "type": "array",
66 | },
67 | "type": "array",
68 | })
69 |
70 | runTestCase(t, "noHeaderData2", egNoHeaderData2,
71 | map[string]interface{}{
72 | "items": map[string]interface{}{
73 | "items": []interface{}{
74 | map[string]interface{}{
75 | "title": "field_1",
76 | "type": "string",
77 | },
78 | map[string]interface{}{
79 | "title": "field_2",
80 | "type": "string",
81 | }, map[string]interface{}{
82 | "title": "field_3",
83 | "type": "string",
84 | }, map[string]interface{}{
85 | "title": "field_4",
86 | "type": "string",
87 | }, map[string]interface{}{
88 | "title": "field_5",
89 | "type": "string",
90 | }, map[string]interface{}{
91 | "title": "field_6",
92 | "type": "string",
93 | }, map[string]interface{}{
94 | "title": "field_7",
95 | "type": "integer",
96 | },
97 | },
98 | "type": "array",
99 | },
100 | "type": "array",
101 | })
102 |
103 | runTestCase(t, "noHeaderData3", egNoHeaderData3,
104 | map[string]interface{}{
105 | "items": map[string]interface{}{
106 | "items": []interface{}{
107 | map[string]interface{}{
108 | "title": "field_1",
109 | "type": "string",
110 | },
111 | map[string]interface{}{
112 | "title": "field_2",
113 | "type": "integer",
114 | }, map[string]interface{}{
115 | "title": "field_3",
116 | "type": "string",
117 | },
118 | },
119 | "type": "array",
120 | },
121 | "type": "array",
122 | })
123 |
124 | runTestCase(t, "noHeaderData4", egNoHeaderData4,
125 | map[string]interface{}{
126 | "items": map[string]interface{}{
127 | "items": []interface{}{
128 | map[string]interface{}{
129 | "title": "field_1",
130 | "type": "string",
131 | },
132 | map[string]interface{}{
133 | "title": "field_2",
134 | "type": "string",
135 | }, map[string]interface{}{
136 | "title": "field_3",
137 | "type": "integer",
138 | },
139 | },
140 | "type": "array",
141 | },
142 | "type": "array",
143 | })
144 | }
145 |
146 | func runTestCase(t *testing.T, description string, input []byte, expect map[string]interface{}) {
147 | st := dataset.Structure{Format: "csv"}
148 | reader := bytes.NewReader(input)
149 | schema, _, err := CSVSchema(&st, reader)
150 | if err != nil {
151 | t.Fatal(err)
152 | }
153 | if diff := cmp.Diff(expect, schema); diff != "" {
154 | t.Errorf("mismatch for \"%s\" (-want +got):\n%s\n", description, diff)
155 | }
156 | }
157 |
--------------------------------------------------------------------------------
/dsio/xlsx_test.go:
--------------------------------------------------------------------------------
1 | package dsio
2 |
3 | import (
4 | "bytes"
5 | "os"
6 | "testing"
7 |
8 | "github.com/qri-io/dataset"
9 | "github.com/qri-io/dataset/dstest"
10 | )
11 |
12 | var xlsxStruct = &dataset.Structure{
13 | Format: "xlsx",
14 | FormatConfig: map[string]interface{}{
15 | "sheetName": "Sheet1",
16 | },
17 | Schema: map[string]interface{}{
18 | "type": "array",
19 | "items": map[string]interface{}{
20 | "type": "array",
21 | "items": []interface{}{
22 | map[string]interface{}{"title": "col_a", "type": "string"},
23 | map[string]interface{}{"title": "col_b", "type": "number"},
24 | map[string]interface{}{"title": "col_c", "type": "integer"},
25 | map[string]interface{}{"title": "col_d", "type": "boolean"},
26 | map[string]interface{}{"title": "col_e", "type": "object"},
27 | map[string]interface{}{"title": "col_f", "type": "array"},
28 | map[string]interface{}{"title": "col_g", "type": "null"},
29 | },
30 | },
31 | },
32 | }
33 |
34 | func TestXLSXReader(t *testing.T) {
35 | f, err := os.Open("testdata/xlsx/simple/body.xlsx")
36 | if err != nil {
37 | t.Fatal(err.Error())
38 | }
39 |
40 | rdr, err := NewEntryReader(xlsxStruct, f)
41 | if err != nil {
42 | t.Errorf("error allocating EntryReader: %s", err.Error())
43 | return
44 | }
45 | count := 0
46 | for {
47 | ent, err := rdr.ReadEntry()
48 | if err != nil {
49 | if err.Error() == "EOF" {
50 | break
51 | }
52 | t.Errorf("unexpected error: %s", err.Error())
53 | return
54 | }
55 |
56 | if arr, ok := ent.Value.([]interface{}); ok {
57 | if len(arr) != 2 {
58 | t.Errorf("invalid row length for row %d. expected %d, got %d", count, 7, len(arr))
59 | continue
60 | }
61 | } else {
62 | t.Errorf("expected value to []interface{}. got: %#v", ent.Value)
63 | continue
64 | }
65 |
66 | count++
67 | }
68 | if count != 4 {
69 | t.Errorf("expected: %d rows, got: %d", 4, count)
70 | }
71 | }
72 |
73 | func TestColIndexToLetters(t *testing.T) {
74 | cases := []struct {
75 | in int
76 | expect string
77 | }{
78 | {0, "A"},
79 | {25, "Z"},
80 | {26, "AA"},
81 | }
82 | for i, c := range cases {
83 | got := ColIndexToLetters(c.in)
84 | if got != c.expect {
85 | t.Errorf("case %d expected: %s, got: %s", i, c.expect, got)
86 | }
87 | }
88 | }
89 |
90 | func TestXLSXWriter(t *testing.T) {
91 | rows := []Entry{
92 | // TODO - vary up test input
93 | {Value: []interface{}{"a", float64(12), 23, nil}},
94 | {Value: []interface{}{"a", float64(12), 23, []interface{}{"foo", "bar"}}},
95 | {Value: []interface{}{"a", float64(12), 23, map[string]interface{}{"foo": "bar"}}},
96 | {Value: []interface{}{"a", float64(12), int64(23), false}},
97 | {Value: []interface{}{"a", float64(12), 23, false}},
98 | }
99 |
100 | buf := &bytes.Buffer{}
101 | rw, err := NewEntryWriter(xlsxStruct, buf)
102 | if err != nil {
103 | t.Errorf("error allocating EntryWriter: %s", err.Error())
104 | return
105 | }
106 | st := rw.Structure()
107 | if diff := dstest.CompareStructures(st, xlsxStruct); diff != "" {
108 | t.Errorf("structure mismatch: %s", diff)
109 | return
110 | }
111 |
112 | for i, row := range rows {
113 | if err := rw.WriteEntry(row); err != nil {
114 | t.Errorf("row %d write error: %s", i, err.Error())
115 | }
116 | }
117 |
118 | if err := rw.Close(); err != nil {
119 | t.Errorf("close reader error: %s", err.Error())
120 | return
121 | }
122 | }
123 |
124 | func TestXLSXCompression(t *testing.T) {
125 | if _, err := NewXLSXReader(&dataset.Structure{Format: "xlsx", Compression: "gzip"}, nil); err == nil {
126 | t.Error("expected xlsx to fail when using compression")
127 | }
128 | if _, err := NewXLSXWriter(&dataset.Structure{Format: "xlsx", Compression: "gzip"}, nil); err == nil {
129 | t.Error("expected xlsx to fail when using compression")
130 | }
131 | }
132 |
133 | /*
134 | TODO(dustmop): Disabled, testdata/movies/data.xlsx doesn't exist
135 |
136 | func BenchmarkXLSXReader(b *testing.B) {
137 | st := &dataset.Structure{Format: "xlsx", Schema: tabular.BaseTabularSchema}
138 |
139 | for n := 0; n < b.N; n++ {
140 | file, err := os.Open("testdata/movies/data.xlsx")
141 | if err != nil {
142 | b.Errorf("unexpected error: %s", err.Error())
143 | }
144 | r, err := NewXLSXReader(st, file)
145 | if err != nil {
146 | b.Fatalf("unexpected error: %s", err.Error())
147 | }
148 | for {
149 | _, err = r.ReadEntry()
150 | if err != nil {
151 | break
152 | }
153 | }
154 | }
155 | }
156 | */
157 |
--------------------------------------------------------------------------------