├── test ├── fixtures │ ├── sample.csv │ └── finance-vix │ │ ├── .datahub │ │ └── flow.yaml │ │ ├── data │ │ └── vix-daily.csv │ │ ├── datapackage.json │ │ └── README.md ├── authorization.test.js ├── push │ └── push.test.js └── cli.test.js ├── lib └── utils │ ├── logo.js │ ├── output │ ├── info.js │ ├── error.js │ └── wait.js │ ├── tools.js │ ├── update.js │ └── error.js ├── .gitmodules ├── .gitignore ├── docs ├── login.md ├── init.md ├── push-flow.md ├── cat.md ├── get.md ├── validate.md ├── info.md ├── help.md └── push.md ├── bin ├── data-help.js ├── data-init.js ├── data-login.js ├── data-cat.js ├── data-push-flow.js ├── data-info.js ├── data-validate.js ├── data.js ├── data-get.js └── data-push.js ├── test-script.sh ├── .travis.yml ├── package.json ├── README.md └── DESIGN.md /test/fixtures/sample.csv: -------------------------------------------------------------------------------- 1 | number,string,boolean 2 | 1,two,true 3 | 3,four,false 4 | -------------------------------------------------------------------------------- /lib/utils/logo.js: -------------------------------------------------------------------------------- 1 | module.exports.box = '📦' 2 | module.exports.elephant = '🐘' 3 | module.exports.square = '❒' 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "test/fixtures/test-data"] 2 | path = test/fixtures/test-data 3 | url = https://github.com/frictionlessdata/test-data 4 | -------------------------------------------------------------------------------- /lib/utils/output/info.js: -------------------------------------------------------------------------------- 1 | const chalk = require('chalk') 2 | 3 | // Prints an informational message 4 | module.exports = msg => { 5 | console.log(`${chalk.gray('>')} ${msg}`) 6 | } 7 | -------------------------------------------------------------------------------- /lib/utils/output/error.js: -------------------------------------------------------------------------------- 1 | const chalk = require('chalk') 2 | 3 | // Prints an error message 4 | module.exports = msg => { 5 | if (msg.message) { 6 | msg = msg.message 7 | } 8 | console.log(`${chalk.red('> Error!')} ${msg}`) 9 | } 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | lib-cov 4 | *.seed 5 | *.log 6 | *.dat 7 | *.out 8 | *.pid 9 | *.gz 10 | 11 | pids/ 12 | logs/ 13 | results/ 14 | node_modules/ 15 | .idea/ 16 | 17 | npm-debug.log 18 | package-lock.json 19 | 20 | sandbox/* 21 | packed/* 22 | -------------------------------------------------------------------------------- /docs/login.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ■ data login 3 | ``` 4 | 5 | Login to DataHub system using your Google/GitHub account. 6 | 7 | ## Options: 8 | 9 | ``` 10 | -i, --interactive Displays the authentication URL 11 | -h, --help Outputs usage information 12 | ``` 13 | -------------------------------------------------------------------------------- /bin/data-help.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | const path = require('path') 3 | 4 | const {customMarked} = require('../lib/utils/tools.js') 5 | 6 | const helpMarkdown = fs.readFileSync(path.join(__dirname, '../docs/help.md'), 'utf8') 7 | 8 | console.log('\n' + customMarked(helpMarkdown)) 9 | -------------------------------------------------------------------------------- /lib/utils/output/wait.js: -------------------------------------------------------------------------------- 1 | const ora = require('ora') 2 | const chalk = require('chalk') 3 | const {eraseLine} = require('ansi-escapes') 4 | 5 | // Prints a spinner followed by the given text 6 | module.exports = msg => { 7 | const spinner = ora(chalk.gray(msg)) 8 | spinner.color = 'gray' 9 | spinner.start() 10 | 11 | return () => { 12 | spinner.stop() 13 | process.stdout.write(eraseLine) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /test-script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ev 3 | echo '>>> Now running shell script...' 4 | npm i -g git+https://github.com/datahq/data-cli.git 5 | data --version 6 | data help 7 | data info https://datahub.io/core/finance-vix 8 | 9 | echo '>>> Installing data-cli with yarn...' 10 | npm uninstall -g data-cli 11 | 12 | yarn global add git+https://github.com/datahq/data-cli.git 13 | data --version 14 | data info https://datahub.io/core/finance-vix 15 | -------------------------------------------------------------------------------- /docs/init.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ■ data init 3 | ``` 4 | 5 | Initialize a Data Package in the current working directory. 6 | It will scan the current working directory and nested directories for the files and generate a `datapackage.json`. 7 | 8 | ## Options: 9 | 10 | ``` 11 | -h, --help Output usage information 12 | -i, --interactive Run init in interactive mode 13 | ``` 14 | 15 | ## Example: 16 | 17 | ``` 18 | # Initialize Data Package in current working directory: 19 | ■ data init 20 | ``` 21 | -------------------------------------------------------------------------------- /docs/push-flow.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ■ data push-flow [PATH] 3 | ``` 4 | `PATH` (optional) is the path to the data package. 5 | 6 | ## Options: 7 | 8 | -h, --help Output usage information 9 | 10 | ## Examples: 11 | 12 | \- Uploads Data Package to DataHub in current working directory 13 | 14 | ■ data push-flow 15 | 16 | data package should have .datahub/flow.yaml 17 | 18 | \- Uploads Data Package to DataHub with path: 19 | 20 | ■ data push-flow core/finance-vix/ 21 | 22 | core/finance-vix/ should have datapackage.json and .datahub/flow.yaml 23 | -------------------------------------------------------------------------------- /test/fixtures/finance-vix/.datahub/flow.yaml: -------------------------------------------------------------------------------- 1 | meta: 2 | dataset: finance-vix 3 | findability: published 4 | owner: test 5 | ownerid: testid 6 | version: 1 7 | inputs: 8 | - kind: datapackage 9 | parameters: 10 | resource-mapping: 11 | vix-daily: http:/testing.com/vixcurrent.csv 12 | url: http:/testing.com/.datahub/datapackage.json 13 | processing: 14 | - 15 | input: vix-daily 16 | tabulator: 17 | skip_rows: 2 18 | headers: 19 | - Date 20 | - VIXOpen 21 | - VIXHigh 22 | - VIXLow 23 | - VIXClose 24 | output: vix-daily 25 | schedule: 'every 1d' 26 | -------------------------------------------------------------------------------- /docs/cat.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ■ data cat [OPTIONS] PATH-OR-URL [OUT-PATH] 3 | ``` 4 | Read a data file and write its output to stdout or `OUT-PATH` if `OUT-PATH` provided. 5 | 6 | Input data formats supported: 7 | 8 | * csv 9 | * excel 10 | 11 | Output data files supported: 12 | 13 | * ascii table (default - if no format specified) 14 | * csv 15 | * excel (.xlsx) 16 | * markdown (.md) 17 | 18 | ## Options 19 | 20 | --format explicitly provide input file format, e.g., if it does not have conventional name 21 | 22 | ## Examples 23 | 24 | Reading from stdin: 25 | 26 | ``` 27 | ■ cat PATH | data cat _ [OUT-PATH] 28 | 29 | ■ curl URL | data cat _ [OUT-PATH] 30 | ``` 31 | -------------------------------------------------------------------------------- /docs/get.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ■ data get 3 | ``` 4 | 5 | Get a dataset from the given URL. 6 | 7 | URL can be one of: 8 | 9 | * dataset in DataHub (e.g., https://datahub.io/core/co2-ppm) 10 | * dataset in GitHub (e.g., https://github.com/datasets/co2-ppm) 11 | * direct URL to dataset 12 | 13 | ## Options: 14 | 15 | ``` 16 | -h, --help Outputs usage information 17 | ``` 18 | 19 | ## Example: 20 | 21 | ``` 22 | # Get dataset from DataHub 23 | # Following dataset will be saved in core/co2-ppm 24 | ■ data get https://datahub.io/core/co2-ppm 25 | 26 | # From GitHub 27 | # Following dataset will be saved in datasets/co2-ppm 28 | ■ data get https://github.com/datasets/co2-ppm 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/validate.md: -------------------------------------------------------------------------------- 1 | 2 | # Validate a descriptor 3 | 4 | ## Usage: 5 | 6 | ``` 7 | # Validates datapackage.json in given path/URL or in cwd if not given: 8 | ■ data validate [path | URL] 9 | 10 | # If a descriptor is invalid, it will print out validation errors. 11 | ``` 12 | 13 | ## Options: 14 | 15 | ``` 16 | -h, --help Output usage information 17 | ``` 18 | 19 | ## Example: 20 | 21 | ``` 22 | # Validate descriptor in current working directory: 23 | ■ data validate 24 | 25 | # Validate descriptor from local path: 26 | ■ data validate test/fixtures/datapackage.json 27 | 28 | # Validate descriptor from URL: 29 | ■ data validate https://bits-staging.datapackaged.com/metadata/core/gdp/_v/latest/datapackage.json 30 | ``` 31 | -------------------------------------------------------------------------------- /test/fixtures/finance-vix/data/vix-daily.csv: -------------------------------------------------------------------------------- 1 | Date,VIXOpen,VIXHigh,VIXLow,VIXClose 2 | 2004-01-02,17.96,18.68,17.54,18.22 3 | 2004-01-05,18.45,18.49,17.44,17.49 4 | 2004-01-06,17.66,17.67,16.19,16.73 5 | 2004-01-07,16.72,16.75,15.05,15.05 6 | 2004-01-08,15.42,15.68,15.32,15.61 7 | 2004-01-09,16.15,16.88,15.57,16.75 8 | 2004-01-12,17.32,17.46,16.79,16.82 9 | 2004-01-13,16.06,18.33,16.53,18.04 10 | 2004-01-14,17.29,17.03,16.04,16.75 11 | 2004-01-15,17.07,17.31,15.49,15.56 12 | 2004-01-16,15.04,15.44,14.09,15 13 | 2004-01-20,15.77,16.13,15.09,15.21 14 | 2004-01-21,15.63,15.63,14.24,14.34 15 | 2004-01-22,14.02,14.87,14.01,14.71 16 | 2004-01-23,14.73,15.05,14.56,14.84 17 | 2004-01-26,15.78,15.78,14.52,14.55 18 | 2004-01-27,15.28,15.44,14.74,15.35 19 | 2004-01-28,15.37,17.06,15.29,16.78 20 | 2004-01-29,16.88,17.66,16.79,17.14 21 | -------------------------------------------------------------------------------- /docs/info.md: -------------------------------------------------------------------------------- 1 | 2 | # Preview a Dataset 3 | 4 | ## Usage: 5 | 6 | ``` 7 | # Get information about Dataset: 8 | ■ data info [path] 9 | ``` 10 | 11 | ## Options: 12 | 13 | ``` 14 | -h, --help Output usage information 15 | --format Explicitly provide input file format, e.g., if it does not have conventional name 16 | ``` 17 | 18 | ## Example: 19 | 20 | ``` 21 | # Get information about Dataset in current working directory: 22 | ■ data info 23 | 24 | # Get information about Dataset providing local path: 25 | ■ data info dir/finance-vix 26 | 27 | # Or you can get info about remote dataset: 28 | ■ data info https://raw.githubusercontent.com/datasets/gdp/master/datapackage.json 29 | 30 | # Additionally, you can preview local or remote tabular data file: 31 | ■ data info https://raw.githubusercontent.com/datahq/core-datasets-tools/master/examples.csv 32 | -------------------------------------------------------------------------------- /lib/utils/tools.js: -------------------------------------------------------------------------------- 1 | // Markdown 2 | const marked = require('marked') 3 | const TerminalRenderer = require('marked-terminal') 4 | // Global packages 5 | const globalPackages = require('global-packages') 6 | 7 | const {elephant} = require('./logo') 8 | 9 | marked.setOptions({ 10 | renderer: new TerminalRenderer() 11 | }) 12 | module.exports.customMarked = marked 13 | 14 | const installedWithNPM = async () => { 15 | let packages 16 | 17 | try { 18 | packages = await globalPackages() 19 | } catch (err) { 20 | console.log(err) 21 | return false 22 | } 23 | 24 | if (!Array.isArray(packages)) { 25 | return false 26 | } 27 | 28 | const related = packages.find(item => item.name === 'now') 29 | 30 | if (!related || related.linked === true) { 31 | return false 32 | } 33 | 34 | if (related.linked === false) { 35 | return true 36 | } 37 | 38 | return false 39 | } 40 | module.exports.installedWithNPM = installedWithNPM 41 | -------------------------------------------------------------------------------- /docs/help.md: -------------------------------------------------------------------------------- 1 | ``` 2 | ❒ data [options] 3 | ``` 4 | ## Commands: 5 | 6 | DataHub: 7 | 8 | push [path] Push data at `path` to the DataHub 9 | 10 | General: 11 | 12 | get [url] Retrieve data at `url` to local disk 13 | info [path/url] Get info on data (file or dataset) at path or url 14 | cat path [out] Read data at path and write to out (or stdout) 15 | 16 | Data Package specific: 17 | 18 | init Create a Data Package 19 | validate Validate Data Package structure 20 | 21 | Administrative: 22 | 23 | help [cmd] Show help on cmd 24 | login Login or signup to the DataHub 25 | 26 | ## Options: 27 | 28 | -h, --help Output usage information 29 | -v, --version Output the version 30 | 31 | ## Examples 32 | 33 | Push a Data Package (in the current directory) 34 | 35 | ■ data push 36 | 37 | Get a Data Package from the DataHub owned by `core` and with name `finance-vix` 38 | 39 | ■ data get https://datahub.io/core/finance-vix 40 | 41 | Get a Data Package on github 42 | 43 | ■ data get https://github.com/datasets/gdp 44 | 45 | -------------------------------------------------------------------------------- /test/fixtures/finance-vix/datapackage.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "finance-vix", 3 | "title": "VIX - CBOE Volatility Index", 4 | "homepage": "http://www.cboe.com/micro/VIX/", 5 | "version": "0.1.0", 6 | "license": "PDDL-1.0", 7 | "sources": [{ 8 | "title": "CBOE VIX Page", 9 | "web": "http://www.cboe.com/micro/vix/historical.aspx" 10 | }], 11 | "resources": [ 12 | { 13 | "name": "vix-daily", 14 | "path": "data/vix-daily.csv", 15 | "format": "csv", 16 | "mediatype": "text/csv", 17 | "schema": { 18 | "fields": [ 19 | { 20 | "name": "Date", 21 | "type": "date", 22 | "description": "" 23 | }, 24 | { 25 | "name": "VIXOpen", 26 | "type": "number", 27 | "description": "" 28 | }, 29 | { 30 | "name": "VIXHigh", 31 | "type": "number", 32 | "description": "" 33 | }, 34 | { 35 | "name": "VIXLow", 36 | "type": "number", 37 | "description": "" 38 | }, 39 | { 40 | "name": "VIXClose", 41 | "type": "number", 42 | "description": "" 43 | } 44 | ], 45 | "primaryKey": "Date" 46 | } 47 | } 48 | ], 49 | "views": [ 50 | { 51 | "id": "Graph", 52 | "type": "Graph", 53 | "state": { 54 | "graphType": "lines", 55 | "group": "Date", 56 | "series": [ "VIXClose" ] 57 | } 58 | } 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /lib/utils/update.js: -------------------------------------------------------------------------------- 1 | const pkg = require('../../package.json') 2 | const updateNotifier = require('update-notifier') 3 | const boxen = require('boxen') 4 | 5 | module.exports = () => { 6 | const notifier = updateNotifier({ 7 | pkg, 8 | updateCheckInterval: 1000 9 | }) 10 | 11 | if (!notifier.update) { 12 | return 13 | } 14 | 15 | // Depending on running OS show appropriate instructions: 16 | const introduction = 'If you\'ve installed data tool using our executable binary then follow instructions below:\n' 17 | const instructions = { 18 | 'darwin': `\ncurl -L https://github.com/datahq/data-cli/releases/download/v${notifier.update.latest}/data-macos.gz -o ./data.gz 19 | gunzip -f data.gz && chmod +x data && sudo mv data /usr/local/bin/data`, 20 | 'linux': `\nwget https://github.com/datahq/data-cli/releases/download/v${notifier.update.latest}/data-linux.gz 21 | gunzip -f data-linux.gz && chmod +x data-linux && sudo mv data-linux /usr/local/bin/data`, 22 | 'win32': `\nDepending on your Windows distribution and configurations, you may need to use different path when moving the executable.\n 23 | You need to run 'move' command as administrator: 24 | curl -k --insecure -L https://github.com/datahq/data-cli/releases/download/v${notifier.update.latest}/data-win.exe.gz -o ./data.gz 25 | gzip -d data.gz && move data "C:\\Windows\\System32\\data.exe"` 26 | } 27 | const summary = `\ndata -v # should print ${notifier.update.latest}` 28 | 29 | if (notifier.update) { 30 | notifier.notify({ 31 | defer: false, 32 | isGlobal: true 33 | }) 34 | console.log(introduction + instructions[process.platform] + summary) 35 | } else { 36 | return 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /bin/data-init.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | // Packages 4 | const fs = require('fs') 5 | const path = require('path') 6 | const minimist = require('minimist') 7 | const {Init} = require('datahub-client') 8 | 9 | // Ours 10 | const {customMarked} = require('../lib/utils/tools.js') 11 | const info = require('../lib/utils/output/info.js') 12 | 13 | const argv = minimist(process.argv.slice(2), { 14 | string: ['init'], 15 | boolean: ['help', 'interactive'], 16 | alias: { 17 | help: 'h', 18 | interactive: 'i' 19 | } 20 | }) 21 | 22 | const initMarkdown = fs.readFileSync(path.join(__dirname, '../docs/init.md'), 'utf8') 23 | const help = () => { 24 | console.log('\n' + customMarked(initMarkdown)) 25 | } 26 | 27 | if (argv.help) { 28 | help() 29 | process.exit(0) 30 | } 31 | 32 | 33 | const checkDpIsThere = (path_ = process.cwd()) => { 34 | const files = fs.readdirSync(path_) 35 | return files.indexOf('datapackage.json') > -1 36 | } 37 | 38 | 39 | (async() => { 40 | 41 | const initializer = new Init({interactive: argv.interactive, path_: argv._[0]}) 42 | // Listen for events: 43 | initializer 44 | .on('message', (message) => { 45 | info(message) 46 | }) 47 | .on('exit', (message) => { 48 | info(message) 49 | process.exit(0) 50 | }) 51 | 52 | // Get a descriptor generated: 53 | let descriptor = {} 54 | if (checkDpIsThere(argv._[0])) { 55 | descriptor = await initializer.updateDataset() 56 | } else { 57 | descriptor = await initializer.createDataset() 58 | } 59 | // Now save the generated descriptor: 60 | const content = JSON.stringify(descriptor, null, 2) 61 | const dest = path.join(argv._[0] || '', 'datapackage.json') 62 | fs.writeFile(dest, content, 'utf8', err => { 63 | if (err) { 64 | throw new Error(err) 65 | } else { 66 | info(`\n💾 Descriptor is saved in "${dest}"`) 67 | } 68 | }) 69 | 70 | })() 71 | -------------------------------------------------------------------------------- /lib/utils/error.js: -------------------------------------------------------------------------------- 1 | // Packages: 2 | const Raven = require('raven') 3 | 4 | // Ours: 5 | const error = require('./output/error') 6 | const info = require('./output/info') 7 | const {version} = require('../../package.json') 8 | const {installedWithNPM} = require('./tools') 9 | 10 | 11 | async function handleError(err, {debug = false} = {}) { 12 | if (process.env.datahub !== 'dev') { // Send report to Sentry if not dev env 13 | // Setup Sentry: 14 | Raven.config('https://e29902aa81ed414d867f51bd0d1ab91a:2b18fef80e954ba68d8f4351aab99672@sentry.io/305079', { 15 | release: version, 16 | extra: { 17 | args: process.argv, 18 | nodejsOrBin: installedWithNPM ? process.version : 'bin', 19 | os: process.platform 20 | } 21 | }) 22 | 23 | await new Promise((resolve, reject) => { 24 | // Capture errors: 25 | Raven.captureException(err, (sendErr, eventId) => { 26 | // Once report is sent resolve the promise. However, we resolve it even 27 | // if it failed to send a report: 28 | resolve() 29 | }) 30 | }) 31 | } 32 | 33 | // Coerce Strings to Error instances 34 | if (typeof err === 'string') { 35 | err = new Error(err) 36 | } 37 | 38 | if (debug) { 39 | console.log(`> [debug] handling error: ${err.stack}`) 40 | } 41 | 42 | if (err.code === 'ECONNREFUSED' || err.code === 'ENOTFOUND') { 43 | error(`Connection error: ${err.message}`) 44 | } else { 45 | if (err.constructor.name === 'Array') { 46 | err.forEach(err => error(err.message)) 47 | } else { 48 | error(err) 49 | // Check if error is due to `xdg-open` module on Linux and print instructions: 50 | if (err.message && err.message.includes('xdg-open')) { 51 | info('Run following command and try again, please:\ncp /usr/bin/xdg-open /usr/local/bin/xdg-open') 52 | } 53 | } 54 | } 55 | } 56 | 57 | module.exports = { 58 | handleError, 59 | error 60 | } 61 | -------------------------------------------------------------------------------- /test/fixtures/finance-vix/README.md: -------------------------------------------------------------------------------- 1 | CBOE Volatility Index (VIX) time-series dataset including daily open, close, 2 | high and low. The CBOE Volatility Index (VIX) is a key measure of market 3 | expectations of near-term volatility conveyed by S&P 500 stock index option 4 | prices introduced in 1993. 5 | 6 | ## Data 7 | 8 | From the [VIX FAQ][faq]: 9 | 10 | > In 1993, the Chicago Board Options Exchange® (CBOE®) introduced the CBOE 11 | > Volatility Index®, VIX®, and it quickly became the benchmark for stock market 12 | > volatility. It is widely followed and has been cited in hundreds of news 13 | > articles in the Wall Street Journal, Barron's and other leading financial 14 | > publications. Since volatility often signifies financial turmoil, VIX is 15 | > often referred to as the "investor fear gauge". 16 | > 17 | > VIX measures market expectation of near term volatility conveyed by stock 18 | > index option prices. The original VIX was constructed using the implied 19 | > volatilities of eight different OEX option series so that, at any given time, 20 | > it represented the implied volatility of a hypothetical at-the-money OEX 21 | > option with exactly 30 days to expiration. 22 | > 23 | > The New VIX still measures the market's expectation of 30-day volatility, but 24 | > in a way that conforms to the latest thinking and research among industry 25 | > practitioners. The New VIX is based on S&P 500 index option prices and 26 | > incorporates information from the volatility "skew" by using a wider range of 27 | > strike prices rather than just at-the-money series. 28 | 29 | [faq]: http://www.cboe.com/micro/vix/faq.aspx 30 | 31 | ## Preparation 32 | 33 | Run the shell script: 34 | 35 | . scripts/process.sh 36 | 37 | Output data is in `data/`. 38 | 39 | ### TODO 40 | 41 | * Incorporate computed historical data (1990-2003) 42 | * Consider incorporating VOX data 43 | 44 | ## License 45 | 46 | No obvious statement on [historical data page][historical]. Given size and 47 | factual nature of the data and its source from a US company would imagine this 48 | was public domain and as such have licensed the Data Package under the Public 49 | Domain Dedication and License (PDDL). 50 | 51 | [historical]: http://www.cboe.com/micro/vix/historical.aspx 52 | -------------------------------------------------------------------------------- /test/authorization.test.js: -------------------------------------------------------------------------------- 1 | const test = require('ava') 2 | const {Agent} = require('datahub-client') 3 | 4 | // ========================== 5 | // USER RIGHTS & RESTRICTIONS 6 | 7 | const mainPath = '/anuveyatsu/finance-vix' 8 | const dpJsonPath = mainPath + '/datapackage.json' 9 | const resourceCsvPath = mainPath + '/r/vix-daily.csv' 10 | const resourceJsonPath = mainPath + '/r/vix-daily.json' 11 | const zipPath = mainPath + '/r/finance-vix_zip.zip' 12 | 13 | const agent = new Agent('https://datahub.io') 14 | 15 | test('Access private dataset as unauthorized user', async t => { 16 | let response = await agent.fetch(mainPath) 17 | t.is(response.status, 404) 18 | response = await agent.fetch(dpJsonPath) 19 | t.is(response.status, 404) 20 | response = await agent.fetch(resourceCsvPath) 21 | t.is(response.status, 404) 22 | response = await agent.fetch(resourceJsonPath) 23 | t.is(response.status, 404) 24 | response = await agent.fetch(zipPath) 25 | t.is(response.status, 404) 26 | }) 27 | 28 | test('Access private dataset as non-owner user', async t => { 29 | // Token for 'test' user (Travis knows it): 30 | const token = process.env.token 31 | let response = await agent.fetch(mainPath + `?jwt=${token}`) 32 | t.is(response.status, 404) 33 | response = await agent.fetch(dpJsonPath + `?jwt=${token}`) 34 | t.is(response.status, 404) 35 | response = await agent.fetch(resourceCsvPath + `?jwt=${token}`) 36 | t.is(response.status, 404) 37 | response = await agent.fetch(resourceJsonPath + `?jwt=${token}`) 38 | t.is(response.status, 404) 39 | response = await agent.fetch(zipPath + `?jwt=${token}`) 40 | t.is(response.status, 404) 41 | }) 42 | 43 | test('Access private dataset as owner', async t => { 44 | // Owner's token is stored as secret env var on Travis 45 | const token = process.env.SECRET_OWNER_TOKEN 46 | let response = await agent.fetch(mainPath + `?jwt=${token}`) 47 | t.is(response.status, 200) 48 | response = await agent.fetch(dpJsonPath + `?jwt=${token}`) 49 | t.is(response.status, 200) 50 | response = await agent.fetch(resourceCsvPath + `?jwt=${token}`) 51 | t.is(response.status, 200) 52 | response = await agent.fetch(resourceJsonPath + `?jwt=${token}`) 53 | t.is(response.status, 200) 54 | response = await agent.fetch(zipPath + `?jwt=${token}`) 55 | t.is(response.status, 200) 56 | }) 57 | -------------------------------------------------------------------------------- /bin/data-login.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs') 4 | const path = require('path') 5 | const minimist = require('minimist') 6 | const inquirer = require('inquirer') 7 | 8 | const {customMarked} = require('../lib/utils/tools.js') 9 | const {config} = require('datahub-client') 10 | const {handleError} = require('../lib/utils/error') 11 | const info = require('../lib/utils/output/info.js') 12 | const {login, authenticate} = require('datahub-client') 13 | const wait = require('../lib/utils/output/wait') 14 | 15 | const argv = minimist(process.argv.slice(2), { 16 | string: ['login'], 17 | boolean: ['help', 'interactive'], 18 | alias: {help: 'h', interactive: 'i'} 19 | }) 20 | 21 | const configMarkdown = fs.readFileSync(path.join(__dirname, '../docs/login.md'), 'utf8') 22 | const help = () => { 23 | console.log('\n' + customMarked(configMarkdown)) 24 | } 25 | 26 | if (argv.help) { 27 | help() 28 | process.exit(0) 29 | } 30 | 31 | Promise.resolve().then(async () => { 32 | const stopSpinner = wait('Logging in ...') 33 | const apiUrl = config.get('api') 34 | const token = config.get('token') 35 | let out 36 | 37 | try { 38 | out = await authenticate(apiUrl, token) 39 | } catch (err) { 40 | await handleError(err) 41 | process.exit(1) 42 | } 43 | if (out.authenticated) { 44 | stopSpinner() 45 | info('You are already logged in.') 46 | process.exit(0) 47 | } 48 | // Signup or signin 49 | stopSpinner() 50 | 51 | // Do choosing login method here 52 | const loginChoices = Object.keys(out.providers).map(provider => { 53 | return provider.charAt(0).toUpperCase() + provider.slice(1) 54 | }) 55 | const result = await inquirer.prompt([ 56 | { 57 | type: 'list', 58 | name: 'loginProvider', 59 | message: 'Login with...', 60 | choices: loginChoices, 61 | filter: val => { 62 | return val.toLowerCase() 63 | } 64 | } 65 | ]) 66 | const authUrl = out.providers[result.loginProvider].url 67 | info('Opening browser and waiting for you to authenticate online') 68 | if (argv.interactive) { 69 | info('Please, copy and paste following URL in your browser:\n' + authUrl) 70 | } else { 71 | info('Note: If nothing is loaded in browser please run `data login -i`') 72 | } 73 | 74 | try { 75 | await login(apiUrl, authUrl, config.get('domain')) 76 | } catch (err) { 77 | await handleError(err) 78 | process.exit(1) 79 | } 80 | info('You are logged in!') 81 | process.exit(0) 82 | }) 83 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | matrix: 3 | include: 4 | - os: linux 5 | node_js: '8' 6 | env: TEST=true 7 | sudo: required 8 | - os: linux 9 | node_js: '7' 10 | env: TEST=false 11 | sudo: required 12 | - os: linux 13 | node_js: '9' 14 | env: TEST=false 15 | sudo: required 16 | - os: osx 17 | node_js: '7' 18 | env: TEST=false 19 | - os: osx 20 | node_js: '8' 21 | env: TEST=true DEPLOY=true 22 | - os: osx 23 | node_js: '9' 24 | env: TEST=false 25 | install: if $TEST; then npm install; else echo 'skipping this script...'; fi 26 | script: if $TEST; then npm test; else echo 'skipping this script...'; fi 27 | before_deploy: if $DEPLOY; then git submodule init && git submodule update && npm run push:test && npm run pack; else echo 'skipping this script...'; fi 28 | deploy: 29 | - provider: releases 30 | api_key: 31 | secure: "Ids51LgFBEoOR1q1DrmxNRP/ryPHPEAuf7vsO7aBVy+qFwcaRzwmvxvjPVa5907Sv7zkccg+qbFW9+W1HQ7IYi4nyaX/4o8WaxQgev2aki2KD5ztQlsWeGRcquAZfZplLrUbXJkEW8cnXcGIE8QKc08yiQa25+3s8fsBi797z8pj04iLAvkK600lFbKls90rQH7FsIlIrc0LarvNNFqEi/qtUoPeMQ6ksWoE1emvRZfLaIh8IIgzfN00MxX/hZb+50v2eZaypjfODZZL5GDvnCM+fofqCyEnJwo0JLxRHz7rHKN4/nAN6Rbvqx2bGQu7WtWCX3nT+u7rdgg3gjqR4oeHHpNqhalwXpfPWU/3G9HiyCSODqa/bps8FODM3hm8+LENQrsWYWZ6/mLxY76F97MqHTQHSz4rB74aKDc+nZ1Teqw8oykC0T47JwguH+ZIfKt7bOW9awWFkEbPSOrWNGQV7ZVjhxKlr4bazeYUyHFhTLjkawH8P1jRSt3H2a+AZES5cDuXEwKER7I7hZU7FCNNbFxUHJo0rSG/dynFk9jmiSmUELAKkkj/C0UTP3r55Zkq7LleOlfhXELmk6cEr+uQIAm/OzQ/CKL+5Vq1uhSNqmdxFGURHQNUQrv2tmysyNEzo+N9mwtOQH43JRjT7PnhuTWI/BOrqi8kk7csUaw=" 32 | file_glob: true 33 | file: "packed/*" 34 | skip_cleanup: true 35 | on: 36 | tags: true 37 | condition: $DEPLOY = true 38 | - provider: npm 39 | email: datahub@datopian.com 40 | api_key: 41 | secure: "riGRy8fqyJPXeTJulBENhxLLktvVimqTlyrjckdifenI8q0vxbTvw4fRep3fKQqgNFrh0dccIagtPO8RSuf0Se9dK+M6mBM2dc8W6t84i0jg+EDoavvhgHlVfXFNEx50lWz2H1EZH4I9MvFbAiXQm7svhaXSMwudzdlHFq/K+0xjDkVgnv4AWOnkezf8XqyOmBfVPcS6mvfEMZgtQPR41eaFM7GZ1hAwOZnOwLSTRMljBiDlBSKp89ahNsmoDua3JMZ8/5s5pp1fBzlHJx6knNF9lSTjXQtJEd1ZGZljdyjIawwCdohzcR37P6iRlCLVAOGKrbeFMUnprUk23HFg3eD86cUtly+jdZd7YqBTSBQ4m9r+3G5YKbUdCbavC0pnc3/cKwP3tYRnLN5PPxo9pTypHwvVzADgG4XBnvXsE07k8F+QBdIJce7JpM1QjDi5xiqJyOEW1YIpVnxOBtO6qc/w+cmlZzcBcdbOfss0+mEU0WHJFj+FE8jDGtt3TJK9PvkV5EKo0KtDGmZVrOeW63CJ3SE47jOcS0GwtxdlSzTsKK9Ic3b9s1pHAoqHy0n/PJEXAJ6NcS0MyeHPPwg1NWWQMIXEt1JQf6iXdcWVVCZBBXShdX4KUUQLkOHXSNv1vIKgE/5UP3bjJ+FHazLQjPjquMIRuGmy6yAwD/GaaLI=" 42 | on: 43 | tags: true 44 | condition: $DEPLOY = true 45 | after_script: 46 | - chmod ugo+x test-script.sh 47 | - ./test-script.sh 48 | -------------------------------------------------------------------------------- /bin/data-cat.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const fs = require('fs') 3 | const path = require('path') 4 | 5 | const minimist = require('minimist') 6 | const {File, isUrl} = require('data.js') 7 | const {writers} = require('datahub-client').cat 8 | 9 | // Ours 10 | const {customMarked} = require('../lib/utils/tools.js') 11 | const info = require('../lib/utils/output/info.js') 12 | const {handleError, error} = require('../lib/utils/error') 13 | 14 | const argv = minimist(process.argv.slice(2), { 15 | string: ['cat'], 16 | boolean: ['help'], 17 | alias: {help: 'h'} 18 | }) 19 | 20 | const getMarkdown = fs.readFileSync(path.join(__dirname, '../docs/cat.md'), 'utf8') 21 | const help = () => { 22 | console.log('\n' + customMarked(getMarkdown)) 23 | } 24 | 25 | if (argv.help) { 26 | help() 27 | process.exit(0) 28 | } 29 | 30 | const pathParts = argv._[0] ? path.parse(argv._[0]) : {name: null} 31 | 32 | let outFileExt, outFormat 33 | if (argv._[1] && argv._[1] !== 'stdout') { 34 | outFileExt = path.extname(argv._[1]) || '.noext' 35 | outFormat = outFileExt.slice(1) 36 | } else { 37 | outFormat = 'ascii' 38 | } 39 | 40 | const writersDatabase = { 41 | ascii: writers.ascii, 42 | csv: writers.csv, 43 | xlsx: writers.xlsx, 44 | md: writers.md, 45 | html: writers.html 46 | } 47 | 48 | const dumpIt = async (res, {sheet}={}) => { 49 | let stream 50 | if (outFormat in writersDatabase) { 51 | try { 52 | stream = await writersDatabase[outFormat](res, {sheet}) 53 | } catch (err) { 54 | if (isUrl(argv._[0])) { 55 | error('Provided URL is invalid') 56 | } 57 | await handleError(err) 58 | process.exit(1) 59 | } 60 | 61 | if (outFormat === 'ascii') { // Write to stdout 62 | stream.pipe(process.stdout) 63 | } else { // Write to file 64 | const writeStream = fs.createWriteStream(argv._[1], {flags : 'w'}) 65 | stream.pipe(writeStream) 66 | writeStream.on('close', () => { 67 | info(`All done! Your data is saved in "${argv._[1]}"`) 68 | }) 69 | } 70 | } else { 71 | info(`Sorry, provided output format is not supported.`) 72 | } 73 | } 74 | 75 | if (pathParts.name === '_' || (!pathParts.name && process.stdin.constructor.name === 'Socket')) { 76 | dumpIt(process.stdin) 77 | } else if (pathParts.name) { 78 | // Check both 'sheet' and 'sheets' args as users can use both of them: 79 | let sheet = argv.sheet || argv.sheets 80 | // Check if it can be coerced to integer, if so we assume it's sheet index: 81 | sheet = !!parseInt(sheet) ? parseInt(sheet) - 1 : sheet 82 | const res = File.load(argv._[0], {format: argv.format}) 83 | dumpIt(res, {sheet}) 84 | } else { 85 | info('No input is provided. Please, run "data cat --help" for usage information.') 86 | } 87 | -------------------------------------------------------------------------------- /docs/push.md: -------------------------------------------------------------------------------- 1 | 2 | ■ data push [PATH] 3 | 4 | `PATH` (optional) is the path to the data file or data package. 5 | 6 | ## Options: 7 | 8 | -h, --help Output usage information. 9 | 10 | --format Explicitly set the format for a file. Useful when a file does not have conventional 11 | naming. E.g., `--format=csv` 12 | 13 | -i, --interactive Enable interactive mode. Useful when pushing a single file. 14 | 15 | --schedule Setup a schedule so the DataHub will automatically re-import the remote file on 16 | a regular basis. E.g., `every 90s`, `every 5m`, `every 2d`. The number is always 17 | an integer, selector is `s/m/h/d/w` (second -> week) and you can’t schedule for 18 | less than 60 seconds. 19 | 20 | --sheets Set which sheets should be processed when pushing Excel files. By default, only 21 | the first sheet is processed. You can use `--sheets=all` option to push "all" sheets. 22 | You also can list sheet numbers, e.g., `--sheets=1,2`. If you wanted to push only 23 | the second sheet, you would do `--sheets=2`. Sheet number starts from 1. 24 | 25 | --name Set the name of the dataset without interaction when pushing the single file. Eg: `--name=my-dataset` 26 | 27 | ### findability options: 28 | 29 | This options define the dataset visibility on the DataHub.io site: 30 | 31 | --public (default) Everybody can see the dataset in the search results. 32 | Everybody can access the dataset by the URL link. 33 | 34 | --unlisted Other users will not see the dataset in the search results. 35 | You will see the dataset in the search results. 36 | Everybody can access the dataset by the URL link. 37 | 38 | --private Other users cannot access the dataset. 39 | Other users will not see the dataset in the search results. 40 | You will see the dataset in the search results. 41 | 42 | ## Examples: 43 | 44 | Uploads Data Package to DataHub in current working directory: 45 | 46 | ■ data push 47 | 48 | Uploads Data Package to DataHub with path (core/finance-vix/ should have datapackage.json): 49 | 50 | ■ data push core/finance-vix/ 51 | 52 | By default, all pushed datasets are public. To make them unlisted: 53 | 54 | ■ data push core/finance-vix/ --unlisted 55 | 56 | Uploads a file from URL to DataHub on weekly basis and sets file format as CSV: 57 | 58 | ■ data push URL --schedule="every 1w" --format=csv 59 | 60 | Uploads a Excel file and processes only the second sheet: 61 | 62 | ■ data push myExcel.xlsx --sheets=2 63 | -------------------------------------------------------------------------------- /bin/data-push-flow.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const fs = require('fs') 3 | const path = require('path') 4 | 5 | const minimist = require('minimist') 6 | const urljoin = require('url-join') 7 | const {DataHub} = require('datahub-client') 8 | const {authenticate} = require('datahub-client') 9 | const {config} = require('datahub-client') 10 | const { write: copyToClipboard } = require('clipboardy') 11 | 12 | // Ours 13 | const {customMarked} = require('../lib/utils/tools.js') 14 | const {handleError} = require('../lib/utils/error') 15 | const wait = require('../lib/utils/output/wait') 16 | const info = require('../lib/utils/output/info.js') 17 | 18 | 19 | const argv = minimist(process.argv.slice(2), { 20 | string: ['push-flow'], 21 | boolean: ['help', 'debug', 'interactive'], 22 | alias: {help: 'h', interactive: 'i'} 23 | }) 24 | 25 | const pushMarkdown = fs.readFileSync(path.join(__dirname, '../docs/push-flow.md'), 'utf8') 26 | const help = () => { 27 | console.log('\n' + customMarked(pushMarkdown)) 28 | } 29 | 30 | if (argv.help) { 31 | help() 32 | process.exit(0) 33 | } 34 | 35 | Promise.resolve().then(async () => { 36 | let stopSpinner = () => {} 37 | // First check if user is authenticated 38 | const apiUrl = config.get('api') 39 | const token = config.get('token') 40 | let out 41 | try { 42 | out = await authenticate(apiUrl, token) 43 | } catch (err) { 44 | await handleError(err) 45 | process.exit(1) 46 | } 47 | if (!out.authenticated) { 48 | info('You need to login in order to push your data. Please, use `data login` command.') 49 | process.exit(0) 50 | } 51 | try { 52 | const datasetPath = argv._[0] || process.cwd() 53 | stopSpinner = wait('Commencing push ...') 54 | 55 | const datahubConfigs = { 56 | apiUrl: config.get('api'), 57 | token: config.get('token'), 58 | debug: argv.debug, 59 | ownerid: config.get('profile') ? config.get('profile').id : config.get('id'), 60 | owner: config.get('profile') ? config.get('profile').username : config.get('username') 61 | } 62 | const datahub = new DataHub(datahubConfigs) 63 | const res = await datahub.pushFlow( 64 | path.join(datasetPath ,'.datahub/flow.yaml'), 65 | path.join(datasetPath ,'.datahub/datapackage.json') 66 | ) 67 | const revisionId = res.flow_id.split('/').pop() 68 | const datasetName = res.dataset_id.split('/').pop() 69 | stopSpinner() 70 | const message = '🙌 your data is published!\n' 71 | const url = urljoin(config.get('domain'), datahubConfigs.owner, datasetName,'v',revisionId) 72 | await copyToClipboard(url) 73 | console.log(message + '🔗 ' + url + ' (copied to clipboard)') 74 | } catch (err) { 75 | stopSpinner() 76 | if (argv.debug) { 77 | console.log('> [debug]\n' + err.stack) 78 | } 79 | await handleError(err) 80 | process.exit(1) 81 | } 82 | }) 83 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "data-cli", 3 | "version": "0.10.1", 4 | "description": "CLI for working with data packages", 5 | "main": "./lib/index.js", 6 | "bin": { 7 | "data": "./bin/data.js" 8 | }, 9 | "directories": { 10 | "test": "test" 11 | }, 12 | "scripts": { 13 | "test": "ava -v", 14 | "push:test": "ava -v test/push/ -m 'push*'", 15 | "watch:test": "npm t -- --watch", 16 | "lint": "xo --quiet", 17 | "data": "node bin/data.js", 18 | "gzip": "rm -rf packed/*.gz && ls packed/data* | xargs gzip -k", 19 | "pack": "pkg bin/data.js -c package.json -o packed/data --options no-warnings && npm run gzip", 20 | "precommit": "npm run lint && npm run build" 21 | }, 22 | "xo": { 23 | "space": true, 24 | "semicolon": false, 25 | "rules": { 26 | "no-var": "warn", 27 | "no-use-before-define": 1, 28 | "no-await-in-loop": 1, 29 | "import/prefer-default-export": 1, 30 | "no-negated-condition": 1, 31 | "guard-for-in": 1 32 | }, 33 | "ignores": [ 34 | "test/fixtures/*/**" 35 | ] 36 | }, 37 | "ava": { 38 | "failFast": false, 39 | "files": [ 40 | "test/*test.js" 41 | ] 42 | }, 43 | "pkg": { 44 | "scripts": [ 45 | "bin/*", 46 | "lib/**/*" 47 | ], 48 | "targets": [ 49 | "node7-linux-x64", 50 | "node7-macos-x64", 51 | "node7-win-x64" 52 | ] 53 | }, 54 | "repository": { 55 | "type": "git", 56 | "url": "git+https://github.com/datahq/data-cli.git" 57 | }, 58 | "keywords": [ 59 | "data", 60 | "data package", 61 | "datahub", 62 | "datapackage" 63 | ], 64 | "author": "Rufus Pollock and DataHQ", 65 | "license": "ISC", 66 | "bugs": { 67 | "url": "https://github.com/datahq/data-cli/issues" 68 | }, 69 | "homepage": "https://datahub.io/docs", 70 | "dependencies": { 71 | "ansi-escapes": "^3.0.0", 72 | "boxen": "^1.3.0", 73 | "chalk": "^2.3.0", 74 | "clipboardy": "^1.2.3", 75 | "data.js": "^0.11.5", 76 | "datahub-client": "^0.5.8", 77 | "first-run": "^1.2.0", 78 | "global-packages": "^1.0.2", 79 | "human-readable-ids": "^1.0.3", 80 | "inquirer": "^5.1.0", 81 | "inquirer-test": "^2.0.1", 82 | "jsonlint": "^1.6.2", 83 | "marked": "^0.3.6", 84 | "marked-terminal": "^2.0.0", 85 | "minimist": "^1.2.0", 86 | "mkdirp": "^0.5.1", 87 | "ora": "^1.3.0", 88 | "pkg": "4.2.5", 89 | "progress": "^2.0.0", 90 | "raven": "^2.4.2", 91 | "universal-analytics": "^0.4.16", 92 | "unzip": "^0.1.11", 93 | "update-notifier": "^2.3.0", 94 | "url-join": "^2.0.2", 95 | "xlsx": "^0.17.0" 96 | }, 97 | "devDependencies": { 98 | "ava": "^0.25.0", 99 | "cross-spawn": "^5.1.0", 100 | "stream-to-array": "^2.3.0", 101 | "xo": "^0.18.2" 102 | }, 103 | "resolutions": { 104 | "graceful-fs": "^4.2.4" 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /bin/data-info.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const fs = require('fs') 3 | const path = require('path') 4 | 5 | const minimist = require('minimist') 6 | const data = require('data.js') 7 | const {info, Agent} = require('datahub-client') 8 | 9 | const {customMarked} = require('../lib/utils/tools.js') 10 | const {handleError} = require('../lib/utils/error') 11 | const printInfo = require('../lib/utils/output/info') 12 | 13 | const argv = minimist(process.argv.slice(2), { 14 | string: ['info'], 15 | boolean: ['help'], 16 | alias: {help: 'h'} 17 | }) 18 | 19 | const infoMarkdown = fs.readFileSync(path.join(__dirname, '../docs/info.md'), 'utf8') 20 | const help = () => { 21 | console.log('\n' + customMarked(infoMarkdown)) 22 | } 23 | 24 | if (argv.help) { 25 | help() 26 | process.exit(0) 27 | } 28 | 29 | const fileOrDatasetIdentifier = argv._[0] ? argv._[0] : './' 30 | 31 | Promise.resolve().then(async () => { 32 | // If given path is a URL then fetch headers and check if status is OK: 33 | const agent = new Agent(fileOrDatasetIdentifier, {debug: argv.debug}) 34 | if (data.isUrl(fileOrDatasetIdentifier)) { 35 | const response = await agent.fetch('/') 36 | if (response.status >= 400) { 37 | throw new Error(`Provided URL returns ${response.status} status code.`) 38 | } 39 | } 40 | 41 | try { 42 | const parsedIdentifier = await data.parseDatasetIdentifier(fileOrDatasetIdentifier) 43 | const isdataset = data.isDataset(fileOrDatasetIdentifier) 44 | const githubDataset = parsedIdentifier.type === 'github' && parsedIdentifier.name.slice((parsedIdentifier.name.lastIndexOf('.') - 1 >>> 0) + 2) === '' 45 | if (isdataset || parsedIdentifier.type === "datahub" || githubDataset) { 46 | const dataset = await data.Dataset.load(fileOrDatasetIdentifier) 47 | const out = info.infoPackage(dataset) 48 | console.log(customMarked(out)) 49 | } else { 50 | const file = data.File.load(fileOrDatasetIdentifier, {format: argv.format}) 51 | const knownTabularFormats = ['csv', 'tsv', 'dsv'] 52 | if (knownTabularFormats.includes(file.descriptor.format)) { 53 | await file.addSchema() 54 | } 55 | // Only print table if resource is tabular: 56 | let table 57 | let tabularFormatsAndExcel = knownTabularFormats.concat(['xls', 'xlsx']) 58 | if (tabularFormatsAndExcel.includes(file.descriptor.format)) { 59 | table = await info.infoResource(file) 60 | } 61 | console.log(customMarked('**File descriptor:**')) 62 | console.log(JSON.stringify(file.descriptor, null, 2)) 63 | if (table) { 64 | console.log(table) 65 | console.log(customMarked('*Only showing first 10 lines. There might be more data.*')) 66 | } 67 | } 68 | } catch (err) { 69 | if (!argv._[0]) { 70 | printInfo('Running `data info` without an argument will search a `datapackage.json` file in the current working directory.') 71 | } 72 | await handleError(err) 73 | process.exit(1) 74 | } 75 | agent.close() 76 | }) 77 | -------------------------------------------------------------------------------- /bin/data-validate.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | // Packages 4 | const fs = require('fs') 5 | const path = require('path') 6 | const minimist = require('minimist') 7 | const jsonlint = require('jsonlint') 8 | const {Validator} = require('datahub-client') 9 | const {Dataset} = require('data.js') 10 | const {eraseLines} = require('ansi-escapes') 11 | 12 | // Ours 13 | const {customMarked} = require('../lib/utils/tools') 14 | const {error} = require('../lib/utils/error') 15 | const wait = require('../lib/utils/output/wait') 16 | const info = require('../lib/utils/output/info') 17 | 18 | const argv = minimist(process.argv.slice(2), { 19 | string: ['validate'], 20 | boolean: ['help'], 21 | alias: {help: 'h'} 22 | }) 23 | 24 | const validateMarkdown = fs.readFileSync(path.join(__dirname, '../docs/validate.md'), 'utf8') 25 | const help = () => { 26 | console.log('\n' + customMarked(validateMarkdown)) 27 | } 28 | 29 | if (argv.help) { 30 | help() 31 | process.exit(0) 32 | } 33 | 34 | let path_ = argv._[0] 35 | 36 | if (!path_) { 37 | path_ = process.cwd() 38 | } 39 | 40 | const validator = new Validator({identifier: path_}) 41 | const stopSpinner = wait('') 42 | 43 | validator.on('message', (message) => { 44 | if (message.constructor.name === 'String') { 45 | process.stdout.write(eraseLines(1)) 46 | info(message) 47 | } else { 48 | process.stdout.write(eraseLines(2)) 49 | info(message.name + ': ' + message.status) 50 | } 51 | }) 52 | 53 | validator.validate().then(result => { 54 | if (result === true) { 55 | stopSpinner() 56 | process.stdout.write(eraseLines(2)) 57 | info('Your Data Package is valid!') 58 | } else { 59 | stopSpinner() 60 | process.stdout.write(eraseLines(2)) 61 | // result is a TableSchemaError with attributes: message, rowNumber, and errors 62 | // each error in errors is of form { message, rowNumber, columnNumber } 63 | 64 | // HACK: strip out confusing "(see 'error.errors')" in error message 65 | if (result.message) { 66 | error(`Validation has failed for "${result.resource}"`) 67 | const msg = result.message.replace(" (see 'error.errors')", '') + ' on line ' + result.rowNumber 68 | error(msg) 69 | result.errors.forEach(err => { 70 | error(err.message) 71 | }) 72 | } 73 | else { 74 | if (result.constructor.name === 'Array') { 75 | result.forEach(err => error(err.message)) 76 | } else { 77 | error(result) 78 | } 79 | } 80 | } 81 | }).catch(err => { 82 | stopSpinner() 83 | process.stdout.write(eraseLines(2)) 84 | error(err.message) 85 | if (err.resource) { 86 | error(`Resource: ${err.resource}`) 87 | error(`Path: ${err.path}`) 88 | } 89 | // Get path to datapackage.json 90 | if (fs.lstatSync(path_).isDirectory()) { 91 | // Check datapackage.json in this dir and if doesn't exist then throw error: 92 | path_ = path.join(path_, 'datapackage.json') 93 | if (!fs.existsSync(path_)) { 94 | error('datapackage.json not found in the given directory') 95 | } 96 | } 97 | // Read given path 98 | let content 99 | try { 100 | content = fs.readFileSync(path_) 101 | } catch (err) { 102 | error(err.message) 103 | process.exit(1) 104 | } 105 | 106 | var lint = jsonlint.parse(content.toString()) 107 | if (lint.error) { 108 | error(`Invalid JSON: on line ${lint.line}, character ${lint.character}\n\n ${lint.error}\n\n${lint.evidence}`) 109 | process.exit(1) 110 | } 111 | }) 112 | -------------------------------------------------------------------------------- /bin/data.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | // Native 3 | const {resolve} = require('path') 4 | 5 | // Packages 6 | const ua = require('universal-analytics') 7 | const {config} = require('datahub-client') 8 | const firstRun = require('first-run') 9 | 10 | const {version} = require('../package.json') 11 | 12 | // Ours 13 | const {error, handleError} = require('../lib/utils/error') 14 | const updateNotifier = require('../lib/utils/update') 15 | 16 | // Increase MaxListenersExceededWarning level for cases when the remote dataset has a lot of resources, 17 | // to avoid: Warning: Possible EventEmitter memory leak detected. X end listeners added. 18 | // ~11 requests is required to validate remote 1 tabular resource, so I set a limit to match a dataset with 10 files. 19 | require('events').EventEmitter.defaultMaxListeners = 120; 20 | 21 | // Handle all uncaught exceptions and unhandled rejections 22 | process.on('uncaughtException', async (err) => { 23 | await handleError(err) 24 | process.exit(1) 25 | }) 26 | 27 | process.on('unhandledRejection', async (err) => { 28 | await handleError(err) 29 | process.exit(1) 30 | }) 31 | 32 | // Check and notify if any updates are available: 33 | updateNotifier() 34 | 35 | // Check if the current path exists and throw and error 36 | // if the user is trying to deploy a non-existing path! 37 | // This needs to be done exactly in this place, because 38 | // the utility imports are taking advantage of it 39 | try { 40 | process.cwd() 41 | } catch (err) { 42 | if (err.code === 'ENOENT' && err.syscall === 'uv_cwd') { 43 | console.log(`Current path doesn't exist!`) 44 | } else { 45 | console.log(err) 46 | } 47 | process.exit(1) 48 | } 49 | 50 | const commands = new Set([ 51 | 'help', 52 | 'get', 53 | 'push', 54 | 'push-flow', 55 | 'validate', 56 | 'info', 57 | 'init', 58 | 'cat', 59 | 'login' 60 | ]) 61 | 62 | // Parse args and dispatch to relevant command 63 | let args = process.argv.slice(2) 64 | 65 | if (args[0] === '-v' || args[0] === '--version') { 66 | console.log(`${version}`) 67 | process.exit() 68 | } 69 | 70 | // Default command 71 | let cmd = 'help' 72 | const index = args.findIndex(a => commands.has(a)) 73 | 74 | if (index > -1) { 75 | cmd = args[index] 76 | args.splice(index, 1) 77 | 78 | // Dispatch to the underlying command and help will be called there 79 | if (cmd === 'help' && index < args.length && commands.has(args[index])) { 80 | cmd = args[index] 81 | args.splice(index, 1) 82 | args.unshift('--help') 83 | } 84 | if (cmd.includes(' ')) { 85 | const parts = cmd.split(' ') 86 | cmd = parts.shift() 87 | args = [].concat(parts, args) 88 | } 89 | } else if (args[0] === '-h' || args[0] === '--help') { 90 | cmd = 'help' 91 | } else if (args.length === 0) { // One final option is no command in which case show help 92 | cmd = 'help' 93 | } else { 94 | error(`Command does not exist "` + args[0] + '"') 95 | console.error(`\nTo see a list of available commands run:`) 96 | console.error(`\n data help\n`) 97 | process.exit(1) 98 | } 99 | 100 | const bin = resolve(__dirname, 'data-' + cmd + '.js') 101 | 102 | // Track events using GA: 103 | // Developers should set 'datahub' env var to 'dev' so their usage doesn't get tracked: 104 | if (process.env.datahub !== 'dev') { 105 | const visitor = ua('UA-80458846-4') 106 | // If user is logged in then use the datahub userid with GA - it allows us to 107 | // track a user activity cross-platform, eg, connect activity on CLI and website: 108 | const userid = config.get('profile') ? config.get('profile').id : config.get('id') 109 | if (userid) { 110 | visitor.set('uid', userid) 111 | } 112 | // If this is the first run of the app, then track it in GA: 113 | if (firstRun()) { 114 | visitor.event('cli', 'first-run', process.platform).send() 115 | } 116 | // Track which version is run and on which OS: 117 | visitor.event('cli-usage-by-os-and-version', process.platform, version).send() 118 | // Event category is 'cli', action is the command and label is all arguments: 119 | const commandToTrack = args.length === 0 ? 'noArgs' : cmd 120 | visitor.event('cli', commandToTrack, process.argv.slice(3, process.argv.length).toString()).send() 121 | } 122 | 123 | // Prepare process.argv for subcommand 124 | process.argv = process.argv.slice(0, 2).concat(args) 125 | 126 | // Load sub command 127 | // With custom parameter to make "pkg" happy 128 | require(bin, 'may-exclude') 129 | -------------------------------------------------------------------------------- /bin/data-get.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | // Packages 4 | const fs = require('fs') 5 | const path = require('path') 6 | const url = require('url') 7 | const mkdirp = require('mkdirp') 8 | const minimist = require('minimist') 9 | const {Dataset, File, isDataset, parseDatasetIdentifier} = require('data.js') 10 | const {get, config} = require('datahub-client') 11 | const unzip = require('unzip') 12 | 13 | // Ours 14 | const {customMarked} = require('../lib/utils/tools.js') 15 | const wait = require('../lib/utils/output/wait') 16 | const {handleError} = require('../lib/utils/error') 17 | 18 | const argv = minimist(process.argv.slice(2), { 19 | string: ['get'], 20 | boolean: ['help', 'debug'], 21 | alias: {help: 'h'} 22 | }) 23 | 24 | const getMarkdown = fs.readFileSync(path.join(__dirname, '../docs/get.md'), 'utf8') 25 | const help = () => { 26 | console.log('\n' + customMarked(getMarkdown)) 27 | } 28 | 29 | if (argv.help || !argv._[0]) { 30 | help() 31 | process.exit(0) 32 | } 33 | 34 | let identifier = argv._[0] 35 | 36 | const run = async () => { 37 | const stopSpinner = wait('Loading...') 38 | try { 39 | const start = new Date() 40 | let pathToSave 41 | const parsedIdentifier = await parseDatasetIdentifier(identifier) 42 | const itIsDataset = isDataset(identifier) 43 | const githubDataset = parsedIdentifier.type === 'github' && parsedIdentifier.name.slice((parsedIdentifier.name.lastIndexOf('.') - 1 >>> 0) + 2) === '' 44 | 45 | if (itIsDataset || githubDataset) { 46 | const dataset = await Dataset.load(identifier) 47 | const owner = dataset.identifier.owner || '' 48 | const name = dataset.identifier.name 49 | 50 | pathToSave = path.join(owner, name) 51 | 52 | if (!checkDestIsEmpty(owner, name)) { 53 | throw new Error(`${owner}/${name} is not empty!`) 54 | } 55 | 56 | /** usual dataset download */ 57 | const allResources = await get(dataset) 58 | // Save all files on disk 59 | const myPromises = allResources.map(async resource => { 60 | return saveIt(owner, name, resource) 61 | }) 62 | await Promise.all(myPromises) 63 | 64 | } else if (parsedIdentifier.type === "datahub") { 65 | // Remove trailing slash: 66 | if(identifier.substr(-1) === '/' && identifier.length > 1) { 67 | identifier = identifier.slice(0, identifier.length - 1) 68 | } 69 | // We assume that if /r/ is in identifier then it's r link. 70 | if (identifier.includes('/r/')) { 71 | pathToSave = await saveFileFromUrl(identifier, argv.format) 72 | } else { 73 | // Try to guess owner and dataset name here. We're not loading Dataset object 74 | // because we want to handle private datasets as well: 75 | const idParts = identifier.split('/') 76 | const owner = idParts[idParts.length - 2] 77 | const name = idParts[idParts.length - 1] 78 | const token = config.get('token') 79 | pathToSave = path.join(owner, name) 80 | 81 | if (!checkDestIsEmpty(owner, name)) { 82 | throw new Error(`${owner}/${name} is not empty!`) 83 | } 84 | 85 | /** For datasets from the datahub we get zipped version and unzip it. 86 | - less traffic 87 | - zipped version has a fancy file structure 88 | #issue: https://github.com/datahq/datahub-qa/issues/86 */ 89 | const zipped_dataset_url = `https://datahub.io/${owner}/${name}/r/${name}_zip.zip?jwt=${token}` 90 | const archive_path = await saveFileFromUrl(zipped_dataset_url, 'zip') 91 | // unzip archive into destination folder 92 | fs.createReadStream(archive_path) 93 | .pipe(unzip.Extract({ path: pathToSave })) 94 | // removing the archive file once we extracted all the dataset files 95 | .on('finish', () => {fs.unlinkSync(archive_path)}) 96 | } 97 | } else { // If it is not a dataset - download the file 98 | if (parsedIdentifier.type === 'github' && !githubDataset) { 99 | identifier += `?raw=true` 100 | } 101 | pathToSave = await saveFileFromUrl(identifier, argv.format) 102 | } 103 | 104 | // show time statistic & success message 105 | stopSpinner() 106 | const end = new Date() - start 107 | console.log(`Time elapsed: ${(end / 1000).toFixed(2)} s`) 108 | console.log(`Dataset/file is saved in "${pathToSave}"`) 109 | 110 | } catch (err) { 111 | stopSpinner() 112 | if (argv.debug) { 113 | console.log('> [debug]\n' + err.stack) 114 | } 115 | await handleError(err) 116 | process.exit(1) 117 | } 118 | } 119 | 120 | run() 121 | 122 | /** 123 | * Download file from url and save it locally using data.js 'File' object. 124 | * returns path, where the file was saved ( ${filename}.${fileformat} ) 125 | * Using: let savedPath = await saveFileFromUrl(url, format) 126 | * @param url: url to get the file 127 | * @param format: csv, json, zip, etc 128 | * @returns {Promise} 129 | */ 130 | const saveFileFromUrl = (url, format) => { 131 | return new Promise(async (resolve, reject) =>{ 132 | const file = await File.load(url, {format: format}) 133 | const destPath = [file.descriptor.name, file.descriptor.format].join('.') 134 | let stream 135 | try { 136 | stream = await file.stream() 137 | } catch (err) { 138 | if (err.message === 'Not Found') { 139 | err.message += ' or Forbidden.' 140 | } 141 | await handleError(err) 142 | process.exit(1) 143 | } 144 | stream.pipe(fs.createWriteStream(destPath)).on('finish', () => { 145 | resolve(destPath) 146 | }) 147 | }) 148 | } 149 | 150 | const saveIt = (owner, name, resource) => { 151 | return new Promise(async (resolve, reject) => { 152 | // We only can save if path is defined 153 | if (resource.descriptor.path) { 154 | const pathParts = url.parse(resource.descriptor.path) 155 | let destPath 156 | if (pathParts.protocol === 'http:' || pathParts.protocol === 'https:') { 157 | const relativePath = resource.descriptor.path.split('/').slice(5).join('/') 158 | destPath = path.join(owner, name, relativePath) 159 | } else { 160 | destPath = path.join(owner, name, resource.descriptor.path) 161 | } 162 | mkdirp.sync(path.dirname(destPath)) 163 | const stream = await resource.stream() 164 | stream.pipe(fs.createWriteStream(destPath)).on('finish', () => { 165 | resolve() 166 | }) 167 | } 168 | }) 169 | } 170 | 171 | // TODO: Move this somewhere to utils 172 | const checkDestIsEmpty = (owner, name) => { 173 | const dest = path.join(owner, name) 174 | return !fs.existsSync(dest) || fs.readdirSync(dest).length === 0; 175 | } 176 | 177 | module.exports = { 178 | checkDestIsEmpty 179 | } 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | **"Data-cli"** is an important part of the [DataHub](https://datahub.io/docs/about) project. This is a command line tool, that helps you to manipulate your data (as `git` manipulates the code). 4 | 5 | For example you have a set of data as a result of your work, let it be few data-files and a description. And you want to share it with your colleagues. With the **"data-cli"** you just need to: 6 | 7 | ```shell 8 | cd data-folder 9 | data init # convert my data files into the data-package 10 | > "Answer a few questions here, e.g. dataset name, files to include, etc" 11 | data push # upload the dataset onto a DataHub 12 | > "As a result you'll got a link to share: 13 | http://datahub.io/user-name/data-package-name 14 | ``` 15 | 16 | That's it! Your data is online. You can make your data unlisted or private, add some pretty graphics, and many more. Please read http://datahub.io/docs for details. 17 | 18 | With `data-cli` you can also: 19 | 20 | * Get data from online sources 21 | * Get info about data files and datasets (local and remote) 22 | * Validate your data to ensure its quality 23 | * Initialize a new dataset (as a Data Package) 24 | 25 | ## Usage examples: 26 | 27 | Here we show examples of usage for common `data` commands. To see the full command documentation - click on the command name, or proceed to the [help pages](https://github.com/datahq/data-cli/tree/master/docs). 28 | 29 | ### data login 30 | 31 | You should login at the first use of data-cli: 32 | ```bash 33 | $ data login 34 | ? Login with... Github 35 | > Opening browser and waiting for you to authenticate online 36 | > You are logged in! 37 | ``` 38 | 39 | ### [data push](https://github.com/datahq/data-cli/blob/master/docs/push.md) 40 | 41 | Upload a dataset or a separate file on the DataHub: 42 | ```bash 43 | $ data push mydata.csv 44 | ? Please, confirm name for this dataset: 45 | 0-selfish-cougar-7 mydataset 46 | ? Please, confirm title for this dataset: 47 | Mydataset Mydataset 48 | Uploading [******************************] 100% (0.0s left) 49 | your data is published! 50 | 🔗 https://datahub.io/myname/mydataset/v/1 (copied to clipboard) 51 | ``` 52 | 53 | Alternatively you can set name without interaction 54 | ```bash 55 | $ data push mydata.csv --name=mydataset 56 | Uploading [******************************] 100% (0.0s left) 57 | your data is published! 58 | 🔗 https://datahub.io/myname/mydataset/v/1 (copied to clipboard) 59 | ``` 60 | 61 | **Note:** by default, findability flag for your dataset is set to `--public`. Use `--unlisted` flag if you want it to not appear in the search results. 62 | 63 | ### [data get](https://github.com/datahq/data-cli/blob/master/docs/get.md) 64 | 65 | Get a dataset from the DataHub or GitHub: 66 | ```bash 67 | $ data get http://datahub.io/core/gold-prices 68 | Time elapsed: 1.72 s 69 | Dataset/file is saved in "core/gold-prices" 70 | ``` 71 | 72 | ### [data info](https://github.com/datahq/data-cli/blob/master/docs/info.md) 73 | 74 | Shows info about the dataset (local or remote): 75 | ```bash 76 | $ data info http://datahub.io/core/gold-prices 77 | # Gold Prices (Monthly in USD) 78 | 79 | Monthly gold prices since 1950 in USD (London market). Data is sourced from the Bundesbank. 80 | 81 | ## Data 82 | * [Bundesbank statistic ... [see more below] 83 | 84 | ## RESOURCES 85 | ┌───────────────────┬────────┬───────┬───────┐ 86 | │ Name │ Format │ Size │ Title │ 87 | ├───────────────────┼────────┼───────┼───────┤ 88 | │ data_csv │ csv │ 16172 │ │ 89 | ├───────────────────┼────────┼───────┼───────┤ 90 | │ data_json │ json │ 32956 │ │ 91 | ├───────────────────┼────────┼───────┼───────┤ 92 | │ gold-prices_zip │ zip │ 17755 │ │ 93 | ├───────────────────┼────────┼───────┼───────┤ 94 | │ data │ csv │ 16170 │ │ 95 | └───────────────────┴────────┴───────┴───────┘ 96 | 97 | ## README 98 | Monthly gold prices since 1950 in USD (London market). Data is sourced from the Bundesbank. 99 | ... 100 | 101 | ### Licence 102 | ... 103 | ``` 104 | 105 | ### [data cat](https://github.com/datahq/data-cli/blob/master/docs/cat.md) 106 | 107 | Works similar as Unix `cat` command but works with remote resources and can convert tabular data into different formats: 108 | ```bash 109 | $ data cat http://datahub.io/core/gold-prices/r/0.csv 110 | ┌──────────────────────────────────────┬──────────────────────────────────────┐ 111 | │ date │ price │ 112 | ├──────────────────────────────────────┼──────────────────────────────────────┤ 113 | │ 1950-02-01 │ 34.730 │ 114 | ├──────────────────────────────────────┼──────────────────────────────────────┤ 115 | │ 1950-03-01 │ 34.730 │ 116 | 117 | ........... 118 | ``` 119 | You can also convert tabular data into different formats (the source could be remote as well): 120 | ```bash 121 | $ data cat prices.csv prices.md 122 | > All done! Your data is saved in "prices.md" 123 | user@pc:~/Downloads$ cat prices.md 124 | | date | price | 125 | | ---------- | -------- | 126 | | 1950-02-01 | 34.730 | 127 | | 1950-03-01 | 34.730 | 128 | ``` 129 | 130 | ### [data init](https://github.com/datahq/data-cli/blob/master/docs/init.md) 131 | 132 | Data-cli has an `init` command that will automatically generate Data Package metadata including scanning the current directory for data files and inferring [table schema] for tabular files: 133 | ```bash 134 | $ data init 135 | This process initializes a new datapackage.json file. 136 | Once there is a datapackage.json file, you can still run `data init` 137 | to update/extend it. 138 | Press ^C at any time to quit. 139 | 140 | ? Enter Data Package name prices 141 | ? Enter Data Package title prices 142 | ? Do you want to add following file as a resource "prices.csv" - y/n? y 143 | prices.csv is just added to resources 144 | ? Do you want to add following file as a resource "prices.xls" - y/n? y 145 | prices.xls is just added to resources 146 | 147 | ? Going to write to /home/user/Downloads/datapackage.json: 148 | { 149 | "name": "prices", 150 | "title": "prices", 151 | "resources": [ 152 | { 153 | "path": "prices.csv", 154 | "name": "prices", 155 | "format": "csv", 156 | .... 157 | }, 158 | "schema": { 159 | "fields": [ 160 | { 161 | "name": "date", 162 | "type": "date", 163 | "format": "default" 164 | }, 165 | { 166 | ........ 167 | { 168 | "path": "prices.xls", 169 | "pathType": "local", 170 | "name": "prices", 171 | "format": "xls", 172 | "mediatype": "application/vnd.ms-excel", 173 | "encoding": "windows-1250" 174 | } 175 | ] 176 | } 177 | 178 | 179 | Is that OK - y/n? y 180 | datapackage.json file is saved in /home/user/Downloads/datapackage.json 181 | ``` 182 | 183 | ### [data validate](https://github.com/datahq/data-cli/blob/master/docs/validate.md) 184 | 185 | ```bash 186 | $ data validate path/to/correct/datapackage 187 | > Your Data Package is valid! 188 | ``` 189 | ```bash 190 | $ data validate path/to/invalid-data 191 | > Error! Validation has failed for "missing-column" 192 | > Error! The column header names do not match the field names in the schema on line 2 193 | 194 | ``` 195 | 196 | ### data help 197 | 198 | Also you can run "help" command in your terminal to see command docs: 199 | ```shell 200 | $ data help 201 | 'General description' 202 | $ data help push 203 | > 'push command description' 204 | 205 | # data help get 206 | # data help init 207 | # etc ... 208 | ``` 209 | 210 | ## Installation 211 | 212 | ``` 213 | npm install data-cli --global 214 | ``` 215 | After installation you can run `data-cli` by the name `data`: 216 | ``` 217 | data --version 218 | > 0.8.9 219 | ``` 220 | 221 | If you're not using NPM you can install `data-cli` binaries following [this instructions](https://datahub.io/docs/getting-started/installing-data#installing-binaries). 222 | 223 | # For developers 224 | 225 | [![Build Status](https://travis-ci.org/datahq/data-cli.svg?branch=master)](https://travis-ci.org/datahq/data-cli) 226 | [![XO code style](https://img.shields.io/badge/code_style-XO-5ed9c7.svg)](https://github.com/sindresorhus/xo) 227 | [![Issues](https://img.shields.io/badge/issue-tracker-orange.svg)](https://github.com/datahq/data-cli/issues) 228 | 229 | ## Configuration 230 | 231 | Configuration is in `~/.config/datahub/config.json`. In general, you should not need to edit this by hand. You can also override any variables in there using environment variables or on the command line by using the same name e.g. 232 | 233 | ``` 234 | $ data login --api https://api-testing.datahub.io 235 | ``` 236 | 237 | NB: you can set a custom location for the `config.json` config file using the `DATAHUB_JSON` environment variable e.g.: 238 | 239 | ``` 240 | export DATAHUB_JSON=~/.config/datahub/my-special-config.json 241 | ``` 242 | 243 | ## Environment 244 | 245 | *You need to have Node.js version >7.6* 246 | 247 | **NOTE:** if you're a developer, you need to set `datahub=dev` environment variable so your usage of the CLI isn't tracked in the analytics: 248 | 249 | It is recommended that you set this up permanently, e.g., MacOS users need to edit `~/.bash_profile` file - add this script in your `~/.bash_profile`: 250 | 251 | ```bash 252 | # The next line sets 'datahub' env var so data-cli doesn't send tracking data to Analytics 253 | export datahub=dev 254 | ``` 255 | 256 | and then restart your terminal. 257 | 258 | ## Install 259 | 260 | ``` 261 | $ npm install 262 | ``` 263 | 264 | ## Running tests 265 | 266 | We use Ava for our tests. For running tests use: 267 | 268 | ``` 269 | $ [sudo] npm test 270 | ``` 271 | 272 | To run tests in watch mode: 273 | 274 | ``` 275 | $ [sudo] npm run watch:test 276 | ``` 277 | 278 | We also have tests for `push` command that publishes some of test datasets to DataHub. While Travis runs all tests on every commit, the `push` tests are run only on tagged commits. To run these tests locally you need to have credentials for 'test' user and use following command: 279 | 280 | ``` 281 | $ [sudo] npm test test/push/push.test.js 282 | ``` 283 | 284 | ## Lint 285 | 286 | We use XO for checking our code for JS standard/convention/style: 287 | 288 | ```bash 289 | # When you run tests, it first runs lint: 290 | $ npm test 291 | 292 | # To run lint separately: 293 | $ npm run lint # shows errors only 294 | 295 | # Fixing erros automatically: 296 | $ xo --fix 297 | ``` 298 | -------------------------------------------------------------------------------- /bin/data-push.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const fs = require('fs') 3 | const path = require('path') 4 | 5 | const minimist = require('minimist') 6 | const urljoin = require('url-join') 7 | const inquirer = require('inquirer') 8 | const hri = require('human-readable-ids').hri 9 | const {Dataset, File, xlsxParser, isDataset, isUrl} = require('data.js') 10 | const { write: copyToClipboard } = require('clipboardy') 11 | const toArray = require('stream-to-array') 12 | const {DataHub, Validator, authenticate, config, Agent} = require('datahub-client') 13 | const ua = require('universal-analytics') 14 | const ProgressBar = require('progress') 15 | 16 | // Ours 17 | const {customMarked} = require('../lib/utils/tools.js') 18 | const {error, handleError} = require('../lib/utils/error') 19 | const wait = require('../lib/utils/output/wait') 20 | const info = require('../lib/utils/output/info.js') 21 | 22 | 23 | const argv = minimist(process.argv.slice(2), { 24 | string: ['push', 'sheets'], 25 | boolean: ['help', 'test', 'debug', 'interactive', 'unlisted', 'private', 'zip', 'sqlite'], 26 | alias: {help: 'h', interactive: 'i', sheets: 'sheet'} 27 | }) 28 | 29 | const pushMarkdown = fs.readFileSync(path.join(__dirname, '../docs/push.md'), 'utf8') 30 | const help = () => { 31 | console.log('\n' + customMarked(pushMarkdown)) 32 | } 33 | 34 | if (argv.help) { 35 | help() 36 | process.exit(0) 37 | } 38 | 39 | Promise.resolve().then(async () => { 40 | let stopSpinner = () => {} 41 | // First check if user is authenticated 42 | const apiUrl = config.get('api') 43 | const token = config.get('token') 44 | let out = {authenticated: true} 45 | if (!argv.test) { 46 | try { 47 | out = await authenticate(apiUrl, token) 48 | } catch (err) { 49 | await handleError(err) 50 | process.exit(1) 51 | } 52 | } 53 | if (!out.authenticated) { 54 | info('You need to login in order to push your data. Please, use `data login` command.') 55 | process.exit(0) 56 | } 57 | try { 58 | const filePath = argv._[0] || process.cwd() 59 | let dataset 60 | if (isDataset(filePath)) { 61 | if (isUrl(filePath)) { 62 | console.log('Error: You can push only local datasets.') 63 | process.exit(0) 64 | } 65 | try { 66 | dataset = await Dataset.load(filePath) 67 | } catch(err){ 68 | info("You can run:") 69 | info("'data validate' to check your data.") 70 | info("'data init' to create a datapackage.") 71 | info("'data help push' to get more info.") 72 | await handleError(err) 73 | process.exit(1) 74 | } 75 | } else { 76 | dataset = await prepareDatasetFromFile(filePath) 77 | } 78 | 79 | dataset.resources.forEach(resource => { 80 | if (resource.constructor.name === 'FileInline') { 81 | throw new Error('We do not support dataset with inlined data') 82 | } 83 | }) 84 | 85 | stopSpinner = wait('Commencing push ...') 86 | 87 | const datahubConfigs = { 88 | apiUrl: config.get('api'), 89 | token: config.get('token'), 90 | debug: argv.debug, 91 | ownerid: config.get('profile') ? config.get('profile').id : config.get('id'), 92 | owner: config.get('profile') ? config.get('profile').username : config.get('username') 93 | } 94 | let findability = 'published' 95 | if (argv.unlisted) { 96 | findability = 'unlisted' 97 | } 98 | if (argv.private) { 99 | findability = 'private' 100 | } 101 | const datahub = new DataHub(datahubConfigs) 102 | const options = { 103 | findability: findability, 104 | sheets: argv.sheets ? argv.sheets.toString() : undefined, 105 | outputs: { 106 | zip: argv.zip, 107 | sqlite: argv.sqlite 108 | }, 109 | schedule: argv.schedule 110 | } 111 | 112 | // Validate metadata prior to pushing: 113 | // Let's normalize resource names as it is common when they're capitalized 114 | // or have spaces - they're generated from file names: 115 | for (const idx in dataset.descriptor.resources) { 116 | if (!dataset.descriptor.resources[idx].name.match(validationPatterns['nameValidation'])) { 117 | dataset.descriptor.resources[idx].name = dataset.descriptor.resources[idx].name.replace(/\s+/g, '-').toLowerCase() 118 | dataset.resources[idx].descriptor.name = dataset.resources[idx].descriptor.name.replace(/\s+/g, '-').toLowerCase() 119 | } 120 | } 121 | const validator = new Validator() 122 | await validator.validateMetadata(dataset.descriptor) 123 | stopSpinner() 124 | // Show the progress bars for each file being uploaded: 125 | const progressBars = [] 126 | // Listen for 'upload' events being emited from the DataHub class: 127 | datahub.on('upload', (message) => { 128 | // Check if a bar is already initiated: 129 | const barItem = progressBars.find(item => item.file === message.file) 130 | if (barItem) { 131 | try { 132 | if (message.completed) { 133 | if (process.platform !== 'win32') { 134 | barItem.bar.interrupt('Completed: ' + message.file) 135 | } else { 136 | info('Completed: ' + message.file) 137 | } 138 | } else { 139 | barItem.bar.tick(message.chunk.length) 140 | } 141 | } catch (err) { 142 | info(err.message) 143 | } 144 | } else { // If a bar doesn't exist initiate one: 145 | progressBars.push({ 146 | file: message.file, 147 | bar: new ProgressBar(` Uploading [:bar] :percent (:etas left) ${message.file}`, { 148 | complete: '*', 149 | incomplete: ' ', 150 | width: 30, 151 | total: message.total, 152 | clear: process.platform === 'win32' ? false : true 153 | }) 154 | }) 155 | } 156 | }) 157 | 158 | const res = await datahub.push(dataset, options) 159 | // Analytics: 160 | if (process.env.datahub !== 'dev') { 161 | const visitor = ua('UA-80458846-4') 162 | visitor.set('uid', datahubConfigs.ownerid) 163 | // Check if it's the first push: 164 | const agent = new Agent(datahubConfigs.apiUrl, {debug: argv.debug}) 165 | let response = await agent.fetch( 166 | `/metastore/search/events?owner="${datahubConfigs.owner}"&size=0`, 167 | {headers: {'Auth-Token': token}} 168 | ) 169 | if (response.ok) { 170 | response = await response.json() 171 | if (response.summary && response.summary.total === 0) { // It's the first push 172 | visitor.event('cli', 'push-first').send() 173 | } 174 | // Count sucessful pushes: 175 | visitor.event('cli', 'push-success').send() 176 | } 177 | } 178 | // Print success message and provide URL to showcase page: 179 | let revisionId = res.flow_id.split('/').pop() 180 | const message = '\n🙌 your data is published!\n' 181 | const url = urljoin(config.get('domain'), datahubConfigs.owner, dataset.descriptor.name, 'v', revisionId) 182 | let copied = ' (copied to clipboard)' 183 | try { 184 | await copyToClipboard(url) 185 | } catch (err) { 186 | copied = '' 187 | console.log(`Warning: Failed to copy to clipboard - ${err.message}`) 188 | } 189 | console.log(message + '🔗 ' + url + copied) 190 | } catch (err) { 191 | stopSpinner() 192 | if (argv.debug) { 193 | console.log('> [debug]\n' + err.stack) 194 | } 195 | await handleError(err) 196 | process.exit(1) 197 | } 198 | }) 199 | 200 | const prepareDatasetFromFile = async filePath => { 201 | let file 202 | if (isUrl(filePath)) { 203 | file = await File.load(filePath, {format: argv.format}) 204 | } else { 205 | const pathParts = path.parse(filePath) 206 | file = await File.load(pathParts.base, {basePath: pathParts.dir, format: argv.format}) 207 | } 208 | // List of formats that are known as tabular 209 | const knownTabularFormats = ['csv', 'tsv', 'dsv'] 210 | if (knownTabularFormats.includes(file.descriptor.format)) { 211 | try { 212 | await file.addSchema() 213 | } catch(err){ 214 | error("tabular file is invalid: " + file.path) 215 | error(err.message) 216 | if (argv.debug){ 217 | console.log('> [debug]\n' + err.stack) 218 | } 219 | process.exit(1) 220 | } 221 | 222 | if (argv.interactive) { 223 | // Prompt user with headers and fieldTypes 224 | const headers = file.descriptor.schema.fields.map(field => field.name) 225 | const fieldTypes = file.descriptor.schema.fields.map(field => field.type) 226 | const questions = [ 227 | ask('headers', headers, 'y', 'yesOrNo'), 228 | ask('types', fieldTypes, 'y', 'yesOrNo') 229 | ] 230 | const answers = await inquirer.prompt(questions) 231 | 232 | if (answers.headers === 'n' || answers.types === 'n') { 233 | // Maybe nicer exit - user has chosen not to proceed for now ... 234 | throw new Error('Please, generate datapackage.json (you can use "data init") and push.') 235 | } 236 | } 237 | } 238 | 239 | let dpName, dpTitle 240 | if (argv.name) { // If name is provided in args then no user prompting: 241 | dpName = argv.name.toString() 242 | // Make unslugifies version for title: 243 | dpTitle = dpName.replace(/-+/g, ' ') 244 | dpTitle = dpTitle.charAt(0).toUpperCase() + dpTitle.slice(1) 245 | } else { 246 | dpName = file.descriptor.name.replace(/\s+/g, '-').toLowerCase() 247 | // Add human readable id so that this packge does not conflict with other 248 | // packages (name is coming from the file name which could just be 249 | // data.csv) 250 | dpName += '-' + hri.random() 251 | // Confirm dpName with user: 252 | let answer = await inquirer.prompt([ask('name', dpName, dpName, 'nameValidation')]) 253 | dpName = answer.name 254 | // Make unslugifies version for title: 255 | dpTitle = dpName.replace(/-+/g, ' ') 256 | dpTitle = dpTitle.charAt(0).toUpperCase() + dpTitle.slice(1) 257 | // Confirm title with user: 258 | answer = await inquirer.prompt([ask('title', dpTitle, dpTitle)]) 259 | dpTitle = answer.title 260 | } 261 | 262 | const metadata = { 263 | name: dpName, 264 | title: dpTitle, 265 | resources: [] 266 | } 267 | const dataset = await Dataset.load(metadata) 268 | dataset.addResource(file) 269 | return dataset 270 | } 271 | 272 | const validationPatterns = { 273 | yesOrNo: /^[y,n]+$/, 274 | nameValidation: /^([-a-z0-9._\/])+$/ 275 | } 276 | 277 | const ask = (property, data, defaultValue, validation) => { 278 | const inquirerObj = { 279 | type: 'input', 280 | name: property, 281 | message: `Please, confirm ${property} for this dataset:\n${data}`, 282 | default: () => { 283 | return defaultValue 284 | } 285 | } 286 | if (validation) { 287 | inquirerObj.validate = value => { 288 | const pass = value.match(validationPatterns[validation]) 289 | if (pass) { 290 | return true 291 | } 292 | return `Provided value must match following pattern: ${validationPatterns[validation]}` 293 | } 294 | } 295 | return inquirerObj 296 | } 297 | -------------------------------------------------------------------------------- /DESIGN.md: -------------------------------------------------------------------------------- 1 | # Design of `data` 2 | 3 | **Note: this material is from 2017-2018** 4 | 5 | # What is the `data` tool 6 | 7 | The `data` tool performs 3 complementary tasks: 8 | 9 | * Data Publishing: it is the **DataHub command line interface** with support for pushing and getting data from the DataHub 10 | * Data Packaging: it is a command line **Data Package manager** for creating, inspecting, validating and accessing data packages and their resources 11 | * Data Wrangling: it is a lightweight command line **Data Wrangler tool** performing tasks like inspecting and cat'ing data files. 12 | 13 | Illustrative set of commands 14 | 15 | ```bash 16 | # ===== 17 | # Publishing 18 | 19 | # data publishing 20 | data push FILE 21 | data push DIRECTORY 22 | 23 | # get a file (from DataHub) 24 | data get FILE / URL 25 | 26 | # delete a published dataset 27 | data delete 28 | 29 | # ===== 30 | # Packaging 31 | 32 | # create a data package 33 | data init 34 | 35 | # validate a data package 36 | data validate 37 | 38 | # ===== 39 | # Wrangling 40 | 41 | # data (pre)viewing and conversion 42 | data cat FILE 43 | # data conversion 44 | data cat FILE OUTFILE 45 | 46 | # DIR: data package summary (assumes it is a data package) 47 | # FIlE: print out meta and stream a summary of data (can turn off maybe in future with --no-preview) 48 | data info {FILE-or-DIR} 49 | ``` 50 | 51 | **Overview diagram** 52 | 53 | State of feature is indicated: 54 | 55 | * light green = working well 56 | * pink = working but needs work (and is priority) 57 | * pink dashed = does not exist and priority 58 | * green = exists and needs work but not priority 59 | * grey = does not exist and not a priority 60 | 61 | ```mermaid 62 | graph LR 63 | 64 | datahub["DataHub CLI tool
publish (and get)"] 65 | dpm[Data Package Manager
works with DPs] 66 | wrangler[Data Wrangler
] 67 | wrangle2["Complex wrangling"] 68 | 69 | data[data cli] 70 | 71 | data --> datahub 72 | data --> dpm 73 | data --> wrangler 74 | 75 | datahub --> push 76 | datahub --> login 77 | datahub --> get 78 | datahub --> delete 79 | 80 | dpm --> init 81 | dpm --> validate 82 | dpm --> inspect 83 | 84 | wrangler --> info 85 | wrangler --> cat 86 | wrangler --> wrangle2 87 | 88 | classDef done fill:lightgreen,stroke:#333,stroke-width:2px; 89 | classDef ok fill:green,stroke:#333,stroke-width:2px; 90 | classDef priority fill:pink,stroke:#333,stroke-width:2px; 91 | classDef prioritynotstarted fill:pink,stroke:#333,stroke-width:2px,stroke-dasharray: 5, 5; 92 | classDef prioritylow fill:grey,stroke:#333,stroke-width:1px,stroke-dasharray; 93 | class login done; 94 | class push,validate priority; 95 | class get,wrangler,delete prioritylow; 96 | class init,info,cat ok; 97 | ``` 98 | 99 | Why do we have 3 features in one: 100 | 101 | * Doing push requires a lot of other stuff including the data package lib and (some) data wrangling (see diagram in next section) 102 | * To encourage people to use the data tool for itself not just for the DataHub. (But: people have lots of other tools - do they need another one?) 103 | 104 | Question: should we focus the tool just on the DataHub part (plus some Data Packages)? 105 | 106 | * In general we focus on MVP right now (remove what is not essential!) 107 | * => drop the data wrangling stuff (?) 108 | * However, we actually need most of this and it is useful to have some of this to hand (but be careful about feature creep) 109 | 110 | ## Doing `push` requires lots of the other stuff 111 | 112 | This diagram shows what is involved with doing a push 113 | 114 | ```mermaid 115 | graph TD 116 | 117 | push[push] 118 | show[show/cat/convert] 119 | validate 120 | 121 | push --> hubapi 122 | 123 | subgraph data.js 124 | infer 125 | caststream[cast stream] 126 | objstream[object stream] 127 | tableschema[tableschema] 128 | descriptor["descriptor
(pkg/resource)"] 129 | stream[raw stream] 130 | parseid["parse identifier
(Infer filetype)"] 131 | userprovided[User
Provided] 132 | ui 133 | end 134 | 135 | subgraph datahub-client 136 | hubapi --> login 137 | hubapi --> pushlib[push] 138 | end 139 | 140 | push --> descriptor 141 | push --> stream 142 | descriptor --> tableschema 143 | 144 | show --> caststream 145 | 146 | tableschema --> infer 147 | tableschema -.-> userprovided 148 | infer --> objstream 149 | infer -.-> ui 150 | 151 | objstream --> stream 152 | stream --> parseid 153 | caststream --> objstream 154 | caststream --> tableschema 155 | 156 | hubapi[hubapi

DataHub API wrapper] 157 | ``` 158 | 159 | For more on data.js Library - See https://hackmd.io/CwZgnOCMDs0LQEMAckBMdSsnMAjaIcADKgKYBmArGOSACbRJhA==?both 160 | 161 | 162 | 163 | # What is wanted (user perspective) 164 | 165 | [DataHub] *As a Publisher I want to* 166 | 167 | 1. Push / publish a local file: data push {file} 168 | 2. Push / publish a remote url: data push {url} 169 | 3. Push / publish a data package: ... 170 | 4. Get a data package, modify and republish (?) 171 | 5. Add a view 172 | 173 | These are in priority order. 174 | 175 | [Data Packager] I want to 176 | 177 | * create a package with data files 178 | * validate a data package or file 179 | * inspect data packages 180 | * read a resource (cast) 181 | 182 | [Wrangler] I want to 183 | 184 | * inspect data files quickly (size, type etc) 185 | * convert them ... e.g. xls -> csv, csv -> json 186 | 187 | ## An imagined session 188 | 189 | ```bash 190 | # search around for datasets 191 | 192 | # I found some csv or excel or even pdf - great! I want to see if useful 193 | data cat URL 194 | 195 | # or maybe just get some info on it e.g. its size, last updated 196 | data info URL 197 | 198 | # let's download it to inspect more ... 199 | data get url 200 | 201 | data cat file 202 | 203 | # hmm - it will need some cleaning up. 204 | 205 | # time to make some notes ... 206 | vi README.md 207 | 208 | # or maybe i just want to save this file online now ... 209 | data push file 210 | 211 | # more expert users may want to creata data package first ... 212 | data init 213 | data push 214 | ``` 215 | 216 | # Operations 217 | 218 | ## `push` 219 | 220 | 221 | ```bash 222 | # create a dataset with this file (auto-generate) 223 | data push myfile.csv 224 | 225 | # what about specifying the dataset target 226 | data push myfile.csv my-existing-dataset 227 | 228 | # or with username 229 | data push {myfile.csv} @myname/xyz 230 | 231 | # or prompt for the dataset name 232 | data push myfile.csv 233 | 234 | > Dataset name: my-ram-xyz 235 | > Dataset title: ... 236 | 237 | If a dataset already exists 238 | # TODO: what if we prompt for file name from user and it is the same as an existing dataset 239 | 240 | # we can store this somewhere ... 241 | .datahub/config 242 | default=myname/mydataset 243 | 244 | data push --dataset=xyz/name myfile.csv -- replaces the file 245 | 246 | # replaces the schema 247 | data push --dataset=name --schema myschema.yaml 248 | 249 | # updates the schema 250 | data push --dataset=name --schema myschema.json 251 | 252 | # fetches the schema 253 | data get --dataset=name --schema --format=yaml 254 | 255 | 256 | data push x [to y] 257 | data push myfile.csv [{dataset}] 258 | 259 | # what could be a problem 260 | 261 | data push myfile1.csv myfile2.csv [{dataset}] 262 | 263 | dataset = [user/]name 264 | ``` 265 | 266 | 267 | As a Publisher i want to publish a file and create a new dataset 268 | 269 | ``` 270 | data push myfile.csv [dataset-name] 271 | ``` 272 | 273 | As a Publisher I want to replace a file in an existing dataset 274 | 275 | As a Publisher I want to add a file to an existing dataset 276 | 277 | ``` 278 | # if existing file with path myfile.csv or name myfile then we replace 279 | data push myfile.csv dataset-name 280 | 281 | # if existing file with name existing-file then we replace, otherwise we add as `existing-file` 282 | data push myfile.csv dataset-name:existing-file 283 | 284 | # if existing file exists so this would replace WARN the user ... 285 | data push myfile.csv dataset-name 286 | ``` 287 | 288 | As a Publisher I want to delete a file from an existing dataset 289 | 290 | ``` 291 | data strip dataset-name[:file-name] 292 | ``` 293 | 294 | 295 | 296 | ### data push {file} - Push / publish a local file: 297 | 298 | ``` 299 | ✓ data push some.csv 300 | ✓ data push some.xls 301 | ✓? data push some.xlsx 302 | ✓? data push a-random-file.xxx 303 | ✓? data push some.pdf // like a random file ... 304 | ✓? data push some.png // does it get viewed ... 305 | 306 | ✓? data push some.zip // inside are files => should act like pushing a directory? 307 | 308 | => should create data package with just README and no resources 309 | data push README.md (?) 310 | 311 | data push datapackage.json (?) 312 | 313 | data push file1.csv file2.csv (?) 314 | data push somefile.parquet 315 | ``` 316 | 317 | * Do we guess media type 318 | * Do we add hash 319 | * Do we prompt for file title 320 | * Do we validate before pushing 321 | * Do we add file size 322 | 323 | ### Issues 324 | 325 | * No progress bar on upload 326 | * No updates about what is happening on a push (we could update on each step successfully completed) 327 | * We should skip re-uploading a file to rawstore if already uploaded 328 | * TODO: does rawstore tell us this atm? 329 | * no support for data push and complex flows and flows involving automation 330 | * Could we automate creating the flow.yml? 331 | 332 | 2017-12-26 pushing to https://datahub.io/rufuspollock1/world-wealth-and-income-database/v/1 says at the top "Your data is safely stored and is getting processed - it will be here soon!" BUT I can already see a page (after a scan i do realise no files are there but that's sort of weird) and there is no other update information. Looking in JS debug i find: 333 | 334 | ``` 335 | Uncaught (in promise) TypeError: Cannot read property 'forEach' of undefined 336 | at https://datahub.io/static/dpr-js/dist/bundle.js:69:10210 337 | at r (https://datahub.io/static/dpr-js/dist/bundle.js:43:6760) 338 | at Generator._invoke (https://datahub.io/static/dpr-js/dist/bundle.js:43:7805) 339 | at Generator.e.(anonymous function) [as next] (https://datahub.io/static/dpr-js/dist/bundle.js:43:6939) 340 | at r (https://datahub.io/static/dpr-js/dist/bundle.js:50:5810) 341 | at https://datahub.io/static/dpr-js/dist/bundle.js:50:5912 342 | at 343 | ``` 344 | 345 | 346 | 2017-12-23 try pushing a data package with just a readme (not even a datapackage.json) and check whether it works (why? I'd like to push research datasets where i'm still in the process of digging stuff up) 347 | 348 | ## `get` 349 | 350 | ### Issues 351 | 352 | 2017-12-23 data get command hung on me and I can't debug ... (no debug flag) => I looked into adding this but introducing debug per command is a pain (we do it once by hand atm for push) => it should be systematic => refactoring dispatch code in the cli (and maybe therefore into the datahub-lib code ...) 353 | 354 | data get command should have option to pull "remote" resources to local paths ... 355 | 356 | 2017-12-23 (?) data get command should pretty print the datapackage.json 357 | 358 | ## `login` 359 | 360 | Login is working 361 | 362 | ## `cat` 363 | 364 | ... 365 | 366 | ## `init` 367 | 368 | ### Issues 369 | 370 | data init guesses types wrong e.g. for world 371 | incomes database. 372 | 373 | 2017-12-26 data init does not add a license field by default 374 | 375 | ## `validate` 376 | 377 | ### Issues with data validation 378 | 379 | * Does not run offline as errors on failure to access a profile (who cares about profiles by default - 99% of what i want to check is the data ...) 380 | 381 | ## `info` 382 | 383 | ``` 384 | For FILE: 385 | 386 | {file-path} 387 | size: .. 388 | md5: ... 389 | format: ... 390 | encoding: ... 391 | 392 | {PREVIEW} 393 | 394 | data info 395 | 396 | ``` 397 | 398 | ## `delete` / `purge` / `hide` 399 | 400 | As Publisher I want to delete a dataset permanently so it does not exist 401 | 402 | ``` 403 | data purge dataset-name 404 | 405 | # prompt user to type out dataset name to 406 | 407 | 408 | # TODO: since people look for delete do we want to explain you hide the dataset 409 | ?? 410 | data hide 411 | data unpublish 412 | data delete # respond with use data push --findability ... 413 | ``` 414 | 415 | # Push File - Detailed Analysis 416 | 417 | Levels: 418 | 419 | 0. Already have Data Package (?) 420 | 1. Good CSV 421 | 2. Good Excel 422 | 3. Bad data (i.e. has ...) 423 | 3. Something else 424 | 425 | ``` 426 | data push {file-or-directory} 427 | ``` 428 | 429 | How does data push work? 430 | 431 | ``` 432 | # you are pushing the raw file 433 | # and the extraction to get one or more data tables ... 434 | # in the background we are creating a data package + pipeline 435 | data push {file} 436 | 437 | Algorithm: 438 | 439 | 1. Detect type / format 440 | 2. Choose the data (e.g. sheet from excel) 441 | 3. Review the headers 442 | 4. Infer data-types and review 443 | 5. [Add constraints] 444 | 6. Data validation 445 | 7. Upload 446 | 8. Get back a link - view page (or the raw url) e.g. http://datapackaged.com/core/finance-vix 447 | * You can view, share, publish, [fork] 448 | 449 | 1. Detect file type 450 | => file extension 451 | 1. Offer guess 452 | 2. Probable guess (options?) 453 | 3. Unknown - tell us 454 | 455 | 1B. Detect encoding (for CSV) 456 | 457 | 2. Choose the data 458 | 1. 1 sheet => ok 459 | 2. Multiple sheets guess and offer 460 | 3. Multiple sheets - ask them (which to include) 461 | 462 | 2B: bad data case - e.g. selecting within table 463 | 464 | 3. Review the headers 465 | * Here is what we found 466 | * More than one option for headers - try to reconcile 467 | * 468 | 469 | 470 | ### Upload: 471 | 472 | * raw file with name a function of the md5 hash 473 | * Pros: efficient on space (e.g. same file stored once but means you need to worry about garbage collection?) 474 | * the pipeline description: description of data and everything else we did [into database] 475 | 476 | Then pipeline runs e.g. load into a database or into a data package 477 | 478 | * stores output somewhere ... 479 | 480 | Viewable online ... 481 | 482 | Note: 483 | data push url # does not store file 484 | data push file # store in rawstore 485 | 486 | ### BitStore 487 | 488 | /rawstore/ - content addressed storage (md5 or sha hashed) 489 | /packages/{owner}/{name}/{tag-or-pipeline} 490 | ``` 491 | 492 | 493 | Try this for a CSV file 494 | 495 | ``` 496 | data push mydata.csv 497 | 498 | # review headers 499 | 500 | # data types ... 501 | 502 | 503 | Upload 504 | 505 | * csv file gets stored as blob md5 ... 506 | * output of the pipeline stored ... 507 | * canonical CSV gets generated ... 508 | ``` 509 | 510 | 511 | 512 | Data Push directory 513 | 514 | ``` 515 | data push {directory} 516 | 517 | # could just do data push file for each file but ... 518 | # that could be tedious 519 | # once I've mapped one file you try reusing that mapping for others ... 520 | # .data directory that stores the pipeline and the datapackage.json 521 | ``` 522 | 523 | 524 | ## Push File - Sequence Diagram 525 | 526 | CLI architecture 527 | 528 | ```mermaid 529 | sequenceDiagram 530 | 531 | participant bin/push.js 532 | participant datahub 533 | participant data 534 | 535 | bin/push.js ->> data: new Package(path) 536 | data -->> bin/push.js: pkg obj 537 | bin/push.js ->> data: pkg.load() 538 | data ->> data: load 539 | data -->> bin/push.js: loaded pkg 540 | bin/push.js ->> datahub: new DataHub({...}) 541 | datahub -->> bin/push.js: datahub obj 542 | bin/push.js ->> datahub: datahub.push(pkg) 543 | datahub ->> data: pkg.resources 544 | data -->> datahub: loaded resources 545 | datahub ->> data: Resource.load(datapackage.json) 546 | datahub ->> datahub: authorize 547 | datahub ->> datahub: upload 548 | datahub ->> datahub: makeSourceSpec 549 | datahub ->> datahub: upload source spec 550 | datahub -->> bin/push.js: 0 or 1 551 | ``` 552 | 553 | ### Analysis 554 | 555 | What are the components involved ...? 556 | 557 | ```mermaid 558 | graph TD 559 | 560 | cli[data push myfile.csv] 561 | 562 | cli --> parseid["parsePath(myfile.csv)"

path=myfile.csv, sourceType=local, format=csv, mediaType=text/csv] 563 | 564 | parseid --> getstr["getStream(descriptor) - depending on url or local"

node stream] 565 | 566 | getstr --> parsestr["parseStream(rawStream)"

object iterator/node obj stream] 567 | 568 | parsestr --> infer["infer(objstream)

tableschema.json"] 569 | 570 | infer --> ui[show user infer and check] 571 | ``` 572 | 573 | # Appendix: Notes on NodeJS streams 574 | 575 | https://www.bennadel.com/blog/2692-you-have-to-explicitly-end-streams-after-pipes-break-in-node-js.htm 576 | 577 | https://gist.github.com/spion/ecdc92bc5de5b381da30 578 | 579 | https://github.com/maxogden/mississippi <-- recommended for managing node streams in consistent ways 580 | -------------------------------------------------------------------------------- /test/push/push.test.js: -------------------------------------------------------------------------------- 1 | // These tests are run only on tagged commits 2 | 3 | const test = require('ava') 4 | const clipboardy = require('clipboardy') 5 | 6 | const {runcli} = require('../cli.test.js') 7 | 8 | 9 | // ===================== 10 | // DATA-CLI PUSH correct 11 | 12 | // QA tests [pushing valid CSV file] 13 | 14 | test.serial('push command succeeds with regular CSV file', async t => { 15 | const path_ = 'test/fixtures/test-data/files/csv/separators/comma.csv' 16 | const args = '--name=comma-separated' 17 | const result = await runcli('push', path_, args) 18 | const stdout = result.stdout.split('\n') 19 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 20 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/comma-separated/v/')) 21 | t.truthy(hasPublishedMessage) 22 | t.truthy(hasURLtoShowcase) 23 | const whatsInClipboard = await clipboardy.read() 24 | t.true(whatsInClipboard.includes('https://datahub.io/test/comma-separated/v/')) 25 | }) 26 | 27 | // end of [pushing valid CSV file] 28 | 29 | test.serial('push --public', async t => { 30 | const path_ = 'test/fixtures/test-data/files/csv/separators/comma.csv' 31 | const args = ['--name=public-test', '--public', '--debug'] 32 | const result = await runcli('push', path_, ...args) 33 | 34 | t.truthy(result.stdout.includes('"findability": "published"')) 35 | }) 36 | 37 | // QA tests [pushing valid dataset from path] 38 | 39 | test.serial('push command succeeds for valid dataset', async t => { 40 | const path_ = 'test/fixtures/test-data/packages/basic-csv' 41 | const result = await runcli('push', path_) 42 | const stdout = result.stdout.split('\n') 43 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 44 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/basic-csv/v/')) 45 | t.truthy(hasPublishedMessage) 46 | t.truthy(hasURLtoShowcase) 47 | const whatsInClipboard = await clipboardy.read() 48 | t.true(whatsInClipboard.includes('https://datahub.io/test/basic-csv/v/')) 49 | }) 50 | 51 | // end of [pushing valid dataset from path] 52 | 53 | // QA tests [pushing valid dataset from working directory] 54 | test.serial('pushing valid dataset from working directory', async t =>{ 55 | const path_ = 'test/fixtures/test-data/packages/basic-csv' 56 | const usualWorkingDir = process.cwd() 57 | process.chdir(path_) 58 | console.log('Working directory changed: ' + process.cwd()) 59 | // push test 60 | const result = await runcli('push') 61 | const stdout = result.stdout.split('\n') 62 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 63 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/basic-csv/v/')) 64 | t.truthy(hasPublishedMessage) 65 | t.truthy(hasURLtoShowcase) 66 | const whatsInClipboard = await clipboardy.read() 67 | t.true(whatsInClipboard.includes('https://datahub.io/test/basic-csv/v/')) 68 | // change working dir to default, so other tests will not fail 69 | process.chdir(usualWorkingDir) 70 | console.log('Working directory restored: ' + process.cwd()) 71 | }) 72 | 73 | // QA tests [pushing multiple CSV files together] - should push only one file and show a WARNING message 74 | // https://datahub.io/test/zero/v/87 75 | test.serial.failing('pushing multiple CSV files Warning message', async t => { 76 | const path_ = 'test/fixtures/test-data/files/csv/separators/comma.csv' 77 | const path2_ = 'test/fixtures/test-data/files/csv/separators/colon.csv' 78 | const args = '--name=comma-separated' 79 | const result = await runcli('push', path_, path2_, args) 80 | const stdout = result.stdout.split('\n') 81 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 82 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/comma-separated/v/')) 83 | const hasWarningMessage = stdout.find(item => item.includes(`Warning: pushing only the ${path_} file.`)) 84 | t.truthy(hasPublishedMessage) 85 | t.truthy(hasURLtoShowcase) 86 | t.truthy(hasWarningMessage) 87 | const whatsInClipboard = await clipboardy.read() 88 | t.true(whatsInClipboard.includes('https://datahub.io/test/comma-separated/v/')) 89 | }) 90 | 91 | 92 | // QA tests [pushing valid dataset with path to datapackage.json] 93 | 94 | test.serial('push command succeeds for valid dataset with path to dp.json', async t => { 95 | const path_ = 'test/fixtures/test-data/packages/basic-csv/datapackage.json' 96 | const result = await runcli('push', path_) 97 | const stdout = result.stdout.split('\n') 98 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 99 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/basic-csv/v/')) 100 | t.truthy(hasPublishedMessage) 101 | t.truthy(hasURLtoShowcase) 102 | const whatsInClipboard = await clipboardy.read() 103 | t.true(whatsInClipboard.includes('https://datahub.io/test/basic-csv/v/')) 104 | }) 105 | 106 | // end of [pushing valid dataset with path to datapackage.json] 107 | 108 | // QA tests [pushing valid CSV from URL] 109 | 110 | test.serial('push command succeeds with regular CSV file from URL', async t => { 111 | const url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/csv/separators/comma.csv' 112 | const args = '--name=comma-separated' 113 | const result = await runcli('push', url_, args) 114 | const stdout = result.stdout.split('\n') 115 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 116 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/comma-separated/v/')) 117 | t.truthy(hasPublishedMessage) 118 | t.truthy(hasURLtoShowcase) 119 | const whatsInClipboard = await clipboardy.read() 120 | t.true(whatsInClipboard.includes('https://datahub.io/test/comma-separated/v/')) 121 | }) 122 | 123 | // end of [pushing valid CSV from URL] 124 | 125 | 126 | // ======================== 127 | // Invalid metadata or data 128 | 129 | // QA tests [Push: Invalid datapackage.json] 130 | 131 | test('push command fails with invalid JSON descriptor', async t => { 132 | let path_ = 'test/fixtures/test-data/packages/invalid-json-single-quotes' 133 | let result = await runcli('push', path_) 134 | let stdout = result.stdout.split('\n') 135 | let hasErrorMsg = stdout.find(item => item.includes('> Error! Unexpected token \' in JSON at position 27')) 136 | t.truthy(hasErrorMsg) 137 | // Suggests running validate command: 138 | const hasSuggestionMsg = stdout.find(item => item.includes('> \'data validate\' to check your data.')) 139 | t.truthy(hasSuggestionMsg) 140 | 141 | path_ = 'test/fixtures/test-data/packages/invalid-json-missing-comma' 142 | result = await runcli('push', path_) 143 | stdout = result.stdout.split('\n') 144 | hasErrorMsg = stdout.find(item => item.includes('> Error! Unexpected string in JSON at position 113')) 145 | t.truthy(hasErrorMsg) 146 | }) 147 | 148 | // end of [Push: Invalid datapackage.json] 149 | 150 | // QA tests [Push: Invalid descriptor metadata] 151 | 152 | test('push command fails with descriptor validation error', async t => { 153 | let path_ = 'test/fixtures/test-data/packages/invalid-descriptor' 154 | let result = await runcli('push', path_) 155 | let stdout = result.stdout.split('\n') 156 | const hasErrorMsg = stdout.find(item => item.includes('Descriptor validation error:')) 157 | t.truthy(hasErrorMsg) 158 | let hasErrorDetails = stdout.find(item => item.includes('String does not match pattern: ^([-a-z0-9._/])+$')) 159 | t.truthy(hasErrorDetails) 160 | hasErrorDetails = stdout.find(item => item.includes('at \"/name\" in descriptor')) 161 | t.truthy(hasErrorDetails) 162 | }) 163 | 164 | // end of [Push: Invalid descriptor metadata] 165 | 166 | // QA tests [Push: Missing descriptor] 167 | 168 | test('push command fails if descriptor is missing', async t => { 169 | let path_ = 'test/fixtures/test-data/packages' 170 | let result = await runcli('push', path_) 171 | let stdout = result.stdout.split('\n') 172 | const hasErrorMsg = stdout.find(item => item.includes('> Error! No datapackage.json at destination.')) 173 | t.truthy(hasErrorMsg) 174 | let suggestsToDoValidate = stdout.find(item => item.includes('data validate')) 175 | let suggestsToDoInit = stdout.find(item => item.includes('data init')) 176 | t.truthy(suggestsToDoValidate) 177 | t.truthy(suggestsToDoInit) 178 | }) 179 | 180 | // end of [Push: Missing descriptor] 181 | 182 | // QA tests [Push: pushing remote data package] 183 | 184 | test('push command fails for remote datasets', async t => { 185 | let path_ = 'https://github.com/frictionlessdata/test-data/blob/master/packages/basic-csv/datapackage.json' 186 | let result = await runcli('push', path_) 187 | let stdout = result.stdout.split('\n') 188 | const hasErrorMsg = stdout.find(item => item.includes('Error: You can push only local datasets.')) 189 | t.truthy(hasErrorMsg) 190 | }) 191 | 192 | // end of [Push: pushing remote data package] 193 | 194 | // QA tests [Push: pushing valid dataset with remote resource] 195 | 196 | test('push command succeeds for valid dataset with remote resource', async t => { 197 | let path_ = 'test/fixtures/test-data/packages/remote-csv' 198 | let result = await runcli('push', path_) 199 | let stdout = result.stdout.split('\n') 200 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 201 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/remote-resource/v/')) 202 | t.truthy(hasPublishedMessage) 203 | t.truthy(hasURLtoShowcase) 204 | const whatsInClipboard = await clipboardy.read() 205 | t.true(whatsInClipboard.includes('https://datahub.io/test/remote-resource/v/')) 206 | }) 207 | 208 | // end of [Push: pushing valid dataset with remote resource] 209 | 210 | // QA tests [Pushing invalid CSV file (irrespective of schema)] 211 | // Also includes [pushing invalid CSV from URL ] 212 | 213 | test.failing('push command fails for invalid local CSV file', async t => { 214 | const path_ = 'test/fixtures/test-data/packages/invalid-data/extra-column.csv' 215 | const args = '--name=extra-column' 216 | const result = await runcli('push', path_, args) 217 | const stdout = result.stdout.split('\n') 218 | const hasErrorMsg = stdout.find(item => item.includes('> Error! Number of columns is inconsistent on line 2')) 219 | t.truthy(hasErrorMsg) 220 | }) 221 | 222 | // end of [Pushing invalid CSV file (irrespective of schema)] 223 | 224 | // QA tests [Pushing packaged invalid CSV file (irrespective of schema)] 225 | 226 | test.serial('push command succeeds with packaged invalid CSV', async t => { 227 | const path_ = 'test/fixtures/test-data/packages/invalid-data' 228 | const result = await runcli('push', path_) 229 | const stdout = result.stdout.split('\n') 230 | const hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 231 | const hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/basic-csv/v/')) 232 | t.truthy(hasPublishedMessage) 233 | t.truthy(hasURLtoShowcase) 234 | const whatsInClipboard = await clipboardy.read() 235 | t.true(whatsInClipboard.includes('https://datahub.io/test/basic-csv/v/')) 236 | }) 237 | 238 | // end of [Pushing packaged invalid CSV file (irrespective of schema)] 239 | 240 | // QA tests [Push non existing file] 241 | 242 | test('push command fails for non-existing file', async t => { 243 | let path_ = 'non-existing.csv' 244 | let result = await runcli('push', path_) 245 | let stdout = result.stdout.split('\n') 246 | const hasErrorMsg = stdout.find(item => item.includes('> Error! ENOENT: no such file or directory, lstat \'non-existing.csv\'')) 247 | t.truthy(hasErrorMsg) 248 | }) 249 | 250 | // end of [Push non existing file] 251 | 252 | // QA tests [pushing empty but correct files] 253 | 254 | test('push command for empty files: no ext, html, txt, json', async t => { 255 | let path_ = 'test/fixtures/test-data/files/empty-files/empty' 256 | let args = '--name=empty-no-extension' 257 | let result = await runcli('push', path_, args) 258 | let stdout = result.stdout.split('\n') 259 | let hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 260 | let hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/empty-no-extension/v/')) 261 | t.truthy(hasPublishedMessage) 262 | t.truthy(hasURLtoShowcase) 263 | 264 | path_ = 'test/fixtures/test-data/files/empty-files/empty.html' 265 | args = '--name=empty-html' 266 | result = await runcli('push', path_, args) 267 | stdout = result.stdout.split('\n') 268 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 269 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/empty-html/v/')) 270 | t.truthy(hasPublishedMessage) 271 | t.truthy(hasURLtoShowcase) 272 | let whatsInClipboard = await clipboardy.read() 273 | t.true(whatsInClipboard.includes('https://datahub.io/test/empty-html/v/')) 274 | 275 | path_ = 'test/fixtures/test-data/files/empty-files/empty.txt' 276 | args = '--name=empty-txt' 277 | result = await runcli('push', path_, args) 278 | stdout = result.stdout.split('\n') 279 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 280 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/empty-txt/v/')) 281 | t.truthy(hasPublishedMessage) 282 | t.truthy(hasURLtoShowcase) 283 | 284 | path_ = 'test/fixtures/test-data/files/empty-files/empty.json' 285 | args = '--name=empty-json' 286 | result = await runcli('push', path_, args) 287 | stdout = result.stdout.split('\n') 288 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 289 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/empty-json/v/')) 290 | t.truthy(hasPublishedMessage) 291 | t.truthy(hasURLtoShowcase) 292 | whatsInClipboard = await clipboardy.read() 293 | t.true(whatsInClipboard.includes('https://datahub.io/test/empty-json/v/')) 294 | }) 295 | 296 | test('push command fails for empty files tabular files such as csv,xls', async t => { 297 | let path_ = 'test/fixtures/test-data/files/empty-files/empty.csv' 298 | let args = '--name=empty-csv' 299 | let result = await runcli('push', path_, args) 300 | let stdout = result.stdout.split('\n') 301 | let hasErrorMsg = stdout.find(item => item.includes('tabular file is invalid: test/fixtures/test-data/files/empty-files/empty.csv')) 302 | t.truthy(hasErrorMsg) 303 | 304 | path_ = 'test/fixtures/test-data/files/empty-files/empty.xls' 305 | result = await runcli('push', path_, args) 306 | args = '--name=empty-xls' 307 | stdout = result.stdout.split('\n') 308 | hasErrorMsg = stdout.find(item => item.includes('You cannot push an empty sheet. Please, add some data and try again.')) 309 | t.truthy(hasErrorMsg) 310 | }) 311 | 312 | // end of [pushing empty but correct files] 313 | 314 | // QA tests [pushing 0 bytes files] 315 | 316 | test('push command fails for zero byte files', async t => { 317 | let path_ = 'test/fixtures/test-data/files/zero-files/zero' 318 | let args = '--name=zero' 319 | let result = await runcli('push', path_, args) 320 | let stdout = result.stdout.split('\n') 321 | let hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 322 | let hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/zero/v/')) 323 | t.truthy(hasPublishedMessage) 324 | t.truthy(hasURLtoShowcase) 325 | 326 | path_ = 'test/fixtures/test-data/files/zero-files/zero.csv' 327 | result = await runcli('push', path_, args) 328 | stdout = result.stdout.split('\n') 329 | t.true(stdout[0].includes('> Error! tabular file is invalid:')) 330 | 331 | path_ = 'test/fixtures/test-data/files/zero-files/zero.html' 332 | result = await runcli('push', path_, args) 333 | stdout = result.stdout.split('\n') 334 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 335 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/zero/v/')) 336 | t.truthy(hasPublishedMessage) 337 | t.truthy(hasURLtoShowcase) 338 | 339 | path_ = 'test/fixtures/test-data/files/zero-files/zero.txt' 340 | result = await runcli('push', path_, args) 341 | stdout = result.stdout.split('\n') 342 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 343 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/zero/v/')) 344 | t.truthy(hasPublishedMessage) 345 | t.truthy(hasURLtoShowcase) 346 | 347 | path_ = 'test/fixtures/test-data/files/zero-files/zero.json' 348 | result = await runcli('push', path_, args) 349 | stdout = result.stdout.split('\n') 350 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 351 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/zero/v/')) 352 | t.truthy(hasPublishedMessage) 353 | t.truthy(hasURLtoShowcase) 354 | 355 | path_ = 'test/fixtures/test-data/files/zero-files/zero.xls' 356 | result = await runcli('push', path_, args) 357 | stdout = result.stdout.split('\n') 358 | let hasErrorMsg = stdout.find(item => item.includes('You can not push empty files, please add some data and try again')) 359 | t.truthy(hasErrorMsg) 360 | }) 361 | 362 | // end of [pushing 0 bytes files] 363 | 364 | 365 | // ========== 366 | // Formatting 367 | 368 | // QA tests [pushing valid CSV with force formatting wrong extention (from path and URl)] 369 | 370 | test.serial('push command succeeds for CSV with wrong ext but force formatting', async t => { 371 | const path_ = 'test/fixtures/test-data/files/wrong-extension-files/comma.txt' 372 | let argName = '--name=comma-separated' 373 | let argFormat = '--format=csv' 374 | let result = await runcli('push', path_, argName, argFormat) 375 | let stdout = result.stdout.split('\n') 376 | let hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 377 | let hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/comma-separated/v/')) 378 | t.truthy(hasPublishedMessage) 379 | t.truthy(hasURLtoShowcase) 380 | let whatsInClipboard = await clipboardy.read() 381 | t.true(whatsInClipboard.includes('https://datahub.io/test/comma-separated/v/')) 382 | 383 | const url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/wrong-extension-files/comma.txt' 384 | result = await runcli('push', path_, argName, argFormat) 385 | stdout = result.stdout.split('\n') 386 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 387 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/comma-separated/v/')) 388 | t.truthy(hasPublishedMessage) 389 | t.truthy(hasURLtoShowcase) 390 | whatsInClipboard = await clipboardy.read() 391 | t.true(whatsInClipboard.includes('https://datahub.io/test/comma-separated/v/')) 392 | }) 393 | 394 | // end of [pushing valid CSV with force formatting wrong extention (from path and URl)] 395 | 396 | // QA tests [pushing valid XLS and XLSX with force formatting] 397 | 398 | test('push command succeeds for Excel with wrong ext but force formatting', async t => { 399 | let path_ = 'test/fixtures/test-data/files/wrong-extension-files/sample-1-sheet.txt' 400 | let argName = '--name=sample-excel-with-force-formatting' 401 | let argFormat = '--format=xls' 402 | let result = await runcli('push', path_, argName, argFormat) 403 | let stdout = result.stdout.split('\n') 404 | let hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 405 | let hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/sample-excel-with-force-formatting/v/')) 406 | t.truthy(hasPublishedMessage) 407 | t.truthy(hasURLtoShowcase) 408 | let whatsInClipboard = await clipboardy.read() 409 | t.true(whatsInClipboard.includes('https://datahub.io/test/sample-excel-with-force-formatting/v/')) 410 | 411 | path_ = 'test/fixtures/test-data/files/wrong-extension-files/sample-1-sheet.pdf' 412 | argFormat = '--format=xlsx' 413 | result = await runcli('push', path_, argName, argFormat) 414 | stdout = result.stdout.split('\n') 415 | hasPublishedMessage = stdout.find(item => item.includes('your data is published!')) 416 | hasURLtoShowcase = stdout.find(item => item.includes('https://datahub.io/test/sample-excel-with-force-formatting/v/')) 417 | t.truthy(hasPublishedMessage) 418 | t.truthy(hasURLtoShowcase) 419 | whatsInClipboard = await clipboardy.read() 420 | t.true(whatsInClipboard.includes('https://datahub.io/test/sample-excel-with-force-formatting/v/')) 421 | }) 422 | 423 | // end of [pushing valid XLS and XLSX with force formatting] 424 | 425 | // QA test [pushing not CSV with force formatting] 426 | 427 | test('push command fails for non-CSV with force formatting', async t => { 428 | let path_ = 'test/fixtures/test-data/files/excel/sample-1-sheet.xls' 429 | const argName = '--name=not-csv-as-csv' 430 | const argFormat = '--format=csv' 431 | let result = await runcli('push', path_, argName, argFormat) 432 | let stdout = result.stdout.split('\n') 433 | let hasExpectedErrorMsg = stdout.find(item => item.includes('> Error! tabular file is invalid')) 434 | t.truthy(hasExpectedErrorMsg) 435 | 436 | let url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/excel/sample-1-sheet.xls' 437 | result = await runcli('push', path_, argName, argFormat) 438 | stdout = result.stdout.split('\n') 439 | hasExpectedErrorMsg = stdout.find(item => item.includes('> Error! tabular file is invalid')) 440 | t.truthy(hasExpectedErrorMsg) 441 | 442 | path_ = 'test/fixtures/test-data/files/excel/sample-1-sheet.xlsx' 443 | result = await runcli('push', path_, argName, argFormat) 444 | stdout = result.stdout.split('\n') 445 | hasExpectedErrorMsg = stdout.find(item => item.includes('> Error! tabular file is invalid')) 446 | t.truthy(hasExpectedErrorMsg) 447 | 448 | url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/excel/sample-1-sheet.xlsx' 449 | result = await runcli('push', path_, argName, argFormat) 450 | stdout = result.stdout.split('\n') 451 | hasExpectedErrorMsg = stdout.find(item => item.includes('> Error! tabular file is invalid')) 452 | t.truthy(hasExpectedErrorMsg) 453 | }) 454 | 455 | // end of [pushing not CSV with force formatting] 456 | 457 | // QA test [pushing not CSV with force formatting (non tabular )] 458 | 459 | test('push command fails for non-CSV (non-tabular) files with force formatting', async t => { 460 | let path_ = 'test/fixtures/test-data/files/other/sample.json' 461 | const argName = '--name=not-csv-as-csv' 462 | const argFormat = '--format=csv' 463 | let result = await runcli('push', path_, argName, argFormat) 464 | let stdout = result.stdout.split('\n') 465 | let hasExpectedErrorMsg = stdout.find(item => item.includes('> Error! tabular file is invalid')) 466 | t.truthy(hasExpectedErrorMsg) 467 | 468 | let url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/other/sample.json' 469 | result = await runcli('push', path_, argName, argFormat) 470 | stdout = result.stdout.split('\n') 471 | hasExpectedErrorMsg = stdout.find(item => item.includes('> Error! tabular file is invalid')) 472 | t.truthy(hasExpectedErrorMsg) 473 | }) 474 | 475 | // end of [pushing not CSV with force formatting (non tabular )] 476 | 477 | 478 | // =========== 479 | // Excel files 480 | 481 | // QA test [pushing excel file with 1 sheet] 482 | 483 | test.serial('push command succeeds for simple Excel with 1 sheet', async t => { 484 | let path_ = 'test/fixtures/test-data/files/excel/sample-1-sheet.xls' 485 | const argName = '--name=test-excel-1-sheet' 486 | let result = await runcli('push', path_, argName, '--debug') 487 | // Check what's printed in console while in debug mode, e.g., if schema is included: 488 | let hasSchemaForFirstSheet = result.stdout.includes('"name": "number"') 489 | t.truthy(hasSchemaForFirstSheet) 490 | let hasPublishedMessage = result.stdout.includes('your data is published!') 491 | let hasURLtoShowcase = result.stdout.includes('https://datahub.io/test/test-excel-1-sheet/v/') 492 | t.truthy(hasPublishedMessage) 493 | t.truthy(hasURLtoShowcase) 494 | let whatsInClipboard = await clipboardy.read() 495 | t.true(whatsInClipboard.includes('https://datahub.io/test/test-excel-1-sheet/v/')) 496 | 497 | path_ = 'test/fixtures/test-data/files/excel/sample-1-sheet.xlsx' 498 | result = await runcli('push', path_, argName, '--debug') 499 | hasSchemaForFirstSheet = result.stdout.includes('"name": "number"') 500 | t.truthy(hasSchemaForFirstSheet) 501 | hasPublishedMessage = result.stdout.includes('your data is published!') 502 | hasURLtoShowcase = result.stdout.includes('https://datahub.io/test/test-excel-1-sheet/v/') 503 | t.truthy(hasPublishedMessage) 504 | t.truthy(hasURLtoShowcase) 505 | whatsInClipboard = await clipboardy.read() 506 | t.true(whatsInClipboard.includes('https://datahub.io/test/test-excel-1-sheet/v/')) 507 | }) 508 | 509 | // end of [pushing excel file with 1 sheet] 510 | 511 | // QA test [pushing excel file with selected sheets] 512 | // also includes: 513 | // [pushing excel file with selected non existing sheet] 514 | // [pushing excel file with all sheets] 515 | // [pushing excel file with list of sheets] 516 | 517 | test.serial('push command succeeds for Excel with selected sheet', async t => { 518 | let path_ = 'test/fixtures/test-data/files/excel/sample-2-sheets.xls' 519 | const argName = '--name=test-excel-2-sheets' 520 | let argSheets = '--sheets=2' 521 | let result = await runcli('push', path_, argName, argSheets, '--debug') 522 | // Check what's printed in console while in debug mode, e.g., if schema is included: 523 | let hasSchemaForSecondSheet = result.stdout.includes('"name": "header4"') 524 | t.truthy(hasSchemaForSecondSheet) 525 | let hasPublishedMessage = result.stdout.includes('your data is published!') 526 | let hasURLtoShowcase = result.stdout.includes('https://datahub.io/test/test-excel-2-sheets/v/') 527 | t.truthy(hasPublishedMessage) 528 | t.truthy(hasURLtoShowcase) 529 | let whatsInClipboard = await clipboardy.read() 530 | t.true(whatsInClipboard.includes('https://datahub.io/test/test-excel-2-sheets/v/')) 531 | 532 | path_ = 'test/fixtures/test-data/files/excel/sample-2-sheets.xlsx' 533 | result = await runcli('push', path_, argName, argSheets, '--debug') 534 | // Check what's printed in console while in debug mode, e.g., if schema is included: 535 | hasSchemaForSecondSheet = result.stdout.includes('"name": "header4"') 536 | t.truthy(hasSchemaForSecondSheet) 537 | hasPublishedMessage = result.stdout.includes('your data is published!') 538 | hasURLtoShowcase = result.stdout.includes('https://datahub.io/test/test-excel-2-sheets/v/') 539 | t.truthy(hasPublishedMessage) 540 | t.truthy(hasURLtoShowcase) 541 | whatsInClipboard = await clipboardy.read() 542 | t.true(whatsInClipboard.includes('https://datahub.io/test/test-excel-2-sheets/v/')) 543 | 544 | argSheets = '--sheets=5' 545 | result = await runcli('push', path_, argName, argSheets, '--debug') 546 | let hasErrorMsg = result.stdout.includes('Error! sheet index 5 is out of range') 547 | t.truthy(hasErrorMsg) 548 | 549 | argSheets = '--sheets=all' 550 | result = await runcli('push', path_, argName, argSheets, '--debug') 551 | let hasSchemaForFirstSheet = result.stdout.includes('"name": "header1"') 552 | hasSchemaForSecondSheet = result.stdout.includes('"name": "header4"') 553 | t.truthy(hasSchemaForFirstSheet) 554 | t.truthy(hasSchemaForSecondSheet) 555 | 556 | argSheets = '--sheets=1,2' 557 | result = await runcli('push', path_, argName, argSheets, '--debug') 558 | hasSchemaForFirstSheet = result.stdout.includes('"name": "header1"') 559 | hasSchemaForSecondSheet = result.stdout.includes('"name": "header4"') 560 | t.truthy(hasSchemaForFirstSheet) 561 | t.truthy(hasSchemaForSecondSheet) 562 | }) 563 | 564 | // end of [pushing excel file with selected sheets] 565 | 566 | 567 | test('push command fails for resources with invalid URL as path', async t => { 568 | const url_ = 'https://github.com/datasets/testtest' 569 | const argName = '--name=test' 570 | let result = await runcli('push', url_, argName) 571 | let stdout = result.stdout.split('\n') 572 | let hasErrorMsg = stdout.find(item => item.includes('> Error! Invalid URL. 404 Not Found: https://github.com/datasets/testtest')) 573 | t.truthy(hasErrorMsg) 574 | 575 | // Pushing a dataset with remote resource: 576 | const path_ = 'test/fixtures/test-data/packages/invalid-remote-path/' 577 | result = await runcli('push', path_, argName) 578 | stdout = result.stdout.split('\n') 579 | hasErrorMsg = stdout.find(item => item.includes('> Error! ')) 580 | t.truthy(hasErrorMsg) 581 | }) 582 | -------------------------------------------------------------------------------- /test/cli.test.js: -------------------------------------------------------------------------------- 1 | // Test the CLI directly 2 | const fs = require('fs') 3 | const path = require('path') 4 | 5 | const test = require('ava') 6 | const {spawn} = require('cross-spawn') 7 | const run = require('inquirer-test') 8 | const {ENTER} = require('inquirer-test') 9 | 10 | const {version} = require('../package.json') 11 | 12 | const runcli = (...args) => { 13 | return new Promise((resolve, reject) => { 14 | const command = path.resolve(__dirname, '../bin/data.js') 15 | args.push('--test') 16 | const data = spawn(command, args) 17 | 18 | let stdout = '' 19 | data.stdout.on('data', data => { 20 | stdout += data 21 | }) 22 | 23 | data.on('error', err => { 24 | reject(err) 25 | }) 26 | 27 | data.on('close', code => { 28 | resolve({ 29 | code, 30 | stdout 31 | }) 32 | }) 33 | }) 34 | } 35 | 36 | // ========== 37 | // The basics 38 | 39 | 40 | 41 | test.after.always('cleanup', t => { 42 | let deleteFolderRecursive = (path) => { 43 | if (fs.existsSync(path)) { 44 | fs.readdirSync(path).forEach((file, index) => { 45 | let curPath = path + "/" + file; 46 | if (fs.lstatSync(curPath).isDirectory()) { // recurse 47 | deleteFolderRecursive(curPath); 48 | } else { // delete file 49 | fs.unlinkSync(curPath); 50 | } 51 | }) 52 | fs.rmdirSync(path); 53 | } 54 | } 55 | deleteFolderRecursive('finance-vix') 56 | deleteFolderRecursive('test/small-dataset-100kb') 57 | deleteFolderRecursive('test/medium-dataset-1mb') 58 | deleteFolderRecursive('test/big-dataset-10mb') 59 | deleteFolderRecursive('test/private-cli-test') 60 | try { 61 | fs.unlinkSync('test/fixtures/test-data/files/geo/datapackage.json') 62 | fs.unlinkSync('sample.csv') 63 | fs.unlinkSync('sample-1-sheet.xls') 64 | fs.unlinkSync('0.csv') 65 | } catch (err) { 66 | console.log('Finished cleanup without deleting some files.') 67 | } 68 | 69 | }) 70 | 71 | test('"data -v --version" prints version', async t => { 72 | let result = await runcli('-v') 73 | 74 | t.is(result.code, 0) 75 | let stdout = result.stdout.split('\n') 76 | t.true(stdout.length > 1) 77 | t.true(stdout[0].includes(`${version}`)) 78 | 79 | result = await runcli('--version') 80 | 81 | t.is(result.code, 0) 82 | stdout = result.stdout.split('\n') 83 | t.true(stdout.length > 1) 84 | t.true(stdout[0].includes(`${version}`)) 85 | }) 86 | 87 | test('"data help" prints help message', async t => { 88 | const result = await runcli('help') 89 | 90 | t.is(result.code, 0) 91 | const stdout = result.stdout.split('\n') 92 | t.true(stdout.length > 1) 93 | const hasExpectedMsg = stdout.find(item => item.includes('❒ data [options] ')) 94 | t.truthy(hasExpectedMsg) 95 | }) 96 | 97 | 98 | // ======================================= 99 | // DATA-CLI GET 100 | 101 | test('get command with local dataset', async t => { 102 | const identifier = 'test/fixtures/finance-vix' 103 | const result = await runcli('get', identifier) 104 | const stdout = result.stdout.split('\n') 105 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 106 | const hasSuccessMsg = stdout.find(item => item.includes('Dataset/file is saved in "finance-vix"')) 107 | t.truthy(hasTimeElapsedMsg) 108 | t.truthy(hasSuccessMsg) 109 | }) 110 | 111 | test('get command with local file', async t => { 112 | const identifier = 'test/fixtures/sample.csv' 113 | const result = await runcli('get', identifier) 114 | const stdout = result.stdout.split('\n') 115 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 116 | const hasSuccessMsg = stdout.find(item => item.includes('Dataset/file is saved in "sample.csv"')) 117 | t.truthy(hasTimeElapsedMsg) 118 | t.truthy(hasSuccessMsg) 119 | }) 120 | 121 | // QA tests [Get: r links from DataHub] 122 | 123 | test('get command with r links from DataHub', async t => { 124 | const identifier = 'https://datahub.io/test/small-dataset-100kb/r/0.csv' 125 | const result = await runcli('get', identifier) 126 | const stdout = result.stdout.split('\n') 127 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 128 | const hasSuccessMsg = stdout.find(item => item.includes('Dataset/file is saved in "0.csv"')) 129 | t.truthy(hasTimeElapsedMsg) 130 | t.truthy(hasSuccessMsg) 131 | }) 132 | 133 | // end of [Get: r links from DataHub] 134 | 135 | // QA tests [Get: Small dataset from DataHub] 136 | 137 | test('get command with small dataset from DataHub', async t => { 138 | const identifier = 'https://datahub.io/test/small-dataset-100kb/' 139 | const result = await runcli('get', identifier) 140 | const stdout = result.stdout.split('\n') 141 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 142 | const hasSuccessMsg = stdout.find(item => item.includes('Dataset/file is saved in "test/small-dataset-100kb"')) 143 | t.truthy(hasTimeElapsedMsg) 144 | t.truthy(hasSuccessMsg) 145 | }) 146 | 147 | // end of [Get: Small dataset from DataHub] 148 | 149 | // QA tests [Get: Medium dataset from DataHub] 150 | 151 | test('get command with medium dataset from DataHub', async t => { 152 | const identifier = 'https://datahub.io/test/medium-dataset-1mb' 153 | const result = await runcli('get', identifier) 154 | const stdout = result.stdout.split('\n') 155 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 156 | const hasSuccessMsg = stdout.find(item => item.includes('Dataset/file is saved in "test/medium-dataset-1mb"')) 157 | t.truthy(hasTimeElapsedMsg) 158 | t.truthy(hasSuccessMsg) 159 | }) 160 | 161 | // end of [Get: Meduim dataset from DataHub] 162 | 163 | // QA tests [Get: Big dataset from DataHub] 164 | 165 | test('get command with big dataset from DataHub', async t => { 166 | const identifier = 'https://datahub.io/test/big-dataset-10mb' 167 | const result = await runcli('get', identifier) 168 | const stdout = result.stdout.split('\n') 169 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 170 | const hasSuccessMsg = stdout.find(item => item.includes('Dataset/file is saved in "test/big-dataset-10mb"')) 171 | t.truthy(hasTimeElapsedMsg) 172 | t.truthy(hasSuccessMsg) 173 | }) 174 | 175 | // end of [Get: Big dataset from DataHub] 176 | 177 | // QA tests [Get: get excel file] 178 | 179 | test('get command with excel file', async t => { 180 | const identifier = 'https://github.com/frictionlessdata/test-data/blob/master/files/excel/sample-1-sheet.xls' 181 | const result = await runcli('get', identifier) 182 | const stdout = result.stdout.split('\n') 183 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 184 | const hasSuccessMsg = stdout.find(item => item.includes('Dataset/file is saved in "sample-1-sheet.xls"')) 185 | t.truthy(hasTimeElapsedMsg) 186 | t.truthy(hasSuccessMsg) 187 | }) 188 | 189 | // end of [Get: get excel file] 190 | 191 | // QA tests [Get: get private dataset] 192 | 193 | test('get command with private dataset', async t => { 194 | const identifier = 'https://datahub.io/test/private-cli-test' 195 | // Note that token for test user is set in env var. First we pass wrong token 196 | // as an argument and expect 404 or 403: 197 | const token = 'non-owner-token' 198 | let result = await runcli('get', identifier, `--token=${token}`) 199 | let stdout = result.stdout.split('\n') 200 | const hasErrorMsg = stdout.find(item => item.includes('> Error! 404: Not Found. Requested URL')) 201 | t.truthy(hasErrorMsg) 202 | 203 | // Now use correct token from env var: 204 | result = await runcli('get', identifier) 205 | stdout = result.stdout.split('\n') 206 | const hasTimeElapsedMsg = stdout.find(item => item.includes('Time elapsed:')) 207 | t.truthy(hasTimeElapsedMsg) 208 | t.true(fs.existsSync('test/private-cli-test/datapackage.json')) 209 | }) 210 | 211 | // end of QA tests [Get: get private dataset] 212 | 213 | 214 | // ======================================= 215 | // CLI commands: validate, cat, info, init 216 | 217 | test('Init command in non-interactive mode', async t => { 218 | const result = await runcli('init', 'test/fixtures/test-data/files/geo/') 219 | t.true(result.stdout.includes('This process initializes a new datapackage.json file')) 220 | t.true(result.stdout.includes('Descriptor is saved in')) 221 | }) 222 | 223 | // QA tests [Info: basic dataset] 224 | 225 | test('Info: basic dataset', async t => { 226 | let identifier = 'test/fixtures/test-data/packages/basic-csv' 227 | let result = await runcli('info', identifier) 228 | let stdout = result.stdout.split('\n') 229 | let hasMsg = stdout.find(item => item.includes('# basic-csv')) 230 | t.truthy(hasMsg) 231 | hasMsg = stdout.find(item => item.includes('comma-separated')) 232 | t.truthy(hasMsg) 233 | 234 | identifier = 'https://github.com/frictionlessdata/test-data/tree/master/packages/basic-csv' 235 | result = await runcli('info', identifier) 236 | stdout = result.stdout.split('\n') 237 | hasMsg = stdout.find(item => item.includes('# basic-csv')) 238 | t.truthy(hasMsg) 239 | hasMsg = stdout.find(item => item.includes('comma-separated')) 240 | t.truthy(hasMsg) 241 | }) 242 | 243 | // end of [Info: basic dataset] 244 | 245 | // QA tests [Info: dataset with multiple resources] 246 | 247 | test('Info: dataset with multiple resources', async t => { 248 | let identifier = 'test/fixtures/test-data/packages/different-separators' 249 | let result = await runcli('info', identifier) 250 | let stdout = result.stdout.split('\n') 251 | let hasCaretsResource = stdout.find(item => item.includes('carets')) 252 | let hasCommaResource = stdout.find(item => item.includes('comma')) 253 | t.truthy(hasCaretsResource) 254 | t.truthy(hasCommaResource) 255 | 256 | identifier = 'https://github.com/frictionlessdata/test-data/tree/master/packages/different-separators' 257 | result = await runcli('info', identifier) 258 | stdout = result.stdout.split('\n') 259 | hasCaretsResource = stdout.find(item => item.includes('carets')) 260 | hasCommaResource = stdout.find(item => item.includes('comma')) 261 | t.truthy(hasCaretsResource) 262 | t.truthy(hasCommaResource) 263 | }) 264 | 265 | // end if [Info: dataset with multiple resources] 266 | 267 | // QA tests [Info: basic CSV] 268 | 269 | test('Info: basic CSV', async t => { 270 | let identifier = 'test/fixtures/test-data/files/csv/100kb.csv' 271 | let result = await runcli('info', identifier) 272 | let stdout = result.stdout.split('\n') 273 | let hasDialect = stdout.find(item => item.includes('dialect')) 274 | let hasSchema = stdout.find(item => item.includes('schema')) 275 | let hasEncodings = stdout.find(item => item.includes('encoding')) 276 | let hasCreatedDate = stdout.find(item => item.includes('created')) 277 | let hasValueInTheTenthRow = stdout.find(item => item.includes('Sharlene')) 278 | let hasValueInTheEleventhRow = stdout.find(item => item.includes('Misti')) 279 | t.truthy(hasDialect) 280 | t.truthy(hasSchema) 281 | t.truthy(hasEncodings) 282 | t.falsy(hasCreatedDate) 283 | t.truthy(hasValueInTheTenthRow) 284 | t.falsy(hasValueInTheEleventhRow) 285 | }) 286 | 287 | // end of [Info: basic CSV] 288 | 289 | // QA tests [Info: non-tabular file] 290 | 291 | test('Info: non-tabular file', async t => { 292 | let identifier = 'test/fixtures/test-data/files/other/sample.pdf' 293 | let result = await runcli('info', identifier) 294 | let stdout = result.stdout.split('\n') 295 | let hasName = stdout.find(item => item.includes('name')) 296 | let hasFormat = stdout.find(item => item.includes('format')) 297 | let hasPath = stdout.find(item => item.includes('path')) 298 | let hasDialect = stdout.find(item => item.includes('dialect')) 299 | t.truthy(hasName) 300 | t.truthy(hasFormat) 301 | t.truthy(hasPath) 302 | t.falsy(hasDialect) 303 | 304 | identifier = 'https://github.com/frictionlessdata/test-data/raw/master/files/other/sample.pdf' 305 | result = await runcli('info', identifier) 306 | stdout = result.stdout.split('\n') 307 | hasName = stdout.find(item => item.includes('name')) 308 | hasFormat = stdout.find(item => item.includes('format')) 309 | hasPath = stdout.find(item => item.includes('path')) 310 | hasDialect = stdout.find(item => item.includes('dialect')) 311 | t.truthy(hasName) 312 | t.truthy(hasFormat) 313 | t.truthy(hasPath) 314 | t.falsy(hasDialect) 315 | }) 316 | 317 | // QA tests [Info: from datahub and github] 318 | 319 | test('info command with a dataset from GitHub', async t => { 320 | const identifier = 'https://github.com/datasets/finance-vix' 321 | const result = await runcli('info', identifier) 322 | const stdout = result.stdout.split('\n') 323 | const hasReadme = stdout.find(item => item.includes('CBOE Volatility Index (VIX) time-series dataset including')) 324 | const hasResource = stdout.find(item => item.includes('vix-daily')) 325 | t.truthy(hasReadme) 326 | t.truthy(hasResource) 327 | }) 328 | 329 | test('info command with a dataset from DataHub', async t => { 330 | const identifier = 'https://datahub.io/core/finance-vix' 331 | const result = await runcli('info', identifier) 332 | const stdout = result.stdout.split('\n') 333 | const hasReadme = stdout.find(item => item.includes('CBOE Volatility Index (VIX) time-series dataset including')) 334 | const hasResource = stdout.find(item => item.includes('vix-daily')) 335 | t.truthy(hasReadme) 336 | t.truthy(hasResource) 337 | }) 338 | 339 | // end of [Info: from datahub and github] 340 | 341 | // QA tests [Proper error messages] 342 | 343 | test('info command - no dataset or descriptor at URL', async t => { 344 | const url_ = 'https://datahub.io' 345 | const result = await runcli('info', url_) 346 | const stdout = result.stdout.split('\n') 347 | const hasErrorMsg = stdout.find(item => item.includes('Expected URL to a dataset or descriptor.')) 348 | t.truthy(hasErrorMsg) 349 | }) 350 | 351 | test('get command - no dataset or descriptor at URL', async t => { 352 | const url_ = 'https://datahub.io' 353 | const result = await runcli('get', url_) 354 | const stdout = result.stdout.split('\n') 355 | const hasErrorMsg = stdout.find(item => item.includes('Expected URL to a dataset or descriptor.')) 356 | t.truthy(hasErrorMsg) 357 | }) 358 | 359 | // end of [Proper error messages] 360 | 361 | // QA tests [Validate: basic csv resource] 362 | 363 | test('validate command - basic dataset', async t => { 364 | const path_ = 'test/fixtures/test-data/packages/basic-csv' 365 | const result = await runcli('validate', path_) 366 | const stdout = result.stdout.split('\n') 367 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 368 | t.truthy(hasValidMessage) 369 | }) 370 | 371 | test.serial('validate command - remote basic dataset', async t => { 372 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/basic-csv' 373 | const result = await runcli('validate', url_) 374 | const stdout = result.stdout.split('\n') 375 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 376 | t.truthy(hasValidMessage) 377 | }) 378 | 379 | // end of [Validate: basic csv resource] 380 | 381 | // QA tests [Validate: non-tabular resource LOCALLY] 382 | 383 | test('validate command - non-tabular resource', async t => { 384 | const path_ = 'test/fixtures/test-data/packages/non-tabular-resource' 385 | const result = await runcli('validate', path_) 386 | const stdout = result.stdout.split('\n') 387 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 388 | t.truthy(hasValidMessage) 389 | }) 390 | 391 | test.serial('validate command - remote dataset with non-tabular resource', async t => { 392 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/non-tabular-resource' 393 | const result = await runcli('validate', url_) 394 | const stdout = result.stdout.split('\n') 395 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 396 | t.truthy(hasValidMessage) 397 | }) 398 | 399 | // end of [Validate: non-tabular resource LOCALLY] 400 | 401 | // QA tests [Validate: remote resource] 402 | 403 | test('validate command - remote resource', async t => { 404 | const path_ = 'test/fixtures/test-data/packages/remote-csv' 405 | const result = await runcli('validate', path_) 406 | const stdout = result.stdout.split('\n') 407 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 408 | t.truthy(hasValidMessage) 409 | }) 410 | 411 | test.serial('validate command - remote dataset with remote resource', async t => { 412 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/remote-csv' 413 | const result = await runcli('validate', url_) 414 | const stdout = result.stdout.split('\n') 415 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 416 | t.truthy(hasValidMessage) 417 | }) 418 | 419 | // end of [Validate: remote resource] 420 | 421 | // QA tests [Validate: csv with different separators] 422 | 423 | test('validate command - csv with different separators', async t => { 424 | const path_ = 'test/fixtures/test-data/packages/different-separators' 425 | const result = await runcli('validate', path_) 426 | const stdout = result.stdout.split('\n') 427 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 428 | t.truthy(hasValidMessage) 429 | }) 430 | 431 | test.serial('validate command - remote dataset with csv with different separators', async t => { 432 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/different-separators' 433 | const result = await runcli('validate', url_) 434 | const stdout = result.stdout.split('\n') 435 | const hasValidMessage = stdout.find(item => item.includes('Your Data Package is valid!')) 436 | t.truthy(hasValidMessage) 437 | }) 438 | 439 | // end of [Validate: csv with different separators] 440 | 441 | // QA tests [Validate: invalid path to resource] 442 | 443 | test('validate command - invalid local path', async t => { 444 | const path_ = 'test/fixtures/test-data/packages/invalid-local-path' 445 | const result = await runcli('validate', path_) 446 | const stdout = result.stdout.split('\n') 447 | const hasErrorMsg = stdout.find(item => item.includes('> Error! ENOENT: no such file or directory')) 448 | t.truthy(hasErrorMsg) 449 | }) 450 | 451 | // end of [Validate: invalid path to resource] 452 | 453 | // QA tests [Validate: invalid remote path to resource] 454 | 455 | test.serial('validate command - invalid remote path for resource', async t => { 456 | const path_ = 'test/fixtures/test-data/packages/invalid-remote-path' 457 | const result = await runcli('validate', path_) 458 | const stdout = result.stdout.split('\n') 459 | const hasErrorMsg = stdout.find(item => item.includes('> Error! Request failed with status code 404')) 460 | const hasResourceName = stdout.find(item => item.includes('> Error! Resource: invalid-remote-path')) 461 | const hasResourcePath = stdout.find(item => item.includes('> Error! Path: https://raw.githubusercontent.com/frictionlessdata/there/is/no/such/file.csv')) 462 | t.truthy(hasErrorMsg) 463 | t.truthy(hasResourceName) 464 | t.truthy(hasResourcePath) 465 | }) 466 | 467 | test.serial('validate command - remote dataset with invalid remote path for resource', async t => { 468 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/invalid-remote-path' 469 | const result = await runcli('validate', url_) 470 | const stdout = result.stdout.split('\n') 471 | const hasErrorMsg = stdout.find(item => item.includes('> Error! Request failed with status code 404')) 472 | const hasResourceName = stdout.find(item => item.includes('> Error! Resource: invalid-remote-path')) 473 | const hasResourcePath = stdout.find(item => item.includes('> Error! Path: https://raw.githubusercontent.com/frictionlessdata/there/is/no/such/file.csv')) 474 | t.truthy(hasErrorMsg) 475 | t.truthy(hasResourceName) 476 | t.truthy(hasResourcePath) 477 | }) 478 | 479 | // end of [Validate: invalid remote path to resource] 480 | 481 | // QA tests [Validate: csv with different field types, formats and constraints] 482 | 483 | test.serial('validate command - wrong constraints', async t => { 484 | const path_ = 'test/fixtures/test-data/packages/types-formats-and-constraints/constraints' 485 | let result = await runcli('validate', path_) 486 | let stdout = result.stdout.split('\n') 487 | let hasErrorMsg = stdout.find(item => item.includes('> Error! There are 7 type and format mismatch errors on line 3')) 488 | t.truthy(hasErrorMsg) 489 | 490 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/types-formats-and-constraints/constraints' 491 | result = await runcli('validate', url_) 492 | stdout = result.stdout.split('\n') 493 | hasErrorMsg = stdout.find(item => item.includes('> Error! There are 7 type and format mismatch errors on line 3')) 494 | t.truthy(hasErrorMsg) 495 | }) 496 | 497 | test.serial('validate command - wrong "date" type/format', async t => { 498 | const path_ = 'test/fixtures/test-data/packages/types-formats-and-constraints/date' 499 | let result = await runcli('validate', path_) 500 | let stdout = result.stdout.split('\n') 501 | let hasErrorMsg = stdout.find(item => item.includes('> Error! There are 2 type and format mismatch errors on line 3')) 502 | t.truthy(hasErrorMsg) 503 | 504 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/types-formats-and-constraints/date' 505 | result = await runcli('validate', url_) 506 | stdout = result.stdout.split('\n') 507 | hasErrorMsg = stdout.find(item => item.includes('> Error! There are 2 type and format mismatch errors on line 3')) 508 | t.truthy(hasErrorMsg) 509 | }) 510 | 511 | test.serial('validate command - wrong "datetime" type/format', async t => { 512 | const path_ = 'test/fixtures/test-data/packages/types-formats-and-constraints/datetime' 513 | let result = await runcli('validate', path_) 514 | let stdout = result.stdout.split('\n') 515 | let hasErrorMsg = stdout.find(item => item.includes('> Error! There are 3 type and format mismatch errors on line 3')) 516 | t.truthy(hasErrorMsg) 517 | 518 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/types-formats-and-constraints/datetime' 519 | result = await runcli('validate', url_) 520 | stdout = result.stdout.split('\n') 521 | hasErrorMsg = stdout.find(item => item.includes('> Error! There are 3 type and format mismatch errors on line 3')) 522 | t.truthy(hasErrorMsg) 523 | }) 524 | 525 | test.serial('validate command - wrong "string" type/format', async t => { 526 | const path_ = 'test/fixtures/test-data/packages/types-formats-and-constraints/string' 527 | let result = await runcli('validate', path_) 528 | let stdout = result.stdout.split('\n') 529 | let hasErrorMsg = stdout.find(item => item.includes('> Error! There are 3 type and format mismatch errors on line 3')) 530 | t.truthy(hasErrorMsg) 531 | 532 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/types-formats-and-constraints/string' 533 | result = await runcli('validate', url_) 534 | stdout = result.stdout.split('\n') 535 | hasErrorMsg = stdout.find(item => item.includes('> Error! There are 3 type and format mismatch errors on line 3')) 536 | t.truthy(hasErrorMsg) 537 | }) 538 | 539 | test.serial('validate command - wrong "time" type/format', async t => { 540 | const path_ = 'test/fixtures/test-data/packages/types-formats-and-constraints/time' 541 | let result = await runcli('validate', path_) 542 | let stdout = result.stdout.split('\n') 543 | let hasErrorMsg = stdout.find(item => item.includes('> Error! There are 3 type and format mismatch errors on line 3')) 544 | t.truthy(hasErrorMsg) 545 | 546 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/types-formats-and-constraints/time' 547 | result = await runcli('validate', url_) 548 | stdout = result.stdout.split('\n') 549 | hasErrorMsg = stdout.find(item => item.includes('> Error! There are 3 type and format mismatch errors on line 3')) 550 | t.truthy(hasErrorMsg) 551 | }) 552 | 553 | test.serial('validate command - wrong "year" type/format', async t => { 554 | const path_ = 'test/fixtures/test-data/packages/types-formats-and-constraints/year' 555 | let result = await runcli('validate', path_) 556 | let stdout = result.stdout.split('\n') 557 | let hasErrorMsg = stdout.find(item => item.includes('> Error! There are 1 type and format mismatch errors on line 2')) 558 | t.truthy(hasErrorMsg) 559 | 560 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/types-formats-and-constraints/year' 561 | result = await runcli('validate', url_) 562 | stdout = result.stdout.split('\n') 563 | hasErrorMsg = stdout.find(item => item.includes('> Error! There are 1 type and format mismatch errors on line 2')) 564 | t.truthy(hasErrorMsg) 565 | }) 566 | 567 | test.serial('validate command - wrong "yearmonth" type/format', async t => { 568 | const path_ = 'test/fixtures/test-data/packages/types-formats-and-constraints/yearmonth' 569 | let result = await runcli('validate', path_) 570 | let stdout = result.stdout.split('\n') 571 | let hasErrorMsg = stdout.find(item => item.includes('> Error! There are 1 type and format mismatch errors on line 2')) 572 | t.truthy(hasErrorMsg) 573 | 574 | const url_ = 'https://github.com/frictionlessdata/test-data/tree/master/packages/types-formats-and-constraints/yearmonth' 575 | result = await runcli('validate', url_) 576 | stdout = result.stdout.split('\n') 577 | hasErrorMsg = stdout.find(item => item.includes('> Error! There are 1 type and format mismatch errors on line 2')) 578 | t.truthy(hasErrorMsg) 579 | }) 580 | 581 | // end of [Validate: csv with different field types, formats and constraints] 582 | 583 | // QA tests [Cat: basic csv] 584 | 585 | test('cat command - basic behaviour', async t => { 586 | const path_ = 'test/fixtures/test-data/files/csv/all-schema-types.csv' 587 | const results = await runcli('cat', path_) 588 | const stdout = results.stdout.split('\n') 589 | const hasMsg = stdout.find(item => item.includes('│ 1.0 │')) 590 | t.truthy(hasMsg) 591 | }) 592 | 593 | test('cat command - remote csv file', async t => { 594 | const url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/csv/all-schema-types.csv' 595 | const results = await runcli('cat', url_) 596 | const stdout = results.stdout.split('\n') 597 | const hasMsg = stdout.find(item => item.includes('│ 1.0 │')) 598 | t.truthy(hasMsg) 599 | }) 600 | 601 | test('cat command - remote non tabular file', async t => { 602 | const url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/other/sample.txt' 603 | const results = await runcli('cat', url_) 604 | const stdout = results.stdout.split('\n') 605 | const hasErrorMsg = stdout.find(item => item.includes('> Error! We do not have a parser for that format: txt')) 606 | t.truthy(hasErrorMsg) 607 | }) 608 | 609 | test('cat command - non-existing path', async t => { 610 | const path_ = 'non/existing/path' 611 | const results = await runcli('cat', path_) 612 | const stdout = results.stdout.split('\n') 613 | const hasErrorMsg = stdout.find(item => item.includes('> Error! ENOENT: no such file or directory')) 614 | t.truthy(hasErrorMsg) 615 | }) 616 | 617 | test('cat command - URL that returns 404', async t => { 618 | const url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/other/sampl.csv' 619 | const results = await runcli('cat', url_) 620 | const stdout = results.stdout.split('\n') 621 | let hasErrorMsg = stdout.find(item => item.includes('> Error! Provided URL is invalid')) 622 | t.truthy(hasErrorMsg) 623 | hasErrorMsg = stdout.find(item => item.includes('> Error! 404: Not Found. Requested URL: https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/other/sampl.csv')) 624 | t.truthy(hasErrorMsg) 625 | }) 626 | 627 | // end of [Cat: basic csv] 628 | 629 | // QA tests [Cat: different separators] 630 | 631 | test('cat command - files with different separator', async t => { 632 | // Local files: 633 | let path_ = 'test/fixtures/test-data/files/csv/separators/semicolon.csv' 634 | let results = await runcli('cat', path_) 635 | let stdout = results.stdout.split('\n') 636 | let delimiterWasntRecognized = stdout.find(item => item.includes(';')) 637 | t.falsy(delimiterWasntRecognized) 638 | let hasCorrectPrint = stdout.find(item => item.includes('number')) 639 | t.truthy(hasCorrectPrint) 640 | 641 | path_ = 'test/fixtures/test-data/files/csv/separators/carets.csv' 642 | results = await runcli('cat', path_) 643 | stdout = results.stdout.split('\n') 644 | delimiterWasntRecognized = stdout.find(item => item.includes('^')) 645 | t.falsy(delimiterWasntRecognized) 646 | hasCorrectPrint = stdout.find(item => item.includes('number')) 647 | t.truthy(hasCorrectPrint) 648 | 649 | // Remote files: 650 | let url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/csv/separators/semicolon.csv' 651 | results = await runcli('cat', url_) 652 | stdout = results.stdout.split('\n') 653 | delimiterWasntRecognized = stdout.find(item => item.includes(';')) 654 | t.falsy(delimiterWasntRecognized) 655 | hasCorrectPrint = stdout.find(item => item.includes('number')) 656 | t.truthy(hasCorrectPrint) 657 | 658 | url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/csv/separators/carets.csv' 659 | results = await runcli('cat', url_) 660 | stdout = results.stdout.split('\n') 661 | delimiterWasntRecognized = stdout.find(item => item.includes('^')) 662 | t.falsy(delimiterWasntRecognized) 663 | hasCorrectPrint = stdout.find(item => item.includes('number')) 664 | t.truthy(hasCorrectPrint) 665 | }) 666 | 667 | // end of [Cat: different separators] 668 | 669 | // QA test [Cat: different encodings] 670 | 671 | test.failing('cat command - different encodings', async t => { 672 | const path_ = 'test/fixtures/test-data/files/csv/encodings/iso8859.csv' 673 | let results = await runcli('cat', path_) 674 | let stdout = results.stdout.split('\n') 675 | let hasCorrectPrint = stdout.find(item => item.includes('Réunion')) 676 | t.truthy(hasCorrectPrint) 677 | 678 | const url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/csv/encodings/western-macos-roman.csv' 679 | results = await runcli('cat', url_) 680 | stdout = results.stdout.split('\n') 681 | hasCorrectPrint = stdout.find(item => item.includes('Réunion')) 682 | t.truthy(hasCorrectPrint) 683 | }) 684 | 685 | // end of [Cat: different encodings] 686 | 687 | test('cat command - local tsv file', async t => { 688 | const path_= 'test/fixtures/test-data/files/csv/separators/tab.tsv' 689 | const results = await runcli('cat', path_) 690 | const stdout = results.stdout.split('\n') 691 | const hasCorrectPrint = stdout.find(item => item.includes('number')) 692 | t.truthy(hasCorrectPrint) 693 | }) 694 | 695 | test('cat command - remote tsv file', async t => { 696 | const url_ = 'https://raw.githubusercontent.com/frictionlessdata/test-data/master/files/csv/separators/tab.tsv' 697 | const results = await runcli('cat', url_) 698 | const stdout = results.stdout.split('\n') 699 | const hasCorrectPrint = stdout.find(item => item.includes('number')) 700 | t.truthy(hasCorrectPrint) 701 | }) 702 | 703 | test('cat command - inconsistent columns', async t => { 704 | const path_ = 'test/fixtures/test-data/files/csv/inconsistent-column-number.csv' 705 | const results = await runcli('cat', path_) 706 | const stdout = results.stdout.split('\n') 707 | const hasErrorMsg = stdout.find(item => item.includes('> Error! Number of columns is inconsistent on line 3')) 708 | t.truthy(hasErrorMsg) 709 | }) 710 | 711 | test('cat command - remote excel file', async t => { 712 | const url_ = 'https://github.com/frictionlessdata/test-data/raw/master/files/excel/sample-1-sheet.xls' 713 | const results = await runcli('cat', url_) 714 | const stdout = results.stdout.split('\n') 715 | const hasCorrectPrint = stdout.find(item => item.includes('number')) 716 | t.truthy(hasCorrectPrint) 717 | }) 718 | 719 | test('cat command - specific excel sheet', async t => { 720 | const path_ = 'test/fixtures/test-data/files/excel/sample-2-sheets.xlsx' 721 | // With sheet name: 722 | let results = await runcli('cat', path_, '--sheet=Sheet2') 723 | let stdout = results.stdout.split('\n') 724 | let hasHeaderFrom2ndSheet = stdout.find(item => item.includes('header4')) 725 | t.truthy(hasHeaderFrom2ndSheet) 726 | // With sheet index: 727 | results = await runcli('cat', path_, '--sheet=2') 728 | stdout = results.stdout.split('\n') 729 | hasHeaderFrom2ndSheet = stdout.find(item => item.includes('header4')) 730 | t.truthy(hasHeaderFrom2ndSheet) 731 | // When sheet doesn't exist: 732 | results = await runcli('cat', path_, '--sheet=3') 733 | stdout = results.stdout.split('\n') 734 | t.is(stdout[0], '> Error! Input source is empty or doesn\'t exist.') 735 | }) 736 | 737 | module.exports = { 738 | runcli 739 | } 740 | --------------------------------------------------------------------------------