├── .gitignore ├── addons ├── datahub.json └── figshare.json ├── cli.js ├── index.js ├── lib └── fetch.js ├── package.json ├── readme.md ├── tests └── basic.js └── usage ├── add.txt ├── root.txt ├── status.txt └── update.txt /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /addons/datahub.json: -------------------------------------------------------------------------------- 1 | { 2 | "icon": "https://avatars3.githubusercontent.com/u/1630326?v=3&s=400", 3 | "type": "ckan", 4 | "name": "datahub", 5 | "description": "The free, powerful data management platform from the Open Knowledge Foundation, based on the CKAN data management system.", 6 | "searcher": "ckan-search", 7 | "url": "http://datahub.io" 8 | } 9 | -------------------------------------------------------------------------------- /addons/figshare.json: -------------------------------------------------------------------------------- 1 | { 2 | "icon": "http://dhimmel.com/wp-content/uploads/2014/02/figshare-spiralsticker.png", 3 | "type": "figshare", 4 | "name": "figshare", 5 | "description": "figshare is a repository where users can make all of their research outputs available in a citable, shareable and discoverable manner.", 6 | "searcher": "figshare", 7 | "url": "http://figshare.com" 8 | } 9 | -------------------------------------------------------------------------------- /cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | var args = require('minimist')(process.argv.slice(2)) 3 | var fs = require('fs') 4 | var path = require('path') 5 | var through = require('through2') 6 | var relativeDate = require('relative-date') 7 | var progress = require('progress-string') 8 | var diff = require('ansi-diff-stream')() 9 | var prettyBytes = require('pretty-bytes') 10 | var fetch = require('./lib/fetch') 11 | 12 | var dps = require('./')(args.path) 13 | exec(args._[0]) 14 | 15 | 16 | function exec (cmd) { 17 | if (cmd === 'add') { 18 | var url = args._[1] 19 | if (!url || args.help) return usage('add') 20 | args.name = args.name || args.n || args._[2] 21 | 22 | var name = args.name || normalize(location) 23 | var resource = { 24 | location: location, 25 | type: args.type, 26 | name: name 27 | } 28 | 29 | return fetch(resource, function (err, resource) { 30 | if (err) abort(err) 31 | process.stdout.write('Downloading...') 32 | if (resource.size) { 33 | var bar = progress({width: 25, total: resource.size}) 34 | var interval = setInterval(function () { 35 | var loc = dps.downloadLocation(resource) 36 | fs.lstat(loc, function (err, stat) { 37 | var percentage = ((stat.size / resource.size) * 100).toFixed(2) 38 | diff.write('[' + bar(stat.size) + `] ${percentage} %`) 39 | }) 40 | }, 250) 41 | } 42 | diff.pipe(process.stdout) 43 | dps.download(resource, function (err) { 44 | if (err) return abort(err) 45 | process.stdout.write('done!\n') 46 | console.log(resource) 47 | }) 48 | }) 49 | } 50 | 51 | if (cmd === 'rm' || cmd === 'remove') { 52 | var name = args._[1] 53 | if (!name || args.help) return usage('dps rm ') 54 | return dps.remove(name, function (err, data) { 55 | if (err) abort(err) 56 | done('Successfully deleted.') 57 | }) 58 | } 59 | 60 | if (cmd === 'update') { 61 | if (args.help) return usage('update') 62 | name = args._[1] 63 | if (!name) { 64 | return dps.update(function (err) { 65 | if (err) abort(err) 66 | done('Successfully updated.') 67 | }) 68 | } 69 | var resource = dps.get({name: name}) 70 | var cb = function (err, data) { 71 | if (err) abort(err) 72 | done(data) 73 | } 74 | dps.updateResource(resource, function (err, resource) { 75 | if (err) abort(err) 76 | console.log(resource) 77 | }) 78 | } 79 | 80 | if (cmd === 'destroy') { 81 | if (args.help) return usage('dps destroy removes everything!') 82 | return dps.destroy(function (err) { 83 | if (err) abort(err) 84 | console.log('goodbye') 85 | }) 86 | } 87 | 88 | if (cmd === 'status' || cmd === 'st') { 89 | if (args.help) return usage('status') 90 | cb = function (err, data) { 91 | if (err) abort(err) 92 | var output = '' 93 | for (var key in dps.config.resources) { 94 | if (dps.config.resources.hasOwnProperty(key)) { 95 | var resource = dps.config.resources[key] 96 | output += '\n' 97 | output += resource.name + '\n' 98 | output += resource.location + '\n' 99 | if (resource.meta.checked) output += ' checked: ' + relativeDate(new Date(resource.meta.checked)) 100 | if (resource.meta.modified) output += ' modified: ' + relativeDate(new Date(resource.meta.modified)) 101 | if (resource.size) output += ' size: ' + prettyBytes(resource.size) 102 | output += '\n' 103 | return console.log(output) 104 | } 105 | } 106 | } 107 | 108 | name = args._[1] 109 | if (name) return dps.check(name, cb) 110 | else return dps.checkAll(cb) 111 | } 112 | 113 | usage('root') 114 | } 115 | 116 | function done (message) { 117 | dps.save(function (err) { 118 | if (err) abort(err) 119 | console.log(message) 120 | }) 121 | } 122 | 123 | function abort (err) { 124 | console.error(err) 125 | console.trace(err) 126 | process.exit(1) 127 | } 128 | 129 | function usage (name) { 130 | var message = fs.readFileSync(path.join(__dirname, 'usage', name + '.txt')).toString() 131 | console.error(message) 132 | process.exit(0) 133 | } 134 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var path = require('path') 2 | var fs = require('fs') 3 | var mkdirp = require('mkdirp') 4 | var pump = require('pump') 5 | var util = require('util') 6 | var extend = require('extend') 7 | var debug = require('debug')('dps') 8 | var got = require('got') 9 | var rimraf = require('rimraf') 10 | var parallel = require('run-parallel') 11 | var fedsearch = require('federated-search') 12 | var fetch = require('./lib/fetch.js') 13 | 14 | var CONFIG_FILE = 'dps.json' 15 | 16 | module.exports = DPS 17 | 18 | function DPS (dir) { 19 | if (!(this instanceof DPS)) return new DPS(dir) 20 | this.dir = dir || process.cwd() 21 | this.configPath = path.join(this.dir, CONFIG_FILE) 22 | this.config = readConfig(this.configPath) 23 | } 24 | 25 | DPS.prototype.downloadLocation = function (resource) { 26 | var out = path.join(this.dir, resource.name) 27 | return path.join(out, path.basename(resource.location)) 28 | } 29 | 30 | DPS.prototype.download = function (resource, cb) { 31 | var self = this 32 | var old = self.get(resource) 33 | 34 | if (old && old.meta && 35 | new Date(old.meta.modified) === resource.meta.modified) { 36 | return cb(null, resource) 37 | } 38 | var reader = got.stream(resource.location) 39 | var out = self.downloadLocation(resource) 40 | var writer = fs.createWriteStream(out) 41 | var stream = pump(reader, writer, function (err) { 42 | if (err) return cb(err) 43 | addToConfig(self.config, resource) 44 | self.save(function (err) { 45 | if (err) return cb(err) 46 | return cb(null, resource) 47 | }) 48 | }) 49 | 50 | } 51 | 52 | DPS.prototype.add = function (location, args) { 53 | // TODO: add a local directory to the tracker.. 54 | } 55 | 56 | 57 | DPS.prototype.update = function (cb) { 58 | var self = this 59 | self._parallelize(self.updateResource, cb) 60 | } 61 | 62 | DPS.prototype.updateResource = function (resource, cb) { 63 | this.download(resource, cb) 64 | } 65 | 66 | DPS.prototype.checkAll = function (cb) { 67 | this._parallelize(fetch, cb) 68 | } 69 | 70 | DPS.prototype.check = function (opts, cb) { 71 | fetch(this.get(opts), cb) 72 | } 73 | 74 | DPS.prototype.save = function (cb) { 75 | var self = this 76 | fs.writeFile(self.configPath, JSON.stringify(self.config, null, 2), cb) 77 | } 78 | 79 | DPS.prototype.remove = function (name, cb) { 80 | var self = this 81 | if (!name) return cb(new Error('Remove requires a name')) 82 | var resource = self.get({name: name}) 83 | if (!resource) return cb(new Error('Resource not found with name', name)) 84 | rimraf(resource.name, function (err) { 85 | if (err) return cb(err) 86 | removeFromConfig(self.config, name) 87 | cb() 88 | }) 89 | } 90 | 91 | DPS.prototype._get_index = function (query) { 92 | var self = this 93 | for (var i in self.config.resources) { 94 | var resource = self.config.resources[i] 95 | for (var key in query) { 96 | if (resource[key] === query[key]) return i 97 | } 98 | } 99 | } 100 | 101 | DPS.prototype._resourcePath = function (resource) { 102 | return path.join(this.dir, resource.name) 103 | } 104 | 105 | DPS.prototype.get = function (opts) { 106 | var self = this 107 | var i = self._get_index(opts) 108 | return self.config.resources[i] 109 | } 110 | 111 | DPS.prototype.destroy = function (cb) { 112 | var self = this 113 | self._parallelize(function destroyResource (resource, done) { 114 | rimraf(self._resourcePath(resource), done) 115 | }, function destroyConfig () { 116 | rimraf(self.configPath, cb) 117 | }) 118 | } 119 | 120 | DPS.prototype._parallelize = function (func, cb) { 121 | var self = this 122 | var tasks = [] 123 | for (var i in self.config.resources) { 124 | (function (i) { 125 | tasks.push(function (done) { 126 | func.call(self, self.config.resources[i], done) 127 | }) 128 | })(i) 129 | } 130 | parallel(tasks, cb) 131 | } 132 | 133 | function addToConfig (config, resource) { 134 | config.resources.push(resource) 135 | } 136 | 137 | function removeFromConfig (config, name) { 138 | var newResources = [] 139 | for (var i in config.resources) { 140 | var resource = config.resources[i] 141 | if (resource.name !== name) newResources.push(resource) 142 | } 143 | config.resources = newResources 144 | } 145 | 146 | function readConfig (configPath) { 147 | if (fs.existsSync(configPath)) return JSON.parse(fs.readFileSync(configPath)) 148 | return { 149 | resources: [] 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /lib/fetch.js: -------------------------------------------------------------------------------- 1 | var got = require('got') 2 | var debug = require('debug')('dps') 3 | 4 | module.exports = function (resource, cb) { 5 | var opts = { method: 'HEAD' } 6 | got(resource.location, opts, function (err, data, res) { 7 | if (err) return cb(err) 8 | resource.size = res.headers['content-length'] 9 | if (resource.size) resource.size = parseInt(resource.size) 10 | resource.meta = { 11 | modified: new Date(res.headers['last-modified']), 12 | checked: new Date() 13 | } 14 | resource.type = 'url' 15 | cb(null, resource) 16 | }) 17 | } 18 | 19 | function normalize (location) { 20 | return location.replace('\/', '_').replace(/[^a-z_+A-Z0-9]/ig, '') 21 | } 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dps", 3 | "version": "1.0.1", 4 | "description": "data per second (a data package system)", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "electron app/app.js 2>&1 | silence-chromium", 8 | "test": "standard && tape tests/*.js" 9 | }, 10 | "bin": { 11 | "dps": "./cli.js" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "https://github.com/karissa/dps.git" 16 | }, 17 | "author": "Karissa McKelvey (http://karissamck.com/)", 18 | "license": "ISC", 19 | "bugs": { 20 | "url": "https://github.com/karissa/dps/issues" 21 | }, 22 | "homepage": "https://github.com/karissa/dps", 23 | "dependencies": { 24 | "ansi-diff-stream": "^1.2.0", 25 | "cli-prompt": "^0.4.2", 26 | "dat-ping": "^1.1.0", 27 | "debug": "^2.2.0", 28 | "dom": "0.0.3", 29 | "extend": "^3.0.0", 30 | "federated-search": "^1.1.1", 31 | "format-data": "^2.1.2", 32 | "got": "^4.1.1", 33 | "minimist": "^1.2.0", 34 | "mkdirp": "^0.5.1", 35 | "npm-execspawn": "^1.2.1", 36 | "page": "^1.6.3", 37 | "pretty-bytes": "^2.0.1", 38 | "progress": "^1.1.8", 39 | "progress-string": "^1.2.1", 40 | "pump": "^1.0.0", 41 | "ractive-toolkit": "^1.0.1", 42 | "relative-date": "^1.1.2", 43 | "rimraf": "^2.4.3", 44 | "run-parallel": "^1.1.2", 45 | "silence-chromium": "^2.0.0", 46 | "standard": "^5.3.1", 47 | "stream-iterate": "^1.1.1", 48 | "through2": "^2.0.0" 49 | }, 50 | "devDependencies": { 51 | "electron-prebuilt": "^0.33.0", 52 | "tape": "^4.2.0" 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # dps (data per second) 2 | 3 | Search, manage, and update datasets. **This is half vaporware right now.** 4 | 5 | With open data comes a price. It's difficult to track and manage all of the urls and APIs that provide data. Harvesting data for a simple query like 'city water' will become a nightmare, let alone if you have custom scraping processes to clean that data after it's downloaded! How do you know when the source data has changed? Enter DPS. 6 | 7 | ## Example 8 | 9 | ``` 10 | dps add http://www.opendatacache.com/cookcounty.socrata.com/api/geospatial/26nm-wd5q cookcounty.geo 11 | ``` 12 | 13 | This puts the following entry into `dps.json`: 14 | ``` 15 | { 16 | location: 'http://www.opendatacache.com/cookcounty.socrata.com/api/geospatial/26nm-wd5q', 17 | type: 'url', 18 | name: 'cookcounty.geo', 19 | size: 618400, 20 | meta: { 21 | modified: 'Wed Jun 24 2015 14:52:26 GMT-0700 (PDT)', 22 | checked: 'Tue Sep 22 2015 01:51:43 GMT-0700 (PDT)' 23 | } 24 | } 25 | ``` 26 | 27 | See status of all my downloads: 28 | ``` 29 | $ dps status 30 | http://www.opendatacache.com/cookcounty.socrata.com/api/geospatial/26nm-wd5q 31 | checked: 3 minutes ago modified: 3 months ago size: 618.4 kB 32 | 33 | http://localhost:6442 34 | checked: 10 minutes ago modified: 5 hours ago size: 4.06 kB 35 | ``` 36 | 37 | Check the sources for updates. 38 | ``` 39 | $ dps update 40 | http://www.opendatacache.com/cookcounty.socrata.com/api/geospatial/26nm-wd5q 41 | checked: just now modified: 3 months ago size: 618.4 kB 42 | 43 | http://localhost:6442 44 | checked: just now modified: 5 hours ago size: 4.06 kB 45 | ``` 46 | 47 | ## Developer install 48 | ``` 49 | git clone https://github.com/karissa/dps.git 50 | cd dps 51 | npm install 52 | ``` 53 | -------------------------------------------------------------------------------- /tests/basic.js: -------------------------------------------------------------------------------- 1 | var test = require('tape') 2 | var tmp = require('os').tmpdir() 3 | var fs = require('fs') 4 | var path = require('path') 5 | var dps = require('..')(tmp) 6 | 7 | var location = 'http://localhost:6442' 8 | 9 | test('add/get/destroy', function (t) { 10 | dps.destroy(function (err) { 11 | t.ifError(err) 12 | var downloader = dps.download(location, {name: 'cookcounty.csv'}) 13 | downloader.on('done', function (resource) { 14 | t.ifError(err) 15 | t.same(resource.location, location, 'location same') 16 | t.same(resource.name, 'cookcounty.csv', 'name same') 17 | t.ok(fs.existsSync(path.join(tmp, resource.name)), 'resource path exists') 18 | var gotten = dps.get({name: resource.name}) 19 | t.deepEquals(gotten, resource, 'resource same') 20 | dps.save(function (err) { 21 | t.ifError(err) 22 | t.ok(fs.existsSync(dps.configPath), 'config path exists') 23 | dps.destroy(function (err) { 24 | t.ifError(err) 25 | t.false(fs.existsSync(dps.configPath), 'successfully destroys config path') 26 | t.false(fs.existsSync(path.join(tmp, resource.name)), 'successfully destroys resource') 27 | t.end() 28 | }) 29 | }) 30 | }) 31 | }) 32 | }) 33 | -------------------------------------------------------------------------------- /usage/add.txt: -------------------------------------------------------------------------------- 1 | dps add [name] 2 | 3 | Adds a data source to the dps repository. 4 | 5 | Parameters: 6 | 7 | : required 8 | 9 | URL or filepath to executable. 10 | 11 | [name]: optional 12 | 13 | A user-friendly name for the data source. 14 | 15 | --type : optional 16 | 17 | Inferred from input, but can be forced. Could be `url`, `dat`, `ftp`, `zip` 18 | -------------------------------------------------------------------------------- /usage/root.txt: -------------------------------------------------------------------------------- 1 | usage: dps [--flag] [--key=value] 2 | 3 | commands: 4 | add add a data source to the list 5 | update update previously downloaded sources 6 | remove remove a source 7 | status get status on current sources 8 | destroy remove everything! 9 | 10 | type `dps --help` to view specific command help 11 | -------------------------------------------------------------------------------- /usage/status.txt: -------------------------------------------------------------------------------- 1 | dps status 2 | -------------------------------------------------------------------------------- /usage/update.txt: -------------------------------------------------------------------------------- 1 | dps update [name] 2 | 3 | Updates all datasets. 4 | 5 | Parameters: 6 | 7 | [name]: optional 8 | 9 | Will update only the dataset with the given name. 10 | 11 | Options: 12 | 13 | --trackers 14 | 15 | Will update only the tracker metadata for dps search, not the datasets themselves. 16 | 17 | --dry-run 18 | 19 | Will output a list of all datasets that would be updated. 20 | --------------------------------------------------------------------------------