├── LICENSE ├── README.md ├── index.js ├── lib └── openrefine.js ├── package.json └── test ├── mocha.opts ├── op.json ├── openrefine.js └── test.csv /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2015 Pomin Wu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the “Software”), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 9 | of the Software, and to permit persons to whom the Software is furnished to do 10 | so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | Fork this project to create your own MIT license that you can always link to. 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # node-openrefine 3 | 4 | Node.js client library for controlling OpenRefine. 5 | 6 | ## TODO / Features 7 | 8 | * [] upload, apply operations, download results, delete project 9 | * [ ] pipe 10 | * [ ] CLI tool 11 | 12 | ## Usage 13 | 14 | ``` javascript 15 | var openrefine = require('openrefine') 16 | 17 | // another server; same usage 18 | var server = openrefine.server('http://localhost:3333') 19 | 20 | // projects metadata 21 | openrefine 22 | .projects() 23 | .then(project_metadata => ...) 24 | ``` 25 | 26 | Project metadata format: 27 | 28 | ``` javascript 29 | { 30 | "[project_id]": { 31 | "name": "[project_name]", 32 | "created": "[project_creation_time]", 33 | "modified": "[project_modification_time]", 34 | "customMetadata": {} 35 | }, 36 | ...[More projects]... 37 | } 38 | ``` 39 | 40 | Create a project and clean up some data: 41 | 42 | ``` javascript 43 | var project = openrefine 44 | .create('data_cleanup_project') // .create() auto-generates a project name 45 | .accept('csv') 46 | .accept({ 47 | separator: ',', 48 | ignoreLines: 1 49 | }) 50 | .expose('csv') 51 | .keep(true) // keep data after end() or pipe; default is not keeping 52 | .use([ 53 | { 54 | "op": "core/column-split", 55 | "description": "Split column DATE by separator", 56 | "engineConfig": { 57 | "facets": [], 58 | "mode": "row-based" 59 | }, 60 | "columnName": "DATE", 61 | "guessCellType": true, 62 | "removeOriginalColumn": true, 63 | "mode": "separator", 64 | "separator": "-", 65 | "regex": false, 66 | "maxColumns": 0 67 | } 68 | ]) 69 | .use(customCleanupAddress()) // customCleanupAddress() returns an array of operations 70 | 71 | project 72 | .load('input.csv') 73 | .end(function (data) { 74 | // ... 75 | }) 76 | .then(() => project.destroy()) 77 | ``` 78 | 79 | Or use the stream interface: 80 | 81 | ``` javascript 82 | fs.createStream('input.csv') 83 | .pipe(project) 84 | .pipe(fs.createWriteStream('output.csv')) 85 | ``` 86 | 87 | A project may have some internal states (project metadata such as name and ID, data imported previously, etc.) To open an existing project, use numeric ID of OpenRefine: 88 | 89 | ``` javascript 90 | server.open(1234567980) 91 | ``` 92 | 93 | Delete all data in a project: 94 | 95 | ``` javascript 96 | project.clean() 97 | ``` 98 | 99 | Destroy a project after use: 100 | 101 | ``` javascript 102 | project.destroy() 103 | ``` 104 | 105 | 106 | 107 | ## See also 108 | 109 | * [Refine API](https://github.com/maxogden/refine-python/wiki/Refine-API) and implementations [in Python](https://github.com/maxogden/refine-python/) and [in Ruby](https://github.com/maxogden/refine-ruby). 110 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | module.exports = { 4 | OpenRefine: require('./lib/openrefine') 5 | } 6 | -------------------------------------------------------------------------------- /lib/openrefine.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | var sa = require('superagent') 4 | var fs = require('fs') 5 | var csv = require('csv') 6 | var debug = require('debug')('openrefine') 7 | 8 | // ES6 Promise plugin for superagent 9 | function promise () { 10 | return function (req) { 11 | req.run = function () { 12 | return new Promise(function (resolve, reject) { 13 | Object.getPrototypeOf(req).end.call(req, function (err, res) { 14 | if (err) { return reject(err) } 15 | if (!res.ok) { return reject(res.error) } 16 | resolve(res) 17 | }) 18 | }) 19 | } 20 | } 21 | } 22 | 23 | // ignore HTTP 302 (promise-based) plugin for superagent 24 | function ignore302 () { 25 | return req => { 26 | var prevRun = req.run 27 | req.run = () => 28 | prevRun() 29 | .catch(err => { 30 | if (err.status === 302) { 31 | return err.response 32 | } 33 | return Promise.reject(err) 34 | }) 35 | } 36 | } 37 | 38 | class Project { 39 | constructor (project_name, endpoint) { 40 | this._name = project_name || 'abc' 41 | this._endpoint = endpoint 42 | this._upload_queue = [] 43 | this._input_format = 'csv' 44 | this._op = [] 45 | } 46 | 47 | /** 48 | * Set or get project name. 49 | * 50 | * @param {String} name 51 | * @return {Project} for chaining 52 | * @api public 53 | */ 54 | name () { 55 | if (arguments.length > 0) { 56 | this._name = arguments[0] 57 | return this 58 | } 59 | return this._name 60 | } 61 | 62 | /** 63 | * Get project ID. 64 | * 65 | * @return {String|undefined} 66 | * @api public 67 | */ 68 | id () { 69 | return this._project_id ? String(this._project_id) : this._project_id 70 | } 71 | 72 | /** 73 | * Set or get input format. 74 | * 75 | * @param {String} format 76 | * @return {Project} for chaining 77 | * @api public 78 | */ 79 | accept () { 80 | if (arguments.length > 0) { 81 | this._input_format = arguments[0] 82 | return this 83 | } 84 | return this._input_format 85 | } 86 | 87 | /** 88 | * Set or get output format. 89 | * 90 | * @param {String} format 91 | * @return {Project} for chaining 92 | * @api public 93 | */ 94 | expose () { 95 | if (arguments.length > 0) { 96 | this._output_format = arguments[0] 97 | return this 98 | } 99 | return this._output_format 100 | } 101 | 102 | keep () { 103 | if (arguments.length > 0) { 104 | this._keep = arguments[0] 105 | return this 106 | } 107 | return this._keep 108 | } 109 | 110 | /** 111 | * Use operation in data pipeline. 112 | * 113 | * @param {object} op 114 | * @return {Project} for chaining 115 | * @api public 116 | */ 117 | use (op) { 118 | this._op.push(this._apply_op(this._endpoint, op)) 119 | return this 120 | } 121 | 122 | _upload_data (endpoint, project_name, file_name) { 123 | return sa.post(endpoint + '/command/core/create-project-from-upload?options={"encoding":"UTF-8","separator":",","ignoreLines":-1,"headerLines":1,"skipDataLines":0,"limit":-1,"storeBlankRows":true,"guessCellValueTypes":false,"processQuotes":true,"storeBlankCellsAsNulls":true,"includeFileSources":false}') 124 | .use(promise()) 125 | .use(ignore302()) 126 | .redirects(0) 127 | .field('project-name', project_name) 128 | .attach('project-file', fs.readFileSync(file_name), file_name) 129 | } 130 | 131 | _download_data (endpoint, project_id, format) { 132 | debug('download data in ' + project_id) 133 | return sa.post(endpoint + '/command/core/export-rows/' + project_id + '.' + format) 134 | .use(promise()) 135 | .send('engine={"facets":[],"mode":"row-based"}') 136 | .send('project=' + project_id) 137 | .send('format=' + this._output_format) 138 | .run() 139 | .then(res => res.text) 140 | } 141 | 142 | _apply_op (endpoint, op) { 143 | debug('apply operations `%s`', JSON.stringify(op)) 144 | return { 145 | run: () => { 146 | return sa.post(endpoint + '/command/core/apply-operations?project=' + this._project_id) 147 | .use(promise()) 148 | .send('operations=' + JSON.stringify(op)) 149 | .run() 150 | } 151 | } 152 | } 153 | 154 | _start_queue (queue) { 155 | if (this._current_job !== undefined) { 156 | return this._current_job 157 | } 158 | if (queue.length === 0) { 159 | return Promise.resolve() 160 | } 161 | this._current_job = queue.shift().run() 162 | .catch(err => debug(err.response)) 163 | .then(res => { 164 | if (!this._project_id) { 165 | this._project_id = +res.headers.location.replace(this._endpoint + '/project?project=', '') 166 | } 167 | this._current_job = undefined 168 | return this._start_queue(queue) 169 | }) 170 | return this._current_job 171 | } 172 | 173 | /** 174 | * Load data into pipeline. 175 | * 176 | * @param {String} file_name 177 | * @return {Project} for chaining 178 | * @api public 179 | */ 180 | load (file_name) { 181 | debug('put ' + file_name + ' on queue') 182 | this._upload_queue.push(this._upload_data(this._endpoint, this._name, file_name)) 183 | this._start_queue(this._upload_queue).catch(debug) 184 | return this 185 | } 186 | 187 | /** 188 | * Start data loading and operations. 189 | * 190 | * @param {Function} done 191 | * @return {Promise} 192 | * @api public 193 | */ 194 | end (done) { 195 | var p = this._start_queue(this._upload_queue) 196 | .then(() => { 197 | return this._start_queue(this._op) 198 | }) 199 | .then(() => { 200 | return this._download_data(this._endpoint, this._project_id, 'csv') 201 | }) 202 | if (this.expose() === undefined) { 203 | p = p.then(text => 204 | new Promise((resolve, reject) => csv.parse(text, { delimiter: '\t', columns: true }, (err, data) => { 205 | if (err) { return reject(err) } 206 | resolve(data) 207 | })) 208 | ) 209 | } 210 | return p.then(done).catch(debug) 211 | } 212 | 213 | /** 214 | * Destroy project. 215 | * 216 | * @return {Promise} 217 | * @api public 218 | */ 219 | destroy () { 220 | return this._start_queue(this._upload_queue) 221 | .then(() => this._start_queue(this._op)) 222 | .then(() => { 223 | debug('delete project ' + this._project_id) 224 | return sa.post(this._endpoint + '/command/core/delete-project') 225 | .use(promise()) 226 | .send('project=' + this._project_id) 227 | .run() 228 | }) 229 | } 230 | } 231 | 232 | module.exports = function (endpoint) { 233 | var conf = { 234 | endpoint: endpoint || 'http://localhost:3333' 235 | } 236 | var server = {} 237 | 238 | /** 239 | * Create a new project. 240 | * 241 | * @param {String} project_name 242 | * @return {Project} 243 | * @api public 244 | */ 245 | server.create = function (project_name) { 246 | debug('create project ' + project_name) 247 | var p = new Project(project_name, conf.endpoint) 248 | p._promise = Promise.resolve({ project_id: undefined }) 249 | return p 250 | } 251 | 252 | /** 253 | * Open an existing project. 254 | * 255 | * @param {String} project_id 256 | * @return {Project} 257 | * @api public 258 | */ 259 | server.open = function (project_id) { 260 | return new Project(undefined, conf.endpoint) 261 | .id(project_id) 262 | } 263 | 264 | /** 265 | * Get project metadata. 266 | * 267 | * @return {Promise} 268 | * @api public 269 | */ 270 | server.projects = function () { 271 | debug('get projects metadata') 272 | return sa.get(conf.endpoint + '/command/core/get-all-project-metadata') 273 | .use(promise()) 274 | .run() 275 | .then(res => res.body.projects) 276 | } 277 | 278 | server.delete = function (id) { 279 | debug('delete project ' + id) 280 | return sa.post(conf.endpoint + '/command/core/delete-project') 281 | .use(promise()) 282 | .send('project=' + id) 283 | .run() 284 | } 285 | 286 | return server 287 | } 288 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "openrefine", 3 | "version": "0.0.3", 4 | "description": "OpenRefine client in Node.js", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "mocha", 8 | "debug": "DEBUG=openrefine mocha --reporter spec" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/pm5/node-openrefine.git" 13 | }, 14 | "keywords": [ 15 | "openrefine", 16 | "googlerefine" 17 | ], 18 | "author": "Pomin Wu (https://github.com/pm5)", 19 | "license": "MIT", 20 | "bugs": { 21 | "url": "https://github.com/pm5/node-openrefine/issues" 22 | }, 23 | "homepage": "https://github.com/pm5/node-openrefine#readme", 24 | "dependencies": { 25 | "csv": "^0.4.6", 26 | "debug": "^2.2.0", 27 | "superagent": "^1.4.0" 28 | }, 29 | "devDependencies": { 30 | "chai": "^3.4.1", 31 | "chai-as-promised": "^5.1.0", 32 | "mocha": "^2.3.3" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --ui bdd 2 | --reporter dot 3 | -------------------------------------------------------------------------------- /test/op.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "op": "core/column-split", 4 | "description": "Split column Date by separator", 5 | "engineConfig": { 6 | "facets": [], 7 | "mode": "row-based" 8 | }, 9 | "columnName": "Date", 10 | "guessCellType": true, 11 | "removeOriginalColumn": true, 12 | "mode": "separator", 13 | "separator": "-", 14 | "regex": false, 15 | "maxColumns": 0 16 | } 17 | ] 18 | -------------------------------------------------------------------------------- /test/openrefine.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | var chai = require('chai') 4 | var chaiAsPromised = require('chai-as-promised') 5 | chai.use(chaiAsPromised) 6 | var expect = chai.expect 7 | var OpenRefine = require('../').OpenRefine 8 | var fs = require('fs') 9 | var csv = require('csv') 10 | 11 | describe('OpenRefine', () => { 12 | var test_project_name = 'dont_use_this_name' 13 | //var test_project_name = 'abc' 14 | 15 | describe('server', () => { 16 | describe('get projects metadata', () => { 17 | var projects_data 18 | before(() => 19 | OpenRefine() 20 | .projects() 21 | .then(data => projects_data = data) 22 | ) 23 | 24 | it('should have correct format', () => { 25 | expect(projects_data.projects).to.be.defined 26 | Object.keys(projects_data).forEach(id => { 27 | expect(projects_data[id].name).to.be.defined 28 | expect(projects_data[id].created).to.be.defined 29 | expect(projects_data[id].modified).to.be.defined 30 | }) 31 | }) 32 | }) 33 | 34 | describe('create projects', () => { 35 | it('should create projects by name', () => { 36 | var p = OpenRefine().create(test_project_name) 37 | expect(p.id()).to.be.undefined 38 | }) 39 | 40 | it('should create projects without a name', () => { 41 | var p = OpenRefine().create() 42 | expect(p.name()).to.exist 43 | }) 44 | }) 45 | 46 | describe('load projects', () => { 47 | var id 48 | before(() => { 49 | var project = OpenRefine() 50 | .create(test_project_name) 51 | project 52 | .load('test/test.csv') 53 | .end() 54 | .then(() => id = project.id()) 55 | return project 56 | }) 57 | 58 | it('should open projects by id', () => 59 | OpenRefine() 60 | .open(id) 61 | ) 62 | }) 63 | 64 | describe('delete projects', () => { 65 | var id 66 | before(() => 67 | OpenRefine() 68 | .create(test_project_name) 69 | .load('test/test.csv') 70 | ) 71 | after(() => OpenRefine().delete(id)) 72 | 73 | it('should delete projects', () => 74 | OpenRefine() 75 | .delete(id) 76 | .then(() => OpenRefine().projects()) 77 | .then(data => { 78 | expect(data).to.not.have.property(id) 79 | }) 80 | ) 81 | }) 82 | }) 83 | 84 | describe('project', () => { 85 | after((done) => 86 | OpenRefine() 87 | .projects() 88 | .then(projects => 89 | Object.keys(projects) 90 | .filter(id => projects[id].name === test_project_name) 91 | ) 92 | .then(ids => ids.forEach(OpenRefine().delete)) 93 | .then(done, done) 94 | ) 95 | 96 | describe('load data', () => { 97 | it('should load the data and output at end', () => 98 | expect( 99 | OpenRefine() 100 | .create(test_project_name) 101 | .load('test/test.csv') 102 | .end() 103 | ).to.eventually.be.ok 104 | ) 105 | }) 106 | 107 | describe('data format', () => { 108 | 109 | describe('in objects', () => { 110 | it('should expose data in array of objects', () => 111 | expect( 112 | OpenRefine() 113 | .create(test_project_name) 114 | .load('test/test.csv') 115 | .end() 116 | ).to.eventually.eql([ 117 | {Date: '2018-11-13', Number: '123'}, 118 | {Date: '2018-11-14', Number: '45671'}, 119 | {Date: '2018-11-15', Number: '991'}, 120 | {Date: '2018-11-16', Number: '3025'}, 121 | {Date: '2018-11-17', Number: '104234'}, 122 | ]) 123 | ) 124 | }) 125 | 126 | describe('in CSV', () => { 127 | it('should expose data in CSV', () => 128 | expect( 129 | OpenRefine() 130 | .create(test_project_name) 131 | .expose('csv') 132 | .load('test/test.csv') 133 | .end(data => data) 134 | ).to.eventually.match(/^Date,Number\n/) 135 | ) 136 | }) 137 | 138 | describe('in TSV', () => { 139 | it('should expose data in TSV', () => 140 | expect( 141 | OpenRefine() 142 | .create(test_project_name) 143 | .expose('tsv') 144 | .load('test/test.csv') 145 | .end() 146 | ).to.eventually.match(/^Date\tNumber\n/) 147 | ) 148 | }) 149 | }) 150 | 151 | describe('apply operations', () => { 152 | it('should apply operations to project', () => 153 | expect( 154 | OpenRefine() 155 | .create(test_project_name) 156 | .use(JSON.parse(fs.readFileSync('test/op.json'))) 157 | .load('test/test.csv') 158 | .end() 159 | ).to.eventually.eql([ 160 | {'Date 1': '2018', 'Date 2': '11', 'Date 3': '13', Number: '123'}, 161 | {'Date 1': '2018', 'Date 2': '11', 'Date 3': '14', Number: '45671'}, 162 | {'Date 1': '2018', 'Date 2': '11', 'Date 3': '15', Number: '991'}, 163 | {'Date 1': '2018', 'Date 2': '11', 'Date 3': '16', Number: '3025'}, 164 | {'Date 1': '2018', 'Date 2': '11', 'Date 3': '17', Number: '104234'}, 165 | ]) 166 | ) 167 | }) 168 | 169 | describe('destroy project', () => { 170 | var project 171 | before(done => { 172 | project = OpenRefine() 173 | .create(test_project_name) 174 | project 175 | .load('test/test.csv') 176 | .end() 177 | .then(() => done()) 178 | }) 179 | before(done => { 180 | project.destroy() 181 | .then(() => done()) 182 | }) 183 | 184 | it('should delete the project', () => { 185 | return expect(OpenRefine() 186 | .projects() 187 | ).to.eventually.not.include.keys(project.id()) 188 | }) 189 | }) 190 | 191 | }) 192 | }) 193 | -------------------------------------------------------------------------------- /test/test.csv: -------------------------------------------------------------------------------- 1 | "Date","Number" 2 | "2018-11-13","123" 3 | "2018-11-14","45671" 4 | "2018-11-15","991" 5 | "2018-11-16","3025" 6 | "2018-11-17","104234" 7 | --------------------------------------------------------------------------------