├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── cli.js ├── cursor.js ├── index.js ├── package.json ├── preprocess ├── preprocess.js ├── tagger │ ├── lexicon_from_posjs.json │ ├── tr_from_brill_paper.txt │ └── tr_from_posjs.txt └── vector.js ├── test ├── add.js ├── create.js ├── del.js ├── preprocess.js ├── reuters-000.json └── search.js └── yuno.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/linux,osx,node 3 | 4 | ### Linux ### 5 | *~ 6 | 7 | # temporary files which can be created if a process still has a handle open of a deleted file 8 | .fuse_hidden* 9 | 10 | # KDE directory preferences 11 | .directory 12 | 13 | # Linux trash folder which might appear on any partition or disk 14 | .Trash-* 15 | 16 | 17 | ### OSX ### 18 | .DS_Store 19 | .AppleDouble 20 | .LSOverride 21 | 22 | # Icon must end with two \r 23 | Icon 24 | 25 | # Thumbnails 26 | ._* 27 | 28 | # Files that might appear in the root of a volume 29 | .DocumentRevisions-V100 30 | .fseventsd 31 | .Spotlight-V100 32 | .TemporaryItems 33 | .Trashes 34 | .VolumeIcon.icns 35 | 36 | # Directories potentially created on remote AFP share 37 | .AppleDB 38 | .AppleDesktop 39 | Network Trash Folder 40 | Temporary Items 41 | .apdisk 42 | 43 | 44 | ### Node ### 45 | # Logs 46 | logs 47 | *.log 48 | npm-debug.log* 49 | 50 | # Runtime data 51 | pids 52 | *.pid 53 | *.seed 54 | 55 | # Directory for instrumented libs generated by jscoverage/JSCover 56 | lib-cov 57 | 58 | # Coverage directory used by tools like istanbul 59 | coverage 60 | 61 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 62 | .grunt 63 | 64 | # node-waf configuration 65 | .lock-wscript 66 | 67 | # Compiled binary addons (http://nodejs.org/api/addons.html) 68 | build/Release 69 | 70 | # Dependency directories 71 | node_modules 72 | jspm_packages 73 | 74 | # Optional npm cache directory 75 | .npm 76 | 77 | # Optional REPL history 78 | .node_repl_history 79 | 80 | # local development examples 81 | examples 82 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - 'stable' 4 | - '6' 5 | - '4' 6 | - '5' 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The person who associated a work with this deed has dedicated the work to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law. 2 | 3 | You can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission. 4 | 5 | See https://creativecommons.org/publicdomain/zero/1.0/ for more info. 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## yunodb 2 | 3 | A portable, persistent, electron compatible fulltext search + document store database for node.js. LevelDB underneath. 4 | 5 | [![js-standard-style](https://img.shields.io/badge/code%20style-standard%20js-green.svg?style=flat-square)](https://github.com/feross/standard) [![Travis](https://img.shields.io/travis/blahah/yunodb.svg?style=flat-square)](https://travis-ci.org/blahah/yunodb) [![npm](https://img.shields.io/npm/v/yunodb.svg?style=flat-square)](https://www.npmjs.com/package/yunodb) [![cc-zero](https://img.shields.io/badge/license-CC0%20public%20domain-ff69b4.svg?style=flat-square)](https://github.com/blahah/yunodb#license---cc0) 6 | 7 | - [How it works](https://github.com/blahah/yunodb#how-it-works) 8 | - [Install](https://github.com/blahah/yunodb#install) 9 | - [Use](https://github.com/blahah/yunodb#use) 10 | - [Create / load a database](https://github.com/blahah/yunodb#create--load-a-database) 11 | - [Index mapping](https://github.com/blahah/yunodb#index-mapping) 12 | - [Add documents](https://github.com/blahah/yunodb#add-documents) 13 | - [Delete documents](https://github.com/blahah/yunodb#delete-documents) 14 | - [Search](https://github.com/blahah/yunodb#search) 15 | - [CLI](https://github.com/blahah/yunodb#cli) 16 | - [Contributing](https://github.com/blahah/yunodb#contributing) 17 | - [License - CC0](https://github.com/blahah/yunodb#license---cc0) 18 | 19 | ## How it works 20 | 21 | yuno is a JSON document store with fulltext search. It's meant for embedding in electron apps, focuses solely on text search, and in most cases should handle millions of documents easily. 22 | 23 | yuno is pretty basic - it has three components: 24 | - The document store, which is just the raw JSON objects stored in [leveldb](https://github.com/Level/levelup) 25 | - The inverted search index, powered by [search-index](https://github.com/fergiemcdowall/search-index) 26 | - A customisable [natural](https://github.com/NaturalNode/natural) language processing pipeline that is applied to documents before adding them to the index, greatly improving speed and memory usage compared to the vanilla search-index. 27 | 28 | **None of this is revolutionary** - actually it's standard in fulltext-search database engines. And all the pieces exist already in the node ecosystem. But I couldn't find a node fulltext search and document store that could handle millions of documents, persisted on disk, didn't have crazy memory requirements and could be easily bundled into an electron app. 29 | 30 | Like, db, **y** **u** **no** exist already?? 31 | 32 | ![yuno.jpg](yuno.jpg) 33 | 34 | ## Install 35 | 36 | ``` 37 | npm install --save yunodb 38 | ``` 39 | 40 | ## Use 41 | 42 | ### Create / load a database 43 | 44 | **`yuno(options, callback)`** 45 | 46 | e.g. 47 | 48 | ``` 49 | var yuno = require('yunodb') 50 | 51 | var dbopts = { 52 | location: './.yuno', 53 | keyField: 'id', 54 | indexMap: ['text'] 55 | } 56 | var db = yuno(dbopts, (err, dbhandle) => { 57 | if (err) throw err 58 | 59 | // do stuff with the db 60 | db = dbhandle 61 | }) 62 | ``` 63 | 64 | `opts` configures the two persistent datastores. Possible key-value pairs are: 65 | 66 | - **location** (String, required) - Base directory in which both datastores will be kept. 67 | - **keyField** (String, required) - [JSONpath](https://github.com/s3u/JSONPath#syntax-through-examples) specifying the field in each document to be used as a key in the document store. 68 | - **indexMap** (Array | Object, required) - [JSONpaths](https://github.com/s3u/JSONPath#syntax-through-examples) specifying the fields in each document to index for fulltext searching. See [index mapping](#index-mapping) below for details. 69 | - **deletable** (Boolean, optional) - Whether documents should be deletable. Setting to true increases index size. Default: false. 70 | - **ngramLength** (Integer | Array, optional) - ngram length(s) to use when building index. 71 | 72 | #### Index mapping 73 | 74 | It is quite rare that all fields in a database should be exposed to the user search. More often, we want to allow the user to search certain fields, but retrieve the full document for each result. The `indexMap` option allows you to specify how to index documents. 75 | 76 | There are two ways to tell `yuno` how to index: 77 | 78 | ##### 1. Pass an Array of fields 79 | 80 | The simple option - an array of fields to index. The contents of each field will be passed through the default Natural Language Processing pipeline before being added to the search index. 81 | 82 | ##### 2. Pass an Object mapping fields to processors 83 | 84 | To fine-tune the processing on a per-field basis, pass an Object where each key is a field to index. Values can be one of: 85 | 86 | - `true`/`false` whether to apply the default NLP pipeline 87 | - `function` a custom processing function. 88 | 89 | Custom processing take the field value as a single argument, and their return value (either a string or an array) will be tokenised and added to the index. 90 | 91 | ### Add documents 92 | 93 | **`db.add(documents, options, callback)`** 94 | 95 | - `documents`, array of JSON-able objects to store 96 | - `options` optional, can override the database-wide `indexMap` option 97 | - `callback`, function to call on completion, with a single argument to be passed an error if there was one 98 | 99 | e.g. 100 | 101 | ```js 102 | var docs = [ 103 | { id: 1, text: 'hello '}, 104 | { id: 2, text: 'goodbye '}, 105 | { id: 3, text: 'tortoise '} 106 | ] 107 | 108 | function done (err) { 109 | if (err) throw err 110 | console.log('successfully added', docs.length, 'documents') 111 | } 112 | 113 | db.add(docs, done) 114 | ``` 115 | 116 | or using a custom `indexMap`: 117 | 118 | ```js 119 | // trim whitespace 120 | function trim (str) { return str.trim() } 121 | 122 | db.add(docs, { text: trim }, doneAdding) 123 | ``` 124 | 125 | ### Delete documents 126 | 127 | **`db.del(documents, callback)`** 128 | 129 | - `documents`, document (object), id (string), or array of documents or ids 130 | - `callback`, function to call on completion, with a single argument to be passed an error if there was one 131 | 132 | e.g. 133 | 134 | ```js 135 | // document 136 | db.del({ id: '1234', otherkey: 'something else' }, done) 137 | 138 | // with id 139 | db.del('1234', done) 140 | 141 | // array 142 | db.del(['1234', '1235', '1236'], done) 143 | ``` 144 | 145 | ### Search 146 | 147 | **`db.search(query, opts, callback)`** 148 | 149 | Returns a cursor that can be used to page through the results. By default the `pageSize` is 50. 150 | 151 | - `query`, string search query 152 | - `opts`, (optional) options object 153 | - `callback`, function to call on completion. Takes two arguments: 154 | - `err` error or `null` 155 | - `results` object containing the result metadata and hits 156 | 157 | e.g. 158 | 159 | ```js 160 | var cursor = db.search('tortoise', function(err, results) { 161 | if (err) throw err 162 | 163 | // first 50 results 164 | console.log(results) 165 | 166 | cursor.next(function(err, results) { 167 | // next page in here 168 | }) 169 | }) 170 | ``` 171 | 172 | ### CLI 173 | 174 | yuno has a minimal command-line interface that can be used to create a database from a file containing JSON objects. 175 | 176 | Install the CLI: 177 | 178 | ```bash 179 | npm install --global yuno 180 | ``` 181 | 182 | Create a new database: 183 | 184 | ```bash 185 | yuno create 186 | ``` 187 | 188 | The JSON data file must contain JSON objects, rather than an array. For example: 189 | 190 | ```json 191 | { "id": "1234", "title": "the coleopterist's handbook" } 192 | { "id": "4321", "title": "bark and ambrosia beetles of south america" } 193 | ``` 194 | 195 | You can provide database options as a JSON file using the `--opts` argument: 196 | 197 | ```bash 198 | yuno create --opts 199 | ``` 200 | 201 | Where the options JSON looks like: 202 | 203 | ```json 204 | { 205 | "keyField": "id", 206 | "indexMap": { 207 | "title": true, 208 | } 209 | } 210 | ``` 211 | 212 | ## Contributing 213 | 214 | yuno is being built to serve my use-case of embedding pre-made databases in electron apps. If you have another use-case and would like features added, please open an issue to discuss it - I'm happy to add things that will be widely useful. 215 | 216 | Contributions are very welcome. **Please** open an issue to discuss any changes you would like to PR, or mention in an existing issue that you plan to work on it. 217 | 218 | Ideas for improving performance are particularly welcome. 219 | 220 | ## License - CC0 221 | 222 | https://creativecommons.org/publicdomain/zero/1.0/ 223 | 224 | yuno is public domain code. Do whatever you want with it. Credit would be appreciated, but it's not required. 225 | -------------------------------------------------------------------------------- /cli.js: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env node 2 | 3 | var program = require('commander') 4 | var yuno = require('.') 5 | var exists = require('path-exists').sync 6 | var JSONStream = require('JSONStream') 7 | var fs = require('fs') 8 | var _path = require('path') 9 | 10 | program 11 | .version(require('./package.json').version) 12 | 13 | program 14 | .command('create ') 15 | .description('create a database from JSON objects') 16 | .option('-o, --opts ', 'JSON file containing database options') 17 | .action(function (path, input, options) { 18 | checkFile(input, 'input JSON') 19 | var opts = {} 20 | if (options.opts) { 21 | checkFile(input, 'options JSON') 22 | opts = require(_path.resolve('.', options.opts)) 23 | } 24 | 25 | console.log('creating database at', path, 'from file', input) 26 | opts.location = path 27 | 28 | function load (err, db) { 29 | if (err) throw err 30 | populate(db, input) 31 | } 32 | 33 | yuno(opts, load) 34 | }) 35 | 36 | program 37 | .parse(process.argv) 38 | 39 | function populate (db, file) { 40 | var json = JSONStream.parse() 41 | var read = fs.createReadStream(file) 42 | 43 | var n = 0 44 | var chunk = [] 45 | 46 | json.on('data', function (entry) { 47 | chunk.push(entry) 48 | 49 | if (chunk.length === 10000) { 50 | var thischunk = chunk 51 | 52 | db.add(thischunk, {}, function (err) { 53 | if (err) return console.log(err) 54 | n += 10000 55 | console.log('written:', n) 56 | }) 57 | 58 | chunk = [] 59 | } 60 | }) 61 | 62 | json.on('end', function () { 63 | if (chunk.length === 0) return 64 | 65 | db.add(chunk, {}, function (err) { 66 | if (err) throw console.log(err) 67 | db.index.tellMeAboutMySearchIndex(function (err, info) { 68 | if (err) throw err 69 | console.log('done! added', info.totalDocs, 'docs to index') 70 | }) 71 | }) 72 | }) 73 | 74 | read.pipe(json) 75 | } 76 | 77 | function checkFile (file, name) { 78 | if (!file) { 79 | console.log('ERROR: you must provide an', name) 80 | process.exit(1) 81 | } else if (/^input/.test(name) && !exists(file)) { 82 | console.log('ERROR:', name, "file doesn't exist at path", file) 83 | process.exit(1) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /cursor.js: -------------------------------------------------------------------------------- 1 | var _ = require('lodash') 2 | 3 | function Cursor (query, db, opts) { 4 | if (!(this instanceof Cursor)) return new Cursor(query, db, opts) 5 | 6 | var defaults = { 7 | pageSize: 50 8 | } 9 | if (!opts) opts = defaults 10 | 11 | this.pageSize = opts.pageSize || defaults.pageSize 12 | this.lastOffset = null 13 | this.query = { AND: { '*': db.preprocessor.naturalize(query) } } 14 | this.db = db 15 | } 16 | 17 | Cursor.prototype.first = function (cb) { 18 | return this.queryWithOffset(0, cb) 19 | } 20 | 21 | Cursor.prototype.next = function (cb) { 22 | var offset = (this.lastOffset === null) ? 0 : this.lastOffset + this.pageSize 23 | return this.queryWithOffset(offset, cb) 24 | } 25 | 26 | Cursor.prototype.prev = function (cb) { 27 | var offset = (this.lastOffset === null) ? 0 : this.lastOffset - this.pageSize 28 | return this.queryWithOffset(offset, cb) 29 | } 30 | 31 | Cursor.prototype.last = function (cb) { 32 | if (this.totalHits) { 33 | var penultimatePage = Math.floor(this.totalHits / this.pageSize) 34 | var lastPageOffset = penultimatePage * this.pageSize 35 | return this.queryWithOffset(lastPageOffset, cb) 36 | } 37 | 38 | cb(new Error('cannot get last page until initial query has run (try cursor.first() first)')) 39 | } 40 | 41 | Cursor.prototype.queryWithOffset = function (offset, cb) { 42 | this.lastOffset = offset 43 | 44 | var self = this 45 | var q = { 46 | query: this.query, 47 | offset: offset, 48 | pageSize: this.pageSize 49 | } 50 | this.db.index.search(q, (err, results) => { 51 | if (err) return cb(err) 52 | self.totalHits = results.totalHits 53 | results.offset = offset 54 | self.fullResults(results, cb) 55 | }) 56 | } 57 | 58 | Cursor.prototype.fullResults = function (results, cb) { 59 | var self = this 60 | 61 | var done = _.after(results.hits.length, function () { 62 | cb(null, results) 63 | }) 64 | 65 | results.hits.map((hit, i) => { 66 | self.db.docstore.get(hit.id, (err, document) => { 67 | if (err) cb(err) 68 | results.hits[i].document = document 69 | done(null) 70 | }) 71 | }) 72 | } 73 | 74 | module.exports = Cursor 75 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var path = require('path') 2 | 3 | var levelup = require('levelup') 4 | var searchIndex = require('search-index') 5 | var _ = require('lodash') 6 | var mkdirp = require('mkdirp') 7 | var jsonpath = require('jsonpath-plus') 8 | 9 | var preprocess = require('./preprocess/preprocess.js') 10 | var Cursor = require('./cursor.js') 11 | 12 | function Yuno (opts, cb) { 13 | if (!(this instanceof Yuno)) return new Yuno(opts, cb) 14 | 15 | requiredOpts(opts, ['keyField', 'indexMap'], cb) 16 | 17 | var self = this 18 | 19 | function ready () { 20 | if (cb) cb(null, self) 21 | // TODO: events, self.emit('ready') 22 | } 23 | 24 | var docstoreOpts = opts.docstore || { 25 | keyEncoding: 'string', 26 | valueEncoding: 'json' 27 | } 28 | 29 | mkdirp.sync(opts.location) 30 | 31 | this.docstorePath = path.join(opts.location, 'docstore') 32 | this.docstore = levelup(this.docstorePath, docstoreOpts) 33 | 34 | this.indexPath = path.join(opts.location, 'index') 35 | 36 | var indexOpts = _.defaults(opts, { 37 | indexPath: this.indexPath, 38 | deletable: false, 39 | fieldedSearch: false, 40 | fieldsToStore: ['tokens'], 41 | nGramLength: 1 42 | }) 43 | 44 | searchIndex(indexOpts, (err, si) => { 45 | if (err) return cb(err) 46 | self.index = si 47 | ready() 48 | }) 49 | 50 | this.preprocessor = preprocess(opts) 51 | this.keyField = opts.keyField || 'id' 52 | } 53 | 54 | Yuno.prototype.getKey = function (doc) { 55 | return jsonpath({ json: doc, path: this.keyField })[0] 56 | } 57 | 58 | Yuno.prototype.putOp = function (doc) { 59 | return { type: 'put', key: this.getKey(doc), value: doc } 60 | } 61 | 62 | // docs: array of documents to add 63 | // opts: options for adding 64 | Yuno.prototype.add = function (docs, opts, cb) { 65 | var self = this 66 | if (_.isFunction(opts)) cb = opts 67 | if (_.isPlainObject(docs)) docs = [docs] 68 | 69 | var errs = [] 70 | var docb = _.after(2, function () { 71 | cb(errs.length > 0 ? errs[0] : null, docs.length) 72 | }) 73 | var done = function (err) { 74 | if (err) errs.push(err) 75 | docb() 76 | } 77 | 78 | this.docstore.batch(docs.map((d) => { 79 | return { type: 'put', key: '' + self.getKey(d), value: JSON.stringify(d) } 80 | }), done) 81 | 82 | this.index.add(docs.map((d) => { 83 | return { id: self.getKey(d), tokens: self.preprocessor.process(d) } 84 | }), done) 85 | // process the docs for search indexing 86 | } 87 | 88 | Yuno.prototype.get = function (key, cb) { 89 | this.docstore.get(key, cb) 90 | } 91 | 92 | Yuno.prototype.search = function (query, opts, cb) { 93 | if (_.isFunction(opts)) { 94 | cb = opts 95 | opts = null 96 | } 97 | var cursor = Cursor(query, this, opts) 98 | cursor.first(cb) 99 | return cursor 100 | } 101 | 102 | Yuno.prototype.del = function (keys, cb) { 103 | var self = this 104 | 105 | if (!(_.isArray(keys))) keys = [keys] 106 | if (_.isPlainObject(keys[0])) keys = keys.map((doc) => { self.getKey(doc) }) 107 | 108 | var errs = [] 109 | var done = _.after(2, function () { 110 | cb(errs.length > 0 ? errs[0] : null) 111 | }) 112 | 113 | this.docstore.batch(keys.map((key) => { 114 | return { type: 'del', key: key } 115 | }), done) 116 | 117 | this.index.del(keys.map((key) => { 118 | return { id: key } 119 | }), done) 120 | } 121 | 122 | Yuno.prototype.close = function (cb) { 123 | var errs = [] 124 | var done = _.after(2, function () { 125 | cb(errs.length > 0 ? errs[0] : null) 126 | }) 127 | 128 | this.docstore.close(done) 129 | this.index.close(done) 130 | } 131 | 132 | function requiredOpts (opts, keys, cb) { 133 | keys.forEach((key) => { 134 | if (!opts[key]) { 135 | cb(new Error(key + ' option is required')) 136 | } 137 | }) 138 | } 139 | 140 | module.exports = Yuno 141 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "yunodb", 3 | "version": "0.1.4", 4 | "description": "portable, persistent, electron compatible fulltext search database", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "standard && tape test/*.js" 8 | }, 9 | "bin": { 10 | "yuno": "./cli.js" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git+https://github.com/blahah/yunodb.git" 15 | }, 16 | "keywords": [ 17 | "fulltext", 18 | "search", 19 | "database", 20 | "json", 21 | "object", 22 | "store", 23 | "leveldb", 24 | "search-index", 25 | "electron" 26 | ], 27 | "author": "Richard Smith-Unna (@blahah)", 28 | "license": "CC0-1.0", 29 | "bugs": { 30 | "url": "https://github.com/blahah/yunodb/issues" 31 | }, 32 | "homepage": "https://github.com/blahah/yunodb#readme", 33 | "dependencies": { 34 | "JSONStream": "^1.1.1", 35 | "commander": "^2.9.0", 36 | "jsonpath-plus": "^0.15.0", 37 | "leveldown": "^1.4.6", 38 | "levelup": "^1.3.1", 39 | "lodash": "^4.11.1", 40 | "mkdirp": "^0.5.1", 41 | "natural": "github:blahah/natural", 42 | "path-exists": "^2.1.0", 43 | "search-index": "^0.8.12" 44 | }, 45 | "devDependencies": { 46 | "rimraf": "^2.5.2", 47 | "standard": "^6.0.8", 48 | "tape": "^4.5.1", 49 | "temporary": "0.0.8" 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /preprocess/preprocess.js: -------------------------------------------------------------------------------- 1 | var vector = require('./vector.js') 2 | var _ = require('lodash') 3 | var jsonpath = require('jsonpath-plus') 4 | 5 | function Preprocessor (opts) { 6 | if (!(this instanceof Preprocessor)) return new Preprocessor(opts) 7 | 8 | if (!opts.indexMap) throw new Error('preprocessor requires an indexMap option') 9 | 10 | this.opts = opts 11 | this.createPipeline(opts) 12 | this.cachePaths(opts) 13 | } 14 | 15 | Preprocessor.prototype.naturalize = function (str) { 16 | return vector(str) 17 | .trim() 18 | .tag() 19 | .filterPOS() 20 | .stripTags() 21 | .lowercase() 22 | .stripPunctuation() 23 | .filterNonWords() 24 | .stem().terms 25 | } 26 | 27 | Preprocessor.prototype.createPipeline = function (opts) { 28 | var indexMap = opts.indexMap 29 | if (indexMap instanceof Array) { 30 | indexMap = _.zipObject(indexMap, [true]) 31 | } 32 | 33 | var self = this 34 | this.pipeline = _.transform(indexMap, function (pipeline, action, field) { 35 | var op = _.identity 36 | if (_.isFunction(action)) { 37 | op = _.bind(action, self) 38 | } else if (action) { 39 | op = self.naturalize 40 | } 41 | pipeline[field] = op 42 | }, {}) 43 | } 44 | 45 | Preprocessor.prototype.cachePaths = function (opts) { 46 | var map = opts.indexMap 47 | this.paths = _.isArray(map) ? map : Object.keys(map) 48 | } 49 | 50 | Preprocessor.prototype.pick = function (object) { 51 | return _.zipObject(this.paths, this.paths.map(function (path) { 52 | return jsonpath({ json: object, path: path }).join(' ') 53 | })) 54 | } 55 | 56 | Preprocessor.prototype.process = function (object) { 57 | var self = this 58 | var picked = this.pick(object) 59 | var parts = _.map(picked, (value, key, o) => { 60 | var step = self.pipeline[key] 61 | return step ? step(value) : value 62 | }) 63 | return _.flatten(parts) 64 | } 65 | 66 | module.exports = Preprocessor 67 | -------------------------------------------------------------------------------- /preprocess/tagger/tr_from_brill_paper.txt: -------------------------------------------------------------------------------- 1 | TO IN NEXT-TAG AT 2 | VBN VBD PREV-WORD-IS-CAP YES 3 | VBD VBN PREV-1-OR-2-OR-3-TAG HVD 4 | VB NN PREV-1-OR-2-TAG AT 5 | NN VB PREV-TAG TO 6 | TO IN NEXT-WORD-IS-CAP YES 7 | NN VB PREV-TAG MD 8 | PPS PPO NEXT-TAG . 9 | VBN VBD PREV-TAG PPS 10 | NP NN CURRENT-WORD-IS-CAP NO -------------------------------------------------------------------------------- /preprocess/tagger/tr_from_posjs.txt: -------------------------------------------------------------------------------- 1 | VBD NN PREV-TAG DT 2 | VBP NN PREV-TAG DT 3 | VB NN PREV-TAG DT 4 | NN CD CURRENT-WORD-IS-NUMBER YES 5 | NNP CD CURRENT-WORD-IS-NUMBER YES 6 | NNS CD CURRENT-WORD-IS-NUMBER YES 7 | NNPS CD CURRENT-WORD-IS-NUMBER YES 8 | NN URL CURRENT-WORD-IS-URL YES 9 | NNS URL CURRENT-WORD-IS-URL YES 10 | NNP URL CURRENT-WORD-IS-URL YES 11 | NNPS URL CURRENT-WORD-IS-URL YES 12 | NN VBN CURRENT-WORD-ENDS-WITH ed 13 | * RB CURRENT-WORD-ENDS-WITH ly 14 | NN JJ CURRENT-WORD-ENDS-WITH al 15 | NNS JJ CURRENT-WORD-ENDS-WITH al 16 | NN VB PREV-WORD-IS would 17 | NN NNS CURRENT-WORD-ENDS-WITH s 18 | NN VBG CURRENT-WORD-ENDS-WITH ing -------------------------------------------------------------------------------- /preprocess/vector.js: -------------------------------------------------------------------------------- 1 | var natural = require('natural') 2 | var tokenize = (new natural.TreebankWordTokenizer()).tokenize 3 | var stem = natural.PorterStemmer.stem 4 | var inherits = require('util').inherits 5 | var path = require('path') 6 | var rules = path.join(__dirname, './tagger/tr_from_posjs.txt') 7 | var lexicon = path.join(__dirname, './tagger/lexicon_from_posjs.json') 8 | var tagger = new natural.BrillPOSTagger(lexicon, rules, 'N') 9 | 10 | inherits(Vector, Array) 11 | 12 | function Vector (terms) { 13 | if (!(this instanceof Vector)) return new Vector(terms) 14 | if (terms instanceof Vector) terms = terms.terms 15 | 16 | if (typeof terms === 'string') { 17 | this.eatString(terms) 18 | } else { 19 | this.terms = terms 20 | } 21 | } 22 | 23 | function stripPunctuation (term) { 24 | return term.replace(/\W+/g, '') 25 | } 26 | 27 | function stripTag (pair) { 28 | return pair[0] 29 | } 30 | 31 | function isWord (term) { 32 | return term.replace(/[0-9]+/g, '').length > 0 33 | } 34 | 35 | Vector.prototype.eatString = function (string) { 36 | this.terms = tokenize(string.replace('/', ' ')) 37 | } 38 | 39 | Vector.prototype.lowercase = function () { 40 | return Vector(this.terms.map((s) => { return s.toLowerCase() })) 41 | } 42 | 43 | Vector.prototype.trim = function () { 44 | return Vector(this.terms.map((s) => { return s.trim() })) 45 | } 46 | 47 | Vector.prototype.tag = function () { 48 | this.tags = tagger.tag(this.terms) 49 | return this 50 | } 51 | 52 | Vector.prototype.filterPOS = function () { 53 | var filtered = this.tags.filter((part) => { 54 | // see 55 | // https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used 56 | var tag = part[1] 57 | if (!tag) return false 58 | 59 | // keep 60 | var first = tag[0] 61 | if (first === 'N') return true // nouns 62 | if (first === 'V') return true // verbs 63 | if (first === 'J') return true // adjectives 64 | if (first === 'R') return true // adverbs 65 | 66 | // discard 67 | return false 68 | }) 69 | return Vector(filtered) 70 | } 71 | 72 | Vector.prototype.stripTags = function () { 73 | return Vector(this.terms.map(stripTag)) 74 | } 75 | 76 | Vector.prototype.stripPunctuation = function () { 77 | return Vector(this.terms.map(stripPunctuation)) 78 | } 79 | 80 | Vector.prototype.filterNonWords = function () { 81 | return Vector(this.terms.filter(isWord)) 82 | } 83 | 84 | Vector.prototype.stem = function () { 85 | return Vector(this.terms.map(stem)) 86 | } 87 | 88 | module.exports = Vector 89 | -------------------------------------------------------------------------------- /test/add.js: -------------------------------------------------------------------------------- 1 | var yuno = require('../') 2 | var test = require('tape') 3 | var tmp = require('temporary') 4 | var rimraf = require('rimraf') 5 | 6 | var path = require('path') 7 | 8 | test('add', function (t) { 9 | var tmpdir = new tmp.Dir() 10 | var dbpath = path.join(tmpdir.path, 'yuno') 11 | 12 | var opts = { 13 | location: dbpath, 14 | keyField: 'id', 15 | indexMap: ['word'] 16 | } 17 | 18 | var doc = { id: '1234', word: 'sesquipedalianism is for deipnosophists' } 19 | 20 | yuno(opts, (err, db) => { 21 | t.error(err, 'no error creating db') 22 | 23 | db.add(doc, function (err) { 24 | t.error(err, 'no error adding document') 25 | 26 | db.docstore.get(doc.id, function (err, value) { 27 | t.error(err, 'no error retrieving doc from docstore') 28 | t.equals(JSON.stringify(doc), value, 'doc is exactly as inserted') 29 | rimraf(dbpath, {}, t.end) 30 | }) 31 | }) 32 | }) 33 | }) 34 | -------------------------------------------------------------------------------- /test/create.js: -------------------------------------------------------------------------------- 1 | var yuno = require('../') 2 | var test = require('tape') 3 | var tmp = require('temporary') 4 | var rimraf = require('rimraf') 5 | 6 | var path = require('path') 7 | var fs = require('fs') 8 | 9 | test('create', function (t) { 10 | t.plan(2) 11 | 12 | var tmpdir = new tmp.Dir() 13 | var dbpath = path.join(tmpdir.path, 'yuno') 14 | 15 | var opts = { 16 | location: dbpath, 17 | keyField: 'id', 18 | indexMap: ['word'] 19 | } 20 | 21 | function cb (err) { 22 | t.error(err, 'no error on create db') 23 | 24 | var stats = fs.lstatSync(dbpath) 25 | t.ok(stats.isDirectory(), 'db directory exists') 26 | 27 | rimraf(dbpath, {}, t.end) 28 | } 29 | 30 | yuno(opts, cb) 31 | }) 32 | -------------------------------------------------------------------------------- /test/del.js: -------------------------------------------------------------------------------- 1 | var yuno = require('../') 2 | var test = require('tape') 3 | var tmp = require('temporary') 4 | 5 | var path = require('path') 6 | 7 | test('add', function (t) { 8 | var tmpdir = new tmp.Dir() 9 | var dbpath = path.join(tmpdir.path, 'yuno') 10 | 11 | var opts = { 12 | location: dbpath, 13 | keyField: 'id', 14 | indexMap: ['word'] 15 | } 16 | 17 | var doc = { id: '1234', word: 'sesquipedalianism is for deipnosophists' } 18 | 19 | yuno(opts, (err, db) => { 20 | t.error(err, 'no error creating db') 21 | 22 | db.add(doc, function (err) { 23 | t.error(err, 'no error adding document') 24 | 25 | db.del(doc.id, function (err) { 26 | t.error(err, 'no error deleting doc') 27 | 28 | db.get(doc.id, function (err) { 29 | t.ok(err, 'document no longer exists in index') 30 | t.end() 31 | }) 32 | }) 33 | }) 34 | }) 35 | }) 36 | -------------------------------------------------------------------------------- /test/preprocess.js: -------------------------------------------------------------------------------- 1 | var preprocessor = require('../preprocess/preprocess.js') 2 | var test = require('tape') 3 | 4 | test('preprocess', function (t) { 5 | var opts = { 6 | indexMap: ['vec'] 7 | } 8 | 9 | var p = preprocessor(opts) 10 | 11 | var obj = { 12 | vec: 'the big green manitee jumped over the laziest lungfish' 13 | } 14 | var expected = [ 15 | 'big', 'green', 'manite', 'jump', 'laziest', 'lungfish' 16 | ] 17 | 18 | t.deepEqual(p.naturalize(obj.vec), expected, 'naturalize a string') 19 | 20 | t.deepEqual(p.process(obj), expected, 'process an object (indexMap is array)') 21 | 22 | opts.indexMap = { vec: (x) => 'a' } 23 | p = preprocessor(opts) 24 | t.deepEqual(p.process(obj), ['a'], 'process an object (indexMap is object)') 25 | 26 | obj.first = { 27 | second: ['intact', 'truncated'] 28 | } 29 | 30 | // test json paths in index map 31 | opts.indexMap = ['first.second[1]'] 32 | p = preprocessor(opts) 33 | t.deepEqual(p.process(obj), ['truncat']) 34 | 35 | t.end() 36 | }) 37 | -------------------------------------------------------------------------------- /test/search.js: -------------------------------------------------------------------------------- 1 | var yuno = require('../') 2 | var test = require('tape') 3 | var tmp = require('temporary') 4 | var rimraf = require('rimraf') 5 | 6 | var path = require('path') 7 | 8 | var reuters = require('./reuters-000.json') 9 | 10 | test('basic search', function (t) { 11 | t.timeoutAfter(1000) 12 | 13 | var tmpdir = new tmp.Dir() 14 | var dbpath = path.join(tmpdir.path, 'yuno') 15 | 16 | var opts = { 17 | location: dbpath, 18 | keyField: 'id', 19 | indexMap: ['word'] 20 | } 21 | 22 | var docs = [ 23 | { id: '1234', word: 'sesquipedalianism is for deipnosophists' }, 24 | { id: '4321', word: 'deipnosophists are annoying' } 25 | ] 26 | 27 | yuno(opts, (err, db) => { 28 | t.error(err, 'no error creating db') 29 | 30 | db.add(docs, function (err) { 31 | t.error(err, 'no error adding document') 32 | 33 | db.search('deipnosophists', function (err, result) { 34 | t.error(err, 'no error searching single') 35 | t.equals(result.totalHits, 2, 'correct number of hits A') 36 | t.equals(result.hits[0].document, JSON.stringify(docs[1]), 'doc 1 is exactly as inserted') 37 | 38 | rimraf(dbpath, {}, t.end) 39 | }) 40 | }) 41 | }) 42 | }) 43 | 44 | test('paging search', function (t) { 45 | t.timeoutAfter(10000) 46 | 47 | var tmpdir = new tmp.Dir() 48 | var dbpath = path.join(tmpdir.path, 'yuno') 49 | 50 | var opts = { 51 | location: dbpath, 52 | keyField: 'id', 53 | indexMap: { 54 | title: true, 55 | body: true, 56 | topics: true, 57 | places: true, 58 | date: false 59 | } 60 | } 61 | 62 | yuno(opts, (err, db) => { 63 | t.error(err, 'no error creating db') 64 | 65 | db.add(reuters, function (err) { 66 | t.error(err, 'no error adding document') 67 | 68 | var cursor = db.search('new york', function (err, result) { 69 | t.error(err, 'no error searching single') 70 | 71 | t.equals(result.totalHits, 55, 'correct number of hits B') 72 | t.equals(result.hits.length, 50, 'correct first page size') 73 | 74 | cursor.next((err, result) => { 75 | t.error(err, 'no error paging') 76 | 77 | t.equals(result.hits.length, 5, 'correct second page size') 78 | rimraf(dbpath, {}, t.end) 79 | }) 80 | }) 81 | }) 82 | }) 83 | }) 84 | -------------------------------------------------------------------------------- /yuno.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blahah/yunodb/290a4ffec2094cf97bce260a88df1281931f4779/yuno.jpg --------------------------------------------------------------------------------