├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── cli.js
├── cursor.js
├── index.js
├── package.json
├── preprocess
    ├── preprocess.js
    ├── tagger
    │   ├── lexicon_from_posjs.json
    │   ├── tr_from_brill_paper.txt
    │   └── tr_from_posjs.txt
    └── vector.js
├── test
    ├── add.js
    ├── create.js
    ├── del.js
    ├── preprocess.js
    ├── reuters-000.json
    └── search.js
└── yuno.jpg


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Created by https://www.gitignore.io/api/linux,osx,node
 3 | 
 4 | ### Linux ###
 5 | *~
 6 | 
 7 | # temporary files which can be created if a process still has a handle open of a deleted file
 8 | .fuse_hidden*
 9 | 
10 | # KDE directory preferences
11 | .directory
12 | 
13 | # Linux trash folder which might appear on any partition or disk
14 | .Trash-*
15 | 
16 | 
17 | ### OSX ###
18 | .DS_Store
19 | .AppleDouble
20 | .LSOverride
21 | 
22 | # Icon must end with two \r
23 | Icon
24 | 
25 | # Thumbnails
26 | ._*
27 | 
28 | # Files that might appear in the root of a volume
29 | .DocumentRevisions-V100
30 | .fseventsd
31 | .Spotlight-V100
32 | .TemporaryItems
33 | .Trashes
34 | .VolumeIcon.icns
35 | 
36 | # Directories potentially created on remote AFP share
37 | .AppleDB
38 | .AppleDesktop
39 | Network Trash Folder
40 | Temporary Items
41 | .apdisk
42 | 
43 | 
44 | ### Node ###
45 | # Logs
46 | logs
47 | *.log
48 | npm-debug.log*
49 | 
50 | # Runtime data
51 | pids
52 | *.pid
53 | *.seed
54 | 
55 | # Directory for instrumented libs generated by jscoverage/JSCover
56 | lib-cov
57 | 
58 | # Coverage directory used by tools like istanbul
59 | coverage
60 | 
61 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
62 | .grunt
63 | 
64 | # node-waf configuration
65 | .lock-wscript
66 | 
67 | # Compiled binary addons (http://nodejs.org/api/addons.html)
68 | build/Release
69 | 
70 | # Dependency directories
71 | node_modules
72 | jspm_packages
73 | 
74 | # Optional npm cache directory
75 | .npm
76 | 
77 | # Optional REPL history
78 | .node_repl_history
79 | 
80 | # local development examples
81 | examples
82 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 |   - 'stable'
4 |   - '6'
5 |   - '4'
6 |   - '5'
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The person who associated a work with this deed has dedicated the work to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law.
2 | 
3 | You can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission.
4 | 
5 | See https://creativecommons.org/publicdomain/zero/1.0/ for more info.
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## yunodb
  2 | 
  3 | A portable, persistent, electron compatible fulltext search + document store database for node.js. LevelDB underneath.
  4 | 
  5 | [![js-standard-style](https://img.shields.io/badge/code%20style-standard%20js-green.svg?style=flat-square)](https://github.com/feross/standard)  [![Travis](https://img.shields.io/travis/blahah/yunodb.svg?style=flat-square)](https://travis-ci.org/blahah/yunodb)  [![npm](https://img.shields.io/npm/v/yunodb.svg?style=flat-square)](https://www.npmjs.com/package/yunodb)  [![cc-zero](https://img.shields.io/badge/license-CC0%20public%20domain-ff69b4.svg?style=flat-square)](https://github.com/blahah/yunodb#license---cc0)
  6 | 
  7 | - [How it works](https://github.com/blahah/yunodb#how-it-works)
  8 | - [Install](https://github.com/blahah/yunodb#install)
  9 | - [Use](https://github.com/blahah/yunodb#use)
 10 |   - [Create / load a database](https://github.com/blahah/yunodb#create--load-a-database)
 11 |     - [Index mapping](https://github.com/blahah/yunodb#index-mapping)
 12 |   - [Add documents](https://github.com/blahah/yunodb#add-documents)
 13 |   - [Delete documents](https://github.com/blahah/yunodb#delete-documents)
 14 |   - [Search](https://github.com/blahah/yunodb#search)
 15 |   - [CLI](https://github.com/blahah/yunodb#cli)
 16 | - [Contributing](https://github.com/blahah/yunodb#contributing)
 17 | - [License - CC0](https://github.com/blahah/yunodb#license---cc0)
 18 | 
 19 | ## How it works
 20 | 
 21 | yuno is a JSON document store with fulltext search. It's meant for embedding in electron apps, focuses solely on text search, and in most cases should handle millions of documents easily.
 22 | 
 23 | yuno is pretty basic - it has three components:
 24 | - The document store, which is just the raw JSON objects stored in [leveldb](https://github.com/Level/levelup)
 25 | - The inverted search index, powered by [search-index](https://github.com/fergiemcdowall/search-index)
 26 | - A customisable [natural](https://github.com/NaturalNode/natural) language processing pipeline that is applied to documents before adding them to the index, greatly improving speed and memory usage compared to the vanilla search-index.
 27 | 
 28 | **None of this is revolutionary** - actually it's standard in fulltext-search database engines. And all the pieces exist already in the node ecosystem. But I couldn't find a node fulltext search and document store that could handle millions of documents, persisted on disk, didn't have crazy memory requirements and could be easily bundled into an electron app.
 29 | 
 30 | Like, db, **y** **u** **no** exist already??
 31 | 
 32 | ![yuno.jpg](yuno.jpg)
 33 | 
 34 | ## Install
 35 | 
 36 | ```
 37 | npm install --save yunodb
 38 | ```
 39 | 
 40 | ## Use
 41 | 
 42 | ### Create / load a database
 43 | 
 44 | **`yuno(options, callback)`**
 45 | 
 46 | e.g.
 47 | 
 48 | ```
 49 | var yuno = require('yunodb')
 50 | 
 51 | var dbopts = {
 52 |   location: './.yuno',
 53 |   keyField: 'id',
 54 |   indexMap: ['text']
 55 | }
 56 | var db = yuno(dbopts, (err, dbhandle) => {
 57 |   if (err) throw err
 58 | 
 59 |   // do stuff with the db
 60 |   db = dbhandle
 61 | })
 62 | ```
 63 | 
 64 | `opts` configures the two persistent datastores. Possible key-value pairs are:
 65 | 
 66 | - **location** (String, required) - Base directory in which both datastores will be kept.
 67 | - **keyField** (String, required) - [JSONpath](https://github.com/s3u/JSONPath#syntax-through-examples) specifying the field in each document to be used as a key in the document store.
 68 | - **indexMap** (Array | Object, required) - [JSONpaths](https://github.com/s3u/JSONPath#syntax-through-examples) specifying the fields in each document to index for fulltext searching. See [index mapping](#index-mapping) below for details.
 69 | - **deletable** (Boolean, optional) - Whether documents should be deletable. Setting to true increases index size. Default: false.
 70 | - **ngramLength** (Integer | Array, optional) - ngram length(s) to use when building index.
 71 | 
 72 | #### Index mapping
 73 | 
 74 | It is quite rare that all fields in a database should be exposed to the user search. More often, we want to allow the user to search certain fields, but retrieve the full document for each result. The `indexMap` option allows you to specify how to index documents.
 75 | 
 76 | There are two ways to tell `yuno` how to index:
 77 | 
 78 | ##### 1. Pass an Array of fields
 79 | 
 80 | The simple option - an array of fields to index. The contents of each field will be passed through the default Natural Language Processing pipeline before being added to the search index.
 81 | 
 82 | ##### 2. Pass an Object mapping fields to processors
 83 | 
 84 | To fine-tune the processing on a per-field basis, pass an Object where each key is a field to index. Values can be one of:
 85 | 
 86 | - `true`/`false` whether to apply the default NLP pipeline
 87 | - `function` a custom processing function.
 88 | 
 89 | Custom processing take the field value as a single argument, and their return value (either a string or an array) will be tokenised and added to the index.
 90 | 
 91 | ### Add documents
 92 | 
 93 | **`db.add(documents, options, callback)`**
 94 | 
 95 | - `documents`, array of JSON-able objects to store
 96 | - `options` optional, can override the database-wide `indexMap` option
 97 | - `callback`, function to call on completion, with a single argument to be passed an error if there was one
 98 | 
 99 | e.g.
100 | 
101 | ```js
102 | var docs = [
103 |   { id: 1, text: 'hello '},
104 |   { id: 2, text: 'goodbye '},
105 |   { id: 3, text: 'tortoise '}
106 | ]
107 | 
108 | function done (err) {
109 |   if (err) throw err
110 |   console.log('successfully added', docs.length, 'documents')
111 | }
112 | 
113 | db.add(docs, done)
114 | ```
115 | 
116 | or using a custom `indexMap`:
117 | 
118 | ```js
119 | // trim whitespace
120 | function trim (str) { return str.trim() }
121 | 
122 | db.add(docs, { text: trim }, doneAdding)
123 | ```
124 | 
125 | ### Delete documents
126 | 
127 | **`db.del(documents, callback)`**
128 | 
129 | - `documents`, document (object), id (string), or array of documents or ids
130 | - `callback`, function to call on completion, with a single argument to be passed an error if there was one
131 | 
132 | e.g.
133 | 
134 | ```js
135 | // document
136 | db.del({ id: '1234', otherkey: 'something else' }, done)
137 | 
138 | // with id
139 | db.del('1234', done)
140 | 
141 | // array
142 | db.del(['1234', '1235', '1236'], done)
143 | ```
144 | 
145 | ### Search
146 | 
147 | **`db.search(query, opts, callback)`**
148 | 
149 | Returns a cursor that can be used to page through the results. By default the `pageSize` is 50.
150 | 
151 | - `query`, string search query
152 | - `opts`, (optional) options object
153 | - `callback`, function to call on completion. Takes two arguments:
154 |   - `err` error or `null`
155 |   - `results` object containing the result metadata and hits
156 | 
157 | e.g.
158 | 
159 | ```js
160 | var cursor = db.search('tortoise', function(err, results) {
161 |   if (err) throw err
162 | 
163 |   // first 50 results
164 |   console.log(results)
165 | 
166 |   cursor.next(function(err, results) {
167 |     // next page in here
168 |   })
169 | })
170 | ```
171 | 
172 | ### CLI
173 | 
174 | yuno has a minimal command-line interface that can be used to create a database from a file containing JSON objects.
175 | 
176 | Install the CLI:
177 | 
178 | ```bash
179 | npm install --global yuno
180 | ```
181 | 
182 | Create a new database:
183 | 
184 | ```bash
185 | yuno create <database path> <JSON data>
186 | ```
187 | 
188 | The JSON data file must contain JSON objects, rather than an array. For example:
189 | 
190 | ```json
191 | { "id": "1234", "title": "the coleopterist's handbook" }
192 | { "id": "4321", "title": "bark and ambrosia beetles of south america" }
193 | ```
194 | 
195 | You can provide database options as a JSON file using the `--opts` argument:
196 | 
197 | ```bash
198 | yuno create --opts <JSON options> <database path> <JSON data>
199 | ```
200 | 
201 | Where the options JSON looks like:
202 | 
203 | ```json
204 | {
205 |   "keyField": "id",
206 |   "indexMap": {
207 |     "title": true,
208 |   }
209 | }
210 | ```
211 | 
212 | ## Contributing
213 | 
214 | yuno is being built to serve my use-case of embedding pre-made databases in electron apps. If you have another use-case and would like features added, please open an issue to discuss it - I'm happy to add things that will be widely useful.
215 | 
216 | Contributions are very welcome. **Please** open an issue to discuss any changes you would like to PR, or mention in an existing issue that you plan to work on it.
217 | 
218 | Ideas for improving performance are particularly welcome.
219 | 
220 | ## License - CC0
221 | 
222 | https://creativecommons.org/publicdomain/zero/1.0/
223 | 
224 | yuno is public domain code. Do whatever you want with it. Credit would be appreciated, but it's not required.
225 | 


--------------------------------------------------------------------------------
/cli.js:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env node
 2 | 
 3 | var program = require('commander')
 4 | var yuno = require('.')
 5 | var exists = require('path-exists').sync
 6 | var JSONStream = require('JSONStream')
 7 | var fs = require('fs')
 8 | var _path = require('path')
 9 | 
10 | program
11 |   .version(require('./package.json').version)
12 | 
13 | program
14 |   .command('create <path> <input>')
15 |   .description('create a database from JSON objects')
16 |   .option('-o, --opts <file>', 'JSON file containing database options')
17 |   .action(function (path, input, options) {
18 |     checkFile(input, 'input JSON')
19 |     var opts = {}
20 |     if (options.opts) {
21 |       checkFile(input, 'options JSON')
22 |       opts = require(_path.resolve('.', options.opts))
23 |     }
24 | 
25 |     console.log('creating database at', path, 'from file', input)
26 |     opts.location = path
27 | 
28 |     function load (err, db) {
29 |       if (err) throw err
30 |       populate(db, input)
31 |     }
32 | 
33 |     yuno(opts, load)
34 |   })
35 | 
36 | program
37 |   .parse(process.argv)
38 | 
39 | function populate (db, file) {
40 |   var json = JSONStream.parse()
41 |   var read = fs.createReadStream(file)
42 | 
43 |   var n = 0
44 |   var chunk = []
45 | 
46 |   json.on('data', function (entry) {
47 |     chunk.push(entry)
48 | 
49 |     if (chunk.length === 10000) {
50 |       var thischunk = chunk
51 | 
52 |       db.add(thischunk, {}, function (err) {
53 |         if (err) return console.log(err)
54 |         n += 10000
55 |         console.log('written:', n)
56 |       })
57 | 
58 |       chunk = []
59 |     }
60 |   })
61 | 
62 |   json.on('end', function () {
63 |     if (chunk.length === 0) return
64 | 
65 |     db.add(chunk, {}, function (err) {
66 |       if (err) throw console.log(err)
67 |       db.index.tellMeAboutMySearchIndex(function (err, info) {
68 |         if (err) throw err
69 |         console.log('done! added', info.totalDocs, 'docs to index')
70 |       })
71 |     })
72 |   })
73 | 
74 |   read.pipe(json)
75 | }
76 | 
77 | function checkFile (file, name) {
78 |   if (!file) {
79 |     console.log('ERROR: you must provide an', name)
80 |     process.exit(1)
81 |   } else if (/^input/.test(name) && !exists(file)) {
82 |     console.log('ERROR:', name, "file doesn't exist at path", file)
83 |     process.exit(1)
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/cursor.js:
--------------------------------------------------------------------------------
 1 | var _ = require('lodash')
 2 | 
 3 | function Cursor (query, db, opts) {
 4 |   if (!(this instanceof Cursor)) return new Cursor(query, db, opts)
 5 | 
 6 |   var defaults = {
 7 |     pageSize: 50
 8 |   }
 9 |   if (!opts) opts = defaults
10 | 
11 |   this.pageSize = opts.pageSize || defaults.pageSize
12 |   this.lastOffset = null
13 |   this.query = { AND: { '*': db.preprocessor.naturalize(query) } }
14 |   this.db = db
15 | }
16 | 
17 | Cursor.prototype.first = function (cb) {
18 |   return this.queryWithOffset(0, cb)
19 | }
20 | 
21 | Cursor.prototype.next = function (cb) {
22 |   var offset = (this.lastOffset === null) ? 0 : this.lastOffset + this.pageSize
23 |   return this.queryWithOffset(offset, cb)
24 | }
25 | 
26 | Cursor.prototype.prev = function (cb) {
27 |   var offset = (this.lastOffset === null) ? 0 : this.lastOffset - this.pageSize
28 |   return this.queryWithOffset(offset, cb)
29 | }
30 | 
31 | Cursor.prototype.last = function (cb) {
32 |   if (this.totalHits) {
33 |     var penultimatePage = Math.floor(this.totalHits / this.pageSize)
34 |     var lastPageOffset = penultimatePage * this.pageSize
35 |     return this.queryWithOffset(lastPageOffset, cb)
36 |   }
37 | 
38 |   cb(new Error('cannot get last page until initial query has run (try cursor.first() first)'))
39 | }
40 | 
41 | Cursor.prototype.queryWithOffset = function (offset, cb) {
42 |   this.lastOffset = offset
43 | 
44 |   var self = this
45 |   var q = {
46 |     query: this.query,
47 |     offset: offset,
48 |     pageSize: this.pageSize
49 |   }
50 |   this.db.index.search(q, (err, results) => {
51 |     if (err) return cb(err)
52 |     self.totalHits = results.totalHits
53 |     results.offset = offset
54 |     self.fullResults(results, cb)
55 |   })
56 | }
57 | 
58 | Cursor.prototype.fullResults = function (results, cb) {
59 |   var self = this
60 | 
61 |   var done = _.after(results.hits.length, function () {
62 |     cb(null, results)
63 |   })
64 | 
65 |   results.hits.map((hit, i) => {
66 |     self.db.docstore.get(hit.id, (err, document) => {
67 |       if (err) cb(err)
68 |       results.hits[i].document = document
69 |       done(null)
70 |     })
71 |   })
72 | }
73 | 
74 | module.exports = Cursor
75 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | var path = require('path')
  2 | 
  3 | var levelup = require('levelup')
  4 | var searchIndex = require('search-index')
  5 | var _ = require('lodash')
  6 | var mkdirp = require('mkdirp')
  7 | var jsonpath = require('jsonpath-plus')
  8 | 
  9 | var preprocess = require('./preprocess/preprocess.js')
 10 | var Cursor = require('./cursor.js')
 11 | 
 12 | function Yuno (opts, cb) {
 13 |   if (!(this instanceof Yuno)) return new Yuno(opts, cb)
 14 | 
 15 |   requiredOpts(opts, ['keyField', 'indexMap'], cb)
 16 | 
 17 |   var self = this
 18 | 
 19 |   function ready () {
 20 |     if (cb) cb(null, self)
 21 |     // TODO: events, self.emit('ready')
 22 |   }
 23 | 
 24 |   var docstoreOpts = opts.docstore || {
 25 |     keyEncoding: 'string',
 26 |     valueEncoding: 'json'
 27 |   }
 28 | 
 29 |   mkdirp.sync(opts.location)
 30 | 
 31 |   this.docstorePath = path.join(opts.location, 'docstore')
 32 |   this.docstore = levelup(this.docstorePath, docstoreOpts)
 33 | 
 34 |   this.indexPath = path.join(opts.location, 'index')
 35 | 
 36 |   var indexOpts = _.defaults(opts, {
 37 |     indexPath: this.indexPath,
 38 |     deletable: false,
 39 |     fieldedSearch: false,
 40 |     fieldsToStore: ['tokens'],
 41 |     nGramLength: 1
 42 |   })
 43 | 
 44 |   searchIndex(indexOpts, (err, si) => {
 45 |     if (err) return cb(err)
 46 |     self.index = si
 47 |     ready()
 48 |   })
 49 | 
 50 |   this.preprocessor = preprocess(opts)
 51 |   this.keyField = opts.keyField || 'id'
 52 | }
 53 | 
 54 | Yuno.prototype.getKey = function (doc) {
 55 |   return jsonpath({ json: doc, path: this.keyField })[0]
 56 | }
 57 | 
 58 | Yuno.prototype.putOp = function (doc) {
 59 |   return { type: 'put', key: this.getKey(doc), value: doc }
 60 | }
 61 | 
 62 | // docs: array of documents to add
 63 | // opts: options for adding
 64 | Yuno.prototype.add = function (docs, opts, cb) {
 65 |   var self = this
 66 |   if (_.isFunction(opts)) cb = opts
 67 |   if (_.isPlainObject(docs)) docs = [docs]
 68 | 
 69 |   var errs = []
 70 |   var docb = _.after(2, function () {
 71 |     cb(errs.length > 0 ? errs[0] : null, docs.length)
 72 |   })
 73 |   var done = function (err) {
 74 |     if (err) errs.push(err)
 75 |     docb()
 76 |   }
 77 | 
 78 |   this.docstore.batch(docs.map((d) => {
 79 |     return { type: 'put', key: '' + self.getKey(d), value: JSON.stringify(d) }
 80 |   }), done)
 81 | 
 82 |   this.index.add(docs.map((d) => {
 83 |     return { id: self.getKey(d), tokens: self.preprocessor.process(d) }
 84 |   }), done)
 85 |   // process the docs for search indexing
 86 | }
 87 | 
 88 | Yuno.prototype.get = function (key, cb) {
 89 |   this.docstore.get(key, cb)
 90 | }
 91 | 
 92 | Yuno.prototype.search = function (query, opts, cb) {
 93 |   if (_.isFunction(opts)) {
 94 |     cb = opts
 95 |     opts = null
 96 |   }
 97 |   var cursor = Cursor(query, this, opts)
 98 |   cursor.first(cb)
 99 |   return cursor
100 | }
101 | 
102 | Yuno.prototype.del = function (keys, cb) {
103 |   var self = this
104 | 
105 |   if (!(_.isArray(keys))) keys = [keys]
106 |   if (_.isPlainObject(keys[0])) keys = keys.map((doc) => { self.getKey(doc) })
107 | 
108 |   var errs = []
109 |   var done = _.after(2, function () {
110 |     cb(errs.length > 0 ? errs[0] : null)
111 |   })
112 | 
113 |   this.docstore.batch(keys.map((key) => {
114 |     return { type: 'del', key: key }
115 |   }), done)
116 | 
117 |   this.index.del(keys.map((key) => {
118 |     return { id: key }
119 |   }), done)
120 | }
121 | 
122 | Yuno.prototype.close = function (cb) {
123 |   var errs = []
124 |   var done = _.after(2, function () {
125 |     cb(errs.length > 0 ? errs[0] : null)
126 |   })
127 | 
128 |   this.docstore.close(done)
129 |   this.index.close(done)
130 | }
131 | 
132 | function requiredOpts (opts, keys, cb) {
133 |   keys.forEach((key) => {
134 |     if (!opts[key]) {
135 |       cb(new Error(key + ' option is required'))
136 |     }
137 |   })
138 | }
139 | 
140 | module.exports = Yuno
141 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "yunodb",
 3 |   "version": "0.1.4",
 4 |   "description": "portable, persistent, electron compatible fulltext search database",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "standard && tape test/*.js"
 8 |   },
 9 |   "bin": {
10 |     "yuno": "./cli.js"
11 |   },
12 |   "repository": {
13 |     "type": "git",
14 |     "url": "git+https://github.com/blahah/yunodb.git"
15 |   },
16 |   "keywords": [
17 |     "fulltext",
18 |     "search",
19 |     "database",
20 |     "json",
21 |     "object",
22 |     "store",
23 |     "leveldb",
24 |     "search-index",
25 |     "electron"
26 |   ],
27 |   "author": "Richard Smith-Unna <rik@fathomlabs.io> (@blahah)",
28 |   "license": "CC0-1.0",
29 |   "bugs": {
30 |     "url": "https://github.com/blahah/yunodb/issues"
31 |   },
32 |   "homepage": "https://github.com/blahah/yunodb#readme",
33 |   "dependencies": {
34 |     "JSONStream": "^1.1.1",
35 |     "commander": "^2.9.0",
36 |     "jsonpath-plus": "^0.15.0",
37 |     "leveldown": "^1.4.6",
38 |     "levelup": "^1.3.1",
39 |     "lodash": "^4.11.1",
40 |     "mkdirp": "^0.5.1",
41 |     "natural": "github:blahah/natural",
42 |     "path-exists": "^2.1.0",
43 |     "search-index": "^0.8.12"
44 |   },
45 |   "devDependencies": {
46 |     "rimraf": "^2.5.2",
47 |     "standard": "^6.0.8",
48 |     "tape": "^4.5.1",
49 |     "temporary": "0.0.8"
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/preprocess/preprocess.js:
--------------------------------------------------------------------------------
 1 | var vector = require('./vector.js')
 2 | var _ = require('lodash')
 3 | var jsonpath = require('jsonpath-plus')
 4 | 
 5 | function Preprocessor (opts) {
 6 |   if (!(this instanceof Preprocessor)) return new Preprocessor(opts)
 7 | 
 8 |   if (!opts.indexMap) throw new Error('preprocessor requires an indexMap option')
 9 | 
10 |   this.opts = opts
11 |   this.createPipeline(opts)
12 |   this.cachePaths(opts)
13 | }
14 | 
15 | Preprocessor.prototype.naturalize = function (str) {
16 |   return vector(str)
17 |     .trim()
18 |     .tag()
19 |     .filterPOS()
20 |     .stripTags()
21 |     .lowercase()
22 |     .stripPunctuation()
23 |     .filterNonWords()
24 |     .stem().terms
25 | }
26 | 
27 | Preprocessor.prototype.createPipeline = function (opts) {
28 |   var indexMap = opts.indexMap
29 |   if (indexMap instanceof Array) {
30 |     indexMap = _.zipObject(indexMap, [true])
31 |   }
32 | 
33 |   var self = this
34 |   this.pipeline = _.transform(indexMap, function (pipeline, action, field) {
35 |     var op = _.identity
36 |     if (_.isFunction(action)) {
37 |       op = _.bind(action, self)
38 |     } else if (action) {
39 |       op = self.naturalize
40 |     }
41 |     pipeline[field] = op
42 |   }, {})
43 | }
44 | 
45 | Preprocessor.prototype.cachePaths = function (opts) {
46 |   var map = opts.indexMap
47 |   this.paths = _.isArray(map) ? map : Object.keys(map)
48 | }
49 | 
50 | Preprocessor.prototype.pick = function (object) {
51 |   return _.zipObject(this.paths, this.paths.map(function (path) {
52 |     return jsonpath({ json: object, path: path }).join(' ')
53 |   }))
54 | }
55 | 
56 | Preprocessor.prototype.process = function (object) {
57 |   var self = this
58 |   var picked = this.pick(object)
59 |   var parts = _.map(picked, (value, key, o) => {
60 |     var step = self.pipeline[key]
61 |     return step ? step(value) : value
62 |   })
63 |   return _.flatten(parts)
64 | }
65 | 
66 | module.exports = Preprocessor
67 | 


--------------------------------------------------------------------------------
/preprocess/tagger/tr_from_brill_paper.txt:
--------------------------------------------------------------------------------
 1 | TO IN NEXT-TAG AT
 2 | VBN VBD PREV-WORD-IS-CAP YES
 3 | VBD VBN PREV-1-OR-2-OR-3-TAG HVD
 4 | VB NN PREV-1-OR-2-TAG AT
 5 | NN VB PREV-TAG TO
 6 | TO IN NEXT-WORD-IS-CAP YES
 7 | NN VB PREV-TAG MD
 8 | PPS PPO NEXT-TAG .
 9 | VBN VBD PREV-TAG PPS
10 | NP NN CURRENT-WORD-IS-CAP NO


--------------------------------------------------------------------------------
/preprocess/tagger/tr_from_posjs.txt:
--------------------------------------------------------------------------------
 1 | VBD NN PREV-TAG DT
 2 | VBP NN PREV-TAG DT
 3 | VB NN PREV-TAG DT
 4 | NN CD CURRENT-WORD-IS-NUMBER YES
 5 | NNP CD CURRENT-WORD-IS-NUMBER YES
 6 | NNS CD CURRENT-WORD-IS-NUMBER YES
 7 | NNPS CD CURRENT-WORD-IS-NUMBER YES
 8 | NN URL CURRENT-WORD-IS-URL YES
 9 | NNS URL CURRENT-WORD-IS-URL YES
10 | NNP URL CURRENT-WORD-IS-URL YES
11 | NNPS URL CURRENT-WORD-IS-URL YES
12 | NN VBN CURRENT-WORD-ENDS-WITH ed
13 | * RB CURRENT-WORD-ENDS-WITH ly
14 | NN JJ CURRENT-WORD-ENDS-WITH al
15 | NNS JJ CURRENT-WORD-ENDS-WITH al
16 | NN VB PREV-WORD-IS would
17 | NN NNS CURRENT-WORD-ENDS-WITH s
18 | NN VBG CURRENT-WORD-ENDS-WITH ing


--------------------------------------------------------------------------------
/preprocess/vector.js:
--------------------------------------------------------------------------------
 1 | var natural = require('natural')
 2 | var tokenize = (new natural.TreebankWordTokenizer()).tokenize
 3 | var stem = natural.PorterStemmer.stem
 4 | var inherits = require('util').inherits
 5 | var path = require('path')
 6 | var rules = path.join(__dirname, './tagger/tr_from_posjs.txt')
 7 | var lexicon = path.join(__dirname, './tagger/lexicon_from_posjs.json')
 8 | var tagger = new natural.BrillPOSTagger(lexicon, rules, 'N')
 9 | 
10 | inherits(Vector, Array)
11 | 
12 | function Vector (terms) {
13 |   if (!(this instanceof Vector)) return new Vector(terms)
14 |   if (terms instanceof Vector) terms = terms.terms
15 | 
16 |   if (typeof terms === 'string') {
17 |     this.eatString(terms)
18 |   } else {
19 |     this.terms = terms
20 |   }
21 | }
22 | 
23 | function stripPunctuation (term) {
24 |   return term.replace(/\W+/g, '')
25 | }
26 | 
27 | function stripTag (pair) {
28 |   return pair[0]
29 | }
30 | 
31 | function isWord (term) {
32 |   return term.replace(/[0-9]+/g, '').length > 0
33 | }
34 | 
35 | Vector.prototype.eatString = function (string) {
36 |   this.terms = tokenize(string.replace('/', ' '))
37 | }
38 | 
39 | Vector.prototype.lowercase = function () {
40 |   return Vector(this.terms.map((s) => { return s.toLowerCase() }))
41 | }
42 | 
43 | Vector.prototype.trim = function () {
44 |   return Vector(this.terms.map((s) => { return s.trim() }))
45 | }
46 | 
47 | Vector.prototype.tag = function () {
48 |   this.tags = tagger.tag(this.terms)
49 |   return this
50 | }
51 | 
52 | Vector.prototype.filterPOS = function () {
53 |   var filtered = this.tags.filter((part) => {
54 |     // see
55 |     // https://en.wikipedia.org/wiki/Brown_Corpus#Part-of-speech_tags_used
56 |     var tag = part[1]
57 |     if (!tag) return false
58 | 
59 |     // keep
60 |     var first = tag[0]
61 |     if (first === 'N') return true // nouns
62 |     if (first === 'V') return true // verbs
63 |     if (first === 'J') return true // adjectives
64 |     if (first === 'R') return true // adverbs
65 | 
66 |     // discard
67 |     return false
68 |   })
69 |   return Vector(filtered)
70 | }
71 | 
72 | Vector.prototype.stripTags = function () {
73 |   return Vector(this.terms.map(stripTag))
74 | }
75 | 
76 | Vector.prototype.stripPunctuation = function () {
77 |   return Vector(this.terms.map(stripPunctuation))
78 | }
79 | 
80 | Vector.prototype.filterNonWords = function () {
81 |   return Vector(this.terms.filter(isWord))
82 | }
83 | 
84 | Vector.prototype.stem = function () {
85 |   return Vector(this.terms.map(stem))
86 | }
87 | 
88 | module.exports = Vector
89 | 


--------------------------------------------------------------------------------
/test/add.js:
--------------------------------------------------------------------------------
 1 | var yuno = require('../')
 2 | var test = require('tape')
 3 | var tmp = require('temporary')
 4 | var rimraf = require('rimraf')
 5 | 
 6 | var path = require('path')
 7 | 
 8 | test('add', function (t) {
 9 |   var tmpdir = new tmp.Dir()
10 |   var dbpath = path.join(tmpdir.path, 'yuno')
11 | 
12 |   var opts = {
13 |     location: dbpath,
14 |     keyField: 'id',
15 |     indexMap: ['word']
16 |   }
17 | 
18 |   var doc = { id: '1234', word: 'sesquipedalianism is for deipnosophists' }
19 | 
20 |   yuno(opts, (err, db) => {
21 |     t.error(err, 'no error creating db')
22 | 
23 |     db.add(doc, function (err) {
24 |       t.error(err, 'no error adding document')
25 | 
26 |       db.docstore.get(doc.id, function (err, value) {
27 |         t.error(err, 'no error retrieving doc from docstore')
28 |         t.equals(JSON.stringify(doc), value, 'doc is exactly as inserted')
29 |         rimraf(dbpath, {}, t.end)
30 |       })
31 |     })
32 |   })
33 | })
34 | 


--------------------------------------------------------------------------------
/test/create.js:
--------------------------------------------------------------------------------
 1 | var yuno = require('../')
 2 | var test = require('tape')
 3 | var tmp = require('temporary')
 4 | var rimraf = require('rimraf')
 5 | 
 6 | var path = require('path')
 7 | var fs = require('fs')
 8 | 
 9 | test('create', function (t) {
10 |   t.plan(2)
11 | 
12 |   var tmpdir = new tmp.Dir()
13 |   var dbpath = path.join(tmpdir.path, 'yuno')
14 | 
15 |   var opts = {
16 |     location: dbpath,
17 |     keyField: 'id',
18 |     indexMap: ['word']
19 |   }
20 | 
21 |   function cb (err) {
22 |     t.error(err, 'no error on create db')
23 | 
24 |     var stats = fs.lstatSync(dbpath)
25 |     t.ok(stats.isDirectory(), 'db directory exists')
26 | 
27 |     rimraf(dbpath, {}, t.end)
28 |   }
29 | 
30 |   yuno(opts, cb)
31 | })
32 | 


--------------------------------------------------------------------------------
/test/del.js:
--------------------------------------------------------------------------------
 1 | var yuno = require('../')
 2 | var test = require('tape')
 3 | var tmp = require('temporary')
 4 | 
 5 | var path = require('path')
 6 | 
 7 | test('add', function (t) {
 8 |   var tmpdir = new tmp.Dir()
 9 |   var dbpath = path.join(tmpdir.path, 'yuno')
10 | 
11 |   var opts = {
12 |     location: dbpath,
13 |     keyField: 'id',
14 |     indexMap: ['word']
15 |   }
16 | 
17 |   var doc = { id: '1234', word: 'sesquipedalianism is for deipnosophists' }
18 | 
19 |   yuno(opts, (err, db) => {
20 |     t.error(err, 'no error creating db')
21 | 
22 |     db.add(doc, function (err) {
23 |       t.error(err, 'no error adding document')
24 | 
25 |       db.del(doc.id, function (err) {
26 |         t.error(err, 'no error deleting doc')
27 | 
28 |         db.get(doc.id, function (err) {
29 |           t.ok(err, 'document no longer exists in index')
30 |           t.end()
31 |         })
32 |       })
33 |     })
34 |   })
35 | })
36 | 


--------------------------------------------------------------------------------
/test/preprocess.js:
--------------------------------------------------------------------------------
 1 | var preprocessor = require('../preprocess/preprocess.js')
 2 | var test = require('tape')
 3 | 
 4 | test('preprocess', function (t) {
 5 |   var opts = {
 6 |     indexMap: ['vec']
 7 |   }
 8 | 
 9 |   var p = preprocessor(opts)
10 | 
11 |   var obj = {
12 |     vec: 'the big green manitee jumped over the laziest lungfish'
13 |   }
14 |   var expected = [
15 |     'big', 'green', 'manite', 'jump', 'laziest', 'lungfish'
16 |   ]
17 | 
18 |   t.deepEqual(p.naturalize(obj.vec), expected, 'naturalize a string')
19 | 
20 |   t.deepEqual(p.process(obj), expected, 'process an object (indexMap is array)')
21 | 
22 |   opts.indexMap = { vec: (x) => 'a' }
23 |   p = preprocessor(opts)
24 |   t.deepEqual(p.process(obj), ['a'], 'process an object (indexMap is object)')
25 | 
26 |   obj.first = {
27 |     second: ['intact', 'truncated']
28 |   }
29 | 
30 |   // test json paths in index map
31 |   opts.indexMap = ['first.second[1]']
32 |   p = preprocessor(opts)
33 |   t.deepEqual(p.process(obj), ['truncat'])
34 | 
35 |   t.end()
36 | })
37 | 


--------------------------------------------------------------------------------
/test/search.js:
--------------------------------------------------------------------------------
 1 | var yuno = require('../')
 2 | var test = require('tape')
 3 | var tmp = require('temporary')
 4 | var rimraf = require('rimraf')
 5 | 
 6 | var path = require('path')
 7 | 
 8 | var reuters = require('./reuters-000.json')
 9 | 
10 | test('basic search', function (t) {
11 |   t.timeoutAfter(1000)
12 | 
13 |   var tmpdir = new tmp.Dir()
14 |   var dbpath = path.join(tmpdir.path, 'yuno')
15 | 
16 |   var opts = {
17 |     location: dbpath,
18 |     keyField: 'id',
19 |     indexMap: ['word']
20 |   }
21 | 
22 |   var docs = [
23 |     { id: '1234', word: 'sesquipedalianism is for deipnosophists' },
24 |     { id: '4321', word: 'deipnosophists are annoying' }
25 |   ]
26 | 
27 |   yuno(opts, (err, db) => {
28 |     t.error(err, 'no error creating db')
29 | 
30 |     db.add(docs, function (err) {
31 |       t.error(err, 'no error adding document')
32 | 
33 |       db.search('deipnosophists', function (err, result) {
34 |         t.error(err, 'no error searching single')
35 |         t.equals(result.totalHits, 2, 'correct number of hits A')
36 |         t.equals(result.hits[0].document, JSON.stringify(docs[1]), 'doc 1 is exactly as inserted')
37 | 
38 |         rimraf(dbpath, {}, t.end)
39 |       })
40 |     })
41 |   })
42 | })
43 | 
44 | test('paging search', function (t) {
45 |   t.timeoutAfter(10000)
46 | 
47 |   var tmpdir = new tmp.Dir()
48 |   var dbpath = path.join(tmpdir.path, 'yuno')
49 | 
50 |   var opts = {
51 |     location: dbpath,
52 |     keyField: 'id',
53 |     indexMap: {
54 |       title: true,
55 |       body: true,
56 |       topics: true,
57 |       places: true,
58 |       date: false
59 |     }
60 |   }
61 | 
62 |   yuno(opts, (err, db) => {
63 |     t.error(err, 'no error creating db')
64 | 
65 |     db.add(reuters, function (err) {
66 |       t.error(err, 'no error adding document')
67 | 
68 |       var cursor = db.search('new york', function (err, result) {
69 |         t.error(err, 'no error searching single')
70 | 
71 |         t.equals(result.totalHits, 55, 'correct number of hits B')
72 |         t.equals(result.hits.length, 50, 'correct first page size')
73 | 
74 |         cursor.next((err, result) => {
75 |           t.error(err, 'no error paging')
76 | 
77 |           t.equals(result.hits.length, 5, 'correct second page size')
78 |           rimraf(dbpath, {}, t.end)
79 |         })
80 |       })
81 |     })
82 |   })
83 | })
84 | 


--------------------------------------------------------------------------------
/yuno.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blahah/yunodb/290a4ffec2094cf97bce260a88df1281931f4779/yuno.jpg


--------------------------------------------------------------------------------