├── .dockerignore ├── .drone.yml ├── .eslintrc.json ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── code-of-conduct.md ├── environments └── prod.yml ├── package-lock.json ├── package.json ├── src ├── correlations.js ├── fts │ ├── Porter2.js │ ├── Porter2.snowball │ ├── Query.js │ ├── Stemmer.js │ ├── Trie.js │ └── fts.js ├── index.js ├── pool.js └── worker-searcher.js ├── test ├── integration_test.js ├── manifests │ ├── atlas-master.json │ └── bi-connector-master.json ├── queries.json ├── regression_test.js ├── stemmed-corpus.txt ├── test_pool.js ├── test_query.js ├── test_stemmer.js ├── test_trie.js └── util.js └── tools └── update_stemmer.js /.dockerignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | npm-debug.log 3 | test 4 | manifests 5 | .git 6 | -------------------------------------------------------------------------------- /.drone.yml: -------------------------------------------------------------------------------- 1 | pipeline: 2 | publish: 3 | image: plugins/ecr 4 | secrets: [ecr_access_key, ecr_secret_key] 5 | registry: 795250896452.dkr.ecr.us-east-1.amazonaws.com 6 | repo: 795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME} 7 | create_repository: true 8 | tags: 9 | - git-${DRONE_COMMIT_SHA:0:7} 10 | - latest 11 | when: 12 | branch: [master, staging] 13 | event: push 14 | 15 | deploy-staging: 16 | image: quay.io/mongodb/drone-helm:v2 17 | release: marian 18 | namespace: docs 19 | environment: 20 | - API_SERVER=https://api.staging.corp.mongodb.com 21 | prefix: STAGING 22 | secrets: [staging_kubernetes_token] 23 | helm_repos: mongodb=https://10gen.github.io/helm-charts 24 | chart: mongodb/web-app 25 | chart_version: 4.7.3 26 | tiller_ns: docs 27 | client_only: true 28 | values_files: ["environments/prod.yml"] 29 | values: "image.tag=git-${DRONE_COMMIT_SHA:0:7},image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME},ingress.enabled=true,ingress.hosts[0]=marian.docs.staging.corp.mongodb.com" 30 | when: 31 | branch: staging 32 | event: push 33 | 34 | deploy-prod: 35 | image: quay.io/mongodb/drone-helm:v2 36 | release: marian 37 | namespace: docs 38 | environment: 39 | - API_SERVER=https://api.prod.corp.mongodb.com 40 | prefix: PROD 41 | secrets: [prod_kubernetes_token] 42 | helm_repos: mongodb=https://10gen.github.io/helm-charts 43 | chart: mongodb/web-app 44 | chart_version: 4.7.3 45 | tiller_ns: docs 46 | client_only: true 47 | values_files: ["environments/prod.yml"] 48 | values: "image.tag=git-${DRONE_COMMIT_SHA:0:7},image.repository=795250896452.dkr.ecr.us-east-1.amazonaws.com/docs/${DRONE_REPO_NAME},ingress.enabled=true,ingress.hosts[0]=marian.docs.prod.corp.mongodb.com" 49 | when: 50 | branch: master 51 | event: push 52 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "worker": true, 4 | "node": true, 5 | "commonjs": true, 6 | "es6": true 7 | }, 8 | "parserOptions": { 9 | "ecmaVersion": 8 10 | }, 11 | "extends": "eslint:recommended", 12 | "rules": { 13 | "indent": [ 14 | "error", 15 | 4 16 | ], 17 | "linebreak-style": [ 18 | "error", 19 | "unix" 20 | ], 21 | "quotes": [ 22 | "error", 23 | "single" 24 | ], 25 | "semi": [ 26 | "error", 27 | "never" 28 | ], 29 | "no-console": "off", 30 | "valid-jsdoc": ["error", { 31 | "requireParamDescription": false, 32 | "requireReturnDescription": false 33 | }] 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:14-alpine 2 | RUN mkdir -p /usr/src/app 3 | WORKDIR /usr/src/app 4 | COPY . /usr/src/app 5 | RUN apk add --no-cache git 6 | RUN npm install --production 7 | 8 | EXPOSE 8080 9 | ENTRYPOINT ["node", "--max-old-space-size=4096", "src/index.js", "bucket:docs-mongodb-org-prd/search-indexes/"] 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 MongoDB, inc. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NPM ?= $(shell which npm) 2 | NODE ?= $(shell which node) 3 | MOCHA ?= ./node_modules/.bin/mocha 4 | ESLINT ?= ./node_modules/.bin/eslint 5 | MANIFEST_SOURCE ?= bucket:docs-mongodb-org-prd/search-indexes/ 6 | export MANIFEST_SOURCE 7 | 8 | .PHONY: all lint test integration regression run 9 | 10 | all: lint test 11 | 12 | lint: node_modules/.CURRENT 13 | ${ESLINT} src/*.js src/fts/*.js test/*.js 14 | 15 | test: node_modules/.CURRENT lint src/fts/Porter2.js 16 | ${MOCHA} test/test_*.js 17 | 18 | integration: test 19 | ${MOCHA} --timeout 5000 test/integration_test.js 20 | 21 | regression: integration 22 | MAX_WORKERS=1 ${MOCHA} --timeout 200000 test/regression_test.js 23 | 24 | run: src/fts/Porter2.js 25 | ${NODE} --max-old-space-size=4096 ./src/index.js ${MANIFEST_SOURCE} 26 | 27 | snowball src/fts/Porter2.js: src/fts/Porter2.snowball 28 | ${NODE} tools/update_stemmer.js $^ src/fts/Porter2.js 29 | 30 | node_modules/.CURRENT: package.json 31 | ${NPM} -s install --build-from-source 32 | touch $@ 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Marian 2 | 3 | Marian is an HTTP full text search service. 4 | 5 | ## Running Marian 6 | 7 | ### Prerequisites 8 | 9 | You will need Node.js v8.0 or later. 10 | 11 | ### 12 | 13 | ### Launching the Marian Server 14 | 15 | ``` 16 | npm install 17 | MAX_WORKERS=2 node ./src/index.js [MANIFEST_SOURCE] 18 | ``` 19 | 20 | Marian will then read the manifest directory given in `MANIFEST_SOURCE`, and 21 | begin listening to requests on port 8080. 22 | 23 | ### Manifest Source 24 | 25 | Marian requires a manifest source directory. This may either be a local 26 | path, or an Amazon S3 path. For example, `dir:./manifests/` or 27 | `bucket:docs-mongodb-org-prod/search-indexes/`. 28 | 29 | The path must contain only JSON files having the following JSON schema: 30 | 31 | ``` 32 | { 33 | "$id": "http://example.com/example.json", 34 | "type": "object", 35 | "definitions": {}, 36 | "$schema": "http://json-schema.org/draft-07/schema#", 37 | "properties": { 38 | "url": { 39 | "$id": "/properties/url", 40 | "type": "string", 41 | "title": "The Url Schema ", 42 | "default": "" 43 | }, 44 | "includeInGlobalSearch": { 45 | "$id": "/properties/includeInGlobalSearch", 46 | "type": "boolean", 47 | "title": "The Includeinglobalsearch Schema ", 48 | "default": false 49 | }, 50 | "aliases": { 51 | "$id": "/properties/aliases", 52 | "type": "array" 53 | }, 54 | "documents": { 55 | "$id": "/properties/documents", 56 | "type": "array", 57 | "items": { 58 | "$id": "/properties/documents/items", 59 | "type": "object", 60 | "properties": { 61 | "slug": { 62 | "$id": "/properties/documents/items/properties/slug", 63 | "type": "string", 64 | "title": "The Slug Schema ", 65 | "default": "" 66 | }, 67 | "title": { 68 | "$id": "/properties/documents/items/properties/title", 69 | "type": "string", 70 | "title": "The Title Schema ", 71 | "default": "" 72 | }, 73 | "headings": { 74 | "$id": "/properties/documents/items/properties/headings", 75 | "type": "array", 76 | "items": { 77 | "$id": "/properties/documents/items/properties/headings/items", 78 | "type": "string", 79 | "title": "The 0th Schema ", 80 | "default": "" 81 | } 82 | }, 83 | "text": { 84 | "$id": "/properties/documents/items/properties/text", 85 | "type": "string", 86 | "title": "The Text Schema ", 87 | "default": "" 88 | }, 89 | "preview": { 90 | "$id": "/properties/documents/items/properties/preview", 91 | "type": "string", 92 | "title": "The Preview Schema ", 93 | "default": "", 94 | }, 95 | "tags": { 96 | "$id": "/properties/documents/items/properties/tags", 97 | "type": "string", 98 | "title": "The Tags Schema ", 99 | "default": "" 100 | }, 101 | "links": { 102 | "$id": "/properties/documents/items/properties/links", 103 | "type": "array", 104 | "items": { 105 | "$id": "/properties/documents/items/properties/links/items", 106 | "type": "string", 107 | "title": "The 0th Schema ", 108 | "default": "" 109 | } 110 | } 111 | } 112 | } 113 | } 114 | } 115 | } 116 | ``` 117 | 118 | ## Marian REST API 119 | 120 | ``` 121 | 122 | GET /search?q=[&searchProperty=] 123 | Returns search results. For example, see https://marian.mongodb.com/search?q=aggregation%20pipeline 124 | GET /status 125 | Returns a status document 126 | POST /refresh 127 | When this endpoint is POSTed, Marian will rescan the manifest source 128 | directory. 129 | 130 | ``` 131 | -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | 78 | -------------------------------------------------------------------------------- /environments/prod.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | # guaranteed amount of resources 3 | requests: 4 | cpu: 100m 5 | memory: 8000Mi 6 | # maximum allowed resources 7 | limits: 8 | ## same as 2000m 9 | cpu: 2 10 | memory: 8000Mi 11 | 12 | probes: 13 | enabled: true 14 | path: /status 15 | headers: {} 16 | liveness: 17 | httpGet: true 18 | initialDelaySeconds: 10 19 | periodSeconds: 60 20 | timeoutSeconds: 1 21 | successThreshold: 1 22 | failureThreshold: 3 23 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "marian", 3 | "version": "0.1.0", 4 | "description": "a tiny search engine", 5 | "main": "src/index.js", 6 | "files": [ 7 | "src/index.js", 8 | "src/worker-searcher.js" 9 | ], 10 | "author": "Andrew Aldridge ", 11 | "license": "AGPL-3.0", 12 | "dependencies": { 13 | "aws-sdk": "^2.508.0", 14 | "basic-logger": "^0.4.4", 15 | "dictionary-en-us": "^2.1.1", 16 | "dive": "^0.5.0", 17 | "nspell": "^2.1.2", 18 | "tiny-worker": "^2.2.0" 19 | }, 20 | "devDependencies": { 21 | "eslint": "^6.1.0", 22 | "mocha": "^7.2.0" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/correlations.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const CORRELATIONS = [ 4 | ['regexp', 'regex', 0.8], 5 | ['regular expression', 'regex', 0.8], 6 | ['ip', 'address', 0.1], 7 | ['address', 'ip', 0.1], 8 | ['join', 'lookup', 0.6], 9 | ['join', 'sql', 0.25], 10 | ['aggregation', 'sql', 0.1], 11 | ['aggregation', 'pipeline', 0.1], 12 | ['least', 'min', 0.6], 13 | ['set security', 'keyfile', 1.0], 14 | ['cluster security', 'keyfile', 1.0], 15 | ['x509', 'x.509', 1.0], 16 | ['auth', 'authentication', 0.25]] 17 | 18 | exports.correlations = CORRELATIONS 19 | 20 | // Words that, if given in the query, are required to exist in the output results. 21 | // Users can simulate this by wrapping terms in double-quotes to get the same result. 22 | exports.MANDATORY = new Set(['realm', 'atlas', 'compass']) 23 | -------------------------------------------------------------------------------- /src/fts/Porter2.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | 'use strict' 3 | 4 | class Among { 5 | constructor (s, substring_i, result) { 6 | this.s = s; 7 | this.substring_i = substring_i; 8 | this.result = result; 9 | this.method = null; 10 | } 11 | } 12 | 13 | /** 14 | * This class was automatically generated by a Snowball to JS compiler 15 | * It implements the stemming algorithm defined by a snowball script. 16 | */ 17 | 18 | class Porter2 { 19 | constructor () { 20 | this.a_0 = [ 21 | new Among("arsen", -1, -1), 22 | new Among("commun", -1, -1), 23 | new Among("gener", -1, -1) 24 | ]; 25 | 26 | this.a_1 = [ 27 | new Among("'", -1, 1), 28 | new Among("'s'", 0, 1), 29 | new Among("'s", -1, 1) 30 | ]; 31 | 32 | this.a_2 = [ 33 | new Among("ied", -1, 2), 34 | new Among("s", -1, 3), 35 | new Among("ies", 1, 2), 36 | new Among("sses", 1, 1), 37 | new Among("ss", 1, -1), 38 | new Among("us", 1, -1) 39 | ]; 40 | 41 | this.a_3 = [ 42 | new Among("", -1, 3), 43 | new Among("bb", 0, 2), 44 | new Among("dd", 0, 2), 45 | new Among("ff", 0, 2), 46 | new Among("gg", 0, 2), 47 | new Among("bl", 0, 1), 48 | new Among("mm", 0, 2), 49 | new Among("nn", 0, 2), 50 | new Among("pp", 0, 2), 51 | new Among("rr", 0, 2), 52 | new Among("at", 0, 1), 53 | new Among("tt", 0, 2), 54 | new Among("iz", 0, 1) 55 | ]; 56 | 57 | this.a_4 = [ 58 | new Among("ed", -1, 2), 59 | new Among("eed", 0, 1), 60 | new Among("ing", -1, 2), 61 | new Among("edly", -1, 2), 62 | new Among("eedly", 3, 1), 63 | new Among("ingly", -1, 2) 64 | ]; 65 | 66 | this.a_5 = [ 67 | new Among("anci", -1, 3), 68 | new Among("enci", -1, 2), 69 | new Among("ogi", -1, 13), 70 | new Among("li", -1, 16), 71 | new Among("bli", 3, 12), 72 | new Among("abli", 4, 4), 73 | new Among("alli", 3, 8), 74 | new Among("fulli", 3, 14), 75 | new Among("lessli", 3, 15), 76 | new Among("ousli", 3, 10), 77 | new Among("entli", 3, 5), 78 | new Among("aliti", -1, 8), 79 | new Among("biliti", -1, 12), 80 | new Among("iviti", -1, 11), 81 | new Among("tional", -1, 1), 82 | new Among("ational", 14, 7), 83 | new Among("alism", -1, 8), 84 | new Among("ation", -1, 7), 85 | new Among("ization", 17, 6), 86 | new Among("izer", -1, 6), 87 | new Among("ator", -1, 7), 88 | new Among("iveness", -1, 11), 89 | new Among("fulness", -1, 9), 90 | new Among("ousness", -1, 10) 91 | ]; 92 | 93 | this.a_6 = [ 94 | new Among("icate", -1, 4), 95 | new Among("ative", -1, 6), 96 | new Among("alize", -1, 3), 97 | new Among("iciti", -1, 4), 98 | new Among("ical", -1, 4), 99 | new Among("tional", -1, 1), 100 | new Among("ational", 5, 2), 101 | new Among("ful", -1, 5), 102 | new Among("ness", -1, 5) 103 | ]; 104 | 105 | this.a_7 = [ 106 | new Among("ic", -1, 1), 107 | new Among("ance", -1, 1), 108 | new Among("ence", -1, 1), 109 | new Among("able", -1, 1), 110 | new Among("ible", -1, 1), 111 | new Among("ate", -1, 1), 112 | new Among("ive", -1, 1), 113 | new Among("ize", -1, 1), 114 | new Among("iti", -1, 1), 115 | new Among("al", -1, 1), 116 | new Among("ism", -1, 1), 117 | new Among("ion", -1, 2), 118 | new Among("er", -1, 1), 119 | new Among("ous", -1, 1), 120 | new Among("ant", -1, 1), 121 | new Among("ent", -1, 1), 122 | new Among("ment", 15, 1), 123 | new Among("ement", 16, 1) 124 | ]; 125 | 126 | this.a_8 = [ 127 | new Among("e", -1, 1), 128 | new Among("l", -1, 2) 129 | ]; 130 | 131 | this.a_9 = [ 132 | new Among("succeed", -1, -1), 133 | new Among("proceed", -1, -1), 134 | new Among("exceed", -1, -1), 135 | new Among("canning", -1, -1), 136 | new Among("inning", -1, -1), 137 | new Among("earring", -1, -1), 138 | new Among("herring", -1, -1), 139 | new Among("outing", -1, -1) 140 | ]; 141 | 142 | this.a_10 = [ 143 | new Among("andes", -1, -1), 144 | new Among("atlas", -1, -1), 145 | new Among("bias", -1, -1), 146 | new Among("cosmos", -1, -1), 147 | new Among("dying", -1, 3), 148 | new Among("early", -1, 12), 149 | new Among("gently", -1, 10), 150 | new Among("howe", -1, -1), 151 | new Among("idly", -1, 9), 152 | new Among("importance", -1, 8), 153 | new Among("important", -1, -1), 154 | new Among("lying", -1, 4), 155 | new Among("news", -1, -1), 156 | new Among("only", -1, 13), 157 | new Among("replica", -1, 6), 158 | new Among("retryable", -1, 7), 159 | new Among("singly", -1, 14), 160 | new Among("skies", -1, 2), 161 | new Among("skis", -1, 1), 162 | new Among("sky", -1, -1), 163 | new Among("tying", -1, 5), 164 | new Among("ugly", -1, 11) 165 | ]; 166 | 167 | this.g_v = [17, 65, 16, 1] ; 168 | 169 | this.g_v_WXY = [1, 17, 65, 208, 1] ; 170 | 171 | this.g_valid_LI = [55, 141, 2] ; 172 | 173 | this.B_Y_found = false; 174 | this.I_p2 = 0; 175 | this.I_p1 = 0; 176 | } 177 | 178 | r_prelude () 179 | { 180 | // (, line 28 181 | // unset Y_found, line 29 182 | this.B_Y_found = false; 183 | // do, line 30 184 | var v_1 = this.cursor; 185 | var lab0 = true; 186 | lab0: while (lab0 == true) 187 | { 188 | lab0 = false; 189 | // (, line 30 190 | // [, line 30 191 | this.bra = this.cursor; 192 | // literal, line 30 193 | if (!(this.eq_s("'"))) 194 | { 195 | break lab0; 196 | } 197 | // ], line 30 198 | this.ket = this.cursor; 199 | // delete, line 30 200 | if (!this.slice_del()) 201 | { 202 | return false; 203 | } 204 | } 205 | this.cursor = v_1; 206 | // do, line 31 207 | var v_2 = this.cursor; 208 | var lab1 = true; 209 | lab1: while (lab1 == true) 210 | { 211 | lab1 = false; 212 | // (, line 31 213 | // [, line 31 214 | this.bra = this.cursor; 215 | // literal, line 31 216 | if (!(this.eq_s("y"))) 217 | { 218 | break lab1; 219 | } 220 | // ], line 31 221 | this.ket = this.cursor; 222 | // <-, line 31 223 | if (!this.slice_from("Y")) 224 | { 225 | return false; 226 | } 227 | // set Y_found, line 31 228 | this.B_Y_found = true; 229 | } 230 | this.cursor = v_2; 231 | // do, line 32 232 | var v_3 = this.cursor; 233 | var lab2 = true; 234 | lab2: while (lab2 == true) 235 | { 236 | lab2 = false; 237 | // repeat, line 32 238 | replab3: while(true) 239 | { 240 | var v_4 = this.cursor; 241 | var lab4 = true; 242 | lab4: while (lab4 == true) 243 | { 244 | lab4 = false; 245 | // (, line 32 246 | // goto, line 32 247 | golab5: while(true) 248 | { 249 | var v_5 = this.cursor; 250 | var lab6 = true; 251 | lab6: while (lab6 == true) 252 | { 253 | lab6 = false; 254 | // (, line 32 255 | if (!(this.in_grouping(this.g_v, 97, 121))) 256 | { 257 | break lab6; 258 | } 259 | // [, line 32 260 | this.bra = this.cursor; 261 | // literal, line 32 262 | if (!(this.eq_s("y"))) 263 | { 264 | break lab6; 265 | } 266 | // ], line 32 267 | this.ket = this.cursor; 268 | this.cursor = v_5; 269 | break golab5; 270 | } 271 | this.cursor = v_5; 272 | if (this.cursor >= this.limit) 273 | { 274 | break lab4; 275 | } 276 | this.cursor++; 277 | } 278 | // <-, line 32 279 | if (!this.slice_from("Y")) 280 | { 281 | return false; 282 | } 283 | // set Y_found, line 32 284 | this.B_Y_found = true; 285 | continue replab3; 286 | } 287 | this.cursor = v_4; 288 | break replab3; 289 | } 290 | } 291 | this.cursor = v_3; 292 | return true; 293 | } 294 | 295 | r_mark_regions () 296 | { 297 | // (, line 35 298 | this.I_p1 = this.limit; 299 | this.I_p2 = this.limit; 300 | // do, line 38 301 | var v_1 = this.cursor; 302 | var lab0 = true; 303 | lab0: while (lab0 == true) 304 | { 305 | lab0 = false; 306 | // (, line 38 307 | // or, line 44 308 | var lab1 = true; 309 | lab1: while (lab1 == true) 310 | { 311 | lab1 = false; 312 | var v_2 = this.cursor; 313 | var lab2 = true; 314 | lab2: while (lab2 == true) 315 | { 316 | lab2 = false; 317 | // among, line 39 318 | if (this.find_among(this.a_0) == 0) 319 | { 320 | break lab2; 321 | } 322 | break lab1; 323 | } 324 | this.cursor = v_2; 325 | // (, line 44 326 | // gopast, line 44 327 | golab3: while(true) 328 | { 329 | var lab4 = true; 330 | lab4: while (lab4 == true) 331 | { 332 | lab4 = false; 333 | if (!(this.in_grouping(this.g_v, 97, 121))) 334 | { 335 | break lab4; 336 | } 337 | break golab3; 338 | } 339 | if (this.cursor >= this.limit) 340 | { 341 | break lab0; 342 | } 343 | this.cursor++; 344 | } 345 | // gopast, line 44 346 | golab5: while(true) 347 | { 348 | var lab6 = true; 349 | lab6: while (lab6 == true) 350 | { 351 | lab6 = false; 352 | if (!(this.out_grouping(this.g_v, 97, 121))) 353 | { 354 | break lab6; 355 | } 356 | break golab5; 357 | } 358 | if (this.cursor >= this.limit) 359 | { 360 | break lab0; 361 | } 362 | this.cursor++; 363 | } 364 | } 365 | // setmark p1, line 45 366 | this.I_p1 = this.cursor; 367 | // gopast, line 46 368 | golab7: while(true) 369 | { 370 | var lab8 = true; 371 | lab8: while (lab8 == true) 372 | { 373 | lab8 = false; 374 | if (!(this.in_grouping(this.g_v, 97, 121))) 375 | { 376 | break lab8; 377 | } 378 | break golab7; 379 | } 380 | if (this.cursor >= this.limit) 381 | { 382 | break lab0; 383 | } 384 | this.cursor++; 385 | } 386 | // gopast, line 46 387 | golab9: while(true) 388 | { 389 | var lab10 = true; 390 | lab10: while (lab10 == true) 391 | { 392 | lab10 = false; 393 | if (!(this.out_grouping(this.g_v, 97, 121))) 394 | { 395 | break lab10; 396 | } 397 | break golab9; 398 | } 399 | if (this.cursor >= this.limit) 400 | { 401 | break lab0; 402 | } 403 | this.cursor++; 404 | } 405 | // setmark p2, line 46 406 | this.I_p2 = this.cursor; 407 | } 408 | this.cursor = v_1; 409 | return true; 410 | } 411 | 412 | r_shortv () 413 | { 414 | // (, line 52 415 | // or, line 54 416 | var lab0 = true; 417 | lab0: while (lab0 == true) 418 | { 419 | lab0 = false; 420 | var v_1 = this.limit - this.cursor; 421 | var lab1 = true; 422 | lab1: while (lab1 == true) 423 | { 424 | lab1 = false; 425 | // (, line 53 426 | if (!(this.out_grouping_b(this.g_v_WXY, 89, 121))) 427 | { 428 | break lab1; 429 | } 430 | if (!(this.in_grouping_b(this.g_v, 97, 121))) 431 | { 432 | break lab1; 433 | } 434 | if (!(this.out_grouping_b(this.g_v, 97, 121))) 435 | { 436 | break lab1; 437 | } 438 | break lab0; 439 | } 440 | this.cursor = this.limit - v_1; 441 | // (, line 55 442 | if (!(this.out_grouping_b(this.g_v, 97, 121))) 443 | { 444 | return false; 445 | } 446 | if (!(this.in_grouping_b(this.g_v, 97, 121))) 447 | { 448 | return false; 449 | } 450 | // atlimit, line 55 451 | if (this.cursor > this.limit_backward) 452 | { 453 | return false; 454 | } 455 | } 456 | return true; 457 | } 458 | 459 | r_R1 () 460 | { 461 | if (!(this.I_p1 <= this.cursor)) 462 | { 463 | return false; 464 | } 465 | return true; 466 | } 467 | 468 | r_R2 () 469 | { 470 | if (!(this.I_p2 <= this.cursor)) 471 | { 472 | return false; 473 | } 474 | return true; 475 | } 476 | 477 | r_Step_1a () 478 | { 479 | var among_var; 480 | // (, line 61 481 | // try, line 62 482 | var v_1 = this.limit - this.cursor; 483 | var lab0 = true; 484 | lab0: while (lab0 == true) 485 | { 486 | lab0 = false; 487 | // (, line 62 488 | // [, line 63 489 | this.ket = this.cursor; 490 | // substring, line 63 491 | among_var = this.find_among_b(this.a_1); 492 | if (among_var == 0) 493 | { 494 | this.cursor = this.limit - v_1; 495 | break lab0; 496 | } 497 | // ], line 63 498 | this.bra = this.cursor; 499 | switch (among_var) { 500 | case 0: 501 | this.cursor = this.limit - v_1; 502 | break lab0; 503 | case 1: 504 | // (, line 65 505 | // delete, line 65 506 | if (!this.slice_del()) 507 | { 508 | return false; 509 | } 510 | break; 511 | } 512 | } 513 | // [, line 68 514 | this.ket = this.cursor; 515 | // substring, line 68 516 | among_var = this.find_among_b(this.a_2); 517 | if (among_var == 0) 518 | { 519 | return false; 520 | } 521 | // ], line 68 522 | this.bra = this.cursor; 523 | switch (among_var) { 524 | case 0: 525 | return false; 526 | case 1: 527 | // (, line 69 528 | // <-, line 69 529 | if (!this.slice_from("ss")) 530 | { 531 | return false; 532 | } 533 | break; 534 | case 2: 535 | // (, line 71 536 | // or, line 71 537 | var lab1 = true; 538 | lab1: while (lab1 == true) 539 | { 540 | lab1 = false; 541 | var v_2 = this.limit - this.cursor; 542 | var lab2 = true; 543 | lab2: while (lab2 == true) 544 | { 545 | lab2 = false; 546 | // (, line 71 547 | // hop, line 71 548 | { 549 | var c = this.cursor - 2; 550 | if (this.limit_backward > c || c > this.limit) 551 | { 552 | break lab2; 553 | } 554 | this.cursor = c; 555 | } 556 | // <-, line 71 557 | if (!this.slice_from("i")) 558 | { 559 | return false; 560 | } 561 | break lab1; 562 | } 563 | this.cursor = this.limit - v_2; 564 | // <-, line 71 565 | if (!this.slice_from("ie")) 566 | { 567 | return false; 568 | } 569 | } 570 | break; 571 | case 3: 572 | // (, line 72 573 | // next, line 72 574 | if (this.cursor <= this.limit_backward) 575 | { 576 | return false; 577 | } 578 | this.cursor--; 579 | // gopast, line 72 580 | golab3: while(true) 581 | { 582 | var lab4 = true; 583 | lab4: while (lab4 == true) 584 | { 585 | lab4 = false; 586 | if (!(this.in_grouping_b(this.g_v, 97, 121))) 587 | { 588 | break lab4; 589 | } 590 | break golab3; 591 | } 592 | if (this.cursor <= this.limit_backward) 593 | { 594 | return false; 595 | } 596 | this.cursor--; 597 | } 598 | // delete, line 72 599 | if (!this.slice_del()) 600 | { 601 | return false; 602 | } 603 | break; 604 | } 605 | return true; 606 | } 607 | 608 | r_Step_1b () 609 | { 610 | var among_var; 611 | // (, line 77 612 | // [, line 78 613 | this.ket = this.cursor; 614 | // substring, line 78 615 | among_var = this.find_among_b(this.a_4); 616 | if (among_var == 0) 617 | { 618 | return false; 619 | } 620 | // ], line 78 621 | this.bra = this.cursor; 622 | switch (among_var) { 623 | case 0: 624 | return false; 625 | case 1: 626 | // (, line 80 627 | // call R1, line 80 628 | if (!this.r_R1()) 629 | { 630 | return false; 631 | } 632 | // <-, line 80 633 | if (!this.slice_from("ee")) 634 | { 635 | return false; 636 | } 637 | break; 638 | case 2: 639 | // (, line 82 640 | // test, line 83 641 | var v_1 = this.limit - this.cursor; 642 | // gopast, line 83 643 | golab0: while(true) 644 | { 645 | var lab1 = true; 646 | lab1: while (lab1 == true) 647 | { 648 | lab1 = false; 649 | if (!(this.in_grouping_b(this.g_v, 97, 121))) 650 | { 651 | break lab1; 652 | } 653 | break golab0; 654 | } 655 | if (this.cursor <= this.limit_backward) 656 | { 657 | return false; 658 | } 659 | this.cursor--; 660 | } 661 | this.cursor = this.limit - v_1; 662 | // delete, line 83 663 | if (!this.slice_del()) 664 | { 665 | return false; 666 | } 667 | // test, line 84 668 | var v_3 = this.limit - this.cursor; 669 | // substring, line 84 670 | among_var = this.find_among_b(this.a_3); 671 | if (among_var == 0) 672 | { 673 | return false; 674 | } 675 | this.cursor = this.limit - v_3; 676 | switch (among_var) { 677 | case 0: 678 | return false; 679 | case 1: 680 | // (, line 86 681 | // <+, line 86 682 | { 683 | var c = this.cursor; 684 | this.insert(this.cursor, this.cursor, "e"); 685 | this.cursor = c; 686 | } 687 | break; 688 | case 2: 689 | // (, line 89 690 | // [, line 89 691 | this.ket = this.cursor; 692 | // next, line 89 693 | if (this.cursor <= this.limit_backward) 694 | { 695 | return false; 696 | } 697 | this.cursor--; 698 | // ], line 89 699 | this.bra = this.cursor; 700 | // delete, line 89 701 | if (!this.slice_del()) 702 | { 703 | return false; 704 | } 705 | break; 706 | case 3: 707 | // (, line 90 708 | // atmark, line 90 709 | if (this.cursor != this.I_p1) 710 | { 711 | return false; 712 | } 713 | // test, line 90 714 | var v_4 = this.limit - this.cursor; 715 | // call shortv, line 90 716 | if (!this.r_shortv()) 717 | { 718 | return false; 719 | } 720 | this.cursor = this.limit - v_4; 721 | // <+, line 90 722 | { 723 | var c = this.cursor; 724 | this.insert(this.cursor, this.cursor, "e"); 725 | this.cursor = c; 726 | } 727 | break; 728 | } 729 | break; 730 | } 731 | return true; 732 | } 733 | 734 | r_Step_1c () 735 | { 736 | // (, line 96 737 | // [, line 97 738 | this.ket = this.cursor; 739 | // or, line 97 740 | var lab0 = true; 741 | lab0: while (lab0 == true) 742 | { 743 | lab0 = false; 744 | var v_1 = this.limit - this.cursor; 745 | var lab1 = true; 746 | lab1: while (lab1 == true) 747 | { 748 | lab1 = false; 749 | // literal, line 97 750 | if (!(this.eq_s_b("y"))) 751 | { 752 | break lab1; 753 | } 754 | break lab0; 755 | } 756 | this.cursor = this.limit - v_1; 757 | // literal, line 97 758 | if (!(this.eq_s_b("Y"))) 759 | { 760 | return false; 761 | } 762 | } 763 | // ], line 97 764 | this.bra = this.cursor; 765 | if (!(this.out_grouping_b(this.g_v, 97, 121))) 766 | { 767 | return false; 768 | } 769 | // not, line 98 770 | { 771 | var v_2 = this.limit - this.cursor; 772 | var lab2 = true; 773 | lab2: while (lab2 == true) 774 | { 775 | lab2 = false; 776 | // atlimit, line 98 777 | if (this.cursor > this.limit_backward) 778 | { 779 | break lab2; 780 | } 781 | return false; 782 | } 783 | this.cursor = this.limit - v_2; 784 | } 785 | // <-, line 99 786 | if (!this.slice_from("i")) 787 | { 788 | return false; 789 | } 790 | return true; 791 | } 792 | 793 | r_Step_2 () 794 | { 795 | var among_var; 796 | // (, line 102 797 | // [, line 103 798 | this.ket = this.cursor; 799 | // substring, line 103 800 | among_var = this.find_among_b(this.a_5); 801 | if (among_var == 0) 802 | { 803 | return false; 804 | } 805 | // ], line 103 806 | this.bra = this.cursor; 807 | // call R1, line 103 808 | if (!this.r_R1()) 809 | { 810 | return false; 811 | } 812 | switch (among_var) { 813 | case 0: 814 | return false; 815 | case 1: 816 | // (, line 104 817 | // <-, line 104 818 | if (!this.slice_from("tion")) 819 | { 820 | return false; 821 | } 822 | break; 823 | case 2: 824 | // (, line 105 825 | // <-, line 105 826 | if (!this.slice_from("ence")) 827 | { 828 | return false; 829 | } 830 | break; 831 | case 3: 832 | // (, line 106 833 | // <-, line 106 834 | if (!this.slice_from("ance")) 835 | { 836 | return false; 837 | } 838 | break; 839 | case 4: 840 | // (, line 107 841 | // <-, line 107 842 | if (!this.slice_from("able")) 843 | { 844 | return false; 845 | } 846 | break; 847 | case 5: 848 | // (, line 108 849 | // <-, line 108 850 | if (!this.slice_from("ent")) 851 | { 852 | return false; 853 | } 854 | break; 855 | case 6: 856 | // (, line 110 857 | // <-, line 110 858 | if (!this.slice_from("ize")) 859 | { 860 | return false; 861 | } 862 | break; 863 | case 7: 864 | // (, line 112 865 | // <-, line 112 866 | if (!this.slice_from("ate")) 867 | { 868 | return false; 869 | } 870 | break; 871 | case 8: 872 | // (, line 114 873 | // <-, line 114 874 | if (!this.slice_from("al")) 875 | { 876 | return false; 877 | } 878 | break; 879 | case 9: 880 | // (, line 115 881 | // <-, line 115 882 | if (!this.slice_from("ful")) 883 | { 884 | return false; 885 | } 886 | break; 887 | case 10: 888 | // (, line 117 889 | // <-, line 117 890 | if (!this.slice_from("ous")) 891 | { 892 | return false; 893 | } 894 | break; 895 | case 11: 896 | // (, line 119 897 | // <-, line 119 898 | if (!this.slice_from("ive")) 899 | { 900 | return false; 901 | } 902 | break; 903 | case 12: 904 | // (, line 121 905 | // <-, line 121 906 | if (!this.slice_from("ble")) 907 | { 908 | return false; 909 | } 910 | break; 911 | case 13: 912 | // (, line 122 913 | // literal, line 122 914 | if (!(this.eq_s_b("l"))) 915 | { 916 | return false; 917 | } 918 | // <-, line 122 919 | if (!this.slice_from("og")) 920 | { 921 | return false; 922 | } 923 | break; 924 | case 14: 925 | // (, line 123 926 | // <-, line 123 927 | if (!this.slice_from("ful")) 928 | { 929 | return false; 930 | } 931 | break; 932 | case 15: 933 | // (, line 124 934 | // <-, line 124 935 | if (!this.slice_from("less")) 936 | { 937 | return false; 938 | } 939 | break; 940 | case 16: 941 | // (, line 125 942 | if (!(this.in_grouping_b(this.g_valid_LI, 99, 116))) 943 | { 944 | return false; 945 | } 946 | // delete, line 125 947 | if (!this.slice_del()) 948 | { 949 | return false; 950 | } 951 | break; 952 | } 953 | return true; 954 | } 955 | 956 | r_Step_3 () 957 | { 958 | var among_var; 959 | // (, line 129 960 | // [, line 130 961 | this.ket = this.cursor; 962 | // substring, line 130 963 | among_var = this.find_among_b(this.a_6); 964 | if (among_var == 0) 965 | { 966 | return false; 967 | } 968 | // ], line 130 969 | this.bra = this.cursor; 970 | // call R1, line 130 971 | if (!this.r_R1()) 972 | { 973 | return false; 974 | } 975 | switch (among_var) { 976 | case 0: 977 | return false; 978 | case 1: 979 | // (, line 131 980 | // <-, line 131 981 | if (!this.slice_from("tion")) 982 | { 983 | return false; 984 | } 985 | break; 986 | case 2: 987 | // (, line 132 988 | // <-, line 132 989 | if (!this.slice_from("ate")) 990 | { 991 | return false; 992 | } 993 | break; 994 | case 3: 995 | // (, line 133 996 | // <-, line 133 997 | if (!this.slice_from("al")) 998 | { 999 | return false; 1000 | } 1001 | break; 1002 | case 4: 1003 | // (, line 135 1004 | // <-, line 135 1005 | if (!this.slice_from("ic")) 1006 | { 1007 | return false; 1008 | } 1009 | break; 1010 | case 5: 1011 | // (, line 137 1012 | // delete, line 137 1013 | if (!this.slice_del()) 1014 | { 1015 | return false; 1016 | } 1017 | break; 1018 | case 6: 1019 | // (, line 139 1020 | // call R2, line 139 1021 | if (!this.r_R2()) 1022 | { 1023 | return false; 1024 | } 1025 | // delete, line 139 1026 | if (!this.slice_del()) 1027 | { 1028 | return false; 1029 | } 1030 | break; 1031 | } 1032 | return true; 1033 | } 1034 | 1035 | r_Step_4 () 1036 | { 1037 | var among_var; 1038 | // (, line 143 1039 | // [, line 144 1040 | this.ket = this.cursor; 1041 | // substring, line 144 1042 | among_var = this.find_among_b(this.a_7); 1043 | if (among_var == 0) 1044 | { 1045 | return false; 1046 | } 1047 | // ], line 144 1048 | this.bra = this.cursor; 1049 | // call R2, line 144 1050 | if (!this.r_R2()) 1051 | { 1052 | return false; 1053 | } 1054 | switch (among_var) { 1055 | case 0: 1056 | return false; 1057 | case 1: 1058 | // (, line 147 1059 | // delete, line 147 1060 | if (!this.slice_del()) 1061 | { 1062 | return false; 1063 | } 1064 | break; 1065 | case 2: 1066 | // (, line 148 1067 | // or, line 148 1068 | var lab0 = true; 1069 | lab0: while (lab0 == true) 1070 | { 1071 | lab0 = false; 1072 | var v_1 = this.limit - this.cursor; 1073 | var lab1 = true; 1074 | lab1: while (lab1 == true) 1075 | { 1076 | lab1 = false; 1077 | // literal, line 148 1078 | if (!(this.eq_s_b("s"))) 1079 | { 1080 | break lab1; 1081 | } 1082 | break lab0; 1083 | } 1084 | this.cursor = this.limit - v_1; 1085 | // literal, line 148 1086 | if (!(this.eq_s_b("t"))) 1087 | { 1088 | return false; 1089 | } 1090 | } 1091 | // delete, line 148 1092 | if (!this.slice_del()) 1093 | { 1094 | return false; 1095 | } 1096 | break; 1097 | } 1098 | return true; 1099 | } 1100 | 1101 | r_Step_5 () 1102 | { 1103 | var among_var; 1104 | // (, line 152 1105 | // [, line 153 1106 | this.ket = this.cursor; 1107 | // substring, line 153 1108 | among_var = this.find_among_b(this.a_8); 1109 | if (among_var == 0) 1110 | { 1111 | return false; 1112 | } 1113 | // ], line 153 1114 | this.bra = this.cursor; 1115 | switch (among_var) { 1116 | case 0: 1117 | return false; 1118 | case 1: 1119 | // (, line 154 1120 | // or, line 154 1121 | var lab0 = true; 1122 | lab0: while (lab0 == true) 1123 | { 1124 | lab0 = false; 1125 | var v_1 = this.limit - this.cursor; 1126 | var lab1 = true; 1127 | lab1: while (lab1 == true) 1128 | { 1129 | lab1 = false; 1130 | // call R2, line 154 1131 | if (!this.r_R2()) 1132 | { 1133 | break lab1; 1134 | } 1135 | break lab0; 1136 | } 1137 | this.cursor = this.limit - v_1; 1138 | // (, line 154 1139 | // call R1, line 154 1140 | if (!this.r_R1()) 1141 | { 1142 | return false; 1143 | } 1144 | // not, line 154 1145 | { 1146 | var v_2 = this.limit - this.cursor; 1147 | var lab2 = true; 1148 | lab2: while (lab2 == true) 1149 | { 1150 | lab2 = false; 1151 | // call shortv, line 154 1152 | if (!this.r_shortv()) 1153 | { 1154 | break lab2; 1155 | } 1156 | return false; 1157 | } 1158 | this.cursor = this.limit - v_2; 1159 | } 1160 | } 1161 | // delete, line 154 1162 | if (!this.slice_del()) 1163 | { 1164 | return false; 1165 | } 1166 | break; 1167 | case 2: 1168 | // (, line 155 1169 | // call R2, line 155 1170 | if (!this.r_R2()) 1171 | { 1172 | return false; 1173 | } 1174 | // literal, line 155 1175 | if (!(this.eq_s_b("l"))) 1176 | { 1177 | return false; 1178 | } 1179 | // delete, line 155 1180 | if (!this.slice_del()) 1181 | { 1182 | return false; 1183 | } 1184 | break; 1185 | } 1186 | return true; 1187 | } 1188 | 1189 | r_exception2 () 1190 | { 1191 | // (, line 159 1192 | // [, line 161 1193 | this.ket = this.cursor; 1194 | // substring, line 161 1195 | if (this.find_among_b(this.a_9) == 0) 1196 | { 1197 | return false; 1198 | } 1199 | // ], line 161 1200 | this.bra = this.cursor; 1201 | // atlimit, line 161 1202 | if (this.cursor > this.limit_backward) 1203 | { 1204 | return false; 1205 | } 1206 | return true; 1207 | } 1208 | 1209 | r_exception1 () 1210 | { 1211 | var among_var; 1212 | // (, line 171 1213 | // [, line 173 1214 | this.bra = this.cursor; 1215 | // substring, line 173 1216 | among_var = this.find_among(this.a_10); 1217 | if (among_var == 0) 1218 | { 1219 | return false; 1220 | } 1221 | // ], line 173 1222 | this.ket = this.cursor; 1223 | // atlimit, line 173 1224 | if (this.cursor < this.limit) 1225 | { 1226 | return false; 1227 | } 1228 | switch (among_var) { 1229 | case 0: 1230 | return false; 1231 | case 1: 1232 | // (, line 177 1233 | // <-, line 177 1234 | if (!this.slice_from("ski")) 1235 | { 1236 | return false; 1237 | } 1238 | break; 1239 | case 2: 1240 | // (, line 178 1241 | // <-, line 178 1242 | if (!this.slice_from("sky")) 1243 | { 1244 | return false; 1245 | } 1246 | break; 1247 | case 3: 1248 | // (, line 179 1249 | // <-, line 179 1250 | if (!this.slice_from("die")) 1251 | { 1252 | return false; 1253 | } 1254 | break; 1255 | case 4: 1256 | // (, line 180 1257 | // <-, line 180 1258 | if (!this.slice_from("lie")) 1259 | { 1260 | return false; 1261 | } 1262 | break; 1263 | case 5: 1264 | // (, line 181 1265 | // <-, line 181 1266 | if (!this.slice_from("tie")) 1267 | { 1268 | return false; 1269 | } 1270 | break; 1271 | case 6: 1272 | // (, line 182 1273 | // <-, line 182 1274 | if (!this.slice_from("replic")) 1275 | { 1276 | return false; 1277 | } 1278 | break; 1279 | case 7: 1280 | // (, line 183 1281 | // <-, line 183 1282 | if (!this.slice_from("retriabl")) 1283 | { 1284 | return false; 1285 | } 1286 | break; 1287 | case 8: 1288 | // (, line 184 1289 | // <-, line 184 1290 | if (!this.slice_from("important")) 1291 | { 1292 | return false; 1293 | } 1294 | break; 1295 | case 9: 1296 | // (, line 188 1297 | // <-, line 188 1298 | if (!this.slice_from("idl")) 1299 | { 1300 | return false; 1301 | } 1302 | break; 1303 | case 10: 1304 | // (, line 189 1305 | // <-, line 189 1306 | if (!this.slice_from("gentl")) 1307 | { 1308 | return false; 1309 | } 1310 | break; 1311 | case 11: 1312 | // (, line 190 1313 | // <-, line 190 1314 | if (!this.slice_from("ugli")) 1315 | { 1316 | return false; 1317 | } 1318 | break; 1319 | case 12: 1320 | // (, line 191 1321 | // <-, line 191 1322 | if (!this.slice_from("earli")) 1323 | { 1324 | return false; 1325 | } 1326 | break; 1327 | case 13: 1328 | // (, line 192 1329 | // <-, line 192 1330 | if (!this.slice_from("onli")) 1331 | { 1332 | return false; 1333 | } 1334 | break; 1335 | case 14: 1336 | // (, line 193 1337 | // <-, line 193 1338 | if (!this.slice_from("singl")) 1339 | { 1340 | return false; 1341 | } 1342 | break; 1343 | } 1344 | return true; 1345 | } 1346 | 1347 | r_postlude () 1348 | { 1349 | // (, line 210 1350 | // Boolean test Y_found, line 210 1351 | if (!this.B_Y_found) 1352 | { 1353 | return false; 1354 | } 1355 | // repeat, line 210 1356 | replab0: while(true) 1357 | { 1358 | var v_1 = this.cursor; 1359 | var lab1 = true; 1360 | lab1: while (lab1 == true) 1361 | { 1362 | lab1 = false; 1363 | // (, line 210 1364 | // goto, line 210 1365 | golab2: while(true) 1366 | { 1367 | var v_2 = this.cursor; 1368 | var lab3 = true; 1369 | lab3: while (lab3 == true) 1370 | { 1371 | lab3 = false; 1372 | // (, line 210 1373 | // [, line 210 1374 | this.bra = this.cursor; 1375 | // literal, line 210 1376 | if (!(this.eq_s("Y"))) 1377 | { 1378 | break lab3; 1379 | } 1380 | // ], line 210 1381 | this.ket = this.cursor; 1382 | this.cursor = v_2; 1383 | break golab2; 1384 | } 1385 | this.cursor = v_2; 1386 | if (this.cursor >= this.limit) 1387 | { 1388 | break lab1; 1389 | } 1390 | this.cursor++; 1391 | } 1392 | // <-, line 210 1393 | if (!this.slice_from("y")) 1394 | { 1395 | return false; 1396 | } 1397 | continue replab0; 1398 | } 1399 | this.cursor = v_1; 1400 | break replab0; 1401 | } 1402 | return true; 1403 | } 1404 | 1405 | stem () 1406 | { 1407 | // (, line 212 1408 | // or, line 214 1409 | var lab0 = true; 1410 | lab0: while (lab0 == true) 1411 | { 1412 | lab0 = false; 1413 | var v_1 = this.cursor; 1414 | var lab1 = true; 1415 | lab1: while (lab1 == true) 1416 | { 1417 | lab1 = false; 1418 | // call exception1, line 214 1419 | if (!this.r_exception1()) 1420 | { 1421 | break lab1; 1422 | } 1423 | break lab0; 1424 | } 1425 | this.cursor = v_1; 1426 | var lab2 = true; 1427 | lab2: while (lab2 == true) 1428 | { 1429 | lab2 = false; 1430 | // not, line 215 1431 | { 1432 | var v_2 = this.cursor; 1433 | var lab3 = true; 1434 | lab3: while (lab3 == true) 1435 | { 1436 | lab3 = false; 1437 | // hop, line 215 1438 | { 1439 | var c = this.cursor + 3; 1440 | if (0 > c || c > this.limit) 1441 | { 1442 | break lab3; 1443 | } 1444 | this.cursor = c; 1445 | } 1446 | break lab2; 1447 | } 1448 | this.cursor = v_2; 1449 | } 1450 | break lab0; 1451 | } 1452 | this.cursor = v_1; 1453 | // (, line 215 1454 | // do, line 216 1455 | var v_3 = this.cursor; 1456 | var lab4 = true; 1457 | lab4: while (lab4 == true) 1458 | { 1459 | lab4 = false; 1460 | // call prelude, line 216 1461 | if (!this.r_prelude()) 1462 | { 1463 | break lab4; 1464 | } 1465 | } 1466 | this.cursor = v_3; 1467 | // do, line 217 1468 | var v_4 = this.cursor; 1469 | var lab5 = true; 1470 | lab5: while (lab5 == true) 1471 | { 1472 | lab5 = false; 1473 | // call mark_regions, line 217 1474 | if (!this.r_mark_regions()) 1475 | { 1476 | break lab5; 1477 | } 1478 | } 1479 | this.cursor = v_4; 1480 | // backwards, line 218 1481 | this.limit_backward = this.cursor; this.cursor = this.limit; 1482 | // (, line 218 1483 | // do, line 220 1484 | var v_5 = this.limit - this.cursor; 1485 | var lab6 = true; 1486 | lab6: while (lab6 == true) 1487 | { 1488 | lab6 = false; 1489 | // call Step_1a, line 220 1490 | if (!this.r_Step_1a()) 1491 | { 1492 | break lab6; 1493 | } 1494 | } 1495 | this.cursor = this.limit - v_5; 1496 | // or, line 222 1497 | var lab7 = true; 1498 | lab7: while (lab7 == true) 1499 | { 1500 | lab7 = false; 1501 | var v_6 = this.limit - this.cursor; 1502 | var lab8 = true; 1503 | lab8: while (lab8 == true) 1504 | { 1505 | lab8 = false; 1506 | // call exception2, line 222 1507 | if (!this.r_exception2()) 1508 | { 1509 | break lab8; 1510 | } 1511 | break lab7; 1512 | } 1513 | this.cursor = this.limit - v_6; 1514 | // (, line 222 1515 | // do, line 224 1516 | var v_7 = this.limit - this.cursor; 1517 | var lab9 = true; 1518 | lab9: while (lab9 == true) 1519 | { 1520 | lab9 = false; 1521 | // call Step_1b, line 224 1522 | if (!this.r_Step_1b()) 1523 | { 1524 | break lab9; 1525 | } 1526 | } 1527 | this.cursor = this.limit - v_7; 1528 | // do, line 225 1529 | var v_8 = this.limit - this.cursor; 1530 | var lab10 = true; 1531 | lab10: while (lab10 == true) 1532 | { 1533 | lab10 = false; 1534 | // call Step_1c, line 225 1535 | if (!this.r_Step_1c()) 1536 | { 1537 | break lab10; 1538 | } 1539 | } 1540 | this.cursor = this.limit - v_8; 1541 | // do, line 227 1542 | var v_9 = this.limit - this.cursor; 1543 | var lab11 = true; 1544 | lab11: while (lab11 == true) 1545 | { 1546 | lab11 = false; 1547 | // call Step_2, line 227 1548 | if (!this.r_Step_2()) 1549 | { 1550 | break lab11; 1551 | } 1552 | } 1553 | this.cursor = this.limit - v_9; 1554 | // do, line 228 1555 | var v_10 = this.limit - this.cursor; 1556 | var lab12 = true; 1557 | lab12: while (lab12 == true) 1558 | { 1559 | lab12 = false; 1560 | // call Step_3, line 228 1561 | if (!this.r_Step_3()) 1562 | { 1563 | break lab12; 1564 | } 1565 | } 1566 | this.cursor = this.limit - v_10; 1567 | // do, line 229 1568 | var v_11 = this.limit - this.cursor; 1569 | var lab13 = true; 1570 | lab13: while (lab13 == true) 1571 | { 1572 | lab13 = false; 1573 | // call Step_4, line 229 1574 | if (!this.r_Step_4()) 1575 | { 1576 | break lab13; 1577 | } 1578 | } 1579 | this.cursor = this.limit - v_11; 1580 | // do, line 231 1581 | var v_12 = this.limit - this.cursor; 1582 | var lab14 = true; 1583 | lab14: while (lab14 == true) 1584 | { 1585 | lab14 = false; 1586 | // call Step_5, line 231 1587 | if (!this.r_Step_5()) 1588 | { 1589 | break lab14; 1590 | } 1591 | } 1592 | this.cursor = this.limit - v_12; 1593 | } 1594 | this.cursor = this.limit_backward; 1595 | // do, line 234 1596 | var v_13 = this.cursor; 1597 | var lab15 = true; 1598 | lab15: while (lab15 == true) 1599 | { 1600 | lab15 = false; 1601 | // call postlude, line 234 1602 | if (!this.r_postlude()) 1603 | { 1604 | break lab15; 1605 | } 1606 | } 1607 | this.cursor = v_13; 1608 | } 1609 | return true; 1610 | } 1611 | 1612 | setCurrent (value) 1613 | { 1614 | this.current = value; 1615 | this.cursor = 0; 1616 | this.limit = this.current.length; 1617 | this.limit_backward = 0; 1618 | this.bra = this.cursor; 1619 | this.ket = this.limit; 1620 | } 1621 | 1622 | /** 1623 | * Get the this.current string. 1624 | */ 1625 | getCurrent () 1626 | { 1627 | return this.current; 1628 | } 1629 | 1630 | 1631 | copy_from (other) 1632 | { 1633 | this.current = other.current; 1634 | this.cursor = other.cursor; 1635 | this.limit = other.limit; 1636 | this.limit_backward = other.limit_backward; 1637 | this.bra = other.bra; 1638 | this.ket = other.ket; 1639 | } 1640 | 1641 | in_grouping (s, min, max) 1642 | { 1643 | if (this.cursor >= this.limit) return false; 1644 | var ch = this.current.charCodeAt(this.cursor); 1645 | if (ch > max || ch < min) return false; 1646 | ch -= min; 1647 | if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; 1648 | this.cursor++; 1649 | return true; 1650 | } 1651 | 1652 | in_grouping_b (s, min, max) 1653 | { 1654 | if (this.cursor <= this.limit_backward) return false; 1655 | var ch = this.current.charCodeAt(this.cursor - 1); 1656 | if (ch > max || ch < min) return false; 1657 | ch -= min; 1658 | if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; 1659 | this.cursor--; 1660 | return true; 1661 | } 1662 | 1663 | out_grouping (s, min, max) 1664 | { 1665 | if (this.cursor >= this.limit) return false; 1666 | var ch = this.current.charCodeAt(this.cursor); 1667 | if (ch > max || ch < min) { 1668 | this.cursor++; 1669 | return true; 1670 | } 1671 | ch -= min; 1672 | if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) == 0) { 1673 | this.cursor++; 1674 | return true; 1675 | } 1676 | return false; 1677 | } 1678 | 1679 | out_grouping_b (s, min, max) 1680 | { 1681 | if (this.cursor <= this.limit_backward) return false; 1682 | var ch = this.current.charCodeAt(this.cursor - 1); 1683 | if (ch > max || ch < min) { 1684 | this.cursor--; 1685 | return true; 1686 | } 1687 | ch -= min; 1688 | if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) { 1689 | this.cursor--; 1690 | return true; 1691 | } 1692 | return false; 1693 | } 1694 | 1695 | eq_s (s) 1696 | { 1697 | if (this.limit - this.cursor < s.length) return false; 1698 | if (this.current.slice(this.cursor, this.cursor + s.length) != s) 1699 | { 1700 | return false; 1701 | } 1702 | this.cursor += s.length; 1703 | return true; 1704 | } 1705 | 1706 | eq_s_b (s) 1707 | { 1708 | if (this.cursor - this.limit_backward < s.length) return false; 1709 | if (this.current.slice(this.cursor - s.length, this.cursor) != s) 1710 | { 1711 | return false; 1712 | } 1713 | this.cursor -= s.length; 1714 | return true; 1715 | } 1716 | 1717 | find_among (v ) 1718 | { 1719 | var i = 0; 1720 | var j = v.length; 1721 | 1722 | var c = this.cursor; 1723 | var l = this.limit; 1724 | 1725 | var common_i = 0; 1726 | var common_j = 0; 1727 | 1728 | var first_key_inspected = false; 1729 | 1730 | while (true) 1731 | { 1732 | var k = i + ((j - i) >>> 1); 1733 | var diff = 0; 1734 | var common = common_i < common_j ? common_i : common_j; // smaller 1735 | var w = v[k]; 1736 | var i2; 1737 | for (i2 = common; i2 < w.s.length; i2++) 1738 | { 1739 | if (c + common == l) 1740 | { 1741 | diff = -1; 1742 | break; 1743 | } 1744 | diff = this.current.charCodeAt(c + common) - w.s.charCodeAt(i2); 1745 | if (diff != 0) break; 1746 | common++; 1747 | } 1748 | if (diff < 0) 1749 | { 1750 | j = k; 1751 | common_j = common; 1752 | } 1753 | else 1754 | { 1755 | i = k; 1756 | common_i = common; 1757 | } 1758 | if (j - i <= 1) 1759 | { 1760 | if (i > 0) break; // v->s has been inspected 1761 | if (j == i) break; // only one item in v 1762 | 1763 | // - but now we need to go round once more to get 1764 | // v->s inspected. This looks messy, but is actually 1765 | // the optimal approach. 1766 | 1767 | if (first_key_inspected) break; 1768 | first_key_inspected = true; 1769 | } 1770 | } 1771 | while (true) 1772 | { 1773 | var w = v[i]; 1774 | if (common_i >= w.s.length) 1775 | { 1776 | this.cursor = c + w.s.length; 1777 | if (w.method == null) 1778 | { 1779 | return w.result; 1780 | } 1781 | var res = w.method(this); 1782 | this.cursor = c + w.s.length; 1783 | if (res) 1784 | { 1785 | return w.result; 1786 | } 1787 | } 1788 | i = w.substring_i; 1789 | if (i < 0) return 0; 1790 | } 1791 | return -1; // not reachable 1792 | } 1793 | 1794 | // find_among_b is for backwards processing. Same comments apply 1795 | find_among_b (v ) 1796 | { 1797 | var i = 0; 1798 | var j = v.length; 1799 | 1800 | var c = this.cursor; 1801 | var lb = this.limit_backward; 1802 | 1803 | var common_i = 0; 1804 | var common_j = 0; 1805 | 1806 | var first_key_inspected = false; 1807 | 1808 | while (true) 1809 | { 1810 | var k = i + ((j - i) >> 1); 1811 | var diff = 0; 1812 | var common = common_i < common_j ? common_i : common_j; 1813 | var w = v[k]; 1814 | var i2; 1815 | for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) 1816 | { 1817 | if (c - common == lb) 1818 | { 1819 | diff = -1; 1820 | break; 1821 | } 1822 | diff = this.current.charCodeAt(c - 1 - common) - w.s.charCodeAt(i2); 1823 | if (diff != 0) break; 1824 | common++; 1825 | } 1826 | if (diff < 0) 1827 | { 1828 | j = k; 1829 | common_j = common; 1830 | } 1831 | else 1832 | { 1833 | i = k; 1834 | common_i = common; 1835 | } 1836 | if (j - i <= 1) 1837 | { 1838 | if (i > 0) break; 1839 | if (j == i) break; 1840 | if (first_key_inspected) break; 1841 | first_key_inspected = true; 1842 | } 1843 | } 1844 | while (true) 1845 | { 1846 | var w = v[i]; 1847 | if (common_i >= w.s.length) 1848 | { 1849 | this.cursor = c - w.s.length; 1850 | if (w.method == null) return w.result; 1851 | var res = w.method(this); 1852 | this.cursor = c - w.s.length; 1853 | if (res) return w.result; 1854 | } 1855 | i = w.substring_i; 1856 | if (i < 0) return 0; 1857 | } 1858 | return -1; // not reachable 1859 | } 1860 | 1861 | /* to replace chars between c_bra and c_ket in this.current by the 1862 | * chars in s. 1863 | */ 1864 | replace_s (c_bra, c_ket, s) 1865 | { 1866 | var adjustment = s.length - (c_ket - c_bra); 1867 | this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket); 1868 | this.limit += adjustment; 1869 | if (this.cursor >= c_ket) this.cursor += adjustment; 1870 | else if (this.cursor > c_bra) this.cursor = c_bra; 1871 | return adjustment; 1872 | } 1873 | 1874 | slice_check () 1875 | { 1876 | if (this.bra < 0 || 1877 | this.bra > this.ket || 1878 | this.ket > this.limit || 1879 | this.limit > this.current.length) 1880 | { 1881 | return false; 1882 | } 1883 | return true; 1884 | } 1885 | 1886 | slice_from (s) 1887 | { 1888 | var result = false; 1889 | if (this.slice_check()) 1890 | { 1891 | this.replace_s(this.bra, this.ket, s); 1892 | result = true; 1893 | } 1894 | return result; 1895 | } 1896 | 1897 | slice_del () 1898 | { 1899 | return this.slice_from(""); 1900 | } 1901 | 1902 | insert (c_bra, c_ket, s) 1903 | { 1904 | var adjustment = this.replace_s(c_bra, c_ket, s); 1905 | if (c_bra <= this.bra) this.bra += adjustment; 1906 | if (c_bra <= this.ket) this.ket += adjustment; 1907 | } 1908 | 1909 | /* Copy the slice into the supplied StringBuffer */ 1910 | slice_to () 1911 | { 1912 | var result = ''; 1913 | if (this.slice_check()) 1914 | { 1915 | result = this.current.slice(this.bra, this.ket); 1916 | } 1917 | return result; 1918 | } 1919 | 1920 | assign_to () 1921 | { 1922 | return this.current.slice(0, this.limit); 1923 | } 1924 | 1925 | stemWord (word) 1926 | { 1927 | this.setCurrent(word); 1928 | this.stem(); 1929 | return this.getCurrent(); 1930 | } 1931 | } 1932 | 1933 | exports.Porter2 = Porter2 1934 | -------------------------------------------------------------------------------- /src/fts/Porter2.snowball: -------------------------------------------------------------------------------- 1 | // This is derived from http://snowball.tartarus.org/algorithms/english/stemmer.html 2 | // But with additional irregular and invariant forms. 3 | 4 | integers ( p1 p2 ) 5 | booleans ( Y_found ) 6 | 7 | routines ( 8 | prelude postlude 9 | mark_regions 10 | shortv 11 | R1 R2 12 | Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 13 | exception1 14 | exception2 15 | ) 16 | 17 | externals ( stem ) 18 | 19 | groupings ( v v_WXY valid_LI ) 20 | 21 | stringescapes {} 22 | 23 | define v 'aeiouy' 24 | define v_WXY v + 'wxY' 25 | 26 | define valid_LI 'cdeghkmnrt' 27 | 28 | define prelude as ( 29 | unset Y_found 30 | do ( ['{'}'] delete) 31 | do ( ['y'] <-'Y' set Y_found) 32 | do repeat(goto (v ['y']) <-'Y' set Y_found) 33 | ) 34 | 35 | define mark_regions as ( 36 | $p1 = limit 37 | $p2 = limit 38 | do( 39 | among ( 40 | 'gener' 41 | 'commun' // added May 2005 42 | 'arsen' // added Nov 2006 (arsenic/arsenal) 43 | // ... extensions possible here ... 44 | ) or (gopast v gopast non-v) 45 | setmark p1 46 | gopast v gopast non-v setmark p2 47 | ) 48 | ) 49 | 50 | backwardmode ( 51 | 52 | define shortv as ( 53 | ( non-v_WXY v non-v ) 54 | or 55 | ( non-v v atlimit ) 56 | ) 57 | 58 | define R1 as $p1 <= cursor 59 | define R2 as $p2 <= cursor 60 | 61 | define Step_1a as ( 62 | try ( 63 | [substring] among ( 64 | '{'}' '{'}s' '{'}s{'}' 65 | (delete) 66 | ) 67 | ) 68 | [substring] among ( 69 | 'sses' (<-'ss') 70 | 'ied' 'ies' 71 | ((hop 2 <-'i') or <-'ie') 72 | 's' (next gopast v delete) 73 | 'us' 'ss' 74 | ) 75 | ) 76 | 77 | define Step_1b as ( 78 | [substring] among ( 79 | 'eed' 'eedly' 80 | (R1 <-'ee') 81 | 'ed' 'edly' 'ing' 'ingly' 82 | ( 83 | test gopast v delete 84 | test substring among( 85 | 'at' 'bl' 'iz' 86 | (<+ 'e') 87 | 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' 88 | // ignoring double c, h, j, k, q, v, w, and x 89 | ([next] delete) 90 | '' (atmark p1 test shortv <+ 'e') 91 | ) 92 | ) 93 | ) 94 | ) 95 | 96 | define Step_1c as ( 97 | ['y' or 'Y'] 98 | non-v not atlimit 99 | <-'i' 100 | ) 101 | 102 | define Step_2 as ( 103 | [substring] R1 among ( 104 | 'tional' (<-'tion') 105 | 'enci' (<-'ence') 106 | 'anci' (<-'ance') 107 | 'abli' (<-'able') 108 | 'entli' (<-'ent') 109 | 'izer' 'ization' 110 | (<-'ize') 111 | 'ational' 'ation' 'ator' 112 | (<-'ate') 113 | 'alism' 'aliti' 'alli' 114 | (<-'al') 115 | 'fulness' (<-'ful') 116 | 'ousli' 'ousness' 117 | (<-'ous') 118 | 'iveness' 'iviti' 119 | (<-'ive') 120 | 'biliti' 'bli' 121 | (<-'ble') 122 | 'ogi' ('l' <-'og') 123 | 'fulli' (<-'ful') 124 | 'lessli' (<-'less') 125 | 'li' (valid_LI delete) 126 | ) 127 | ) 128 | 129 | define Step_3 as ( 130 | [substring] R1 among ( 131 | 'tional' (<- 'tion') 132 | 'ational' (<- 'ate') 133 | 'alize' (<-'al') 134 | 'icate' 'iciti' 'ical' 135 | (<-'ic') 136 | 'ful' 'ness' 137 | (delete) 138 | 'ative' 139 | (R2 delete) // 'R2' added Dec 2001 140 | ) 141 | ) 142 | 143 | define Step_4 as ( 144 | [substring] R2 among ( 145 | 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 146 | 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' 147 | (delete) 148 | 'ion' ('s' or 't' delete) 149 | ) 150 | ) 151 | 152 | define Step_5 as ( 153 | [substring] among ( 154 | 'e' (R2 or (R1 not shortv) delete) 155 | 'l' (R2 'l' delete) 156 | ) 157 | ) 158 | 159 | define exception2 as ( 160 | 161 | [substring] atlimit among( 162 | 'inning' 'outing' 'canning' 'herring' 'earring' 163 | 'proceed' 'exceed' 'succeed' 164 | 165 | // ... extensions possible here ... 166 | 167 | ) 168 | ) 169 | ) 170 | 171 | define exception1 as ( 172 | 173 | [substring] atlimit among( 174 | 175 | /* special changes: */ 176 | 177 | 'skis' (<-'ski') 178 | 'skies' (<-'sky') 179 | 'dying' (<-'die') 180 | 'lying' (<-'lie') 181 | 'tying' (<-'tie') 182 | 'replica' (<-'replic') 183 | 'retryable' (<- 'retriabl') 184 | 'importance' (<-'important') 185 | 186 | /* special -LY cases */ 187 | 188 | 'idly' (<-'idl') 189 | 'gently' (<-'gentl') 190 | 'ugly' (<-'ugli') 191 | 'early' (<-'earli') 192 | 'only' (<-'onli') 193 | 'singly' (<-'singl') 194 | 195 | // ... extensions possible here ... 196 | 197 | /* invariant forms: */ 198 | 199 | 'sky' 200 | 'news' 201 | 'howe' 202 | 'important' 203 | 204 | 'atlas' 'cosmos' 'bias' 'andes' // not plural forms 205 | 206 | // ... extensions possible here ... 207 | ) 208 | ) 209 | 210 | define postlude as (Y_found repeat(goto (['Y']) <-'y')) 211 | 212 | define stem as ( 213 | 214 | exception1 or 215 | not hop 3 or ( 216 | do prelude 217 | do mark_regions 218 | backwards ( 219 | 220 | do Step_1a 221 | 222 | exception2 or ( 223 | 224 | do Step_1b 225 | do Step_1c 226 | 227 | do Step_2 228 | do Step_3 229 | do Step_4 230 | 231 | do Step_5 232 | ) 233 | ) 234 | do postlude 235 | ) 236 | ) 237 | -------------------------------------------------------------------------------- /src/fts/Query.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | const {isStopWord, tokenize, stem} = require('./Stemmer.js') 3 | 4 | /** 5 | * Return true if there is a configuration of numbers in the tree that 6 | * appear in sequential order. 7 | * @param {Array>} tree The tree 8 | * @param {number|undefined} lastCandidate Recursive state. 9 | * @return {boolean} True if there is a configuration of numbers in the 10 | * tree that appear in sequential order. 11 | */ 12 | function haveContiguousPath(tree, lastCandidate) { 13 | if (tree.length === 0) { 14 | return true 15 | } 16 | 17 | for (const element of tree[0]) { 18 | if (lastCandidate === undefined || element === lastCandidate + 1) { 19 | if (haveContiguousPath(tree.slice(1), element)) { 20 | return true 21 | } 22 | } 23 | } 24 | 25 | return false 26 | } 27 | 28 | /** 29 | * Check if the given phraseComponents appear in contiguous positions 30 | * within the keywords map. 31 | * @param {[string]} phraseComponents List of stems that must appear sequentially. 32 | * @param {Map} keywords Keywords 33 | * @return {boolean} True if there's a contiguous configuration of phrase components. 34 | */ 35 | function haveContiguousKeywords(phraseComponents, keywords) { 36 | const path = [] 37 | for (const component of phraseComponents) { 38 | const positions = keywords.get(component) 39 | if (positions === undefined) { 40 | return false 41 | } 42 | path.push(positions) 43 | } 44 | 45 | return haveContiguousPath(path) 46 | } 47 | 48 | function processPart(part) { 49 | return tokenize(part, false) 50 | } 51 | 52 | /** A parsed search query. */ 53 | class Query { 54 | 55 | /** 56 | * Create a new query. 57 | * @param {string} queryString The query to parse 58 | */ 59 | constructor(queryString) { 60 | this.terms = new Set() 61 | this.phrases = [] 62 | this.stemmedPhrases = [] 63 | this.filter = () => true 64 | 65 | const parts = queryString.split(/((?:\s+|^)"[^"]+"(?:\s+|$))/) 66 | let inQuotes = false 67 | for (const part of parts) { 68 | inQuotes = Boolean(part.match(/^\s*"/)) 69 | 70 | if (!inQuotes) { 71 | this.addTerms(processPart(part)) 72 | } else { 73 | const phraseMatch = part.match(/\s*"([^"]*)"?\s*/) 74 | if (!phraseMatch) { 75 | // This is a phrase fragment 76 | this.addTerms(processPart(part)) 77 | continue 78 | } 79 | 80 | const phrase = phraseMatch[1].toLowerCase().trim() 81 | this.phrases.push(phrase) 82 | 83 | const phraseParts = processPart(phrase) 84 | this.stemmedPhrases.push(phraseParts.filter((term) => !isStopWord(term)).map((term) => stem(term))) 85 | this.addTerms(phraseParts) 86 | } 87 | } 88 | } 89 | 90 | /** 91 | * Return true if the exact phrases in the query appear in ANY of the fields 92 | * appearing in the match. 93 | * @param {Map} tokens Token positions 94 | * @return {boolean} True if the given match contains this query's phrases. 95 | */ 96 | checkPhrases(tokens) { 97 | for (const phraseTokens of this.stemmedPhrases) { 98 | if (!haveContiguousKeywords(phraseTokens, tokens)) { 99 | return false 100 | } 101 | } 102 | 103 | return true 104 | } 105 | 106 | addTerms(terms) { 107 | for (const term of terms) { 108 | this.terms.add(term) 109 | } 110 | } 111 | } 112 | 113 | exports.Query = Query 114 | -------------------------------------------------------------------------------- /src/fts/Stemmer.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | /* Derived from the following: */ 4 | /* ! 5 | * lunr.stopWordFilter 6 | * Copyright (C) 2017 Oliver Nightingale 7 | */ 8 | 9 | const Porter2 = require('./Porter2').Porter2 10 | 11 | const stopWords = new Set([ 12 | 'a', 13 | 'able', 14 | 'about', 15 | 'across', 16 | 'after', 17 | 'all', 18 | 'almost', 19 | 'also', 20 | 'am', 21 | 'among', 22 | 'an', 23 | 'and', 24 | 'any', 25 | 'are', 26 | 'as', 27 | 'at', 28 | 'be', 29 | 'because', 30 | 'been', 31 | 'but', 32 | 'by', 33 | 'can', 34 | 'cannot', 35 | 'could', 36 | 'dear', 37 | 'did', 38 | 'do', 39 | 'does', 40 | 'either', 41 | 'else', 42 | 'ever', 43 | 'every', 44 | 'for', 45 | 'from', 46 | 'got', 47 | 'had', 48 | 'has', 49 | 'have', 50 | 'he', 51 | 'her', 52 | 'hers', 53 | 'him', 54 | 'his', 55 | 'how', 56 | 'however', 57 | 'i', 58 | 'i.e.', 59 | 'if', 60 | 'important', 61 | 'in', 62 | 'into', 63 | 'is', 64 | 'it', 65 | 'its', 66 | 'just', 67 | 'may', 68 | 'me', 69 | 'might', 70 | 'most', 71 | 'must', 72 | 'my', 73 | 'neither', 74 | 'no', 75 | 'nor', 76 | 'of', 77 | 'off', 78 | 'often', 79 | 'on', 80 | 'only', 81 | 'or', 82 | 'other', 83 | 'our', 84 | 'own', 85 | 'rather', 86 | 'said', 87 | 'say', 88 | 'says', 89 | 'she', 90 | 'should', 91 | 'since', 92 | 'so', 93 | 'some', 94 | 'than', 95 | 'that', 96 | 'the', 97 | 'their', 98 | 'them', 99 | 'then', 100 | 'there', 101 | 'these', 102 | 'they', 103 | 'this', 104 | 'tis', 105 | 'to', 106 | 'too', 107 | 'twas', 108 | 'us', 109 | 'wants', 110 | 'was', 111 | 'we', 112 | 'were', 113 | 'what', 114 | 'where', 115 | 'which', 116 | 'while', 117 | 'who', 118 | 'whom', 119 | 'why', 120 | 'will', 121 | 'with', 122 | 'would', 123 | 'yet', 124 | 'you', 125 | 'your', 126 | 'e.g.']) 127 | 128 | const atomicPhraseMap = { 129 | 'ops': 'manager', 130 | 'cloud': 'manager', 131 | 'real': 'time' 132 | } 133 | const atomicPhrases = new Set(Object.entries(atomicPhraseMap).map((kv) => kv.join(' '))) 134 | 135 | const wordCache = new Map() 136 | const stemmer = new Porter2() 137 | function stem(word) { 138 | if (atomicPhrases.has(word)) { 139 | return word 140 | } 141 | 142 | let stemmed = wordCache.get(word) 143 | if (!stemmed) { 144 | stemmed = stemmer.stemWord(word) 145 | wordCache.set(word, stemmed) 146 | } 147 | 148 | return stemmed 149 | } 150 | 151 | function isStopWord(word) { 152 | return stopWords.has(word) 153 | } 154 | 155 | function tokenize(text, fuzzy) { 156 | const components = text.split(/[^\w$%.]+/).map((token) => { 157 | return token.toLocaleLowerCase().replace(/(?:^\.)|(?:\.$)/g, '') 158 | }) 159 | 160 | const tokens = [] 161 | for (let i = 0; i < components.length; i += 1) { 162 | const token = components[i] 163 | 164 | if (token == '$') { 165 | tokens.push('positional') 166 | tokens.push('operator') 167 | continue 168 | } 169 | 170 | const nextToken = components[i + 1] 171 | if (nextToken !== undefined && atomicPhraseMap[token] === nextToken) { 172 | i += 1 173 | tokens.push(`${token} ${atomicPhraseMap[token]}`) 174 | continue 175 | } 176 | 177 | if (token.length > 1) { 178 | tokens.push(token) 179 | } 180 | 181 | const subtokens = token.split('.') 182 | if (fuzzy && subtokens.length > 1) { 183 | for (const subtoken of subtokens) { 184 | if (subtoken.length > 1) { 185 | tokens.push(subtoken) 186 | } 187 | } 188 | } 189 | } 190 | 191 | return tokens 192 | } 193 | 194 | exports.stem = stem 195 | exports.isStopWord = isStopWord 196 | exports.tokenize = tokenize 197 | -------------------------------------------------------------------------------- /src/fts/Trie.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | class Trie { 4 | constructor() { 5 | this.trie = new Map([[0, null]]) 6 | } 7 | 8 | insert(token, id) { 9 | let cursor = this.trie 10 | 11 | for (let i = 0; i < token.length; i += 1) { 12 | const code = token.charCodeAt(i) + 1 13 | if (!cursor.get(code)) { 14 | cursor.set(code, new Map([[0, null]])) 15 | } 16 | 17 | cursor = cursor.get(code) 18 | } 19 | 20 | if (cursor.get(0) === null) { 21 | cursor.set(0, new Set()) 22 | } 23 | 24 | cursor.get(0).add(id) 25 | } 26 | 27 | // Return Map> 28 | search(token, prefixSearch) { 29 | let cursor = this.trie 30 | for (let i = 0; i < token.length; i += 1) { 31 | const code = token.charCodeAt(i) + 1 32 | if (!cursor.get(code)) { 33 | return new Map() 34 | } 35 | 36 | cursor = cursor.get(code) 37 | } 38 | 39 | const results = new Map() 40 | 41 | if (cursor.get(0)) { 42 | for (const id of cursor.get(0)) { 43 | results.set(id, new Set([token])) 44 | } 45 | } 46 | 47 | if (!prefixSearch) { 48 | return results 49 | } 50 | 51 | const stack = [[cursor, token]] 52 | while (stack.length > 0) { 53 | const [currentNode, currentToken] = stack.pop() 54 | for (const key of currentNode.keys()) { 55 | if (key !== 0) { 56 | const nextCursor = currentNode.get(key) 57 | if (nextCursor) { 58 | stack.push([nextCursor, currentToken + String.fromCharCode(key - 1)]) 59 | } 60 | continue 61 | } 62 | 63 | if (currentNode.get(key) === null) { 64 | continue 65 | } 66 | 67 | for (const value of currentNode.get(0)) { 68 | const arr = results.get(value) 69 | if (arr) { 70 | arr.add(currentToken) 71 | } else { 72 | results.set(value, new Set([currentToken])) 73 | } 74 | } 75 | 76 | continue 77 | } 78 | } 79 | 80 | return results 81 | } 82 | } 83 | 84 | exports.Trie = Trie 85 | -------------------------------------------------------------------------------- /src/fts/fts.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const pathModule = require('path') 4 | const Query = require('./Query.js').Query 5 | const Trie = require('./Trie.js').Trie 6 | const {isStopWord, stem, tokenize} = require('./Stemmer.js') 7 | const MANDATORY = require(pathModule.join(__dirname, '../correlations.js')).MANDATORY 8 | 9 | const MAX_MATCHES = 150 10 | const LOG_4_DIVISOR = 1.0 / Math.log2(4.0) 11 | 12 | /** 13 | * Normalize URLs by chopping off trailing index.html components. 14 | * standard deviation of relevancy. Return that minimum relevancy score. 15 | * @param {String} url The input URL. 16 | * @return {String} The normalized URL. 17 | */ 18 | function normalizeURL(url) { 19 | return url.replace(/\/index.html$/, '/') 20 | } 21 | 22 | function computeScore(match, maxRelevancyScore, maxAuthorityScore) { 23 | const normalizedRelevancyScore = match.relevancyScore / maxRelevancyScore + 1 24 | const normalizedAuthorityScore = match.authorityScore / maxAuthorityScore + 1 25 | return Math.log2(normalizedRelevancyScore) + (Math.log2(normalizedAuthorityScore) * LOG_4_DIVISOR) 26 | } 27 | 28 | /** 29 | * We want to penalize the final score of any matches that are in the bottom 30 | * standard deviation of relevancy. Return that minimum relevancy score. 31 | * @param {[Match]} matches The matches over which to compute a relevancy threshold. 32 | * @return {number} The relevancy threshold. 33 | */ 34 | function computeRelevancyThreshold(matches) { 35 | let meanScore = 0 36 | for (const match of matches) { 37 | meanScore += match.relevancyScore 38 | } 39 | meanScore /= matches.length 40 | 41 | let sum = 0 42 | for (const match of matches) { 43 | sum += (match.relevancyScore - meanScore) ** 2 44 | } 45 | 46 | return Math.sqrt((1 / (matches.length - 1) * sum)) 47 | } 48 | 49 | function capLength(array, maxLength) { 50 | return array.length > maxLength ? array.slice(0, maxLength) : array 51 | } 52 | 53 | function hits(matches, converganceThreshold, maxIterations) { 54 | let lastAuthorityNorm = 0 55 | let lastHubNorm = 0 56 | for (let i = 0; i < maxIterations; i += 1) { 57 | let authorityNorm = 0 58 | // Update all authority scores 59 | for (const match of matches) { 60 | match.authorityScore = 0 61 | for (const incomingMatch of match.incomingNeighbors) { 62 | match.authorityScore += incomingMatch.hubScore 63 | } 64 | authorityNorm += match.authorityScore ** 2 65 | } 66 | 67 | // Normalise the authority scores 68 | authorityNorm = Math.sqrt(authorityNorm) 69 | for (const match of matches) { 70 | match.authorityScore /= authorityNorm 71 | } 72 | 73 | // Update all hub scores 74 | let hubNorm = 0 75 | for (const match of matches) { 76 | match.hubScore = 0 77 | for (const outgoingMatch of match.outgoingNeighbors) { 78 | match.hubScore += outgoingMatch.authorityScore 79 | } 80 | hubNorm += match.hubScore ** 2 81 | } 82 | 83 | // Normalise the hub scores 84 | hubNorm = Math.sqrt(hubNorm) 85 | for (const match of matches) { 86 | match.hubScore /= hubNorm 87 | } 88 | 89 | if (Math.abs(authorityNorm - lastAuthorityNorm) < converganceThreshold && 90 | Math.abs(hubNorm - lastHubNorm) < converganceThreshold) { 91 | break 92 | } 93 | 94 | lastAuthorityNorm = authorityNorm 95 | lastHubNorm = hubNorm 96 | } 97 | 98 | // Cut anything with zero relevancy 99 | matches = matches.filter((match) => match.relevancyScore > 0) 100 | 101 | // Compute statistics for score normalization 102 | let maxRelevancyScore = 0 103 | let maxAuthorityScore = 0 104 | const relevancyScoreThreshold = computeRelevancyThreshold(matches) 105 | for (const match of matches) { 106 | if (isNaN(match.authorityScore)) { match.authorityScore = 1e-10 } 107 | 108 | // Ignore anything with bad relevancy for the purposes of score normalization 109 | if (match.relevancyScore < relevancyScoreThreshold) { continue } 110 | 111 | if (match.relevancyScore > maxRelevancyScore) { maxRelevancyScore = match.relevancyScore } 112 | if (match.authorityScore > maxAuthorityScore) { maxAuthorityScore = match.authorityScore } 113 | } 114 | 115 | // Compute the final ranking score 116 | for (const match of matches) { 117 | match.score = computeScore(match, maxRelevancyScore, maxAuthorityScore) 118 | 119 | // Penalize anything with especially poor relevancy 120 | if (match.relevancyScore < relevancyScoreThreshold * 2.5) { 121 | match.score -= (relevancyScoreThreshold / match.relevancyScore) 122 | } 123 | } 124 | 125 | matches = matches.sort((a, b) => { 126 | if (a.score < b.score) { 127 | return 1 128 | } 129 | if (a.score > b.score) { 130 | return -1 131 | } 132 | 133 | return 0 134 | }) 135 | 136 | return capLength(matches, MAX_MATCHES) 137 | } 138 | 139 | /* Yuanhua Lv and ChengXiang Zhai. 2011. Lower-bounding term frequency 140 | * normalization. In Proceedings of the 20th ACM international 141 | * conference on Information and knowledge management (CIKM '11), Bettina 142 | * Berendt, Arjen de Vries, Wenfei Fan, Craig Macdonald, Iadh Ounis, and 143 | * Ian Ruthven (Eds.). ACM, New York, NY, USA, 7-16. DOI: https://doi.org/10.1145/2063576.2063584 144 | */ 145 | function dirichletPlus(termFrequencyInQuery, termFrequencyInDoc, 146 | termProbabilityInLanguage, docLength, queryLength) { 147 | const delta = 0.05 148 | 149 | // In the range suggested by A Study of Smoothing Methods for Language Models 150 | // Applied to Ad Hoc Information Retrieval [Zhai, Lafferty] 151 | const mu = 2000 152 | 153 | // In some fields, the query may never exist, making its probability 0. 154 | // This is... weird. Return 0 to avoid NaN since while dirichlet+ 155 | // prefers rare words, a nonexistent word should probably be ignored. 156 | if (termProbabilityInLanguage === 0) { return 0 } 157 | 158 | let term2 = Math.log2(1 + (termFrequencyInDoc / (mu * termProbabilityInLanguage))) 159 | term2 += Math.log2(1 + (delta / (mu * termProbabilityInLanguage))) 160 | 161 | const term3 = queryLength * Math.log2(mu / (docLength + mu)) 162 | 163 | return (termFrequencyInQuery * term2) + term3 164 | } 165 | 166 | class TermEntry { 167 | constructor() { 168 | this.docs = [] 169 | this.positions = new Map() 170 | this.timesAppeared = new Map() 171 | } 172 | 173 | register(propertyName, fieldName, docID) { 174 | this.docs.push(docID) 175 | this.timesAppeared.set(`${fieldName} ${propertyName}`, (this.timesAppeared.get(fieldName) || 0) + 1) 176 | } 177 | 178 | getTimesAppeared(propertyName, fieldName) { 179 | return this.timesAppeared.get(`${fieldName} ${propertyName}`) || 0 180 | } 181 | 182 | addTokenPosition(docID, tokenID) { 183 | const positions = this.positions.get(docID) 184 | if (!positions) { 185 | this.positions.set(docID, [tokenID]) 186 | } else { 187 | positions.push(tokenID) 188 | } 189 | } 190 | } 191 | 192 | class DocumentEntry { 193 | constructor(propertyName, len, termFrequencies) { 194 | this.propertyName = propertyName 195 | this.len = len 196 | this.termFrequencies = termFrequencies 197 | } 198 | } 199 | 200 | class Match { 201 | constructor(docID, relevancyScore, initialTerms) { 202 | this._id = docID 203 | this.relevancyScore = relevancyScore 204 | this.terms = initialTerms 205 | 206 | this.score = 0.0 207 | this.authorityScore = 1.0 208 | this.hubScore = 1.0 209 | this.incomingNeighbors = [] 210 | this.outgoingNeighbors = [] 211 | } 212 | } 213 | 214 | class Field { 215 | constructor(name, weight) { 216 | this.name = name 217 | this.documents = new Map() 218 | this.weight = weight 219 | this.totalTokensSeen = 0 220 | 221 | this._lengthWeight = null 222 | } 223 | 224 | /** Return the inverse average number of unique terms per document. 225 | * This makes no fscking sense, but is useful as a weighting factor 226 | * in my testing. */ 227 | get lengthWeight() { 228 | if (!this._lengthWeight) { 229 | let nTerms = 0 230 | for (const doc of this.documents.values()) { 231 | nTerms += doc.termFrequencies.size 232 | } 233 | 234 | this._lengthWeight = this.documents.size / nTerms 235 | } 236 | 237 | return this._lengthWeight 238 | } 239 | } 240 | 241 | class FTSIndex { 242 | constructor(fields) { 243 | this.fields = fields.map((field) => new Field(field[0], field[1])) 244 | this.trie = new Trie() 245 | this.terms = new Map() 246 | this.docID = 0 247 | this.termID = 0 248 | this.documentWeights = new Map() 249 | 250 | this.linkGraph = new Map() 251 | this.inverseLinkGraph = new Map() 252 | this.urlToId = new Map() 253 | this.idToUrl = new Map() 254 | 255 | this.incomingNeighbors = [] 256 | this.outgoingNeighbors = [] 257 | 258 | this.wordCorrelations = new Map() 259 | } 260 | 261 | // word can be multiple tokens. synonym must be a single token. 262 | correlateWord(word, synonym, closeness) { 263 | word = tokenize(word, false).map((w) => stem(w)).join(' ') 264 | synonym = stem(synonym) 265 | 266 | const correlationEntry = this.wordCorrelations.get(word) 267 | if (!correlationEntry) { 268 | this.wordCorrelations.set(word, [[synonym, closeness]]) 269 | } else { 270 | correlationEntry.push([synonym, closeness]) 271 | } 272 | } 273 | 274 | collectCorrelations(terms) { 275 | const stemmedTerms = new Map(terms.map((term) => [stem(term), 1])) 276 | 277 | for (let i = 0; i < terms.length; i += 1) { 278 | const pair = [stem(terms[i])] 279 | 280 | if (i < terms.length - 1) { 281 | pair.push(`${pair[0]} ${stem(terms[i+1])}`) 282 | } 283 | 284 | for (const term of pair) { 285 | const correlations = this.wordCorrelations.get(term) 286 | if (!correlations) { continue } 287 | 288 | for (const [correlation, weight] of correlations) { 289 | const newWeight = Math.max(stemmedTerms.get(correlation) || 0, weight) 290 | stemmedTerms.set(correlation, newWeight) 291 | } 292 | } 293 | } 294 | 295 | return stemmedTerms 296 | } 297 | 298 | add(propertyName, document, onToken) { 299 | document._id = this.docID 300 | 301 | if (document.links !== undefined && document.url !== undefined) { 302 | document.url = normalizeURL(document.url) 303 | 304 | this.linkGraph.set(document.url, document.links || []) 305 | for (let href of document.links || []) { 306 | href = normalizeURL(href) 307 | let incomingLinks = this.inverseLinkGraph.get(href) 308 | if (!incomingLinks) { 309 | incomingLinks = [] 310 | this.inverseLinkGraph.set(href, incomingLinks) 311 | } 312 | 313 | incomingLinks.push(document.url) 314 | } 315 | this.urlToId.set(document.url, document._id) 316 | this.idToUrl.set(document._id, document.url) 317 | } 318 | 319 | for (const field of this.fields) { 320 | field._lengthWeight = null 321 | const termFrequencies = new Map() 322 | 323 | const text = document[field.name] 324 | if (!text) { continue } 325 | 326 | const tokens = tokenize(text, true) 327 | let numberOfTokens = 0 328 | 329 | for (let token of tokens) { 330 | onToken(token) 331 | 332 | if (isStopWord(token)) { continue } 333 | if (token.startsWith('%%')) { 334 | this.correlateWord(token.slice(2), token, 0.9) 335 | } else if (token.startsWith('$') || token.startsWith('%')) { 336 | this.correlateWord(token.slice(1), token, 0.9) 337 | } else { 338 | token = stem(token) 339 | } 340 | 341 | numberOfTokens += 1 342 | this.termID += 1 343 | 344 | let indexEntry = this.terms.get(token) 345 | if (!indexEntry) { 346 | indexEntry = new TermEntry() 347 | this.terms.set(token, indexEntry) 348 | } 349 | 350 | const count = termFrequencies.get(token) || 0 351 | termFrequencies.set(token, count + 1) 352 | 353 | if (count === 0) { 354 | this.trie.insert(token, document._id) 355 | indexEntry.register(propertyName, field.name, document._id) 356 | } 357 | 358 | indexEntry.addTokenPosition(document._id, this.termID) 359 | } 360 | 361 | // After each field, bump by one to prevent accidental adjacency. 362 | this.termID += 1 363 | 364 | field.totalTokensSeen += numberOfTokens 365 | field.documents.set(document._id, new DocumentEntry(propertyName, numberOfTokens, termFrequencies)) 366 | } 367 | 368 | this.documentWeights.set(document._id, document.weight || 1) 369 | this.docID += 1 370 | 371 | return document._id 372 | } 373 | 374 | getNeighbors(baseSet, match) { 375 | const url = this.idToUrl.get(match._id) 376 | const links = this.linkGraph.get(url) || [] 377 | 378 | let incomingNeighbors = this.incomingNeighbors[match._id] 379 | let outgoingNeighbors = this.outgoingNeighbors[match._id] 380 | 381 | if (!incomingNeighbors) { 382 | const incomingNeighborsSet = new Set() 383 | for (const ancestorURL of this.inverseLinkGraph.get(url) || []) { 384 | const ancestorID = this.urlToId.get(ancestorURL) 385 | if (ancestorID === undefined) { continue } 386 | 387 | if (ancestorID) { 388 | incomingNeighborsSet.add(ancestorID) 389 | } 390 | } 391 | 392 | incomingNeighbors = Array.from(incomingNeighborsSet) 393 | this.incomingNeighbors[match._id] = incomingNeighbors 394 | } 395 | 396 | if (!outgoingNeighbors) { 397 | const outgoingNeighborsSet = new Set() 398 | for (const link of links) { 399 | const descendentID = this.urlToId.get(link) 400 | if (descendentID === undefined) { continue } 401 | 402 | if (descendentID) { 403 | outgoingNeighborsSet.add(descendentID) 404 | } 405 | } 406 | 407 | outgoingNeighbors = Array.from(outgoingNeighborsSet) 408 | this.outgoingNeighbors[match._id] = outgoingNeighbors 409 | } 410 | 411 | for (const neighborID of incomingNeighbors) { 412 | let newMatch = baseSet.get(neighborID) 413 | if (!newMatch) { 414 | newMatch = new Match(neighborID, 0, null) 415 | baseSet.set(neighborID, newMatch) 416 | } 417 | match.incomingNeighbors.push(newMatch) 418 | } 419 | 420 | for (const neighborID of outgoingNeighbors) { 421 | let newMatch = baseSet.get(neighborID) 422 | if (!newMatch) { 423 | newMatch = new Match(neighborID, 0, null) 424 | baseSet.set(neighborID, newMatch) 425 | } 426 | 427 | match.outgoingNeighbors.push(newMatch) 428 | } 429 | } 430 | 431 | collectMatchesFromTrie(terms) { 432 | const resultSet = [] 433 | for (const term of terms) { 434 | const matches = this.trie.search(term, true) 435 | for (const match of matches.entries()) { 436 | resultSet.push(match) 437 | } 438 | } 439 | 440 | return resultSet 441 | } 442 | 443 | search(query, useHits) { 444 | if (typeof query === 'string') { 445 | query = new Query(query) 446 | } 447 | 448 | const matchSet = new Map() 449 | const originalTerms = new Set(query.terms) 450 | const stemmedTerms = this.collectCorrelations(Array.from(query.terms)) 451 | 452 | for (const term of stemmedTerms.keys()) { 453 | const correlations = this.wordCorrelations.get(term) 454 | if (!correlations) { continue } 455 | 456 | for (const [correlation, weight] of correlations) { 457 | const newWeight = Math.max(stemmedTerms.get(correlation) || 0, weight) 458 | stemmedTerms.set(correlation, newWeight) 459 | } 460 | } 461 | 462 | const mandatoryTerms = new Set(Array.from(query.terms).filter(term => MANDATORY.has(term)).map(term => stem(term))) 463 | 464 | for (const tuple of this.collectMatchesFromTrie(stemmedTerms.keys())) { 465 | const [docID, terms] = tuple 466 | 467 | if (!query.filter(docID)) { continue } 468 | 469 | let match = matchSet.get(docID) 470 | if (!match) { 471 | match = new Match(docID, 0, new Set()) 472 | matchSet.set(docID, match) 473 | } 474 | 475 | for (const term of terms) { 476 | const termEntry = this.terms.get(term) 477 | 478 | let termRelevancyScore = 0 479 | for (const field of this.fields) { 480 | const docEntry = field.documents.get(docID) 481 | if (!docEntry) { continue } 482 | 483 | let termWeight = stemmedTerms.get(term) || 0.1 484 | if (mandatoryTerms.has(term)) { 485 | termWeight *= 1.5 486 | } 487 | const termFrequencyInDoc = docEntry.termFrequencies.get(term) || 0 488 | const termProbability = termEntry.getTimesAppeared(docEntry.propertyName, field.name) / Math.max(field.totalTokensSeen, 500) 489 | 490 | // Larger fields yield larger scores, but we want fields to have roughly 491 | // equal weight. field.lengthWeight is stupid, but yields good results. 492 | termRelevancyScore += dirichletPlus(termWeight, termFrequencyInDoc, termProbability, docEntry.len, 493 | originalTerms.size) * field.weight * field.lengthWeight * 494 | this.documentWeights.get(docID) 495 | } 496 | 497 | match.relevancyScore += termRelevancyScore 498 | match.terms.add(term) 499 | } 500 | } 501 | 502 | // Create a root set of the core relevant results 503 | let rootSet = Array.from(matchSet.values()) 504 | if (query.phrases.length) { 505 | rootSet = rootSet.filter((match) => { 506 | const tokens = new Map() 507 | match.terms = Array.from(match.terms) 508 | for (const term of match.terms) { 509 | const termEntry = this.terms.get(term) 510 | if (!termEntry) { return false } 511 | 512 | const positions = termEntry.positions.get(match._id) 513 | if (!positions) { return false } 514 | 515 | tokens.set(term, positions) 516 | } 517 | return query.checkPhrases(tokens) 518 | }) 519 | } 520 | 521 | if (!useHits) { 522 | rootSet = rootSet.sort((a, b) => { 523 | if (a.relevancyScore < b.relevancyScore) { 524 | return 1 525 | } 526 | if (a.relevancyScore > b.relevancyScore) { 527 | return -1 528 | } 529 | 530 | return 0 531 | }) 532 | 533 | return capLength(rootSet, MAX_MATCHES) 534 | } 535 | 536 | // Expand our root set's neighbors to create a base set: the set of all 537 | // relevant pages, as well as pages that link TO or are linked FROM those pages. 538 | let baseSet = new Map(rootSet.map((match) => [match._id, match])) 539 | for (const match of rootSet) { 540 | this.getNeighbors(baseSet, match) 541 | } 542 | 543 | // Run HITS to re-sort our results based on authority 544 | return hits(Array.from(baseSet.values()), 0.00001, 200) 545 | } 546 | } 547 | 548 | exports.FTSIndex = FTSIndex 549 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 'use strict' 3 | 4 | const fs = require('fs') 5 | const http = require('http') 6 | const os = require('os') 7 | const pathModule = require('path') 8 | const process = require('process') 9 | const url = require('url') 10 | const util = require('util') 11 | const zlib = require('zlib') 12 | 13 | const Pool = require('./pool.js').Pool 14 | const dive = require('dive') 15 | const Logger = require('basic-logger') 16 | const S3 = require('aws-sdk/clients/s3') 17 | const Worker = require('tiny-worker') 18 | 19 | process.title = 'marian' 20 | 21 | const MAXIMUM_QUERY_LENGTH = 100 22 | 23 | // If a worker's backlog rises above this threshold, reject the request. 24 | // This prevents the server from getting bogged down for unbounded periods of time. 25 | const MAXIMUM_BACKLOG = 20 26 | const WARNING_BACKLOG = 15 27 | 28 | const STANDARD_HEADERS = { 29 | 'X-Content-Type-Options': 'nosniff', 30 | 'X-Robots-Tag': 'noindex' 31 | } 32 | 33 | const log = new Logger({ 34 | showTimestamp: true, 35 | }) 36 | 37 | /** 38 | * Find an acceptable compression format for the client, and return a compressed 39 | * version of the content if possible. Otherwise return the original input text. 40 | * 41 | * Supports Brotli and gzip. 42 | * 43 | * @param {http.IncomingMessage} req The HTTP request object. 44 | * @param {map} headers The headers object which will be used in the response. 45 | * @param {string} content The text to compress. 46 | * @return {Buffer|string} 47 | */ 48 | function compress(req, headers, content) { 49 | const acceptEncoding = (req.headers['accept-encoding'] || '').split(',').map((e) => e.trim()) 50 | if (acceptEncoding.indexOf('gzip') > -1) { 51 | headers['Content-Encoding'] = 'gzip' 52 | return util.promisify(zlib.gzip)(content) 53 | } 54 | 55 | return new Promise((resolve) => resolve(content)) 56 | } 57 | 58 | /** 59 | * If the request method does not match the method parameter, return false 60 | * and write a 405 status code. Otherwise return true. 61 | * 62 | * @param {http.IncomingMessage} req 63 | * @param {http.ServerResponse} res 64 | * @param {string} method 65 | * @return {boolean} 66 | */ 67 | function checkMethod(req, res, method) { 68 | if (req.method !== method) { 69 | res.writeHead(405, {}) 70 | res.end('') 71 | return false 72 | } 73 | 74 | return true 75 | } 76 | 77 | /** A web worker with a promise-oriented message-call interface. */ 78 | class TaskWorker { 79 | /** 80 | * Create a new TaskWorker. 81 | * @param {string} scriptPath - A path to a JS file to execute. 82 | */ 83 | constructor(scriptPath) { 84 | this.worker = new Worker(scriptPath) 85 | this.worker.onmessage = this.onmessage.bind(this) 86 | 87 | this.backlog = 0 88 | this.pending = new Map() 89 | this.messageId = 0 90 | } 91 | 92 | /** 93 | * Send a message to this TaskWorker. 94 | * @param {map} message - An object to send to the worker. 95 | * @return {Promise} 96 | */ 97 | send(message) { 98 | if (this.backlog > MAXIMUM_BACKLOG) { 99 | throw new Error('backlog-exceeded') 100 | } 101 | 102 | return new Promise((resolve, reject) => { 103 | const messageId = this.messageId 104 | this.messageId += 1 105 | this.backlog += 1 106 | 107 | this.worker.postMessage({message: message, messageId: messageId}) 108 | this.pending.set(messageId, [resolve, reject]) 109 | }) 110 | } 111 | 112 | /** 113 | * Handler for messages received from the worker. 114 | * @private 115 | * @param {MessageEvent} event 116 | * @return {Promise} 117 | */ 118 | onmessage(event) { 119 | const pair = this.pending.get(event.data.messageId) 120 | if (!pair) { 121 | log.error(`Got unknown message ID ${event.data.messageId}`) 122 | return 123 | } 124 | 125 | this.backlog -= 1 126 | this.pending.delete(event.data.messageId) 127 | const [resolve, reject] = pair 128 | if (event.data.error) { 129 | reject(new Error(event.data.error)) 130 | return 131 | } 132 | 133 | resolve(event.data) 134 | } 135 | } 136 | 137 | class Index { 138 | constructor(manifestSource) { 139 | this.manifestSource = manifestSource 140 | this.manifests = [] 141 | this.errors = [] 142 | 143 | this.lastSyncDate = null 144 | this.currentlyIndexing = false 145 | 146 | const MAX_WORKERS = parseInt(process.env.MAX_WORKERS) || 2 147 | this.workers = new Pool(Math.min(os.cpus().length, MAX_WORKERS), () => new TaskWorker(pathModule.join(__dirname, 'worker-searcher.js'))) 148 | 149 | // Suspend all of our workers until we have an index 150 | for (const worker of this.workers.pool) { 151 | this.workers.suspend(worker) 152 | } 153 | } 154 | 155 | getStatus() { 156 | return { 157 | manifests: this.manifests, 158 | lastSync: { 159 | errors: this.errors, 160 | finished: this.lastSyncDate ? this.lastSyncDate.toISOString() : null 161 | }, 162 | workers: this.workers.getStatus() 163 | } 164 | } 165 | 166 | search(queryString, searchProperty) { 167 | const worker = this.workers.get() 168 | const useHits = worker.backlog <= WARNING_BACKLOG 169 | 170 | return worker.send({search: { 171 | queryString: queryString, 172 | searchProperty: searchProperty, 173 | useHits: useHits 174 | }}).then((message) => message.results) 175 | } 176 | 177 | async getManifestsFromS3(bucketName, prefix) { 178 | const s3 = new S3({apiVersion: '2006-03-01'}) 179 | const result = await util.promisify(s3.makeUnauthenticatedRequest.bind(s3))('listObjectsV2', { 180 | Bucket: bucketName, 181 | Prefix: prefix 182 | }) 183 | 184 | if (result.IsTruncated) { 185 | // This would indicate something awry, since we shouldn't 186 | // ever have more than 1000 properties. And if we ever did, 187 | // everything would need to be rearchitected. 188 | throw new Error('Got truncated response from S3') 189 | } 190 | 191 | const manifests = [] 192 | for (const bucketEntry of result.Contents) { 193 | if (bucketEntry.Size === 0) { 194 | continue 195 | } 196 | 197 | const matches = bucketEntry.Key.match(/([^/]+).json$/) 198 | if (matches === null) { 199 | this.errors.push(`Got weird filename in manifest listing: "${bucketEntry.Key}"`) 200 | continue 201 | } 202 | 203 | const searchProperty = matches[1] 204 | const data = await util.promisify(s3.makeUnauthenticatedRequest.bind(s3))('getObject', { 205 | Bucket: bucketName, 206 | Key: bucketEntry.Key 207 | }) 208 | 209 | manifests.push({ 210 | body: data.Body.toString('utf-8'), 211 | lastModified: data.LastModified, 212 | searchProperty: searchProperty 213 | }) 214 | } 215 | 216 | return manifests 217 | } 218 | 219 | getManifestsFromDirectory(prefix) { 220 | return new Promise((resolve, reject) => { 221 | const manifests = [] 222 | 223 | dive(prefix, (err, path, stats) => { 224 | if (err) { reject(err) } 225 | const matches = path.match(/([^/]+).json$/) 226 | if (!matches) { return } 227 | const searchProperty = matches[1] 228 | 229 | manifests.push({ 230 | body: fs.readFileSync(path, {encoding: 'utf-8'}), 231 | lastModified: stats.mtime, 232 | searchProperty: searchProperty 233 | }) 234 | }, () => { 235 | resolve(manifests) 236 | })}) 237 | } 238 | 239 | async getManifests() { 240 | const parsedSource = this.manifestSource.match(/((?:bucket)|(?:dir)):(.+)/) 241 | if (!parsedSource) { 242 | throw new Error('Bad manifest source') 243 | } 244 | 245 | let manifests 246 | if (parsedSource[1] === 'bucket') { 247 | const parts = parsedSource[2].split('/', 2) 248 | const bucketName = parts[0].trim() 249 | const prefix = parts[1].trim() 250 | if (!bucketName.length || !prefix.length) { 251 | throw new Error('Bad bucket manifest source') 252 | } 253 | manifests = await this.getManifestsFromS3(bucketName, prefix) 254 | } else if (parsedSource[1] === 'dir') { 255 | manifests = await this.getManifestsFromDirectory(parsedSource[2]) 256 | } else { 257 | throw new Error('Unknown manifest source protocol') 258 | } 259 | 260 | return manifests 261 | } 262 | 263 | async load() { 264 | if (this.currentlyIndexing) { 265 | throw new Error('already-indexing') 266 | } 267 | this.currentlyIndexing = true 268 | 269 | let manifests 270 | try { 271 | manifests = await this.getManifests() 272 | } catch (err) { 273 | this.currentlyIndexing = false 274 | throw err 275 | } 276 | 277 | this.errors = [] 278 | setTimeout(async () => { 279 | for (const worker of this.workers.pool) { 280 | this.workers.suspend(worker) 281 | try { 282 | await worker.send({sync: manifests}) 283 | } finally { 284 | this.workers.resume(worker) 285 | } 286 | 287 | // Ideally we would have a lastSyncDate per worker. 288 | this.lastSyncDate = new Date() 289 | } 290 | 291 | this.currentlyIndexing = false 292 | this.manifests = manifests.map((manifest) => manifest.searchProperty) 293 | 294 | log.info('Loaded new index') 295 | }, 1) 296 | } 297 | } 298 | 299 | class HTTPStatusException extends Error { 300 | constructor(code, result) { 301 | super(`HTTP Status ${code}`) 302 | this.code = code 303 | this.result = result 304 | Error.captureStackTrace(this, HTTPStatusException) 305 | } 306 | } 307 | 308 | function escapeHTML(unsafe) { 309 | return unsafe 310 | .replace(/&/g, '&') 311 | .replace(//g, '>') 313 | .replace(/"/g, '"') 314 | .replace(/'/g, ''') 315 | } 316 | 317 | 318 | class Marian { 319 | constructor(bucket) { 320 | this.index = new Index(bucket) 321 | 322 | // Fire-and-forget loading 323 | this.index.load().catch((err) => { 324 | this.errors.push(err) 325 | }) 326 | } 327 | 328 | start(port) { 329 | const server = http.createServer(async (req, res) => { 330 | try { 331 | await this.handle(req, res) 332 | } catch(err) { 333 | log.error(err) 334 | res.writeHead(500, {}) 335 | res.end('') 336 | } 337 | }) 338 | 339 | server.listen(port, () => { 340 | log.info(`Listening on port ${port}`) 341 | }) 342 | } 343 | 344 | handle(req, res) { 345 | const parsedUrl = url.parse(req.url, true) 346 | 347 | const pathname = parsedUrl.pathname.replace(/\/+$/, '') 348 | if (pathname === '/search') { 349 | if (checkMethod(req, res, 'GET')) { 350 | this.handleSearch(parsedUrl, req, res) 351 | } 352 | } else if (pathname === '/refresh') { 353 | if (checkMethod(req, res, 'POST')) { 354 | this.handleRefresh(parsedUrl, req, res) 355 | } 356 | } else if (pathname === '/status') { 357 | if (checkMethod(req, res, 'GET')) { 358 | this.handleStatus(parsedUrl, req, res) 359 | } 360 | } else if (pathname === '') { 361 | if (checkMethod(req, res, 'GET')) { 362 | this.handleUI(parsedUrl, req, res) 363 | } 364 | } else { 365 | res.writeHead(400, {}) 366 | res.end('') 367 | } 368 | } 369 | 370 | async handleRefresh(parsedUrl, req, res) { 371 | const headers = { 372 | 'Vary': 'Accept-Encoding' 373 | } 374 | Object.assign(headers, STANDARD_HEADERS) 375 | 376 | try { 377 | await this.index.load() 378 | } catch(err) { 379 | headers['Content-Type'] = 'application/json' 380 | const body = await compress(req, headers, JSON.stringify({'errors': [err]})) 381 | 382 | if (err.message === 'already-indexing') { 383 | log.warn('Index request rejected: busy') 384 | res.writeHead(200, headers) 385 | } else { 386 | res.writeHead(500, headers) 387 | } 388 | res.end(body) 389 | return 390 | } 391 | 392 | if (this.index.errors.length > 0) { 393 | headers['Content-Type'] = 'application/json' 394 | const body = await compress(req, headers, JSON.stringify({'errors': this.index.errors})) 395 | res.writeHead(200, headers) 396 | res.end(body) 397 | return 398 | } 399 | 400 | res.writeHead(200, headers) 401 | res.end('') 402 | } 403 | 404 | async handleStatus(parsedUrl, req, res) { 405 | const headers = { 406 | 'Content-Type': 'application/json', 407 | 'Vary': 'Accept-Encoding', 408 | 'Pragma': 'no-cache', 409 | 'Access-Control-Allow-Origin': '*', 410 | } 411 | Object.assign(headers, STANDARD_HEADERS) 412 | 413 | const status = this.index.getStatus() 414 | let body = JSON.stringify(status) 415 | body = await compress(req, headers, body) 416 | 417 | // If all workers are overloaded, return 503 418 | let statusCode = 200 419 | if (status.workers.filter((n) => (n <= WARNING_BACKLOG && n !== 's')).length === 0) { 420 | statusCode = 503 421 | } 422 | 423 | res.writeHead(statusCode, headers) 424 | res.end(body) 425 | } 426 | 427 | async fetchResults(parsedUrl, req) { 428 | if (req.headers['if-modified-since'] && this.index.lastSyncDate) { 429 | const lastSyncDateNoMilliseconds = new Date(this.index.lastSyncDate) 430 | // HTTP dates truncate the milliseconds. 431 | lastSyncDateNoMilliseconds.setMilliseconds(0) 432 | 433 | const ifModifiedSince = new Date(req.headers['if-modified-since']) 434 | if (ifModifiedSince >= lastSyncDateNoMilliseconds) { 435 | throw new HTTPStatusException(304, '') 436 | } 437 | } 438 | 439 | if (parsedUrl.query.length > MAXIMUM_QUERY_LENGTH) { 440 | throw new HTTPStatusException(400, '[]') 441 | } 442 | 443 | const query = parsedUrl.query.q 444 | if (!query) { 445 | throw new HTTPStatusException(400, '[]') 446 | } 447 | 448 | try { 449 | return await this.index.search(query, parsedUrl.query.searchProperty) 450 | } catch (err) { 451 | if (err.message === 'still-indexing' || err.message === 'backlog-exceeded' || err.message === 'pool-unavailable') { 452 | // Search index isn't yet loaded, or our backlog is out of control 453 | throw new HTTPStatusException(503, '[]') 454 | } else if (err.message === 'query-too-long') { 455 | throw new HTTPStatusException(400, '[]') 456 | } 457 | 458 | log.error(err) 459 | } 460 | } 461 | 462 | async handleSearch(parsedUrl, req, res) { 463 | const headers = { 464 | 'Content-Type': 'application/json', 465 | 'Vary': 'Accept-Encoding', 466 | 'Cache-Control': 'public,max-age=120,must-revalidate', 467 | 'Access-Control-Allow-Origin': '*', 468 | } 469 | Object.assign(headers, STANDARD_HEADERS) 470 | 471 | let results 472 | try { 473 | results = await this.fetchResults(parsedUrl, req) 474 | } catch (err) { 475 | if (err.code === undefined || err.result === undefined) { 476 | throw(err) 477 | } 478 | 479 | res.writeHead(err.code, headers) 480 | res.end(err.result) 481 | return 482 | } 483 | headers['Last-Modified'] = this.index.lastSyncDate.toUTCString() 484 | let responseBody = JSON.stringify(results) 485 | 486 | responseBody = await compress(req, headers, responseBody) 487 | res.writeHead(200, headers) 488 | res.end(responseBody) 489 | } 490 | 491 | async handleUI(parsedUrl, req, res) { 492 | const headers = { 493 | 'Content-Type': 'text/html', 494 | 'Vary': 'Accept-Encoding', 495 | 'Cache-Control': 'public,max-age=120,must-revalidate', 496 | } 497 | Object.assign(headers, STANDARD_HEADERS) 498 | 499 | const dataList = this.index.manifests.map((manifest) => encodeURIComponent(manifest)) 500 | if (dataList.length > 0) { 501 | dataList.unshift('') 502 | } 503 | 504 | const query = parsedUrl.query.q || '' 505 | const searchProperty = parsedUrl.query.searchProperty || '' 506 | let results = [] 507 | let resultError = false 508 | if (query) { 509 | try { 510 | results = (await this.fetchResults(parsedUrl, req)).results 511 | } catch (err) { 512 | if (err.code === undefined || err.result === undefined) { 513 | throw(err) 514 | } 515 | 516 | resultError = true 517 | } 518 | } 519 | 520 | const resultTextParts = results.map(result => { 521 | return `
  • 522 | 523 |
    ${escapeHTML(result.preview)}
    524 |
  • ` 525 | }) 526 | 527 | let responseBody = ` 528 | Marian 529 | 533 | 534 | 535 |
    536 | 537 | 538 | 539 |
    540 | 541 | ${dataList.join(' 543 | ${resultError ? '

    Error fetching results

    ' : ''} 544 |
      545 | ${resultTextParts.join('\n')} 546 |
    547 | 555 | 556 | ` 557 | 558 | responseBody = await compress(req, headers, responseBody) 559 | res.writeHead(200, headers) 560 | res.end(responseBody) 561 | } 562 | } 563 | 564 | async function main() { 565 | Logger.setLevel('info', true) 566 | 567 | const server = new Marian(process.argv[2]) 568 | server.start(8080) 569 | } 570 | 571 | main() 572 | -------------------------------------------------------------------------------- /src/pool.js: -------------------------------------------------------------------------------- 1 | /** A balancing scheduling pool. Useful primarily for making a pool of TaskWorkers. */ 2 | class Pool { 3 | /** 4 | * Create a new Pool. 5 | * @param {number} size - The size of the pool. 6 | * @param {function} f - A function returning a pool element. This element 7 | * must have a "backlog" property representing its current load. 8 | */ 9 | constructor(size, f) { 10 | if (this.size <= 0) { throw new Error('Bad pool size') } 11 | 12 | this.pool = [] 13 | this.suspended = new Set() 14 | for (let i = 0; i < size; i += 1) { 15 | this.pool.push(f()) 16 | } 17 | } 18 | 19 | suspend(element) { 20 | this.suspended.add(element) 21 | } 22 | 23 | resume(element) { 24 | this.suspended.delete(element) 25 | } 26 | 27 | /** 28 | * Return the least-loaded element of the pool. 29 | * @return {?} The least-loaded element of the pool. 30 | */ 31 | get() { 32 | const dummy = {backlog: Infinity} 33 | let min = dummy 34 | for (const element of this.pool) { 35 | if (this.suspended.has(element)) { continue } 36 | if (element.backlog < min.backlog) { 37 | min = element 38 | } 39 | } 40 | 41 | if (dummy === min) { 42 | throw new Error('pool-unavailable') 43 | } 44 | 45 | return min 46 | } 47 | 48 | getStatus() { 49 | return this.pool.map((worker) => { 50 | if (!this.suspended.has(worker)) { 51 | return worker.backlog 52 | } 53 | 54 | return 's' 55 | }) 56 | } 57 | } 58 | 59 | exports.Pool = Pool 60 | -------------------------------------------------------------------------------- /src/worker-searcher.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | require('process').title = 'marian-worker' 4 | const pathModule = require('path') 5 | 6 | const dictionary = require('dictionary-en-us') 7 | const nspell = require('nspell') 8 | const Query = require(pathModule.join(__dirname, './src/fts/Query.js')).Query 9 | const fts = require(pathModule.join(__dirname, './src/fts/fts.js')) 10 | const correlations = require(pathModule.join(__dirname, './src/correlations.js')).correlations 11 | 12 | const MAXIMUM_TERMS = 10 13 | 14 | let spelling = null 15 | let searchPropertyAliases = new Map() 16 | let index = null 17 | let documents = {} 18 | 19 | /** 20 | * Search the index, and return results within the given searchProperty. 21 | * @param {string} queryString The query string. 22 | * @param {[string]} searchProperties The properties to search. If empty, all results are returned. 23 | * @param {boolean} useHits True if HITS link analysis should be performed. 24 | * @return {{results: [{title: String, preview: String, url: String}], spellingCorrections: Object}} 25 | */ 26 | function search(queryString, searchProperties, useHits) { 27 | if (!index) { 28 | throw new Error('still-indexing') 29 | } 30 | 31 | searchProperties = searchProperties.map((property) => { 32 | if (searchPropertyAliases.has(property)) { 33 | return searchPropertyAliases.get(property) 34 | } 35 | 36 | return property 37 | }) 38 | 39 | const parsedQuery = new Query(queryString) 40 | if (parsedQuery.terms.size > MAXIMUM_TERMS) { 41 | throw new Error('query-too-long') 42 | } 43 | 44 | if (searchProperties.length) { 45 | const properties = new Set(searchProperties) 46 | parsedQuery.filter = (_id) => properties.has(documents[_id].searchProperty) 47 | } else { 48 | parsedQuery.filter = (_id) => documents[_id].includeInGlobalSearch === true 49 | } 50 | 51 | let rawResults = index.search(parsedQuery, useHits) 52 | 53 | // If our results seem poor in quality, check if the query is misspelled 54 | const misspelled = {} 55 | if (spelling !== null && (rawResults.length === 0 || rawResults[0].score <= 0.6)) { 56 | for (const term of parsedQuery.terms) { 57 | const suggestions = spelling.suggest(term) 58 | if (suggestions.length > 0) { 59 | misspelled[term] = suggestions[0] 60 | } 61 | } 62 | } 63 | 64 | rawResults = rawResults.map((match) => { 65 | const doc = documents[match._id] 66 | // console.log(doc.title, match.score, match.relevancyScore, match.authorityScore) 67 | return { 68 | title: doc.title, 69 | preview: doc.preview, 70 | url: doc.url 71 | } 72 | }) 73 | 74 | return { 75 | results: rawResults, 76 | spellingCorrections: misspelled 77 | } 78 | } 79 | 80 | function setupSpellingDictionary(words) { 81 | dictionary((err, dict) => { 82 | if (err) { 83 | console.error(err) 84 | } 85 | 86 | const newWords = dict.dic.utf8Slice().split('\n').filter((w) => { 87 | return words.has(w.split('/', 1)[0]) 88 | }) 89 | const newSpelling = nspell(dict.aff, newWords.join('\n')) 90 | for (const word of words) { 91 | newSpelling.add(word) 92 | } 93 | 94 | spelling = newSpelling 95 | }) 96 | } 97 | 98 | function sync(manifests) { 99 | const newSearchPropertyAliases = new Map() 100 | const newIndex = new fts.FTSIndex([ 101 | ['text', 1], 102 | ['headings', 5], 103 | ['title', 10], 104 | ['tags', 10], 105 | ]) 106 | 107 | for (const [term, synonymn, weight] of correlations) { 108 | newIndex.correlateWord(term, synonymn, weight) 109 | } 110 | 111 | manifests = manifests.map((manifest) => { 112 | manifest.body = JSON.parse(manifest.body) 113 | let url = manifest.body.url.replace(/\/+$/, '') 114 | const urlRoot = new URL(url) 115 | urlRoot.pathname = urlRoot.pathname.replace(/^\/+/, '') 116 | url = urlRoot.toString().replace(/\/+$/, '') 117 | 118 | for (const alias of (manifest.body.aliases || [])) { 119 | newSearchPropertyAliases.set(alias, manifest.searchProperty) 120 | } 121 | 122 | manifest.body.documents = manifest.body.documents.map((doc) => { 123 | doc.slug = doc.slug.replace(/^\/+/, '') 124 | doc.url = `${url}/${doc.slug}` 125 | 126 | return doc 127 | }) 128 | 129 | return { 130 | documents: manifest.body.documents, 131 | searchProperty: manifest.searchProperty, 132 | includeInGlobalSearch: manifest.body.includeInGlobalSearch 133 | } 134 | }) 135 | 136 | const words = new Set() 137 | const newDocuments = Object.create(null) 138 | 139 | for (const manifest of manifests) { 140 | for (const doc of manifest.documents) { 141 | const weight = doc.weight || 1 142 | const id = newIndex.add(manifest.searchProperty, { 143 | links: doc.links, 144 | url: doc.url, 145 | 146 | weight: weight, 147 | text: doc.text, 148 | tags: doc.tags, 149 | headings: (doc.headings || []).join(' '), 150 | title: doc.title}, (word) => words.add(word)) 151 | 152 | newDocuments[id] = { 153 | title: doc.title, 154 | preview: doc.preview, 155 | url: doc.url, 156 | searchProperty: manifest.searchProperty, 157 | includeInGlobalSearch: manifest.includeInGlobalSearch 158 | } 159 | } 160 | } 161 | 162 | setupSpellingDictionary(words) 163 | index = newIndex 164 | searchPropertyAliases = newSearchPropertyAliases 165 | documents = newDocuments 166 | } 167 | 168 | self.onmessage = function(event) { 169 | const message = event.data.message 170 | const messageId = event.data.messageId 171 | 172 | try { 173 | if (message.search !== undefined) { 174 | const properties = (message.search.searchProperty || '').split(',').filter((x) => x) 175 | 176 | const results = search(message.search.queryString, properties, message.search.useHits) 177 | self.postMessage({results: results, messageId: messageId}) 178 | } else if (message.sync !== undefined) { 179 | sync(message.sync) 180 | self.postMessage({ok: true, messageId: messageId}) 181 | } else { 182 | throw new Error('Unknown command') 183 | } 184 | } catch (err) { 185 | self.postMessage({error: err.message, messageId: messageId}) 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /test/integration_test.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /* eslint-env node, mocha */ 3 | 'use strict' 4 | 5 | const assert = require('assert') 6 | const process = require('process') 7 | 8 | const testUtil = require('./util.js') 9 | 10 | describe('integration', function() { 11 | this.slow(100) 12 | 13 | let ctx = null 14 | 15 | before('starting server', function(done) { 16 | ctx = testUtil.startServer('dir:test/manifests/', done) 17 | }) 18 | 19 | let lastSync 20 | function testFunctionality() { 21 | it('should return proper /status document', async () => { 22 | const result = await testUtil.request(`http://localhost:${ctx.port}/status`) 23 | assert.strictEqual(result.response.statusCode, 200) 24 | assert.strictEqual(result.response.headers['content-type'], 'application/json') 25 | assert.ok(result.json.lastSync.finished) 26 | lastSync = result.json.lastSync.finished 27 | assert.deepStrictEqual(result.json.manifests.sort(), ['atlas-master', 'bi-connector-master']) 28 | }) 29 | 30 | it('should return proper results for a normal query', async () => { 31 | const result = await testUtil.request(`http://localhost:${ctx.port}/search?q=${encodeURIComponent('"connect dialog" compass')}`) 32 | assert.strictEqual(result.response.statusCode, 200) 33 | assert.strictEqual(result.response.headers['content-type'], 'application/json') 34 | assert.deepStrictEqual(result.json, {'results':[{'title':'Connect via Compass — MongoDB Atlas','preview':'The Connect dialog for a cluster provides the details to connect to a cluster via Compass.','url':'https://docs.atlas.mongodb.com/compass-connection/index.html'},{'title':'Connect via mongo Shell — MongoDB Atlas','preview':'The Connect dialog for a cluster provides the details to connect to a cluster via the mongo shell.','url':'https://docs.atlas.mongodb.com/mongo-shell-connection/index.html'},{'title':'Connect via Driver — MongoDB Atlas','preview':'The Connect dialog for a cluster provides the details to connect to a cluster with an application using a MongoDB driver.','url':'https://docs.atlas.mongodb.com/driver-connection/index.html'},{'title':'Connect to a Cluster — MongoDB Atlas','preview':'Atlas provides instructions on connecting to a cluster via the mongo shell, a MongoDB driver, or MongoDB Compass via the Atlas UI.','url':'https://docs.atlas.mongodb.com/connect-to-cluster/index.html'},{'title':'Set up VPC Peering Connection — MongoDB Atlas','preview':'For Atlas clusters deployed on Google Cloud Platform or Microsoft Azure, add the IP addresses of your GCP or Azure services to Atlas group IP whitelist to grant those services access to the cluster.','url':'https://docs.atlas.mongodb.com/security-vpc-peering/index.html'},{'title':'Connect from Tableau Desktop — MongoDB Connector for BI 2.2','preview':'The MongoDB Connector for BI is a named connector in Tableau.','url':'https://docs.mongodb.com/bi-connector/current/connect/tableau/index.html'},{'title':'Load File with mongoimport — MongoDB Atlas','preview':'You can use mongoimport to import data from a JSON or a CSV file into MongoDB Atlas cluster.','url':'https://docs.atlas.mongodb.com/import/mongoimport/index.html'},{'title':'Migrate with mongomirror — MongoDB Atlas','preview':'mongomirror is a utility for migrating data from an existing MongoDB replica set to a MongoDB Atlas replica set. mongomirror does not require you to shut down your existing replica set or applications.','url':'https://docs.atlas.mongodb.com/import/mongomirror/index.html'},{'title':'MongoDB Atlas — MongoDB Atlas','preview':'MongoDB Atlas is a cloud service for running, monitoring, and maintaining MongoDB deployments, including the provisioning of dedicated servers for the MongoDB instances. In addition, Atlas provides the ability to introspect collections, query backups, and migrate data from existing MongoDB replica set into an Atlas cluster.','url':'https://docs.atlas.mongodb.com/index.html'}],'spellingCorrections':{}}) 35 | }) 36 | 37 | // Test spelling correction 38 | it('should return spelling corrections', async () => { 39 | const result = await testUtil.request(`http://localhost:${ctx.port}/search?q=quary`) 40 | assert.strictEqual(result.response.statusCode, 200) 41 | assert.strictEqual(result.response.headers['content-type'], 'application/json') 42 | assert.deepStrictEqual(result.json.spellingCorrections, {'quary': 'query'}) 43 | }) 44 | 45 | // Test variants of searchProperty 46 | it('should properly handle searchProperty', async () => { 47 | let result = await testUtil.request(`http://localhost:${ctx.port}/search?q=aggregation`) 48 | assert.strictEqual(result.response.statusCode, 200) 49 | assert.strictEqual(result.response.headers['content-type'], 'application/json') 50 | assert.deepStrictEqual(result.json, {'results':[{'title':'Schema Configuration — MongoDB Connector for BI 2.2','preview':'Business intelligence tools connect to a data source and, given a fixed tabular schema, allow the user to visually explore their data. As MongoDB uses a flexible schema, these tools currently cannot use MongoDB as a native data source.','url':'https://docs.mongodb.com/bi-connector/current/schema-configuration/index.html'},{'title':'Supported SQL Functions and Operators — MongoDB Connector for BI 2.2','preview':'MongoDB Connector for BI Version 2.2 is compatible with SQL-99 SELECT statements.','url':'https://docs.mongodb.com/bi-connector/current/supported-operations/index.html'},{'title':'MongoDB Reference — MongoDB Atlas','preview':'For a comprehensive documentation of MongoDB, refer to the MongoDB Manual. The following sections in the manual provide some starting points for developing with MongoDB.','url':'https://docs.atlas.mongodb.com/mongodb-reference/index.html'},{'title':'Command Limitations in Free Tier Clusters — MongoDB Atlas','preview':'Atlas Free Tier clusters do not support all functionality available to other clusters.','url':'https://docs.atlas.mongodb.com/unsupported-commands/index.html'},{'title':'Monitor a Cluster — MongoDB Atlas','preview':'Atlas collects and displays metrics for your servers, databases, and MongoDB processes. Atlas displays three charts in the Clusters view and additional charts in the Metrics view.','url':'https://docs.atlas.mongodb.com/monitor-cluster-metrics/index.html'},{'title':'Create a Cluster — MongoDB Atlas','preview':'Atlas-managed MongoDB deployments, or “clusters”, can be either a replica set or a sharded cluster.','url':'https://docs.atlas.mongodb.com/create-new-cluster/index.html'},{'title':'Query a Backup Snapshot — MongoDB Atlas','preview':'Atlas provides queryable backups. This functionality allows you to query specific backup snapshot. You can use the queryable backups to:','url':'https://docs.atlas.mongodb.com/query-backup/index.html'},{'title':'MongoDB Connector for BI — MongoDB Connector for BI 2.2','preview':'The MongoDB Connector for Business Intelligence (BI) allows users to visualize their MongoDB Enterprise data using existing relational business intelligence tools such as Tableau, MicroStrategy, and Qlik. These tools expect to connect to a data source and find data in tabular form following a fixed schema. This presents a challenge when working with MongoDB’s flexible schema and rich, multi-dimensional documents.','url':'https://docs.mongodb.com/bi-connector/current/index.html'},{'title':'Release Notes for MongoDB Connector for BI — MongoDB Connector for BI 2.2','preview':'Supports authenticating directly against MongoDB using the new C and JDBC authentication plugins. These plugins support SCRAM-SHA-1 and PLAIN mechanisms and remove the SSL requirement for authentication. The authentication plugins can be found on GitHub:','url':'https://docs.mongodb.com/bi-connector/current/release-notes/index.html'},{'title':'mongodrdl — MongoDB Connector for BI 2.2','preview':'The mongodrdl command man page.','url':'https://docs.mongodb.com/bi-connector/current/reference/mongodrdl/index.html'}],'spellingCorrections':{}}) 51 | 52 | const result2 = await testUtil.request(`http://localhost:${ctx.port}/search?q=aggregation&searchProperty=atlas-master,bi-connector-master`) 53 | assert.deepStrictEqual(result.json, result2.json) 54 | 55 | result = await testUtil.request(`http://localhost:${ctx.port}/search?q=aggregation&searchProperty=bi-connector-master`) 56 | assert.strictEqual(result.response.statusCode, 200) 57 | assert.strictEqual(result.response.headers['content-type'], 'application/json') 58 | assert.deepStrictEqual(result.json, {'results':[{'title':'Schema Configuration — MongoDB Connector for BI 2.2','preview':'Business intelligence tools connect to a data source and, given a fixed tabular schema, allow the user to visually explore their data. As MongoDB uses a flexible schema, these tools currently cannot use MongoDB as a native data source.','url':'https://docs.mongodb.com/bi-connector/current/schema-configuration/index.html'},{'title':'Supported SQL Functions and Operators — MongoDB Connector for BI 2.2','preview':'MongoDB Connector for BI Version 2.2 is compatible with SQL-99 SELECT statements.','url':'https://docs.mongodb.com/bi-connector/current/supported-operations/index.html'},{'title':'MongoDB Connector for BI — MongoDB Connector for BI 2.2','preview':'The MongoDB Connector for Business Intelligence (BI) allows users to visualize their MongoDB Enterprise data using existing relational business intelligence tools such as Tableau, MicroStrategy, and Qlik. These tools expect to connect to a data source and find data in tabular form following a fixed schema. This presents a challenge when working with MongoDB’s flexible schema and rich, multi-dimensional documents.','url':'https://docs.mongodb.com/bi-connector/current/index.html'},{'title':'Release Notes for MongoDB Connector for BI — MongoDB Connector for BI 2.2','preview':'Supports authenticating directly against MongoDB using the new C and JDBC authentication plugins. These plugins support SCRAM-SHA-1 and PLAIN mechanisms and remove the SSL requirement for authentication. The authentication plugins can be found on GitHub:','url':'https://docs.mongodb.com/bi-connector/current/release-notes/index.html'},{'title':'mongodrdl — MongoDB Connector for BI 2.2','preview':'The mongodrdl command man page.','url':'https://docs.mongodb.com/bi-connector/current/reference/mongodrdl/index.html'}],'spellingCorrections':{}}) 59 | 60 | const result3 = await testUtil.request(`http://localhost:${ctx.port}/search?q=aggregation&searchProperty=bi-connector-alias`) 61 | assert.deepStrictEqual(result.json, result3.json) 62 | }) 63 | 64 | it('should return 304 if index hasn\'t changed', async () => { 65 | const result = await testUtil.request({ 66 | port: ctx.port, 67 | path: `/search?q=${encodeURIComponent('quary')}`, 68 | headers: { 69 | 'If-Modified-Since': new Date().toUTCString() 70 | }}) 71 | assert.strictEqual(result.response.statusCode, 304) 72 | }) 73 | 74 | it('should NOT return 304 if index has changed', async () => { 75 | const result = await testUtil.request({ 76 | port: ctx.port, 77 | path: `/search?q=${encodeURIComponent('quary')}`, 78 | headers: { 79 | 'If-Modified-Since': new Date(0).toUTCString() 80 | }}) 81 | assert.strictEqual(result.response.statusCode, 200) 82 | }) 83 | } 84 | 85 | it('should print port to stdout', () => { 86 | assert.ok(ctx.port) 87 | }) 88 | 89 | testFunctionality() 90 | 91 | it('should return 200 to /refresh', async function() { 92 | this.slow(5000) 93 | const result = await testUtil.request({ 94 | method: 'post', 95 | port: ctx.port, 96 | path: '/refresh'}) 97 | assert.strictEqual(result.response.statusCode, 200) 98 | 99 | await new Promise((resolve, reject) => { 100 | const intervalID = setInterval(async () => { 101 | const result = await testUtil.request({ 102 | port: ctx.port, 103 | path: '/status'}) 104 | 105 | try { 106 | assert.strictEqual(result.response.statusCode, 200) 107 | } catch (err) { 108 | reject(err) 109 | return 110 | } 111 | 112 | if (result.json.lastSync.finished > lastSync) { 113 | clearInterval(intervalID) 114 | resolve() 115 | } 116 | }, 100) 117 | }) 118 | }) 119 | 120 | after('shutting down', function() { 121 | process.kill(ctx.child.pid, 'SIGINT') 122 | }) 123 | }) 124 | -------------------------------------------------------------------------------- /test/regression_test.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /* eslint-env node, mocha */ 3 | 'use strict' 4 | 5 | const assert = require('assert') 6 | const fs = require('fs') 7 | const { URL } = require('url') 8 | const process = require('process') 9 | const testUtil = require('./util.js') 10 | 11 | const MANIFEST_SOURCE = process.env.MANIFEST_SOURCE 12 | if (!MANIFEST_SOURCE) { 13 | throw new Error('Missing manifest source') 14 | } 15 | 16 | // https://en.wikipedia.org/wiki/Discounted_cumulative_gain 17 | function computeDcg(relevancies) { 18 | let result = 0 19 | for (let i = 0; i < relevancies.length; i += 1) { 20 | result += (Math.pow(2, relevancies[i]) - 1) / Math.log2(i + 2) 21 | } 22 | 23 | return result 24 | } 25 | 26 | async function search(query, port) { 27 | const result = await testUtil.request(`http://localhost:${port}/search?q=${encodeURIComponent(query)}&searchProperty=manual-current`) 28 | assert.strictEqual(result.response.statusCode, 200) 29 | assert.strictEqual(result.response.headers['content-type'], 'application/json') 30 | return result.json.results 31 | } 32 | 33 | async function computeScore(queries, port) { 34 | let total = 0.0 35 | let min = Infinity 36 | const entries = Object.entries(queries) 37 | for (const [query, scores] of entries) { 38 | const results = (await search(query, port)).slice(0, 5).map((result) => { 39 | return new URL(result.url).pathname.replace(/^\/manual\//, '').replace(/\/(?:index.html)?$/, '') 40 | }) 41 | 42 | const relevancyList = results.map((pathname) => { 43 | if (scores[pathname] !== undefined) { 44 | return scores[pathname] 45 | } 46 | 47 | console.error(`Unknown result: "${pathname}" for query: "${query}"`) 48 | return 0 49 | }) 50 | 51 | const idealizedRelevancyList = Object.values(scores).filter(x => x > 0).sort() 52 | const dcg = computeDcg(relevancyList) 53 | const idealizedDcg = computeDcg(idealizedRelevancyList) 54 | const normalizedDcg = dcg / idealizedDcg 55 | total += normalizedDcg 56 | if (normalizedDcg < min) { 57 | min = normalizedDcg 58 | } 59 | 60 | if (normalizedDcg === 0) { 61 | console.warn(`Nothing relevant found in the top 5 results for "${query}"`) 62 | } 63 | } 64 | 65 | return [min, total / entries.length] 66 | } 67 | 68 | describe('regression', function() { 69 | this.slow(120000) 70 | 71 | let ctx = null 72 | 73 | before('starting server', function(done) { 74 | ctx = testUtil.startServer(MANIFEST_SOURCE, done) 75 | }) 76 | 77 | it('should be relevant for "find"', async () => { 78 | const result = (await search('find', ctx.port))[0].url 79 | assert.strictEqual(result, 'https://docs.mongodb.com/manual/reference/method/db.collection.find/index.html') 80 | }) 81 | 82 | it('should be relevant for "mongod.conf"', async () => { 83 | const result = (await search('mongod.conf', ctx.port))[0].url 84 | assert.strictEqual(result, 'https://docs.mongodb.com/manual/reference/configuration-options/index.html') 85 | }) 86 | 87 | it('should be relevant for "$in"', async () => { 88 | const results = (await search('$in', ctx.port)).slice(0, 2).map((d) => d.url).sort() 89 | assert.deepStrictEqual(results, [ 90 | 'https://docs.mongodb.com/manual/reference/operator/aggregation/in/index.html', 91 | 'https://docs.mongodb.com/manual/reference/operator/query/in/index.html']) 92 | 93 | const results2 = (await search('in', ctx.port)).slice(0, 2).map((d) => d.url).sort() 94 | assert.deepStrictEqual(results, results2) 95 | }) 96 | 97 | it('should not reduce relevancy score', async () => { 98 | const queries = fs.readFileSync('test/queries.json') 99 | const [minScore, meanScore] = await computeScore(JSON.parse(queries), ctx.port) 100 | console.log(`Minimum nDCG@5: ${minScore}`) 101 | console.log(`Mean nDCG@5: ${meanScore}`) 102 | assert(meanScore > 0.56) 103 | }) 104 | 105 | after('shutting down', function() { 106 | process.kill(ctx.child.pid, 'SIGINT') 107 | }) 108 | }) 109 | -------------------------------------------------------------------------------- /test/test_pool.js: -------------------------------------------------------------------------------- 1 | /* eslint-env node, mocha */ 2 | 'use strict' 3 | 4 | const assert = require('assert') 5 | const Pool = require('../src/pool.js').Pool 6 | 7 | describe('Pool', () => { 8 | let i = 0 9 | const pool = new Pool(3, () => { 10 | i += 1 11 | return { 12 | backlog: i, 13 | i: i 14 | } 15 | }) 16 | 17 | it('Should be idempotent', () => { 18 | assert.strictEqual(pool.get().i, 1) 19 | assert.strictEqual(pool.get().i, 1) 20 | }) 21 | 22 | it('Should select the unsuspended element with the smallest backlog', () => { 23 | assert.deepStrictEqual(pool.getStatus(), [1, 2, 3]) 24 | 25 | pool.pool[0].backlog += 3 26 | const x = pool.get() 27 | assert.strictEqual(x.i, 2) 28 | pool.suspend(x) 29 | assert.deepStrictEqual(pool.getStatus(), [4, 's', 3]) 30 | assert.strictEqual(pool.get().i, 3) 31 | pool.resume(x) 32 | assert.deepStrictEqual(pool.getStatus(), [4, 2, 3]) 33 | assert.strictEqual(pool.get().i, 2) 34 | 35 | pool.pool[0].backlog -= 2 36 | assert.strictEqual(pool.get().i, 1) 37 | 38 | pool.pool[2].backlog -= 2 39 | assert.strictEqual(pool.get().i, 3) 40 | }) 41 | 42 | it('Should throw if no elements are available', () => { 43 | for (const worker of pool.pool) { 44 | pool.suspend(worker) 45 | } 46 | 47 | assert.throws(() => { 48 | pool.get() 49 | }, /pool-unavailable/) 50 | }) 51 | }) 52 | -------------------------------------------------------------------------------- /test/test_query.js: -------------------------------------------------------------------------------- 1 | /* eslint-env node, mocha */ 2 | 'use strict' 3 | 4 | const assert = require('assert') 5 | const Query = require('../src/fts/Query.js').Query 6 | 7 | describe('Query', () => { 8 | it('should parse a single term', () => { 9 | const query = (new Query('foo')) 10 | assert.deepStrictEqual(query.terms, new Set(['foo'])) 11 | assert.deepStrictEqual(query.phrases, []) 12 | }) 13 | 14 | it('should delimit terms with any standard whitespace characters', () => { 15 | const query = (new Query('foo \t bar')) 16 | assert.deepStrictEqual(query.terms, new Set(['foo', 'bar'])) 17 | assert.deepStrictEqual(query.phrases, []) 18 | }) 19 | 20 | it('should parse multi-word phrases', () => { 21 | const query = (new Query('foo "one phrase" bar "second phrase"')) 22 | assert.deepStrictEqual(query.terms, new Set(['foo', 'one', 'phrase', 'bar', 'second'])) 23 | assert.deepStrictEqual(query.phrases, ['one phrase', 'second phrase']) 24 | }) 25 | 26 | it('should handle adjacent phrases', () => { 27 | const query = (new Query('"introduce the" "officially supported"')) 28 | assert.deepStrictEqual(query.terms, new Set(['introduce', 'the', 'officially', 'supported'])) 29 | assert.deepStrictEqual(query.phrases, ['introduce the', 'officially supported']) 30 | assert.deepStrictEqual(query.stemmedPhrases, [['introduc'], ['offici', 'support']]) 31 | }) 32 | 33 | it('should handle a phrase fragment as a single phrase', () => { 34 | const query = (new Query('"officially supported')) 35 | assert.deepStrictEqual(query.terms, new Set(['officially', 'supported'])) 36 | assert.deepStrictEqual(query.phrases, ['officially supported']) 37 | }) 38 | 39 | describe('#checkPhrases', () => { 40 | it('should match phrases with adjacent words', () => { 41 | const query = (new Query('"Quoth the raven"')) 42 | const tokenPositions = new Map([ 43 | ['quoth', [0, 5]], 44 | ['raven', [8, 1]]]) 45 | assert.ok(query.checkPhrases(tokenPositions)) 46 | }) 47 | 48 | it('should refuse phrases without adjacent words', () => { 49 | const query = (new Query('"foo bar" "Quoth the raven"')) 50 | const tokenPositions = new Map([ 51 | ['quoth', [0, 3]], 52 | ['raven', [2, 5]], 53 | ['foo', [6]], 54 | ['bar', [7]]]) 55 | assert.ok(!query.checkPhrases(tokenPositions)) 56 | }) 57 | }) 58 | }) 59 | -------------------------------------------------------------------------------- /test/test_stemmer.js: -------------------------------------------------------------------------------- 1 | /* eslint-env node, mocha */ 2 | 'use strict' 3 | 4 | const assert = require('assert') 5 | const fs = require('fs') 6 | const promisify = require('util').promisify 7 | const {tokenize, stem} = require('../src/fts/Stemmer.js') 8 | 9 | describe('Stemmer', () => { 10 | describe('#tokenize', () => { 11 | it('should split on whitespace', () => { 12 | assert.deepStrictEqual(tokenize('The qUick \tbrown\n\n\t fox.'), ['the', 'quick', 'brown', 'fox']) 13 | }) 14 | 15 | it('should handle code somewhat coherently', () => { 16 | assert.deepStrictEqual( 17 | tokenize('db.scores.find(\n { results: { $elemMatch: { $gte: 80, $lt: 85 } } }\n)'), 18 | ['db.scores.find', 'results', '$elemmatch', '$gte', '80', '$lt', '85']) 19 | }) 20 | 21 | it('should tokenize atomic phrases', () => { 22 | assert.deepStrictEqual( 23 | tokenize('ops manager configuration'), 24 | ['ops manager', 'configuration']) 25 | assert.strictEqual(stem('ops manager'), 'ops manager') 26 | }) 27 | 28 | it('should replace a standalone $ with "positional operator"', () => { 29 | assert.deepStrictEqual( 30 | tokenize('$ operator'), 31 | ['positional', 'operator', 'operator']) 32 | 33 | assert.deepStrictEqual( 34 | tokenize('$max operator'), 35 | ['$max', 'operator']) 36 | }) 37 | 38 | it('should pass the porter2 test vector', async function() { 39 | this.slow(250) 40 | 41 | const text = await promisify(fs.readFile)('test/stemmed-corpus.txt', {encoding: 'utf-8'}) 42 | const lines = text.split('\n') 43 | for (let line of lines) { 44 | line = line.trim() 45 | if (!line) { continue } 46 | const [word, correctStemmed] = line.split(/\s+/, 2) 47 | const stemmed = stem(word) 48 | assert.strictEqual(stemmed, correctStemmed) 49 | } 50 | }) 51 | }) 52 | }) 53 | -------------------------------------------------------------------------------- /test/test_trie.js: -------------------------------------------------------------------------------- 1 | /* eslint-env node, mocha */ 2 | 'use strict' 3 | 4 | const assert = require('assert') 5 | const Trie = require('../src/fts/Trie.js').Trie 6 | 7 | describe('Trie', () => { 8 | const trie = new Trie() 9 | 10 | it('Should be idempotent', () => { 11 | trie.insert('foobar', 0) 12 | trie.insert('foobar', 0) 13 | 14 | assert.deepStrictEqual( 15 | trie.search('foobar', true), 16 | new Map([[0, new Set(['foobar'])]])) 17 | 18 | assert.deepStrictEqual( 19 | trie.search('foobar', false), 20 | new Map([[0, new Set(['foobar'])]])) 21 | }) 22 | 23 | it('Should be additive', () => { 24 | trie.insert('foobar', 1) 25 | 26 | assert.deepStrictEqual( 27 | trie.search('foobar', true), 28 | new Map([[0, new Set(['foobar'])], [1, new Set(['foobar'])]])) 29 | 30 | assert.deepStrictEqual( 31 | trie.search('foobar', false), 32 | new Map([[0, new Set(['foobar'])], [1, new Set(['foobar'])]])) 33 | }) 34 | 35 | it('Should handle prefix matching', () => { 36 | trie.insert('foobaz', 0) 37 | 38 | assert.deepStrictEqual( 39 | trie.search('foo', true), 40 | new Map([ 41 | [0, new Set(['foobar', 'foobaz'])], 42 | [1, new Set(['foobar'])]])) 43 | 44 | assert.deepStrictEqual( 45 | trie.search('foo', false), 46 | new Map()) 47 | }) 48 | }) 49 | -------------------------------------------------------------------------------- /test/util.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | /* eslint-env node */ 3 | const child_process = require('child_process') 4 | const http = require('http') 5 | const readline = require('readline') 6 | 7 | function startServer(path, done) { 8 | let isDone = false 9 | 10 | const child = child_process.spawn('./src/index.js', [path], { 11 | stdio: [0, 'pipe', 2] 12 | }) 13 | 14 | const rl = readline.createInterface({ 15 | input: child.stdout 16 | }) 17 | 18 | const ctx = { 19 | child: child, 20 | port: 0 21 | } 22 | 23 | rl.on('line', (line) => { 24 | if (isDone) { return } 25 | 26 | const match = line.match(/Listening on port ([0-9]+)/) 27 | if (match) { 28 | ctx.port = parseInt(match[1]) 29 | } 30 | 31 | if (line.match(/Loaded new index/)) { 32 | isDone = true 33 | done() 34 | } else if (line.match(/Error/)) { 35 | throw new Error(line) 36 | } 37 | }) 38 | 39 | rl.on('error', (err) => { 40 | throw err 41 | }) 42 | 43 | rl.on('end', () => { 44 | rl.close() 45 | }) 46 | 47 | return ctx 48 | } 49 | 50 | function request(url) { 51 | return new Promise((resolve, reject) => { 52 | http.request(url, (res) => { 53 | res.setEncoding('utf8') 54 | let data = '' 55 | 56 | res.on('data', (chunk) => { data += chunk }) 57 | res.on('end', () => { 58 | resolve({ 59 | response: res, 60 | json: data ? JSON.parse(data) : undefined 61 | }) 62 | }) 63 | res.on('error', (err) => { 64 | reject(err) 65 | }) 66 | }).end() 67 | }) 68 | } 69 | 70 | exports.request = request 71 | exports.startServer = startServer 72 | -------------------------------------------------------------------------------- /tools/update_stemmer.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const assert = require('assert') 4 | const { spawnSync } = require('child_process') 5 | const fs = require('fs') 6 | const process = require('process') 7 | 8 | const PAT_CONSTRUCTOR = /constructor\s*\(\) \{([^}]+)\}/ 9 | const PAT_START = /r_prelude\s*\(\)/ 10 | const PAT_END_START = /^([^\n\S]*)stem\s*\(\)\s*\n/m 11 | 12 | const source_path = process.argv[2] 13 | const output_path = process.argv[3] 14 | const result = spawnSync('snowball', [source_path, '-o', '.stemmer', '-n', 'Porter2', '-js']) 15 | if (result.status !== 0) { 16 | throw new Error('Error running snowball') 17 | } 18 | 19 | let oldJS = fs.readFileSync(output_path, {encoding: 'utf-8'}) 20 | const updatedJS = fs.readFileSync('.stemmer.js', {encoding: 'utf-8'}) 21 | fs.unlinkSync('.stemmer.js') 22 | 23 | // Replace the constructor, containing Among definitions 24 | const newConstructor = updatedJS.match(PAT_CONSTRUCTOR)[0].replace(/\n[^\n\S]*\}[^\n\S]*$/, ` 25 | this.B_Y_found = false; 26 | this.I_p2 = 0; 27 | this.I_p1 = 0; 28 | }`) 29 | oldJS = oldJS.replace(PAT_CONSTRUCTOR, newConstructor) 30 | 31 | // Replace the methods. This is... tricky. 32 | function getMethodsStartEnd(js) { 33 | const startMatch = js.match(PAT_START) 34 | const startIndex = startMatch.index 35 | const endStartMatch = js.match(PAT_END_START) 36 | const endStartIndex = endStartMatch.index 37 | const endStartIndentation = endStartMatch[1] 38 | 39 | const endIndex = endStartIndex + js.slice(endStartIndex).indexOf('\n' + endStartIndentation + '}') 40 | assert(endIndex > endStartIndex, '"stem() {}" block end not found') 41 | return [startIndex, endIndex] 42 | } 43 | 44 | const [oldMethodsStart, oldMethodsEnd] = getMethodsStartEnd(oldJS) 45 | const [newMethodsStart, newMethodsEnd] = getMethodsStartEnd(updatedJS) 46 | const newMethods = updatedJS.slice(newMethodsStart, newMethodsEnd) 47 | 48 | oldJS = oldJS.slice(0, oldMethodsStart) + newMethods + oldJS.slice(oldMethodsEnd) 49 | fs.writeFileSync(output_path, oldJS) 50 | --------------------------------------------------------------------------------