├── .eslintignore ├── .eslintrc.json ├── .firebaserc ├── .github └── workflows │ ├── ci.yaml │ └── npm-release.yaml ├── .gitignore ├── .prettierrc.js ├── LICENSE ├── README.md ├── firebase.json ├── jest.config.js ├── jest.setup.js ├── package-lock.json ├── package.json ├── src ├── counter.ts ├── cursor.ts ├── index.spec.ts ├── index.ts ├── pagination.spec.ts ├── query.spec.ts ├── query.ts ├── sort.ts ├── tokenizer │ ├── english.ts │ ├── index.ts │ ├── japanese.ts │ ├── tokenize.ts │ └── tokneize.test.ts └── utils │ └── firestore.ts ├── testdata └── 5.en.json └── tsconfig.json /.eslintignore: -------------------------------------------------------------------------------- 1 | lib/ 2 | jest.setup.js -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./node_modules/gts/" 3 | } 4 | -------------------------------------------------------------------------------- /.firebaserc: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: 4 | push: 5 | paths: 6 | - '**.ts' 7 | - '**.js' 8 | - '*.json' 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | strategy: 16 | matrix: 17 | node-version: [10.x, 12.x, 14.x] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Use Node.js ${{ matrix.node-version }} 22 | uses: actions/setup-node@v1 23 | with: 24 | node-version: ${{ matrix.node-version }} 25 | - run: npm install 26 | - run: npm run build --if-present 27 | - run: npm test 28 | env: 29 | CI: true -------------------------------------------------------------------------------- /.github/workflows/npm-release.yaml: -------------------------------------------------------------------------------- 1 | name: Node.js Package 2 | on: 3 | release: 4 | types: [created] 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/setup-node@v1 11 | with: 12 | node-version: '12.x' 13 | registry-url: 'https://registry.npmjs.org' 14 | - run: npm install 15 | - run: npm test 16 | - run: npm publish 17 | env: 18 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | lib 3 | *.log -------------------------------------------------------------------------------- /.prettierrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | ...require('gts/.prettierrc.json'), 3 | } 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Firestore Full-Text Search 2 | 3 | Firestore Full-Text Search provides a Firestore-specific full-text search function. 4 | It runs on Cloud Functions and has excellent performance. 5 | Supports simple inverted index type search. 6 | 7 | #### Usage 8 | 9 | ```bash 10 | npm install --save firestore-full-text-search 11 | ``` 12 | 13 | ```ts 14 | import admin from 'firebase-admin'; 15 | import FirestoreFullTextSearch from 'firestore-full-text-search'; 16 | 17 | admin.initializeApp({...}); 18 | const db = admin.firestore(); 19 | 20 | // Specifies the collection in which to store the inverted index. 21 | const fullTextSearch = new FirestoreFullTextSearch(db.collection('index')); 22 | 23 | 24 | // Set documents 25 | const postData: Post = { 26 | title: "What's Firestore Full-Text Search?", 27 | content: 28 | 'Firestore Full-Text Search provides a Firestore-specific full-text search function. It runs on Cloud Functions and has excellent performance.', 29 | created: admin.firestore.FieldValue.serverTimestamp(), 30 | }; 31 | 32 | const docRef = postsRef.collection('posts').doc('1'); 33 | 34 | // WriteBatch is supported so that documents and search indexes can be stored atomically. 35 | const batch = db.batch(); 36 | batch.set(docRef, postData); 37 | await fullTextSearch.set('en', docRef, {batch, data: postData}); 38 | await batch.commit(); 39 | ``` 40 | 41 | ```js 42 | // Search documents 43 | const results = await fullTextSearch.search('en', 'firestore'); 44 | ``` 45 | 46 | #### ToDo 47 | 48 | - [x] English Support 49 | - [x] Japanese Support 50 | - [x] Implement Query parser 51 | - [x] Implement Delete document 52 | - [x] Sorting Support 53 | - [x] Limit Support 54 | - [x] Pagination Support 55 | - [x] OpenTelemetry Support 56 | - [ ] Browser Support (Search-Only) 57 | - [ ] Firebase Performance Monitoring Support -------------------------------------------------------------------------------- /firebase.json: -------------------------------------------------------------------------------- 1 | { 2 | "emulators": { 3 | "firestore": { 4 | "port": 5000 5 | }, 6 | "ui": { 7 | "enabled": true 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: 'ts-jest', 3 | testEnvironment: 'node', 4 | testPathIgnorePatterns: ['lib'], 5 | setupFilesAfterEnv: [`${process.cwd()}/jest.setup.js`], 6 | }; 7 | -------------------------------------------------------------------------------- /jest.setup.js: -------------------------------------------------------------------------------- 1 | jest.setTimeout(30000); 2 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "firestore-full-text-search", 3 | "main": "lib/index.js", 4 | "types": "lib/index.d.ts", 5 | "files": [ 6 | "README.md", 7 | "package.json", 8 | "lib" 9 | ], 10 | "keywords": [ 11 | "search", 12 | "Full Text Search", 13 | "Firebase", 14 | "firebase", 15 | "firestore" 16 | ], 17 | "version": "0.6.1", 18 | "description": "Firestore Full-text Search", 19 | "scripts": { 20 | "test": "firebase emulators:exec jest", 21 | "start:emulators": "firebase emulators:start --project test", 22 | "jest": "jest", 23 | "lint": "gts lint", 24 | "clean": "gts clean", 25 | "compile": "tsc", 26 | "fix": "gts fix", 27 | "prepare": "npm run compile", 28 | "pretest": "npm run compile", 29 | "posttest": "npm run lint" 30 | }, 31 | "repository": { 32 | "type": "git", 33 | "url": "git+https://github.com/k2wanko/firestore-full-text-search.git" 34 | }, 35 | "author": "k2wanko", 36 | "license": "Apache-2.0", 37 | "bugs": { 38 | "url": "https://github.com/k2wanko/firestore-full-text-search/issues" 39 | }, 40 | "homepage": "https://github.com/k2wanko/firestore-full-text-search#readme", 41 | "devDependencies": { 42 | "@google-cloud/firestore": "^4.8.1", 43 | "@opentelemetry/core": "^0.14.0", 44 | "@opentelemetry/metrics": "^0.14.0", 45 | "@opentelemetry/node": "^0.14.0", 46 | "@opentelemetry/tracing": "^0.14.0", 47 | "@types/jest": "^26.0.20", 48 | "@types/kuromoji": "^0.1.0", 49 | "@types/luxon": "^1.25.1", 50 | "@types/node": "^14.11.2", 51 | "firebase-tools": "^9.2.1", 52 | "gts": "^3.0.3", 53 | "jest": "^26.6.3", 54 | "ts-jest": "^26.4.4", 55 | "typescript": "^4.0.3" 56 | }, 57 | "dependencies": { 58 | "@opentelemetry/api": "^0.14.0", 59 | "firebase-admin": "^9.4.2", 60 | "firebase-functions": "^3.13.0", 61 | "kuromoji": "^0.1.2", 62 | "luxon": "^1.25.0" 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/counter.ts: -------------------------------------------------------------------------------- 1 | import type {DocumentReference} from '@google-cloud/firestore'; 2 | import {FieldValue} from '@google-cloud/firestore'; 3 | import {WriteBatch2} from './utils/firestore'; 4 | 5 | export async function incrementCounter( 6 | ref: DocumentReference, 7 | numShards: number, 8 | numIncrement: number, 9 | options?: {batch: WriteBatch2} 10 | ) { 11 | const shardId = Math.floor(Math.random() * numShards).toString(); 12 | const shardRef = ref.collection('count').doc(shardId); 13 | const batch = options?.batch; 14 | 15 | const data = {count: FieldValue.increment(numIncrement)}; 16 | 17 | if (batch) { 18 | batch.set(shardRef, data, {merge: true}); 19 | } else { 20 | await shardRef.set(data, {merge: true}); 21 | } 22 | return; 23 | } 24 | 25 | export async function getCount(ref: DocumentReference): Promise { 26 | const snap = await ref.collection('count').get(); 27 | let total = 0; 28 | for (const doc of snap.docs) { 29 | total += doc.data().count as number; 30 | } 31 | return total; 32 | } 33 | -------------------------------------------------------------------------------- /src/cursor.ts: -------------------------------------------------------------------------------- 1 | import type {FieldValue} from '@google-cloud/firestore'; 2 | export type Cursor = string; 3 | 4 | export interface CursorInfo { 5 | fields: string[]; 6 | fieldValueMap: {[key: string]: FieldValue}; 7 | } 8 | 9 | async function createCursor(info: CursorInfo): Promise { 10 | return Buffer.from(JSON.stringify(info)).toString('base64'); 11 | } 12 | 13 | export async function parseCursor(cursor: Cursor): Promise { 14 | return JSON.parse(Buffer.from(cursor, 'base64').toString()) as CursorInfo; 15 | } 16 | 17 | export class CursorBuilder { 18 | #fields: string[]; 19 | #fieldValueMap: {[key: string]: FieldValue}; 20 | constructor() { 21 | this.#fields = []; 22 | this.#fieldValueMap = {}; 23 | } 24 | 25 | add(path: string, val: FieldValue) { 26 | if (this.#fieldValueMap[path]) { 27 | throw new Error(`exits path: ${path}`); 28 | } 29 | this.#fields.push(path); 30 | this.#fieldValueMap[path] = val; 31 | } 32 | 33 | async build(): Promise { 34 | return createCursor({ 35 | fields: this.#fields, 36 | fieldValueMap: this.#fieldValueMap, 37 | }); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/index.spec.ts: -------------------------------------------------------------------------------- 1 | import admin from 'firebase-admin'; 2 | import FirestoreFullTextSearch, {WordEntity} from './index'; 3 | import type {FieldValue} from '@google-cloud/firestore'; 4 | 5 | process.env.FIRESTORE_EMULATOR_HOST = 6 | process.env.FIRESTORE_EMULATOR_HOST || 'localhost:5000'; 7 | 8 | admin.initializeApp({ 9 | projectId: 'test', 10 | }); 11 | 12 | export type Post = { 13 | title: string; 14 | content: string; 15 | created: Date | FieldValue; 16 | label?: string[]; 17 | }; 18 | 19 | export type Animal = { 20 | type: string; 21 | description: string; 22 | like: number; 23 | }; 24 | 25 | describe('FirestoreFullTextSearch:english', () => { 26 | it('set:simple', async () => { 27 | const db = admin.firestore(); 28 | 29 | const postsRef = db.collection('posts'); 30 | const postData: Post = { 31 | title: "What's Firestore Full-Text Search?", 32 | content: 33 | 'Firestore Full-Text Search provides a Firestore-specific full-text search function. It runs on Cloud Functions and has excellent performance.', 34 | created: admin.firestore.FieldValue.serverTimestamp(), 35 | }; 36 | 37 | const docRef = postsRef.doc('gF4lmS8gOlkAPlqGzTHh'); 38 | await docRef.set(postData); 39 | 40 | const indexRef = db.collection('index_simple'); 41 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 42 | await fullTextSearch.set('en', docRef); 43 | 44 | const word = 'search'; 45 | const wants = ['title', 'content']; 46 | for (const field of wants) { 47 | const contentRef = indexRef.doc( 48 | `/v1/words/${word}/docs/${docRef.id}.${field}` 49 | ); 50 | const contentSnap = await contentRef.get(); 51 | expect(contentSnap.exists).toBe(true); 52 | } 53 | }); 54 | 55 | it('set:batch', async () => { 56 | const db = admin.firestore(); 57 | 58 | const postsRef = db.collection('posts'); 59 | const postData: Post = { 60 | title: "What's Firebase?", 61 | content: 62 | 'Firebase helps you build and run successful apps.\n Backed by Google and loved by app development teams - from startups to global enterprises.', 63 | created: admin.firestore.FieldValue.serverTimestamp(), 64 | }; 65 | 66 | const docRef = postsRef.doc('aF7lmS8gOlkAPlqGzTHh'); 67 | 68 | const batch = db.batch(); 69 | batch.set(docRef, postData); 70 | 71 | const indexRef = db.collection('index_simple'); 72 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 73 | await fullTextSearch.set('en', docRef, {batch, data: postData}); 74 | 75 | await batch.commit(); 76 | 77 | const word = 'firebas'; 78 | const wants = ['title', 'content']; 79 | for (const field of wants) { 80 | const contentRef = indexRef.doc( 81 | `/v1/words/${word}/docs/${docRef.id}.${field}` 82 | ); 83 | const contentSnap = await contentRef.get(); 84 | expect(contentSnap.exists).toBe(true); 85 | } 86 | }); 87 | 88 | it('set:related', async () => { 89 | const db = admin.firestore(); 90 | const indexRef = db.collection('related'); 91 | const testData = [ 92 | { 93 | data: { 94 | title: "What's JavaScript", 95 | }, 96 | }, 97 | { 98 | data: { 99 | title: "What's Javascript", 100 | }, 101 | }, 102 | { 103 | data: { 104 | title: "What's javascript", 105 | }, 106 | }, 107 | ]; 108 | 109 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 110 | for (const {data} of testData) { 111 | await fullTextSearch.set('en', db.doc('post/1'), {data}); 112 | } 113 | 114 | const snap = await db.doc('/related/v1/words/javascript').get(); 115 | const data = snap.data() as WordEntity; 116 | expect(data.related.sort()).toStrictEqual( 117 | ['JavaScript', 'Javascript', 'javascript'].sort() 118 | ); 119 | }); 120 | 121 | it('search:simple', async () => { 122 | const db = admin.firestore(); 123 | const indexRef = db.collection('index_simple'); 124 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 125 | const {hits} = await fullTextSearch.search('en', 'firestore'); 126 | expect(hits.length).toBe(1); 127 | expect(hits[0].id).toBe('gF4lmS8gOlkAPlqGzTHh'); 128 | }); 129 | 130 | it('search:double-keywords', async () => { 131 | const db = admin.firestore(); 132 | const indexRef = db.collection('index_simple'); 133 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 134 | const {hits} = await fullTextSearch.search('en', 'firebase firestore'); 135 | 136 | expect(hits.length).toBe(2); 137 | }); 138 | 139 | it('search:nothing', async () => { 140 | const db = admin.firestore(); 141 | const indexRef = db.collection('index_simple'); 142 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 143 | const {hits} = await fullTextSearch.search('en', 'nothing'); 144 | expect(hits.length).toBe(0); 145 | }); 146 | }); 147 | 148 | describe('FirestoreFullTextSearch', () => { 149 | const db = admin.firestore(); 150 | 151 | beforeAll(async () => { 152 | const dogs: {[key: string]: Animal} = { 153 | akita: { 154 | type: 'dog', 155 | description: 156 | 'The Akita (秋田犬, Akita-inu, Japanese pronunciation: [akʲita.inɯ]) is a large breed of dog originating from the mountainous regions of northern Japan.', 157 | like: 10, 158 | }, 159 | corgi: { 160 | type: 'dog', 161 | description: 162 | 'The Welsh Corgi (/ˈkɔːrɡi/[5] plural "Corgis" or occasionally the etymologically consistent "Corgwn"; /ˈkɔːrɡuːn/) is a small type of herding dog that originated in Wales.[6]', 163 | like: 50, 164 | }, 165 | 'border collie': { 166 | type: 'dog', 167 | description: 168 | 'The Border Collie is a working and herding dog breed developed in the Anglo-Scottish border county of Northumberland, for herding livestock, especially sheep.[1]', 169 | like: 5, 170 | }, 171 | }; 172 | 173 | const indexRef = db.collection('index_dogs_sort'); 174 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 175 | for (const [id, data] of Object.entries(dogs)) { 176 | const batch = db.batch(); 177 | const dogRef = db.collection('dogs').doc(id); 178 | batch.set(dogRef, data); 179 | await fullTextSearch.set('en', dogRef, { 180 | data, 181 | batch, 182 | indexMask: ['description'], 183 | fields: ['like'], 184 | }); 185 | await batch.commit(); 186 | } 187 | }); 188 | 189 | it('search:sort', async () => { 190 | const indexRef = db.collection('index_dogs_sort'); 191 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 192 | const {hits} = await fullTextSearch.search('en', 'herding'); 193 | expect(hits.length === 2).toBe(true); 194 | expect(hits[0].id).toBe('border collie'); 195 | expect(hits[1].id).toBe('corgi'); 196 | // console.log(results.map(res => res.id)); 197 | }); 198 | 199 | it('delete:document', async () => { 200 | const postsRef = db.collection('posts'); 201 | const postData: Post = { 202 | title: "What's Firestore Full-Text Search?", 203 | content: 204 | 'Firestore Full-Text Search provides a Firestore-specific full-text search function. It runs on Cloud Functions and has excellent performance.', 205 | created: admin.firestore.FieldValue.serverTimestamp(), 206 | }; 207 | 208 | const docRef = postsRef.doc('post1'); 209 | await docRef.set(postData); 210 | 211 | const indexRef = db.collection('index_delete_test'); 212 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 213 | await fullTextSearch.set('en', docRef); 214 | 215 | const word = 'search'; 216 | const wants = ['title', 'content']; 217 | for (const field of wants) { 218 | const contentRef = indexRef.doc( 219 | `/v1/words/${word}/docs/${docRef.id}.${field}` 220 | ); 221 | const contentSnap = await contentRef.get(); 222 | expect(contentSnap.exists).toBe(true); 223 | } 224 | 225 | await fullTextSearch.delete('en', docRef); 226 | 227 | for (const field of wants) { 228 | const contentRef = indexRef.doc( 229 | `/v1/words/${word}/docs/${docRef.id}.${field}` 230 | ); 231 | const contentSnap = await contentRef.get(); 232 | expect(contentSnap.exists).toBe(false); 233 | } 234 | }); 235 | }); 236 | 237 | describe('FirestoreFullTextSearch:japanese', () => { 238 | it('set:simple', async () => { 239 | const db = admin.firestore(); 240 | 241 | const postsRef = db.collection('posts'); 242 | const postData: Post = { 243 | title: 'Firestore Full-Text Searchとは?', 244 | content: 245 | 'Firestore Full-Text Search は、Firestoreに特化した全文検索機能を提供します。Cloud Functions上で動作し、優れたパフォーマンスを発揮します。', 246 | created: admin.firestore.FieldValue.serverTimestamp(), 247 | }; 248 | 249 | const docRef = postsRef.doc('gF4lmS8gOlkAPlqGzTHh'); 250 | await docRef.set(postData); 251 | 252 | const indexRef = db.collection('index_ja'); 253 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 254 | await fullTextSearch.set('ja', docRef); 255 | 256 | const word = 'パフォーマンス'; 257 | const wants = ['content']; 258 | for (const field of wants) { 259 | const contentRef = indexRef.doc( 260 | `/v1/words/${word}/docs/${docRef.id}.${field}` 261 | ); 262 | const contentSnap = await contentRef.get(); 263 | expect(contentSnap.exists).toBe(true); 264 | } 265 | }); 266 | }); 267 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | CollectionReference, 3 | DocumentData, 4 | DocumentReference, 5 | Firestore, 6 | Query, 7 | WriteBatch, 8 | } from '@google-cloud/firestore'; 9 | import {FieldValue} from '@google-cloud/firestore'; 10 | import type {LanguageID, Token} from './tokenizer'; 11 | import tokenize from './tokenizer/tokenize'; 12 | import {trace, metrics} from '@opentelemetry/api'; 13 | import {parseQuery, SearchQuery} from './query'; 14 | import {calcScore} from './sort'; 15 | import {getCount, incrementCounter} from './counter'; 16 | import {WriteBatch2} from './utils/firestore'; 17 | import {Cursor, CursorBuilder, parseCursor} from './cursor'; 18 | 19 | export type FieldEntity = { 20 | __positions: Buffer; 21 | __score: number; // tf * idf 22 | __ref: DocumentReference; 23 | }; 24 | 25 | export type WordEntity = { 26 | related: string[]; 27 | }; 28 | 29 | export type CounterEntity = { 30 | count: number; 31 | }; 32 | 33 | export type Options = { 34 | sharedCounterNum?: number; 35 | }; 36 | 37 | export type SetOptions = { 38 | batch?: WriteBatch; 39 | data?: DocumentData; 40 | indexMask?: string[]; 41 | fields?: string[]; 42 | }; 43 | 44 | export type DeleteOptions = { 45 | batch?: WriteBatch; 46 | data?: DocumentData; 47 | indexMask?: string[]; 48 | }; 49 | 50 | export type SearchOptions = { 51 | limit?: number; 52 | cursor?: Cursor; 53 | }; 54 | 55 | export type SearchResult = { 56 | hits: DocumentReference[]; 57 | total: number; 58 | cursor?: Cursor; 59 | }; 60 | 61 | export type FieldTypeEntity = { 62 | type: FieldType; 63 | }; 64 | 65 | export type FieldType = 'string' | 'array' | 'number' | 'date'; 66 | 67 | const tracer = trace.getTracer('firestore-full-text-search'); 68 | 69 | const meter = metrics.getMeterProvider().getMeter('firestore-full-text-search'); 70 | const documentWriteCounter = meter.createCounter('document_write_count'); 71 | const documentWriteTokenCounter = meter.createCounter( 72 | 'document_write_token_count' 73 | ); 74 | // const searchTokenCounter = meter.createCounter('search_token_count'); 75 | 76 | const defaultSharedCounterNum = 3; 77 | 78 | export default class FirestoreFullTextSearch { 79 | #ref: CollectionReference; 80 | #db: Firestore; 81 | #wordsRef: CollectionReference; 82 | #wordDocsRef: CollectionReference; 83 | #fieldsRef: CollectionReference; 84 | #options?: Options; 85 | 86 | constructor(ref: CollectionReference, options?: Options) { 87 | this.#ref = ref; 88 | this.#db = ref.firestore; 89 | this.#wordsRef = ref.doc('v1').collection('words'); 90 | this.#wordDocsRef = ref.doc('v1').collection('word_docs'); 91 | this.#fieldsRef = ref.doc('v1').collection('fields'); 92 | this.#options = options; 93 | } 94 | 95 | async set(lang: LanguageID, doc: DocumentReference, options?: SetOptions) { 96 | const span = tracer.startSpan('set'); 97 | span.setAttributes({ 98 | index: this.#ref.path, 99 | doc: doc.path, 100 | lang, 101 | }); 102 | let data = options?.data; 103 | if (!data) { 104 | const snap = await doc.get(); 105 | if (!snap.exists) { 106 | throw new Error('Document does not exist.'); 107 | } 108 | data = snap.data() as DocumentData; // exists checked. 109 | } 110 | 111 | const _data = data; 112 | if (!_data) { 113 | throw new Error('Document is empty'); 114 | } 115 | 116 | const batch = new WriteBatch2(this.#db, {batch: options?.batch}); 117 | const indexMask = options?.indexMask; 118 | const fields = options?.fields; 119 | 120 | const allDocCount = await getCount(this.#ref.doc('v1')); 121 | 122 | let newDocCount = 0; 123 | const newWordCountMap = new Map(); 124 | const tokensMap = new Map(); 125 | const targetFields = new Set(); 126 | let writeCount = 0; 127 | let writeTokenCount = 0; 128 | for (const [fieldName, value] of Object.entries(data)) { 129 | if (indexMask) { 130 | if (!indexMask.includes(fieldName)) { 131 | continue; 132 | } 133 | } 134 | 135 | if (fieldName.startsWith('__')) { 136 | continue; 137 | } 138 | 139 | if (typeof value !== 'string') { 140 | continue; 141 | } 142 | targetFields.add(fieldName); 143 | } 144 | 145 | for (const fieldName of targetFields) { 146 | const value = data[fieldName]; 147 | if (typeof value !== 'string') { 148 | continue; 149 | } 150 | const tokens = await tokenize(lang, value); 151 | tokensMap.set(fieldName, tokens); 152 | for (const token of tokens) { 153 | const word = token.normalizedWord; 154 | if (!word) { 155 | continue; 156 | } 157 | 158 | const wordRef = this.#wordsRef.doc(word); 159 | const docRef = wordRef.collection('docs').doc(`${doc.id}.${fieldName}`); 160 | const res = await docRef.get(); 161 | if (!res.exists) { 162 | newDocCount = 1; 163 | newWordCountMap.set(word, 1); 164 | } 165 | } 166 | } 167 | 168 | for (const fieldName of targetFields) { 169 | const value = data[fieldName]; 170 | if (typeof value !== 'string') { 171 | continue; 172 | } 173 | 174 | const tokens = tokensMap.get(fieldName); 175 | if (!tokens) { 176 | throw new Error('Not found tokens'); 177 | } 178 | for (const token of tokens) { 179 | const word = token.normalizedWord; 180 | if (!word) { 181 | continue; 182 | } 183 | const wordRef = this.#wordsRef.doc(word); 184 | const wordSnap = await wordRef.get(); 185 | if (wordSnap.exists) { 186 | const wordData = wordSnap.data() as WordEntity; 187 | batch.set( 188 | wordRef, 189 | { 190 | related: Array.from( 191 | new Set(wordData.related.concat([token.word])).keys() 192 | ), 193 | }, 194 | {merge: true} 195 | ); 196 | } else { 197 | batch.set(wordRef, {related: [token.word]}); 198 | } 199 | 200 | const wordDocCount = await getCount(wordRef); 201 | const docRef = wordRef.collection('docs').doc(`${doc.id}.${fieldName}`); 202 | const wordDocRef = this.#wordDocsRef.doc(`${word}.${doc.id}`); 203 | const docData = { 204 | __word: word, 205 | __fields: Array.from(targetFields.values()), 206 | __positions: new Uint8Array(token.positions), 207 | __score: calcScore( 208 | token.positions.length, 209 | tokens.length, 210 | wordDocCount + (newWordCountMap.get(word) ?? 0), 211 | allDocCount + newDocCount 212 | ), 213 | __ref: doc, 214 | }; 215 | if (fields) { 216 | const fieldTypes: {[key: string]: FieldType} = {}; 217 | const fieldData: {[key: string]: unknown} = {}; 218 | const _fieldData = fields.reduce((p, name) => { 219 | const val = _data[name]; 220 | if (Array.isArray(val)) { 221 | fieldTypes[name] = 'array'; 222 | p[name] = val.sort(); 223 | } else { 224 | if (val instanceof Date) { 225 | fieldTypes[name] = 'date'; 226 | p[name] = val; 227 | } else if ( 228 | val instanceof FieldValue && 229 | val.isEqual(FieldValue.serverTimestamp()) 230 | ) { 231 | fieldTypes[name] = 'date'; 232 | p[name] = val; 233 | } else { 234 | switch (typeof val) { 235 | case 'string': 236 | fieldTypes[name] = 'string'; 237 | p[name] = _data[name]; 238 | break; 239 | case 'number': 240 | fieldTypes[name] = 'number'; 241 | p[name] = _data[name]; 242 | break; 243 | default: 244 | throw new Error(`Unsupport filed type ${typeof val}`); 245 | } 246 | } 247 | } 248 | return p; 249 | }, fieldData); 250 | for (const [name, type] of Object.entries(fieldTypes)) { 251 | batch.set(this.#fieldsRef.doc(name), { 252 | type, 253 | } as FieldTypeEntity); 254 | } 255 | batch.set(docRef, {...{__ref: doc}, ..._fieldData}); 256 | batch.set(wordDocRef, {...docData, ..._fieldData}); 257 | } else { 258 | batch.set(docRef, {__ref: doc}); 259 | batch.set(wordDocRef, docData); 260 | } 261 | 262 | if (newWordCountMap.has(word)) { 263 | await incrementCounter( 264 | wordRef, 265 | this.#options?.sharedCounterNum ?? defaultSharedCounterNum, 266 | newWordCountMap.get(word) ?? 0, 267 | {batch} 268 | ); 269 | } 270 | writeCount += 1; 271 | } 272 | 273 | writeTokenCount += tokens.length; 274 | } 275 | 276 | await incrementCounter( 277 | this.#ref.doc('v1'), 278 | this.#options?.sharedCounterNum ?? defaultSharedCounterNum, 279 | newDocCount, 280 | {batch} 281 | ); 282 | 283 | await batch.commit(); 284 | 285 | documentWriteCounter 286 | .bind({ 287 | index: this.#ref.path, 288 | lang, 289 | }) 290 | .add(writeCount); 291 | documentWriteTokenCounter 292 | .bind({ 293 | index: this.#ref.path, 294 | lang, 295 | }) 296 | .add(writeTokenCount); 297 | span.end(); 298 | } 299 | 300 | async delete( 301 | lang: LanguageID, 302 | doc: DocumentReference, 303 | options?: DeleteOptions 304 | ) { 305 | const span = tracer.startSpan('delete'); 306 | span.setAttributes({ 307 | index: this.#ref.path, 308 | doc: doc.path, 309 | lang, 310 | }); 311 | 312 | let data = options?.data; 313 | if (!data) { 314 | const snap = await doc.get(); 315 | if (!snap.exists) { 316 | throw new Error('Document does not exist.'); 317 | } 318 | data = snap.data() as DocumentData; // exists checked. 319 | } 320 | 321 | const _data = data; 322 | if (!_data) { 323 | throw new Error('Document is empty'); 324 | } 325 | 326 | const batch = new WriteBatch2(this.#db, {batch: options?.batch}); 327 | const indexMask = options?.indexMask; 328 | let docCount = 0; 329 | 330 | for (const [fieldName, vaule] of Object.entries(data)) { 331 | if (indexMask) { 332 | if (!indexMask.includes(fieldName)) { 333 | continue; 334 | } 335 | } 336 | 337 | if (fieldName.startsWith('__')) { 338 | continue; 339 | } 340 | 341 | if (typeof vaule !== 'string') { 342 | continue; 343 | } 344 | 345 | const tokens = await tokenize(lang, vaule); 346 | for (const token of tokens) { 347 | const word = token.normalizedWord; 348 | if (!word) { 349 | continue; 350 | } 351 | const wordRef = this.#wordsRef.doc(word); 352 | const docRef = wordRef.collection('docs').doc(`${doc.id}.${fieldName}`); 353 | const wordDocRef = this.#wordDocsRef.doc(`${word}.${doc.id}`); 354 | 355 | batch.delete(docRef); 356 | batch.delete(wordDocRef); 357 | await incrementCounter( 358 | wordRef, 359 | this.#options?.sharedCounterNum ?? defaultSharedCounterNum, 360 | -1, 361 | {batch} 362 | ); 363 | docCount = 1; 364 | } 365 | } 366 | 367 | await incrementCounter( 368 | this.#ref.doc('v1'), 369 | this.#options?.sharedCounterNum ?? defaultSharedCounterNum, 370 | docCount * -1, 371 | {batch} 372 | ); 373 | 374 | await batch.commit(); 375 | 376 | span.end(); 377 | } 378 | 379 | async search( 380 | lang: LanguageID, 381 | stringOrQuery: string | SearchQuery, 382 | options?: SearchOptions 383 | ): Promise { 384 | const span = tracer.startSpan('search'); 385 | span.setAttributes({ 386 | index: this.#ref.path, 387 | lang, 388 | }); 389 | 390 | const cursorQueue: string[] = []; 391 | 392 | let searchQuery: SearchQuery; 393 | if (typeof stringOrQuery === 'string') { 394 | searchQuery = parseQuery(stringOrQuery); 395 | } else { 396 | searchQuery = stringOrQuery; 397 | } 398 | 399 | let limit = options?.limit ?? 100; 400 | if (limit < 1) { 401 | limit = 1; 402 | } else if (limit > 500) { 403 | limit = 500; 404 | } 405 | 406 | const fields = searchQuery?.fields; 407 | type fieldInfo = {name: string; type: FieldType}; 408 | let fieldInfos: fieldInfo[] | null = null; 409 | if (fields) { 410 | const snap = await this.#db.getAll( 411 | ...fields.map(field => this.#fieldsRef.doc(field.name)) 412 | ); 413 | fieldInfos = snap.map(doc => ({name: doc.id, type: doc.data()?.type})); 414 | } 415 | 416 | const words: string[] = []; 417 | let total = 0; 418 | for (const keyword of searchQuery.keywords) { 419 | const tokens = await tokenize(lang, keyword); 420 | for (const token of tokens) { 421 | words.push(token.normalizedWord); 422 | const wordRef = this.#wordsRef.doc(token.normalizedWord); 423 | const count = await getCount(wordRef); 424 | if (count === 0) { 425 | continue; 426 | } 427 | total += count; 428 | } 429 | } 430 | 431 | let query: Query = this.#wordDocsRef; 432 | if (words.length === 1) { 433 | query = query.where('__word', '==', words[0]); 434 | } else { 435 | query = query.where('__word', 'in', words); 436 | } 437 | 438 | if (fieldInfos) { 439 | for (const info of fieldInfos) { 440 | if (!fields) { 441 | continue; 442 | } 443 | const field = fields.find(f => f.name === info.name); 444 | if (!field) { 445 | continue; 446 | } 447 | switch (info.type) { 448 | case 'string': 449 | query = query.where(field.name, field.operator, field.value); 450 | break; 451 | case 'array': 452 | switch (field.operator) { 453 | case '==': 454 | query = query.where(field.name, 'in', [[field.value].sort()]); 455 | break; 456 | case '!=': 457 | query = query.where(field.name, 'not-in', [ 458 | [field.value].sort(), 459 | ]); 460 | break; 461 | default: 462 | } 463 | break; 464 | default: 465 | query = query.where(field.name, field.operator, field.value); 466 | } 467 | } 468 | } else { 469 | query = query.orderBy('__score', 'desc'); 470 | cursorQueue.push('__score'); 471 | } 472 | 473 | const cursor = options?.cursor; 474 | if (cursor) { 475 | const info = await parseCursor(cursor); 476 | query = query.startAfter( 477 | ...info.fields.map(field => info.fieldValueMap[field]) 478 | ); 479 | } 480 | 481 | if (limit !== undefined) { 482 | query = query.limit(limit); 483 | } 484 | 485 | const snap = await query.get(); 486 | 487 | if (snap.empty) { 488 | return {hits: [], total}; 489 | } 490 | 491 | const lastVisible = snap.docs[snap.docs.length - 1]; 492 | const cursorBuilder = new CursorBuilder(); 493 | for (const queue of cursorQueue) { 494 | cursorBuilder.add(queue, lastVisible.data()[queue]); 495 | } 496 | 497 | const hits = snap.docs.map(doc => doc.data().__ref); 498 | 499 | return { 500 | hits, 501 | total, 502 | cursor: hits.length < limit ? undefined : await cursorBuilder.build(), 503 | }; 504 | } 505 | } 506 | -------------------------------------------------------------------------------- /src/pagination.spec.ts: -------------------------------------------------------------------------------- 1 | import admin from 'firebase-admin'; 2 | import fs from 'fs'; 3 | import path from 'path'; 4 | import {getCount} from './counter'; 5 | import FirestoreFullTextSearch from './index'; 6 | 7 | process.env.FIRESTORE_EMULATOR_HOST = 8 | process.env.FIRESTORE_EMULATOR_HOST || 'localhost:5000'; 9 | 10 | admin.initializeApp({ 11 | projectId: 'test', 12 | }); 13 | 14 | const db = admin.firestore(); 15 | const docs = db.collection('animals'); 16 | const index = db.collection('pagination'); 17 | const fullTextSearch = new FirestoreFullTextSearch(index); 18 | 19 | describe('pagination', () => { 20 | beforeAll(async () => { 21 | const count = await getCount(index.doc('v1')); 22 | if (count !== 0) { 23 | return; 24 | } 25 | 26 | const {items} = await new Promise((resolve, reject) => { 27 | fs.readFile( 28 | path.resolve(__dirname, '..', 'testdata', '5.en.json'), 29 | (err, data) => { 30 | if (err) { 31 | reject(err); 32 | return; 33 | } 34 | 35 | resolve(JSON.parse(data.toString('utf-8'))); 36 | } 37 | ); 38 | }); 39 | for (const {title, description} of items) { 40 | const batch = db.batch(); 41 | const ref = docs.doc(title); 42 | const data = {description}; 43 | await batch.set(ref, data); 44 | await fullTextSearch.set('en', ref, {data, batch}); 45 | await batch.commit(); 46 | } 47 | }); 48 | 49 | it('basic', async () => { 50 | const {hits, total, cursor} = await fullTextSearch.search('en', 'member', { 51 | limit: 2, 52 | }); 53 | 54 | // console.log({hits: hits.map(hit => hit.id), total, cursor}); 55 | 56 | expect(hits.length).toBe(2); 57 | expect(hits.map(hit => hit.path)).toStrictEqual([ 58 | 'animals/Cattle', 59 | 'animals/Cat', 60 | ]); 61 | expect(total).toBe(3); 62 | 63 | const { 64 | hits: hits2, 65 | total: total2, 66 | cursor: cursor2, 67 | } = await fullTextSearch.search('en', 'member', { 68 | limit: 2, 69 | cursor, 70 | }); 71 | 72 | console.log({hits2: hits2.map(hit => hit.id), cursor2}); 73 | 74 | expect(hits2.length).toBe(1); 75 | expect(cursor2).toBe(undefined); 76 | expect(hits2.map(hit => hit.path)).toStrictEqual(['animals/Bird']); 77 | expect(total2).toBe(3); 78 | }); 79 | 80 | // it('startAfter', async () => { 81 | // const wordsSnap = await db 82 | // .collection('/pagination/v1/word_docs') 83 | // .where('__word', '==', 'member') 84 | // .orderBy('__score', 'desc') 85 | // .limit(2) 86 | // .get(); 87 | 88 | // const last = wordsSnap.docs[wordsSnap.docs.length - 1]; 89 | // console.log({ids: wordsSnap.docs.map(doc => doc.id)}); 90 | 91 | // const nextSnap = await db 92 | // .collection('/pagination/v1/word_docs') 93 | // .where('__word', '==', 'member') 94 | // .orderBy('__score', 'desc') 95 | // .startAfter(last) 96 | // .limit(2) 97 | // .get(); 98 | // console.log({ids: nextSnap.docs.map(doc => doc.id)}); 99 | // }); 100 | 101 | // it('startsWith', async () => { 102 | // const wordsRef = index.doc('v1').collection('words'); 103 | // const query = startsWith(wordsRef, FieldPath.documentId(), 'a'); 104 | // const snap = await query.get(); 105 | // console.log({size: snap.size, path: wordsRef.path}); 106 | // for (const doc of snap.docs) { 107 | // console.log(doc.id); 108 | // } 109 | // }); 110 | }); 111 | -------------------------------------------------------------------------------- /src/query.spec.ts: -------------------------------------------------------------------------------- 1 | import admin from 'firebase-admin'; 2 | import {DateTime} from 'luxon'; 3 | import FirestoreFullTextSearch from './index'; 4 | import {parseQuery, SearchQuery} from './query'; 5 | import {Post, Animal} from './index.spec'; 6 | 7 | process.env.FIRESTORE_EMULATOR_HOST = 8 | process.env.FIRESTORE_EMULATOR_HOST || 'localhost:5000'; 9 | 10 | admin.initializeApp({ 11 | projectId: 'test', 12 | }); 13 | 14 | describe('parseQuery', () => { 15 | it('nothing', () => { 16 | const res = parseQuery(''); 17 | const want: SearchQuery = { 18 | keywords: [], 19 | }; 20 | expect(res).toStrictEqual(want); 21 | }); 22 | 23 | it('simple', () => { 24 | const res = parseQuery('dog'); 25 | const want: SearchQuery = { 26 | keywords: ['dog'], 27 | }; 28 | expect(res).toStrictEqual(want); 29 | }); 30 | 31 | it('2 keywords', () => { 32 | const res = parseQuery('dog cat'); 33 | const want: SearchQuery = { 34 | keywords: ['dog', 'cat'], 35 | }; 36 | expect(res).toStrictEqual(want); 37 | }); 38 | 39 | it('has space keyword', () => { 40 | const res = parseQuery('"welsh corgi"'); 41 | const want: SearchQuery = { 42 | keywords: ['welsh corgi'], 43 | }; 44 | expect(res).toStrictEqual(want); 45 | }); 46 | 47 | it('has space keywords', () => { 48 | const res = parseQuery('"welsh corgi" "cardigan welsh corgi"'); 49 | const want: SearchQuery = { 50 | keywords: ['welsh corgi', 'cardigan welsh corgi'], 51 | }; 52 | expect(res).toStrictEqual(want); 53 | }); 54 | 55 | it('string:field-in', () => { 56 | const res = parseQuery('dog label:"welsh corgi"'); 57 | const want: SearchQuery = { 58 | keywords: ['dog'], 59 | fields: [ 60 | {name: 'label', type: 'string', operator: '==', value: 'welsh corgi'}, 61 | ], 62 | }; 63 | expect(res).toStrictEqual(want); 64 | }); 65 | 66 | it('string:field-not-in', () => { 67 | const res = parseQuery('dog -label:"welsh corgi"'); 68 | const want: SearchQuery = { 69 | keywords: ['dog'], 70 | fields: [ 71 | {name: 'label', type: 'string', operator: '!=', value: 'welsh corgi'}, 72 | ], 73 | }; 74 | expect(res).toStrictEqual(want); 75 | }); 76 | 77 | // // @k2wanko: I can't think of a way to make it work with the current indexing mechanism. 78 | // it('string:not', () => { 79 | // const res = parseQuery('dog NOT "welsh corgi"'); 80 | // const want: SearchQuery = { 81 | // keywords: ['dog'], 82 | // fields: [ 83 | // {name: 'label', type: 'string', operator: 'NOT', value: 'welsh corgi'}, 84 | // ], 85 | // }; 86 | // expect(res).toStrictEqual(want); 87 | // }); 88 | 89 | it('number:greater-than', () => { 90 | const res = parseQuery('dog like:>10'); 91 | const want: SearchQuery = { 92 | keywords: ['dog'], 93 | fields: [{name: 'like', type: 'number', operator: '>', value: 10}], 94 | }; 95 | expect(res).toStrictEqual(want); 96 | }); 97 | 98 | it('number:greater-than-or-equal', () => { 99 | const res = parseQuery('dog like:>=10'); 100 | const want: SearchQuery = { 101 | keywords: ['dog'], 102 | fields: [{name: 'like', type: 'number', operator: '>=', value: 10}], 103 | }; 104 | expect(res).toStrictEqual(want); 105 | }); 106 | 107 | it('number:less-than', () => { 108 | const res = parseQuery('dog like:<10'); 109 | const want: SearchQuery = { 110 | keywords: ['dog'], 111 | fields: [{name: 'like', type: 'number', operator: '<', value: 10}], 112 | }; 113 | expect(res).toStrictEqual(want); 114 | }); 115 | 116 | it('number:less-than-or-equal', () => { 117 | const res = parseQuery('dog like:<=10'); 118 | const want: SearchQuery = { 119 | keywords: ['dog'], 120 | fields: [{name: 'like', type: 'number', operator: '<=', value: 10}], 121 | }; 122 | expect(res).toStrictEqual(want); 123 | }); 124 | 125 | it('date:greater-than', () => { 126 | const res = parseQuery('hello created:>2021-01-01'); 127 | const want: SearchQuery = { 128 | keywords: ['hello'], 129 | fields: [ 130 | { 131 | name: 'created', 132 | type: 'date', 133 | operator: '>', 134 | value: DateTime.fromISO('2021-01-01').toJSDate(), 135 | }, 136 | ], 137 | }; 138 | expect(res).toStrictEqual(want); 139 | }); 140 | 141 | it('date:greater-than-or-equal', () => { 142 | const res = parseQuery('hello created:>=2021-01-01'); 143 | const want: SearchQuery = { 144 | keywords: ['hello'], 145 | fields: [ 146 | { 147 | name: 'created', 148 | type: 'date', 149 | operator: '>=', 150 | value: DateTime.fromISO('2021-01-01').toJSDate(), 151 | }, 152 | ], 153 | }; 154 | expect(res).toStrictEqual(want); 155 | }); 156 | 157 | it('date:less-than', () => { 158 | const res = parseQuery('hello created:<2021-01-01'); 159 | const want: SearchQuery = { 160 | keywords: ['hello'], 161 | fields: [ 162 | { 163 | name: 'created', 164 | type: 'date', 165 | operator: '<', 166 | value: DateTime.fromISO('2021-01-01').toJSDate(), 167 | }, 168 | ], 169 | }; 170 | expect(res).toStrictEqual(want); 171 | }); 172 | 173 | it('date:less-than-or-equal', () => { 174 | const res = parseQuery('hello created:<=2021-01-01'); 175 | const want: SearchQuery = { 176 | keywords: ['hello'], 177 | fields: [ 178 | { 179 | name: 'created', 180 | type: 'date', 181 | operator: '<=', 182 | value: DateTime.fromISO('2021-01-01').toJSDate(), 183 | }, 184 | ], 185 | }; 186 | expect(res).toStrictEqual(want); 187 | }); 188 | }); 189 | 190 | describe('querySearch', () => { 191 | beforeAll(async () => { 192 | const db = admin.firestore(); 193 | 194 | const postsRef = db.collection('posts'); 195 | const postData: Post = { 196 | title: 'Test Post', 197 | content: 'Hello', 198 | created: DateTime.fromISO('2021-01-01').toJSDate(), 199 | label: ['draft'], 200 | }; 201 | const postData2: Post = { 202 | title: 'Test Post', 203 | content: 'Hello', 204 | created: DateTime.fromISO('2021-01-02').toJSDate(), 205 | label: ['published'], 206 | }; 207 | const postData3: Post = { 208 | title: 'Test Post 2', 209 | content: 'Hello World', 210 | created: DateTime.fromISO('2021-02-01').toJSDate(), 211 | label: ['published'], 212 | }; 213 | 214 | const docRef = postsRef.doc('bF7lfaw8gOlkAPlqGzTHh'); 215 | const docRef2 = postsRef.doc('cF7lfawhaOlkAPlqGzTHh'); 216 | const docRef3 = postsRef.doc('dF7lfawhaOlkAPlqGzTHh'); 217 | 218 | const batch = db.batch(); 219 | batch.set(docRef, postData); 220 | batch.set(docRef2, postData2); 221 | batch.set(docRef3, postData3); 222 | 223 | const indexRef = db.collection('index_posts'); 224 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 225 | await fullTextSearch.set('en', docRef, { 226 | batch, 227 | data: postData, 228 | indexMask: ['content'], 229 | fields: ['label', 'created'], 230 | }); 231 | await fullTextSearch.set('en', docRef2, { 232 | batch, 233 | data: postData2, 234 | indexMask: ['content'], 235 | fields: ['label', 'created'], 236 | }); 237 | await fullTextSearch.set('en', docRef3, { 238 | batch, 239 | data: postData3, 240 | indexMask: ['content'], 241 | fields: ['label', 'created'], 242 | }); 243 | 244 | await batch.commit(); 245 | }); 246 | 247 | beforeAll(async () => { 248 | const dogs: {[key: string]: Animal} = { 249 | akita: { 250 | type: 'dog', 251 | description: 252 | 'The Akita (秋田犬, Akita-inu, Japanese pronunciation: [akʲita.inɯ]) is a large breed of dog originating from the mountainous regions of northern Japan.', 253 | like: 10, 254 | }, 255 | corgi: { 256 | type: 'dog', 257 | description: 258 | 'The Welsh Corgi (/ˈkɔːrɡi/[5] plural "Corgis" or occasionally the etymologically consistent "Corgwn"; /ˈkɔːrɡuːn/) is a small type of herding dog that originated in Wales.[6]', 259 | like: 50, 260 | }, 261 | 'border collie': { 262 | type: 'dog', 263 | description: 264 | 'The Border Collie is a working and herding dog breed developed in the Anglo-Scottish border county of Northumberland, for herding livestock, especially sheep.[1]', 265 | like: 5, 266 | }, 267 | }; 268 | 269 | const db = admin.firestore(); 270 | const batch = db.batch(); 271 | const indexRef = db.collection('index_dogs'); 272 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 273 | for (const [id, data] of Object.entries(dogs)) { 274 | const dogRef = db.collection('dogs').doc(id); 275 | batch.set(dogRef, data); 276 | await fullTextSearch.set('en', dogRef, { 277 | data, 278 | batch, 279 | indexMask: ['description'], 280 | fields: ['like'], 281 | }); 282 | } 283 | await batch.commit(); 284 | }); 285 | 286 | it('string:field-in', async () => { 287 | const db = admin.firestore(); 288 | const indexRef = db.collection('index_posts'); 289 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 290 | const {hits} = await fullTextSearch.search('en', 'hello label:published'); 291 | expect(hits.length).toBe(2); 292 | }); 293 | 294 | it('string:field-not-in', async () => { 295 | const db = admin.firestore(); 296 | const indexRef = db.collection('index_posts'); 297 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 298 | const {hits} = await fullTextSearch.search('en', 'hello -label:published'); 299 | expect(hits.length).toBe(1); 300 | }); 301 | 302 | it('number:greater-than', async () => { 303 | const db = admin.firestore(); 304 | const indexRef = db.collection('index_dogs'); 305 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 306 | const {hits} = await fullTextSearch.search('en', 'herding like:>5'); 307 | expect(hits.length >= 1).toBe(true); 308 | expect(hits[0].id).toBe('corgi'); 309 | }); 310 | 311 | it('number:greater-than-or-equal', async () => { 312 | const db = admin.firestore(); 313 | const indexRef = db.collection('index_dogs'); 314 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 315 | const {hits} = await fullTextSearch.search('en', 'herding like:>=5'); 316 | expect(hits.length >= 2).toBe(true); 317 | expect(hits[0].id).toBe('border collie'); 318 | expect(hits[1].id).toBe('corgi'); 319 | }); 320 | 321 | it('number:less-than', async () => { 322 | const db = admin.firestore(); 323 | const indexRef = db.collection('index_dogs'); 324 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 325 | const {hits} = await fullTextSearch.search('en', 'herding like:<10'); 326 | expect(hits.length >= 1).toBe(true); 327 | expect(hits[0].id).toBe('border collie'); 328 | }); 329 | 330 | it('number:less-than-or-equal', async () => { 331 | const db = admin.firestore(); 332 | const indexRef = db.collection('index_dogs'); 333 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 334 | const {hits} = await fullTextSearch.search('en', 'herding like:<=50'); 335 | expect(hits.length >= 2).toBe(true); 336 | expect(hits[0].id).toBe('border collie'); 337 | expect(hits[1].id).toBe('corgi'); 338 | }); 339 | 340 | it('date:greater-than', async () => { 341 | const db = admin.firestore(); 342 | const indexRef = db.collection('index_posts'); 343 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 344 | const {hits} = await fullTextSearch.search( 345 | 'en', 346 | 'hello created:>2021-01-01' 347 | ); 348 | expect(hits.length >= 2).toBe(true); 349 | expect(hits[0].id).toBe('cF7lfawhaOlkAPlqGzTHh'); 350 | expect(hits[1].id).toBe('dF7lfawhaOlkAPlqGzTHh'); 351 | }); 352 | 353 | it('date:greater-than-or-equal', async () => { 354 | const db = admin.firestore(); 355 | const indexRef = db.collection('index_posts'); 356 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 357 | const {hits} = await fullTextSearch.search( 358 | 'en', 359 | 'hello created:>=2021-01-01' 360 | ); 361 | expect(hits.length >= 3).toBe(true); 362 | expect(hits[0].id).toBe('bF7lfaw8gOlkAPlqGzTHh'); 363 | expect(hits[1].id).toBe('cF7lfawhaOlkAPlqGzTHh'); 364 | expect(hits[2].id).toBe('dF7lfawhaOlkAPlqGzTHh'); 365 | }); 366 | 367 | it('date:less-than', async () => { 368 | const db = admin.firestore(); 369 | const indexRef = db.collection('index_posts'); 370 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 371 | const {hits} = await fullTextSearch.search( 372 | 'en', 373 | 'hello created:<2021-01-02' 374 | ); 375 | expect(hits.length === 1).toBe(true); 376 | expect(hits[0].id).toBe('bF7lfaw8gOlkAPlqGzTHh'); 377 | }); 378 | 379 | it('date:less-than-or-equal', async () => { 380 | const db = admin.firestore(); 381 | const indexRef = db.collection('index_posts'); 382 | const fullTextSearch = new FirestoreFullTextSearch(indexRef); 383 | const {hits} = await fullTextSearch.search( 384 | 'en', 385 | 'hello created:<=2021-01-02' 386 | ); 387 | expect(hits.length === 2).toBe(true); 388 | expect(hits[0].id).toBe('bF7lfaw8gOlkAPlqGzTHh'); 389 | expect(hits[1].id).toBe('cF7lfawhaOlkAPlqGzTHh'); 390 | }); 391 | }); 392 | -------------------------------------------------------------------------------- /src/query.ts: -------------------------------------------------------------------------------- 1 | import {DateTime} from 'luxon'; 2 | 3 | export type FieldType = FieldStringType | FieldNumberType | FieldDateType; 4 | 5 | export type FieldStringType = { 6 | type: 'string'; 7 | operator: FilterOp; 8 | value: string; 9 | } & FieldTypeBase; 10 | 11 | export type FieldNumberType = { 12 | type: 'number'; 13 | operator: FilterOp; 14 | value: number; 15 | } & FieldTypeBase; 16 | 17 | export type FieldDateType = { 18 | type: 'date'; 19 | operator: FilterOp; 20 | value: Date; 21 | } & FieldTypeBase; 22 | 23 | export type FilterOp = '==' | '!=' | '>' | '>=' | '<' | '<='; 24 | 25 | export type FieldTypeBase = {name: string}; 26 | 27 | export type SearchQuery = { 28 | keywords: string[]; 29 | fields?: FieldType[]; 30 | }; 31 | 32 | export function parseQuery(query: string): SearchQuery { 33 | if (!query) { 34 | return { 35 | keywords: [], 36 | }; 37 | } 38 | 39 | const keywords: string[] = []; 40 | const regex = /(\S+:'(?:[^'\\]|\\.)*')|(\S+:"(?:[^"\\]|\\.)*")|(-?"(?:[^"\\]|\\.)*")|(-?'(?:[^'\\]|\\.)*')|\S+|\S+:\S+/g; 41 | let fields: FieldType[] | undefined; 42 | let match; 43 | while ((match = regex.exec(query)) !== null) { 44 | const term = match[0]; 45 | if (!term.includes(':')) { 46 | keywords.push(term.replace(/"/g, '')); 47 | continue; 48 | } 49 | if (!fields) { 50 | fields = []; 51 | } 52 | 53 | let [name, value] = term.split(':'); 54 | let operator: FilterOp = '=='; 55 | if (name.startsWith('-')) { 56 | name = name.slice(1, name.length); 57 | operator = '!='; 58 | } 59 | 60 | let [numOp, numValOrDateOrStr] = [ 61 | value.slice(0, 1), 62 | value.slice(1, value.length), 63 | ]; 64 | if (numValOrDateOrStr.startsWith('=')) { 65 | numOp += '='; 66 | numValOrDateOrStr = numValOrDateOrStr.slice(1, numValOrDateOrStr.length); 67 | } 68 | const numberVal = Number.parseInt(numValOrDateOrStr); 69 | if (!Number.isNaN(numberVal) && !numValOrDateOrStr.includes('-')) { 70 | switch (numOp) { 71 | case '>': 72 | case '<': 73 | case '>=': 74 | case '<=': 75 | fields.push({ 76 | name, 77 | type: 'number', 78 | operator: numOp, 79 | value: numberVal, 80 | }); 81 | continue; 82 | default: 83 | } 84 | } 85 | 86 | const datetime = DateTime.fromISO(numValOrDateOrStr); 87 | if (datetime.invalidReason === null) { 88 | switch (numOp) { 89 | case '>': 90 | case '<': 91 | case '>=': 92 | case '<=': 93 | fields.push({ 94 | name, 95 | type: 'date', 96 | operator: numOp, 97 | value: datetime.toJSDate(), 98 | }); 99 | continue; 100 | default: 101 | } 102 | } 103 | 104 | value = value.replace(/"/g, ''); 105 | fields.push({ 106 | name, 107 | type: 'string', 108 | operator, 109 | value, 110 | }); 111 | } 112 | if (fields) { 113 | return { 114 | keywords, 115 | fields, 116 | }; 117 | } 118 | return { 119 | keywords, 120 | }; 121 | } 122 | -------------------------------------------------------------------------------- /src/sort.ts: -------------------------------------------------------------------------------- 1 | export function calcScore( 2 | targetWordCount: number, 3 | totalWordCount: number, 4 | targetWordDocCount: number, 5 | allDocCount: number 6 | ): number { 7 | return ( 8 | (targetWordCount / totalWordCount) * 9 | (Math.log(allDocCount / targetWordDocCount) || 1) 10 | ); 11 | } 12 | -------------------------------------------------------------------------------- /src/tokenizer/english.ts: -------------------------------------------------------------------------------- 1 | import type {LanguageID, Tokenizer} from './index'; 2 | 3 | const exceptions = new Map([ 4 | ['skis', 'ski'], 5 | ['dying', 'die'], 6 | ['lying', 'lie'], 7 | ['tying', 'tie'], 8 | ['idly', 'idl'], 9 | ['gently', 'gentl'], 10 | ['ugly', 'ugli'], 11 | ['early', 'earli'], 12 | ['only', 'onli'], 13 | ['singly', 'singl'], 14 | ['sky', 'sky'], 15 | ['news', 'news'], 16 | ['howe', 'howe'], 17 | ['atlas', 'atlas'], 18 | ['cosmos', 'cosmos'], 19 | ['bias', 'bias'], 20 | ['andes', 'andes'], 21 | ]); 22 | 23 | const exceptions1a = new Map([ 24 | ['inning', 'inning'], 25 | ['outing', 'outing'], 26 | ['canning', 'canning'], 27 | ['herring', 'herring'], 28 | ['earring', 'earring'], 29 | ['proceed', 'proceed'], 30 | ['exceed', 'exceed'], 31 | ['succeed', 'succeed'], 32 | ]); 33 | 34 | const extensions2 = new Map([ 35 | ['ization', 'ize'], 36 | ['fulness', 'ful'], 37 | ['iveness', 'ive'], 38 | ['ational', 'ate'], 39 | ['ousness', 'ous'], 40 | ['tional', 'tion'], 41 | ['biliti', 'ble'], 42 | ['lessli', 'less'], 43 | ['entli', 'ent'], 44 | ['ation', 'ate'], 45 | ['alism', 'al'], 46 | ['aliti', 'al'], 47 | ['ousli', 'ous'], 48 | ['iviti', 'ive'], 49 | ['fulli', 'ful'], 50 | ['enci', 'ence'], 51 | ['anci', 'ance'], 52 | ['abli', 'able'], 53 | ['izer', 'ize'], 54 | ['ator', 'ate'], 55 | ['alli', 'al'], 56 | ['bli', 'ble'], 57 | ['ogi', 'og'], 58 | ['li', ''], 59 | ]); 60 | 61 | // https://github.com/stopwords-iso/stopwords-en/blob/master/raw/snowball-tartarus.txt 62 | const stopWords = new Set([ 63 | 'i', 64 | 'me', 65 | 'my', 66 | 'myself', 67 | 'we', 68 | 'us', 69 | 'our', 70 | 'ours', 71 | 'ourselves', 72 | 'you', 73 | 'your', 74 | 'yours', 75 | 'yourself', 76 | 'yourselves', 77 | 'he', 78 | 'him', 79 | 'his', 80 | 'himself', 81 | 'she', 82 | 'her', 83 | 'hers', 84 | 'herself', 85 | 'it', 86 | 'its', 87 | 'itself', 88 | 'they', 89 | 'them', 90 | 'their', 91 | 'theirs', 92 | 'themselves', 93 | 'what', 94 | 'which', 95 | 'who', 96 | 'whom', 97 | 'this', 98 | 'that', 99 | 'these', 100 | 'those', 101 | 'am', 102 | 'is', 103 | 'are', 104 | 'was', 105 | 'were', 106 | 'be', 107 | 'been', 108 | 'being', 109 | 'have', 110 | 'has', 111 | 'had', 112 | 'having', 113 | 'do', 114 | 'does', 115 | 'did', 116 | 'doing', 117 | 'will', 118 | 'would', 119 | 'shall', 120 | 'should', 121 | 'can', 122 | 'could', 123 | 'may', 124 | 'might', 125 | 'must', 126 | 'ought', 127 | "i'm", 128 | "you're", 129 | "he's", 130 | "she's", 131 | "it's", 132 | "we're", 133 | "they're", 134 | "i've", 135 | "you've", 136 | "we've", 137 | "they've", 138 | "i'd", 139 | "you'd", 140 | "he'd", 141 | "she'd", 142 | "we'd", 143 | "they'd", 144 | "i'll", 145 | "you'll", 146 | "he'll", 147 | "she'll", 148 | "we'll", 149 | "they'll", 150 | "isn't", 151 | "aren't", 152 | "wasn't", 153 | "weren't", 154 | "hasn't", 155 | "haven't", 156 | "hadn't", 157 | "doesn't", 158 | "don't", 159 | "didn't", 160 | "won't", 161 | "wouldn't", 162 | "shan't", 163 | "shouldn't", 164 | "can't", 165 | 'cannot', 166 | "couldn't", 167 | "mustn't", 168 | "let's", 169 | "that's", 170 | "who's", 171 | "what's", 172 | "here's", 173 | "there's", 174 | "when's", 175 | "where's", 176 | "why's", 177 | "how's", 178 | "daren't", 179 | "needn't", 180 | 'doubtful', 181 | "oughtn't", 182 | "mightn't", 183 | 'a', 184 | 'an', 185 | 'the', 186 | 'and', 187 | 'but', 188 | 'if', 189 | 'or', 190 | 'because', 191 | 'as', 192 | 'until', 193 | 'while', 194 | 'of', 195 | 'at', 196 | 'by', 197 | 'for', 198 | 'with', 199 | 'about', 200 | 'against', 201 | 'between', 202 | 'into', 203 | 'through', 204 | 'during', 205 | 'before', 206 | 'after', 207 | 'above', 208 | 'below', 209 | 'to', 210 | 'from', 211 | 'up', 212 | 'down', 213 | 'in', 214 | 'out', 215 | 'on', 216 | 'off', 217 | 'over', 218 | 'under', 219 | 'again', 220 | 'further', 221 | 'then', 222 | 'once', 223 | 'here', 224 | 'there', 225 | 'when', 226 | 'where', 227 | 'why', 228 | 'how', 229 | 'all', 230 | 'any', 231 | 'both', 232 | 'each', 233 | 'few', 234 | 'more', 235 | 'most', 236 | 'other', 237 | 'some', 238 | 'such', 239 | 'no', 240 | 'nor', 241 | 'not', 242 | 'only', 243 | 'own', 244 | 'same', 245 | 'so', 246 | 'than', 247 | 'too', 248 | 'very', 249 | 'one', 250 | 'every', 251 | 'least', 252 | 'less', 253 | 'many', 254 | 'now', 255 | 'ever', 256 | 'never', 257 | 'say', 258 | 'says', 259 | 'said', 260 | 'also', 261 | 'get', 262 | 'go', 263 | 'goes', 264 | 'just', 265 | 'made', 266 | 'make', 267 | 'put', 268 | 'see', 269 | 'seen', 270 | 'whether', 271 | 'like', 272 | 'well', 273 | 'back', 274 | 'even', 275 | 'still', 276 | 'way', 277 | 'take', 278 | 'since', 279 | 'another', 280 | 'however', 281 | 'two', 282 | 'three', 283 | 'four', 284 | 'five', 285 | 'first', 286 | 'second', 287 | 'new', 288 | 'old', 289 | 'high', 290 | 'long', 291 | ]); 292 | 293 | export class EnglishTokenizer implements Tokenizer { 294 | getLanguage(): LanguageID { 295 | return 'en'; 296 | } 297 | 298 | async getStopWords(): Promise> { 299 | return stopWords; 300 | } 301 | 302 | async splitter(content: string): Promise { 303 | const words = content.trim().split(/ +/); 304 | return words.map(word => word.replace(/[.,:"]+$/g, '')).filter(v => !!v); 305 | } 306 | 307 | // implemented from algorithm at http://snowball.tartarus.org/algorithms/english/stemmer.html 308 | async stemmer(content: string): Promise { 309 | if (content.length < 3) { 310 | return content; 311 | } 312 | if (exceptions.has(content)) { 313 | return exceptions.get(content) ?? ''; 314 | } 315 | 316 | const eRx = ['', '']; 317 | content = content 318 | .toLowerCase() 319 | .replace(/^'/, '') 320 | .replace(/[^a-z']/g, '') 321 | .replace(/^y|([aeiouy])y/g, '$1Y'); 322 | let R1, res; 323 | 324 | if ((res = /^(gener|commun|arsen)/.exec(content))) { 325 | R1 = res[0].length; 326 | } else { 327 | R1 = (/[aeiouy][^aeiouy]/.exec(' ' + content)?.index || 1000) + 1; 328 | } 329 | 330 | const R2 = 331 | (/[aeiouy][^aeiouy]/.exec(' ' + content.substr(R1))?.length || 1000) + 332 | R1 + 333 | 1; 334 | 335 | // step 0 336 | content = content.replace(/('s'?|')$/, ''); 337 | 338 | // step 1a 339 | const rx = /(?:(ss)es|(..i)(?:ed|es)|(us)|(ss)|(.ie)(?:d|s))$/; 340 | if (rx.test(content)) { 341 | content = content.replace(rx, '$1$2$3$4$5'); 342 | } else { 343 | content = content.replace(/([aeiouy].+)s$/, '$1'); 344 | } 345 | 346 | if (exceptions1a.has(content)) { 347 | return exceptions1a.get(content) ?? ''; 348 | } 349 | 350 | // step 1b 351 | const s1 = (/(eedly|eed)$/.exec(content) || eRx)[1], 352 | s2 = (/(?:[aeiouy].*)(ingly|edly|ing|ed)$/.exec(content) || eRx)[1]; 353 | 354 | if (s1.length > s2.length) { 355 | if (content.indexOf(s1, R1) >= 0) { 356 | content = content.substr(0, content.length - s1.length) + 'ee'; 357 | } 358 | } else if (s2.length > s1.length) { 359 | content = content.substr(0, content.length - s2.length); 360 | if (/(at|bl|iz)$/.test(content)) { 361 | content += 'e'; 362 | } else if (/(bb|dd|ff|gg|mm|nn|pp|rr|tt)$/.test(content)) { 363 | content = content.substr(0, content.length - 1); 364 | } else if ( 365 | !content.substr(R1) && 366 | /([^aeiouy][aeiouy][^aeiouywxY]|^[aeiouy][^aeiouy]|^[aeiouy])$/.test( 367 | content 368 | ) 369 | ) { 370 | content += 'e'; 371 | } 372 | } 373 | 374 | // step 1c 375 | content = content.replace(/(.[^aeiouy])[yY]$/, '$1i'); 376 | 377 | // step 2 378 | const sfx = /(ization|fulness|iveness|ational|ousness|tional|biliti|lessli|entli|ation|alism|aliti|ousli|iviti|fulli|enci|anci|abli|izer|ator|alli|bli|l(ogi)|[cdeghkmnrt](li))$/.exec( 379 | content 380 | ); 381 | if (sfx) { 382 | const sfx2 = sfx[3] || sfx[2] || sfx[1]; 383 | if (content.indexOf(sfx2, R1) >= 0) { 384 | content = 385 | content.substr(0, content.length - sfx2.length) + 386 | extensions2.get(sfx2); 387 | } 388 | } 389 | 390 | // step 3 391 | const sfx3 = (/(ational|tional|alize|icate|iciti|ative|ical|ness|ful)$/.exec( 392 | content 393 | ) || eRx)[1]; 394 | if (sfx && content.indexOf(sfx3, R1) >= 0) { 395 | content = `${content.substr(0, content.length - sfx3.length)}${new Map([ 396 | ['ational', 'ate'], 397 | ['tional', 'tion'], 398 | ['alize', 'al'], 399 | ['icate', 'ic'], 400 | ['iciti', 'ic'], 401 | ['ative', content.indexOf('ative', R2) >= 0 ? '' : 'ative'], 402 | ['ical', 'ic'], 403 | ['ness', ''], 404 | ['ful', ''], 405 | ]).get(sfx3)}`; 406 | } 407 | 408 | // step 4 409 | const sfx4 = /(ement|ance|ence|able|ible|ment|ant|ent|ism|ate|iti|ous|ive|ize|[st](ion)|al|er|ic)$/.exec( 410 | content 411 | ); 412 | if (sfx4) { 413 | const sfx5 = sfx4[2] || sfx4[1]; 414 | if (content.indexOf(sfx5, R2) >= 0) { 415 | content = content.substr(0, content.length - sfx5.length); 416 | } 417 | } 418 | 419 | // step 5 420 | if (content.substr(-1) === 'e') { 421 | if ( 422 | content.substr(R2) || 423 | (content.substr(R1) && 424 | !/([^aeiouy][aeiouy][^aeiouywxY]|^[aeiouy][^aeiouy])e$/.test(content)) 425 | ) { 426 | content = content.substr(0, content.length - 1); 427 | } 428 | } else if (content.substr(-2) === 'll' && content.indexOf('l', R2) >= 0) { 429 | content = content.substr(0, content.length - 1); 430 | } 431 | 432 | return content.toLowerCase(); 433 | } 434 | } 435 | -------------------------------------------------------------------------------- /src/tokenizer/index.ts: -------------------------------------------------------------------------------- 1 | export type LanguageID = English | Japanese; 2 | export type English = 'en'; 3 | export type Japanese = 'ja'; 4 | 5 | export interface Tokenizer { 6 | getLanguage(): LanguageID; 7 | getStopWords(): Promise>; 8 | splitter(content: string): Promise; 9 | stemmer(content: string): Promise; 10 | } 11 | 12 | export type Token = { 13 | word: string; 14 | normalizedWord: string; 15 | positions: number[]; 16 | }; 17 | -------------------------------------------------------------------------------- /src/tokenizer/japanese.ts: -------------------------------------------------------------------------------- 1 | import type {LanguageID, Tokenizer} from './index'; 2 | import path from 'path'; 3 | import kuromoji from 'kuromoji'; 4 | 5 | const stopWords = new Set([ 6 | 'あそこ', 7 | 'あっ', 8 | 'あの', 9 | 'あのかた', 10 | 'あの人', 11 | 'あり', 12 | 'あります', 13 | 'ある', 14 | 'あれ', 15 | 'い', 16 | 'いう', 17 | 'います', 18 | 'いる', 19 | 'う', 20 | 'うち', 21 | 'え', 22 | 'お', 23 | 'および', 24 | 'おり', 25 | 'おります', 26 | 'か', 27 | 'かつて', 28 | 'から', 29 | 'が', 30 | 'き', 31 | 'ここ', 32 | 'こちら', 33 | 'こと', 34 | 'この', 35 | 'これ', 36 | 'これら', 37 | 'さ', 38 | 'さらに', 39 | 'し', 40 | 'しかし', 41 | 'する', 42 | 'ず', 43 | 'せ', 44 | 'せる', 45 | 'そこ', 46 | 'そして', 47 | 'その', 48 | 'その他', 49 | 'その後', 50 | 'それ', 51 | 'それぞれ', 52 | 'それで', 53 | 'た', 54 | 'ただし', 55 | 'たち', 56 | 'ため', 57 | 'たり', 58 | 'だ', 59 | 'だっ', 60 | 'だれ', 61 | 'つ', 62 | 'て', 63 | 'で', 64 | 'でき', 65 | 'できる', 66 | 'です', 67 | 'では', 68 | 'でも', 69 | 'と', 70 | 'という', 71 | 'といった', 72 | 'とき', 73 | 'ところ', 74 | 'として', 75 | 'とともに', 76 | 'とも', 77 | 'と共に', 78 | 'どこ', 79 | 'どの', 80 | 'な', 81 | 'ない', 82 | 'なお', 83 | 'なかっ', 84 | 'ながら', 85 | 'なく', 86 | 'なっ', 87 | 'など', 88 | 'なに', 89 | 'なら', 90 | 'なり', 91 | 'なる', 92 | 'なん', 93 | 'に', 94 | 'において', 95 | 'における', 96 | 'について', 97 | 'にて', 98 | 'によって', 99 | 'により', 100 | 'による', 101 | 'に対して', 102 | 'に対する', 103 | 'に関する', 104 | 'の', 105 | 'ので', 106 | 'のみ', 107 | 'は', 108 | 'ば', 109 | 'へ', 110 | 'ほか', 111 | 'ほとんど', 112 | 'ほど', 113 | 'ます', 114 | 'また', 115 | 'または', 116 | 'まで', 117 | 'も', 118 | 'もの', 119 | 'ものの', 120 | 'や', 121 | 'よう', 122 | 'より', 123 | 'ら', 124 | 'られ', 125 | 'られる', 126 | 'れ', 127 | 'れる', 128 | 'を', 129 | 'ん', 130 | '何', 131 | '及び', 132 | '彼', 133 | '彼女', 134 | '我々', 135 | '特に', 136 | '私', 137 | '私達', 138 | '貴方', 139 | '貴方方', 140 | ]); 141 | 142 | export class JapaneseTokenizer implements Tokenizer { 143 | #builder: kuromoji.TokenizerBuilder; 144 | #tokenizer?: kuromoji.Tokenizer; 145 | 146 | constructor() { 147 | this.#builder = kuromoji.builder({ 148 | dicPath: path.resolve(__dirname, '../../node_modules/kuromoji/dict'), 149 | }); 150 | } 151 | 152 | getLanguage(): LanguageID { 153 | return 'ja'; 154 | } 155 | 156 | async getStopWords(): Promise> { 157 | return stopWords; 158 | } 159 | 160 | async splitter(content: string): Promise { 161 | const tokenizer = await new Promise< 162 | kuromoji.Tokenizer 163 | >((resolve, reject) => { 164 | if (this.#tokenizer) { 165 | return resolve(this.#tokenizer); 166 | } 167 | this.#builder.build((err, tokenizer) => { 168 | if (err) { 169 | reject(err); 170 | return; 171 | } 172 | this.#tokenizer = tokenizer; 173 | resolve(tokenizer); 174 | }); 175 | }); 176 | const res = tokenizer.tokenize(content); 177 | return res 178 | .filter(token => token.pos !== '助詞') 179 | .filter(token => token.pos !== '記号') 180 | .filter(token => token.surface_form !== '.') 181 | .map(token => token.surface_form); 182 | } 183 | 184 | async stemmer(content: string): Promise { 185 | return content; 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/tokenizer/tokenize.ts: -------------------------------------------------------------------------------- 1 | import type {Tokenizer, LanguageID, Token} from './index'; 2 | import {EnglishTokenizer} from './english'; 3 | import {JapaneseTokenizer} from './japanese'; 4 | 5 | export default async function tokenize( 6 | lang: LanguageID, 7 | text: string 8 | ): Promise { 9 | let tokeneizer: Tokenizer | null = null; 10 | switch (lang) { 11 | case 'en': 12 | tokeneizer = new EnglishTokenizer(); 13 | break; 14 | case 'ja': 15 | tokeneizer = new JapaneseTokenizer(); 16 | break; 17 | default: 18 | throw new Error(`Unsupport language: ${lang}`); 19 | } 20 | const words = await tokeneizer.splitter(text); 21 | 22 | const wordToPositions = new Map< 23 | string, 24 | {word: string; positions: number[]} 25 | >(); 26 | let index = 0; 27 | for (const word of words) { 28 | if ((await tokeneizer.getStopWords()).has(word)) { 29 | continue; 30 | } 31 | 32 | const stemWord = await tokeneizer.stemmer(word.toLowerCase()); 33 | if (wordToPositions.has(stemWord)) { 34 | wordToPositions.set(stemWord, { 35 | word, 36 | positions: wordToPositions.get(stemWord)?.positions.concat(index) ?? [], 37 | }); 38 | } else { 39 | wordToPositions.set(stemWord, {word, positions: [index]}); 40 | } 41 | index++; 42 | } 43 | 44 | const res: Token[] = new Array(index); 45 | for (const [stemWord, {word, positions}] of wordToPositions) { 46 | for (const pos of positions) { 47 | res[pos] = { 48 | word, 49 | normalizedWord: stemWord, 50 | positions, 51 | }; 52 | } 53 | } 54 | 55 | return res; 56 | } 57 | -------------------------------------------------------------------------------- /src/tokenizer/tokneize.test.ts: -------------------------------------------------------------------------------- 1 | import tokeneize from './tokenize'; 2 | import type {Token} from './index'; 3 | 4 | describe('tokeneize', () => { 5 | it('english', async () => { 6 | const word = 7 | "Node.js is a JavaScript runtime built on Chrome's V8 JavaScript engine."; 8 | const wants: Token[] = [ 9 | {word: 'Node.js', normalizedWord: 'nodej', positions: [0]}, 10 | {word: 'JavaScript', normalizedWord: 'javascript', positions: [1, 6]}, 11 | {word: 'runtime', normalizedWord: 'runtim', positions: [2]}, 12 | {word: 'built', normalizedWord: 'built', positions: [3]}, 13 | {word: "Chrome's", normalizedWord: 'chrome', positions: [4]}, 14 | {word: 'V8', normalizedWord: 'v8', positions: [5]}, 15 | {word: 'JavaScript', normalizedWord: 'javascript', positions: [1, 6]}, 16 | {word: 'engine', normalizedWord: 'engin', positions: [7]}, 17 | ]; 18 | const res = await tokeneize('en', word); 19 | for (const i in res) { 20 | const [token, want] = [res[i], wants[i]]; 21 | expect(token.normalizedWord).toBe(want.normalizedWord); 22 | expect(token.word).toBe(want.word); 23 | expect(token.positions).toStrictEqual(want.positions); 24 | } 25 | }); 26 | 27 | it('japanese', async () => { 28 | const word = 29 | 'Node.js は、Chrome の V8 JavaScript エンジン で動作する JavaScript 環境です。'; 30 | const wants: Token[] = [ 31 | {word: 'Node', normalizedWord: 'node', positions: [0]}, 32 | {word: 'js', normalizedWord: 'js', positions: [1]}, 33 | {word: 'Chrome', normalizedWord: 'chrome', positions: [2]}, 34 | {word: 'V', normalizedWord: 'v', positions: [3]}, 35 | {word: '8', normalizedWord: '8', positions: [4]}, 36 | { 37 | word: 'JavaScript', 38 | normalizedWord: 'javascript', 39 | positions: [5, 8], 40 | }, 41 | {word: 'エンジン', normalizedWord: 'エンジン', positions: [6]}, 42 | {word: '動作', normalizedWord: '動作', positions: [7]}, 43 | { 44 | word: 'JavaScript', 45 | normalizedWord: 'javascript', 46 | positions: [5, 8], 47 | }, 48 | {word: '環境', normalizedWord: '環境', positions: [9]}, 49 | ]; 50 | 51 | const res = await tokeneize('ja', word); 52 | for (const i in res) { 53 | const [token, want] = [res[i], wants[i]]; 54 | expect(token.normalizedWord).toBe(want.normalizedWord); 55 | expect(token.word).toBe(want.word); 56 | expect(token.positions).toStrictEqual(want.positions); 57 | } 58 | }); 59 | }); 60 | -------------------------------------------------------------------------------- /src/utils/firestore.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | Firestore, 3 | WriteBatch, 4 | DocumentReference, 5 | SetOptions, 6 | Precondition, 7 | WriteResult, 8 | Query, 9 | CollectionReference, 10 | FieldPath, 11 | } from '@google-cloud/firestore'; 12 | 13 | export type WriteBatch2Options = { 14 | batch?: WriteBatch; 15 | }; 16 | 17 | type WriteData = WriteCreateData | WriteSetData | WriteDeleteData; 18 | type WriteCreateData = { 19 | type: 'create'; 20 | data: Partial; 21 | }; 22 | type WriteSetData = { 23 | type: 'set'; 24 | data: Partial; 25 | options?: SetOptions; 26 | }; 27 | type WriteDeleteData = { 28 | type: 'delete'; 29 | precondition?: Precondition; 30 | }; 31 | 32 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 33 | function flatDeep(arr: Array, d = 1): Array { 34 | return d > 0 35 | ? arr.reduce( 36 | (acc, val) => 37 | acc.concat(Array.isArray(val) ? flatDeep(val, d - 1) : val), 38 | [] 39 | ) 40 | : arr.slice(); 41 | } 42 | 43 | // Split more than 500 document writes. 44 | export class WriteBatch2 { 45 | #db: Firestore; 46 | #externalBatch: WriteBatch | null; 47 | #writeDocumentMap = new Map>(); 48 | #commited = false; 49 | 50 | constructor(db: Firestore, options?: WriteBatch2Options) { 51 | this.#db = db; 52 | this.#externalBatch = options?.batch ?? null; 53 | this.#commited = false; 54 | } 55 | 56 | create(documentRef: DocumentReference, data: T): WriteBatch2 { 57 | this.#writeDocumentMap.set(documentRef, {type: 'create', data}); 58 | return this; 59 | } 60 | 61 | set( 62 | documentRef: DocumentReference, 63 | data: Partial, 64 | options?: SetOptions 65 | ): WriteBatch2 { 66 | this.#writeDocumentMap.set(documentRef, {type: 'set', data, options}); 67 | return this; 68 | } 69 | 70 | delete( 71 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 72 | documentRef: DocumentReference, 73 | precondition?: Precondition 74 | ): WriteBatch2 { 75 | this.#writeDocumentMap.set(documentRef, {type: 'delete', precondition}); 76 | return this; 77 | } 78 | 79 | async commit(): Promise { 80 | if (this.#commited) { 81 | throw new Error('commited'); 82 | } 83 | this.#commited = true; 84 | const isSmallDocs = this.#writeDocumentMap.size <= 499; 85 | let currentBatch = isSmallDocs 86 | ? this.#externalBatch ?? this.#db.batch() 87 | : this.#db.batch(); 88 | const batchs: WriteBatch[] = [currentBatch]; 89 | let i = 0; 90 | for (const [ref, data] of this.#writeDocumentMap) { 91 | switch (data.type) { 92 | case 'create': 93 | currentBatch.create(ref, data.data); 94 | break; 95 | case 'set': 96 | if (data.options) { 97 | currentBatch.set(ref, data.data, data.options); 98 | } else { 99 | currentBatch.set(ref, data.data); 100 | } 101 | break; 102 | case 'delete': 103 | currentBatch.delete(ref, data.precondition); 104 | break; 105 | } 106 | 107 | if (i % 500 === 0) { 108 | currentBatch = this.#db.batch(); 109 | batchs.push(currentBatch); 110 | } 111 | 112 | i++; 113 | } 114 | 115 | if (isSmallDocs && this.#externalBatch && batchs.length === 1) { 116 | return []; 117 | } 118 | 119 | if (isSmallDocs && this.#externalBatch) { 120 | batchs.shift(); 121 | } 122 | 123 | const results = await Promise.all(batchs.map(batch => batch.commit())); 124 | return flatDeep(results); 125 | } 126 | } 127 | 128 | export function startsWith( 129 | query: Query | CollectionReference, 130 | fieldPath: string | FieldPath, 131 | value: string 132 | ) { 133 | const start = value.slice(0, value.length - 1); 134 | const end = value.slice(value.length - 1, value.length); 135 | const v = `${start}${String.fromCharCode(end.charCodeAt(0) + 1)}`; 136 | return query 137 | .where(fieldPath, '>=', value) 138 | .where(fieldPath, '<', v) 139 | .orderBy(fieldPath); 140 | } 141 | -------------------------------------------------------------------------------- /testdata/5.en.json: -------------------------------------------------------------------------------- 1 | { 2 | "reference": "https://en.wikipedia.org/", 3 | "items": [ 4 | { 5 | "title": "Dog", 6 | "description": "The dog (Canis familiaris when considered a distinct species or Canis lupus familiaris when considered a subspecies of the wolf) is a domesticated carnivore of the family Canidae. It is part of the wolf-like canids, and is the most widely abundant terrestrial carnivore. The dog and the extant gray wolf are sister taxa as modern wolves are not closely related to the wolves that were first domesticated, which implies that the direct ancestor of the dog is extinct. The dog was the first species to be domesticated, and has been selectively bred over millennia for various behaviors, sensory capabilities, and physical attributes.Their long association with humans has led dogs to be uniquely attuned to human behavior, and they can thrive on a starch-rich diet that would be inadequate for other canids. Dogs vary widely in shape, size, and colors. They perform many roles for humans, such as hunting, herding, pulling loads, protection, assisting police and military, companionship, and, more recently, aiding disabled people, and therapeutic roles. This influence on human society has given them the sobriquet of \"man's best friend.\"" 7 | }, 8 | { 9 | "title": "Cat", 10 | "description": "The cat (Felis catus) is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae and is often referred to as the domestic cat to distinguish it from the wild members of the family. A cat can either be a house cat, a farm cat or a feral cat; the latter ranges freely and avoids human contact. Domestic cats are valued by humans for companionship and their ability to hunt rodents. About 60 cat breeds are recognized by various cat registries.The cat is similar in anatomy to the other felid species: it has a strong flexible body, quick reflexes, sharp teeth and retractable claws adapted to killing small prey. Its night vision and sense of smell are well developed. Cat communication includes vocalizations like meowing, purring, trilling, hissing, growling and grunting as well as cat-specific body language. A predator that is most active at dawn and dusk, the cat is a solitary hunter but a social species. It can hear sounds too faint or too high in frequency for human ears, such as those made by mice and other small mammals. It secretes and perceives pheromones.Female domestic cats can have kittens from spring to late autumn, with litter sizes often ranging from two to five kittens. Domestic cats are bred and shown at events as registered pedigreed cats, a hobby known as cat fancy. Failure to control breeding of pet cats by spaying and neutering, as well as abandonment of pets, resulted in large numbers of feral cats worldwide, contributing to the extinction of entire bird, mammal, and reptile species, and evoking population control.Cats were first domesticated in the Near East around 7500 BC. It was long thought that cat domestication was initiated in ancient Egypt, as since around 3100 BC veneration was given to cats in ancient Egypt. As of 2017, the domestic cat was the second-most popular pet in the United States, with 95 million cats owned. In the United Kingdom, around 7.3 million cats lived in more than 4.8 million households as of 2019." 11 | }, 12 | { 13 | "title": "Horse", 14 | "description": "The horse (Equus ferus caballus) is one of two extant subspecies of Equus ferus. It is an odd-toed ungulate mammal belonging to the taxonomic family Equidae. The horse has evolved over the past 45 to 55 million years from a small multi-toed creature, Eohippus, into the large, single-toed animal of today. Humans began domesticating horses around 4000 BC, and their domestication is believed to have been widespread by 3000 BC. Horses in the subspecies caballus are domesticated, although some domesticated populations live in the wild as feral horses. These feral populations are not true wild horses, as this term is used to describe horses that have never been domesticated, such as the endangered Przewalski's horse, a separate subspecies, and the only remaining true wild horse. There is an extensive, specialized vocabulary used to describe equine-related concepts, covering everything from anatomy to life stages, size, colors, markings, breeds, locomotion, and behavior.\nHorses are adapted to run, allowing them to quickly escape predators, possessing an excellent sense of balance and a strong fight-or-flight response. Related to this need to flee from predators in the wild is an unusual trait: horses are able to sleep both standing up and lying down, with younger horses tending to sleep significantly more than adults. Female horses, called mares, carry their young for approximately 11 months, and a young horse, called a foal, can stand and run shortly following birth. Most domesticated horses begin training under a saddle or in a harness between the ages of two and four. They reach full adult development by age five, and have an average lifespan of between 25 and 30 years.\nHorse breeds are loosely divided into three categories based on general temperament: spirited \"hot bloods\" with speed and endurance; \"cold bloods\", such as draft horses and some ponies, suitable for slow, heavy work; and \"warmbloods\", developed from crosses between hot bloods and cold bloods, often focusing on creating breeds for specific riding purposes, particularly in Europe. There are more than 300 breeds of horse in the world today, developed for many different uses.\nHorses and humans interact in a wide variety of sport competitions and non-competitive recreational pursuits, as well as in working activities such as police work, agriculture, entertainment, and therapy. Horses were historically used in warfare, from which a wide variety of riding and driving techniques developed, using many different styles of equipment and methods of control. Many products are derived from horses, including meat, milk, hide, hair, bone, and pharmaceuticals extracted from the urine of pregnant mares. Humans provide domesticated horses with food, water, and shelter, as well as attention from specialists such as veterinarians and farriers.\n\n" 15 | }, 16 | { 17 | "title": "Bird", 18 | "description": "Birds are a group of warm-blooded vertebrates constituting the class Aves , characterised by feathers, toothless beaked jaws, the laying of hard-shelled eggs, a high metabolic rate, a four-chambered heart, and a strong yet lightweight skeleton. Birds live worldwide and range in size from the 5.5 cm (2.2 in) bee hummingbird to the 2.8 m (9 ft 2 in) ostrich. There are about ten thousand living species, more than half of which are passerine, or \"perching\" birds. Birds have wings whose development varies according to species; the only known groups without wings are the extinct moa and elephant birds. Wings, which evolved from forelimbs, gave birds the ability to fly, although further evolution has led to the loss of flight in some birds, including ratites, penguins, and diverse endemic island species. The digestive and respiratory systems of birds are also uniquely adapted for flight. Some bird species of aquatic environments, particularly seabirds and some waterbirds, have further evolved for swimming.\nBirds are a group of feathered theropod dinosaurs, and constitute the only living dinosaurs. Likewise, birds are considered reptiles in the modern cladistic sense of the term, and their closest living relatives are the crocodilians. Birds are descendants of the primitive avialans (whose members include Archaeopteryx) which first appeared about 160 million years ago (mya) in China. According to DNA evidence, modern birds (Neornithes) evolved in the Middle to Late Cretaceous, and diversified dramatically around the time of the Cretaceous–Paleogene extinction event 66 mya, which killed off the pterosaurs and all non-avian dinosaurs.\nMany social species pass on knowledge across generations, which is considered a form of culture. Birds are social, communicating with visual signals, calls, and songs, and participating in such behaviours as cooperative breeding and hunting, flocking, and mobbing of predators. The vast majority of bird species are socially (but not necessarily sexually) monogamous, usually for one breeding season at a time, sometimes for years, but rarely for life. Other species have breeding systems that are polygynous (one male with many females) or, rarely, polyandrous (one female with many males). Birds produce offspring by laying eggs which are fertilised through sexual reproduction. They are usually laid in a nest and incubated by the parents. Most birds have an extended period of parental care after hatching.\nMany species of birds are economically important as food for human consumption and raw material in manufacturing, with domesticated and undomesticated birds being important sources of eggs, meat, and feathers. Songbirds, parrots, and other species are popular as pets. Guano (bird excrement) is harvested for use as a fertiliser. Birds figure throughout human culture. About 120 to 130 species have become extinct due to human activity since the 17th century, and hundreds more before then. Human activity threatens about 1,200 bird species with extinction, though efforts are underway to protect them. Recreational birdwatching is an important part of the ecotourism industry.\n\n" 19 | }, 20 | { 21 | "title": "Cattle", 22 | "description": "Cattle, or cows (female) and bulls (male), are the most common type of large domesticated ungulates. They are a prominent modern member of the subfamily Bovinae, are the most widespread species of the genus Bos, and are most commonly classified collectively as Bos taurus.\nCattle are commonly raised as livestock for meat (beef or veal, see beef cattle), for milk (see dairy cattle), and for hides, which are used to make leather. They are used as riding animals and draft animals (oxen or bullocks, which pull carts, plows and other implements). Another product of cattle is their dung, which can be used to create manure or fuel. In some regions, such as parts of India, cattle have significant religious meaning. Cattle, mostly small breeds such as the Miniature Zebu, are also kept as pets.\nAround 10,500 years ago, cattle were domesticated from as few as 80 progenitors in central Anatolia, the Levant and Western Iran. According to the Food and Agriculture Organization (FAO), there are approximately 1.5 billion cattle in the world as of 2018. In 2009, cattle became one of the first livestock animals to have a fully mapped genome." 23 | } 24 | ] 25 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./node_modules/gts/tsconfig-google.json", 3 | "compilerOptions": { 4 | "rootDir": "src", 5 | "outDir": "lib", 6 | "allowSyntheticDefaultImports": true, 7 | "esModuleInterop": true, 8 | }, 9 | "include": [ 10 | "src/**/*.ts", 11 | "test/**/*.ts" 12 | ], 13 | "exclude": [ 14 | "src/**/*.test.ts", 15 | "src/**/*.spec.ts" 16 | ] 17 | } --------------------------------------------------------------------------------