├── .eslintignore
├── .eslintrc.json
├── .firebaserc
├── .github
    └── workflows
    │   ├── ci.yaml
    │   └── npm-release.yaml
├── .gitignore
├── .prettierrc.js
├── LICENSE
├── README.md
├── firebase.json
├── jest.config.js
├── jest.setup.js
├── package-lock.json
├── package.json
├── src
    ├── counter.ts
    ├── cursor.ts
    ├── index.spec.ts
    ├── index.ts
    ├── pagination.spec.ts
    ├── query.spec.ts
    ├── query.ts
    ├── sort.ts
    ├── tokenizer
    │   ├── english.ts
    │   ├── index.ts
    │   ├── japanese.ts
    │   ├── tokenize.ts
    │   └── tokneize.test.ts
    └── utils
    │   └── firestore.ts
├── testdata
    └── 5.en.json
└── tsconfig.json


/.eslintignore:
--------------------------------------------------------------------------------
1 | lib/
2 | jest.setup.js


--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": "./node_modules/gts/"
3 | }
4 | 


--------------------------------------------------------------------------------
/.firebaserc:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: Node.js CI
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |     - '**.ts'
 7 |     - '**.js'
 8 |     - '*.json'
 9 | 
10 | jobs:
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     strategy:
16 |       matrix:
17 |         node-version: [10.x, 12.x, 14.x]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Use Node.js ${{ matrix.node-version }}
22 |       uses: actions/setup-node@v1
23 |       with:
24 |         node-version: ${{ matrix.node-version }}
25 |     - run: npm install
26 |     - run: npm run build --if-present
27 |     - run: npm test
28 |       env:
29 |         CI: true


--------------------------------------------------------------------------------
/.github/workflows/npm-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Node.js Package
 2 | on:
 3 |   release:
 4 |     types: [created]
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v2
10 |     - uses: actions/setup-node@v1
11 |       with:
12 |         node-version: '12.x'
13 |         registry-url: 'https://registry.npmjs.org'
14 |     - run: npm install
15 |     - run: npm test
16 |     - run: npm publish
17 |       env:
18 |         NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | lib
3 | *.log


--------------------------------------------------------------------------------
/.prettierrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   ...require('gts/.prettierrc.json'),
3 | }
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Firestore Full-Text Search
 2 | 
 3 | Firestore Full-Text Search provides a Firestore-specific full-text search function.  
 4 | It runs on Cloud Functions and has excellent performance.  
 5 | Supports simple inverted index type search.
 6 | 
 7 | #### Usage
 8 | 
 9 | ```bash
10 | npm install --save firestore-full-text-search
11 | ```
12 | 
13 | ```ts
14 | import admin from 'firebase-admin';
15 | import FirestoreFullTextSearch from 'firestore-full-text-search';
16 | 
17 | admin.initializeApp({...});
18 | const db = admin.firestore();
19 | 
20 | // Specifies the collection in which to store the inverted index.
21 | const fullTextSearch = new FirestoreFullTextSearch(db.collection('index'));
22 | 
23 | 
24 | // Set documents
25 | const postData: Post = {
26 |     title: "What's Firestore Full-Text Search?",
27 |     content:
28 |     'Firestore Full-Text Search provides a Firestore-specific full-text search function. It runs on Cloud Functions and has excellent performance.',
29 |     created: admin.firestore.FieldValue.serverTimestamp(),
30 | };
31 | 
32 | const docRef = postsRef.collection('posts').doc('1');
33 | 
34 | // WriteBatch is supported so that documents and search indexes can be stored atomically.
35 | const batch = db.batch();
36 | batch.set(docRef, postData);
37 | await fullTextSearch.set('en', docRef, {batch, data: postData});
38 | await batch.commit();
39 | ```
40 | 
41 | ```js
42 | // Search documents
43 | const results = await fullTextSearch.search('en', 'firestore');
44 | ```
45 | 
46 | #### ToDo
47 | 
48 | - [x] English Support
49 | - [x] Japanese Support
50 | - [x] Implement Query parser
51 | - [x] Implement Delete document 
52 | - [x] Sorting Support
53 | - [x] Limit Support
54 | - [x] Pagination Support
55 | - [x] OpenTelemetry Support
56 | - [ ] Browser Support (Search-Only)
57 | - [ ] Firebase Performance Monitoring Support


--------------------------------------------------------------------------------
/firebase.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "emulators": {
 3 |     "firestore": {
 4 |       "port": 5000
 5 |     },
 6 |     "ui": {
 7 |       "enabled": true
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   preset: 'ts-jest',
3 |   testEnvironment: 'node',
4 |   testPathIgnorePatterns: ['lib'],
5 |   setupFilesAfterEnv: [`${process.cwd()}/jest.setup.js`],
6 | };
7 | 


--------------------------------------------------------------------------------
/jest.setup.js:
--------------------------------------------------------------------------------
1 | jest.setTimeout(30000);
2 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "firestore-full-text-search",
 3 |   "main": "lib/index.js",
 4 |   "types": "lib/index.d.ts",
 5 |   "files": [
 6 |     "README.md",
 7 |     "package.json",
 8 |     "lib"
 9 |   ],
10 |   "keywords": [
11 |     "search",
12 |     "Full Text Search",
13 |     "Firebase",
14 |     "firebase",
15 |     "firestore"
16 |   ],
17 |   "version": "0.6.1",
18 |   "description": "Firestore Full-text Search",
19 |   "scripts": {
20 |     "test": "firebase emulators:exec jest",
21 |     "start:emulators": "firebase emulators:start --project test",
22 |     "jest": "jest",
23 |     "lint": "gts lint",
24 |     "clean": "gts clean",
25 |     "compile": "tsc",
26 |     "fix": "gts fix",
27 |     "prepare": "npm run compile",
28 |     "pretest": "npm run compile",
29 |     "posttest": "npm run lint"
30 |   },
31 |   "repository": {
32 |     "type": "git",
33 |     "url": "git+https://github.com/k2wanko/firestore-full-text-search.git"
34 |   },
35 |   "author": "k2wanko",
36 |   "license": "Apache-2.0",
37 |   "bugs": {
38 |     "url": "https://github.com/k2wanko/firestore-full-text-search/issues"
39 |   },
40 |   "homepage": "https://github.com/k2wanko/firestore-full-text-search#readme",
41 |   "devDependencies": {
42 |     "@google-cloud/firestore": "^4.8.1",
43 |     "@opentelemetry/core": "^0.14.0",
44 |     "@opentelemetry/metrics": "^0.14.0",
45 |     "@opentelemetry/node": "^0.14.0",
46 |     "@opentelemetry/tracing": "^0.14.0",
47 |     "@types/jest": "^26.0.20",
48 |     "@types/kuromoji": "^0.1.0",
49 |     "@types/luxon": "^1.25.1",
50 |     "@types/node": "^14.11.2",
51 |     "firebase-tools": "^9.2.1",
52 |     "gts": "^3.0.3",
53 |     "jest": "^26.6.3",
54 |     "ts-jest": "^26.4.4",
55 |     "typescript": "^4.0.3"
56 |   },
57 |   "dependencies": {
58 |     "@opentelemetry/api": "^0.14.0",
59 |     "firebase-admin": "^9.4.2",
60 |     "firebase-functions": "^3.13.0",
61 |     "kuromoji": "^0.1.2",
62 |     "luxon": "^1.25.0"
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/counter.ts:
--------------------------------------------------------------------------------
 1 | import type {DocumentReference} from '@google-cloud/firestore';
 2 | import {FieldValue} from '@google-cloud/firestore';
 3 | import {WriteBatch2} from './utils/firestore';
 4 | 
 5 | export async function incrementCounter(
 6 |   ref: DocumentReference,
 7 |   numShards: number,
 8 |   numIncrement: number,
 9 |   options?: {batch: WriteBatch2}
10 | ) {
11 |   const shardId = Math.floor(Math.random() * numShards).toString();
12 |   const shardRef = ref.collection('count').doc(shardId);
13 |   const batch = options?.batch;
14 | 
15 |   const data = {count: FieldValue.increment(numIncrement)};
16 | 
17 |   if (batch) {
18 |     batch.set(shardRef, data, {merge: true});
19 |   } else {
20 |     await shardRef.set(data, {merge: true});
21 |   }
22 |   return;
23 | }
24 | 
25 | export async function getCount(ref: DocumentReference): Promise<number> {
26 |   const snap = await ref.collection('count').get();
27 |   let total = 0;
28 |   for (const doc of snap.docs) {
29 |     total += doc.data().count as number;
30 |   }
31 |   return total;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/cursor.ts:
--------------------------------------------------------------------------------
 1 | import type {FieldValue} from '@google-cloud/firestore';
 2 | export type Cursor = string;
 3 | 
 4 | export interface CursorInfo {
 5 |   fields: string[];
 6 |   fieldValueMap: {[key: string]: FieldValue};
 7 | }
 8 | 
 9 | async function createCursor(info: CursorInfo): Promise<Cursor> {
10 |   return Buffer.from(JSON.stringify(info)).toString('base64');
11 | }
12 | 
13 | export async function parseCursor(cursor: Cursor): Promise<CursorInfo> {
14 |   return JSON.parse(Buffer.from(cursor, 'base64').toString()) as CursorInfo;
15 | }
16 | 
17 | export class CursorBuilder {
18 |   #fields: string[];
19 |   #fieldValueMap: {[key: string]: FieldValue};
20 |   constructor() {
21 |     this.#fields = [];
22 |     this.#fieldValueMap = {};
23 |   }
24 | 
25 |   add(path: string, val: FieldValue) {
26 |     if (this.#fieldValueMap[path]) {
27 |       throw new Error(`exits path: ${path}`);
28 |     }
29 |     this.#fields.push(path);
30 |     this.#fieldValueMap[path] = val;
31 |   }
32 | 
33 |   async build(): Promise<Cursor> {
34 |     return createCursor({
35 |       fields: this.#fields,
36 |       fieldValueMap: this.#fieldValueMap,
37 |     });
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/index.spec.ts:
--------------------------------------------------------------------------------
  1 | import admin from 'firebase-admin';
  2 | import FirestoreFullTextSearch, {WordEntity} from './index';
  3 | import type {FieldValue} from '@google-cloud/firestore';
  4 | 
  5 | process.env.FIRESTORE_EMULATOR_HOST =
  6 |   process.env.FIRESTORE_EMULATOR_HOST || 'localhost:5000';
  7 | 
  8 | admin.initializeApp({
  9 |   projectId: 'test',
 10 | });
 11 | 
 12 | export type Post = {
 13 |   title: string;
 14 |   content: string;
 15 |   created: Date | FieldValue;
 16 |   label?: string[];
 17 | };
 18 | 
 19 | export type Animal = {
 20 |   type: string;
 21 |   description: string;
 22 |   like: number;
 23 | };
 24 | 
 25 | describe('FirestoreFullTextSearch:english', () => {
 26 |   it('set:simple', async () => {
 27 |     const db = admin.firestore();
 28 | 
 29 |     const postsRef = db.collection('posts');
 30 |     const postData: Post = {
 31 |       title: "What's Firestore Full-Text Search?",
 32 |       content:
 33 |         'Firestore Full-Text Search provides a Firestore-specific full-text search function. It runs on Cloud Functions and has excellent performance.',
 34 |       created: admin.firestore.FieldValue.serverTimestamp(),
 35 |     };
 36 | 
 37 |     const docRef = postsRef.doc('gF4lmS8gOlkAPlqGzTHh');
 38 |     await docRef.set(postData);
 39 | 
 40 |     const indexRef = db.collection('index_simple');
 41 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
 42 |     await fullTextSearch.set('en', docRef);
 43 | 
 44 |     const word = 'search';
 45 |     const wants = ['title', 'content'];
 46 |     for (const field of wants) {
 47 |       const contentRef = indexRef.doc(
 48 |         `/v1/words/${word}/docs/${docRef.id}.${field}`
 49 |       );
 50 |       const contentSnap = await contentRef.get();
 51 |       expect(contentSnap.exists).toBe(true);
 52 |     }
 53 |   });
 54 | 
 55 |   it('set:batch', async () => {
 56 |     const db = admin.firestore();
 57 | 
 58 |     const postsRef = db.collection('posts');
 59 |     const postData: Post = {
 60 |       title: "What's Firebase?",
 61 |       content:
 62 |         'Firebase helps you build and run successful apps.\n Backed by Google and loved by app development teams - from startups to global enterprises.',
 63 |       created: admin.firestore.FieldValue.serverTimestamp(),
 64 |     };
 65 | 
 66 |     const docRef = postsRef.doc('aF7lmS8gOlkAPlqGzTHh');
 67 | 
 68 |     const batch = db.batch();
 69 |     batch.set(docRef, postData);
 70 | 
 71 |     const indexRef = db.collection('index_simple');
 72 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
 73 |     await fullTextSearch.set('en', docRef, {batch, data: postData});
 74 | 
 75 |     await batch.commit();
 76 | 
 77 |     const word = 'firebas';
 78 |     const wants = ['title', 'content'];
 79 |     for (const field of wants) {
 80 |       const contentRef = indexRef.doc(
 81 |         `/v1/words/${word}/docs/${docRef.id}.${field}`
 82 |       );
 83 |       const contentSnap = await contentRef.get();
 84 |       expect(contentSnap.exists).toBe(true);
 85 |     }
 86 |   });
 87 | 
 88 |   it('set:related', async () => {
 89 |     const db = admin.firestore();
 90 |     const indexRef = db.collection('related');
 91 |     const testData = [
 92 |       {
 93 |         data: {
 94 |           title: "What's JavaScript",
 95 |         },
 96 |       },
 97 |       {
 98 |         data: {
 99 |           title: "What's Javascript",
100 |         },
101 |       },
102 |       {
103 |         data: {
104 |           title: "What's javascript",
105 |         },
106 |       },
107 |     ];
108 | 
109 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
110 |     for (const {data} of testData) {
111 |       await fullTextSearch.set('en', db.doc('post/1'), {data});
112 |     }
113 | 
114 |     const snap = await db.doc('/related/v1/words/javascript').get();
115 |     const data = snap.data() as WordEntity;
116 |     expect(data.related.sort()).toStrictEqual(
117 |       ['JavaScript', 'Javascript', 'javascript'].sort()
118 |     );
119 |   });
120 | 
121 |   it('search:simple', async () => {
122 |     const db = admin.firestore();
123 |     const indexRef = db.collection('index_simple');
124 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
125 |     const {hits} = await fullTextSearch.search('en', 'firestore');
126 |     expect(hits.length).toBe(1);
127 |     expect(hits[0].id).toBe('gF4lmS8gOlkAPlqGzTHh');
128 |   });
129 | 
130 |   it('search:double-keywords', async () => {
131 |     const db = admin.firestore();
132 |     const indexRef = db.collection('index_simple');
133 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
134 |     const {hits} = await fullTextSearch.search('en', 'firebase firestore');
135 | 
136 |     expect(hits.length).toBe(2);
137 |   });
138 | 
139 |   it('search:nothing', async () => {
140 |     const db = admin.firestore();
141 |     const indexRef = db.collection('index_simple');
142 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
143 |     const {hits} = await fullTextSearch.search('en', 'nothing');
144 |     expect(hits.length).toBe(0);
145 |   });
146 | });
147 | 
148 | describe('FirestoreFullTextSearch', () => {
149 |   const db = admin.firestore();
150 | 
151 |   beforeAll(async () => {
152 |     const dogs: {[key: string]: Animal} = {
153 |       akita: {
154 |         type: 'dog',
155 |         description:
156 |           'The Akita (秋田犬, Akita-inu, Japanese pronunciation: [akʲita.inɯ]) is a large breed of dog originating from the mountainous regions of northern Japan.',
157 |         like: 10,
158 |       },
159 |       corgi: {
160 |         type: 'dog',
161 |         description:
162 |           'The Welsh Corgi (/ˈkɔːrɡi/[5] plural "Corgis" or occasionally the etymologically consistent "Corgwn"; /ˈkɔːrɡuːn/) is a small type of herding dog that originated in Wales.[6]',
163 |         like: 50,
164 |       },
165 |       'border collie': {
166 |         type: 'dog',
167 |         description:
168 |           'The Border Collie is a working and herding dog breed developed in the Anglo-Scottish border county of Northumberland, for herding livestock, especially sheep.[1]',
169 |         like: 5,
170 |       },
171 |     };
172 | 
173 |     const indexRef = db.collection('index_dogs_sort');
174 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
175 |     for (const [id, data] of Object.entries(dogs)) {
176 |       const batch = db.batch();
177 |       const dogRef = db.collection('dogs').doc(id);
178 |       batch.set(dogRef, data);
179 |       await fullTextSearch.set('en', dogRef, {
180 |         data,
181 |         batch,
182 |         indexMask: ['description'],
183 |         fields: ['like'],
184 |       });
185 |       await batch.commit();
186 |     }
187 |   });
188 | 
189 |   it('search:sort', async () => {
190 |     const indexRef = db.collection('index_dogs_sort');
191 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
192 |     const {hits} = await fullTextSearch.search('en', 'herding');
193 |     expect(hits.length === 2).toBe(true);
194 |     expect(hits[0].id).toBe('border collie');
195 |     expect(hits[1].id).toBe('corgi');
196 |     // console.log(results.map(res => res.id));
197 |   });
198 | 
199 |   it('delete:document', async () => {
200 |     const postsRef = db.collection('posts');
201 |     const postData: Post = {
202 |       title: "What's Firestore Full-Text Search?",
203 |       content:
204 |         'Firestore Full-Text Search provides a Firestore-specific full-text search function. It runs on Cloud Functions and has excellent performance.',
205 |       created: admin.firestore.FieldValue.serverTimestamp(),
206 |     };
207 | 
208 |     const docRef = postsRef.doc('post1');
209 |     await docRef.set(postData);
210 | 
211 |     const indexRef = db.collection('index_delete_test');
212 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
213 |     await fullTextSearch.set('en', docRef);
214 | 
215 |     const word = 'search';
216 |     const wants = ['title', 'content'];
217 |     for (const field of wants) {
218 |       const contentRef = indexRef.doc(
219 |         `/v1/words/${word}/docs/${docRef.id}.${field}`
220 |       );
221 |       const contentSnap = await contentRef.get();
222 |       expect(contentSnap.exists).toBe(true);
223 |     }
224 | 
225 |     await fullTextSearch.delete('en', docRef);
226 | 
227 |     for (const field of wants) {
228 |       const contentRef = indexRef.doc(
229 |         `/v1/words/${word}/docs/${docRef.id}.${field}`
230 |       );
231 |       const contentSnap = await contentRef.get();
232 |       expect(contentSnap.exists).toBe(false);
233 |     }
234 |   });
235 | });
236 | 
237 | describe('FirestoreFullTextSearch:japanese', () => {
238 |   it('set:simple', async () => {
239 |     const db = admin.firestore();
240 | 
241 |     const postsRef = db.collection('posts');
242 |     const postData: Post = {
243 |       title: 'Firestore Full-Text Searchとは?',
244 |       content:
245 |         'Firestore Full-Text Search は、Firestoreに特化した全文検索機能を提供します。Cloud Functions上で動作し、優れたパフォーマンスを発揮します。',
246 |       created: admin.firestore.FieldValue.serverTimestamp(),
247 |     };
248 | 
249 |     const docRef = postsRef.doc('gF4lmS8gOlkAPlqGzTHh');
250 |     await docRef.set(postData);
251 | 
252 |     const indexRef = db.collection('index_ja');
253 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
254 |     await fullTextSearch.set('ja', docRef);
255 | 
256 |     const word = 'パフォーマンス';
257 |     const wants = ['content'];
258 |     for (const field of wants) {
259 |       const contentRef = indexRef.doc(
260 |         `/v1/words/${word}/docs/${docRef.id}.${field}`
261 |       );
262 |       const contentSnap = await contentRef.get();
263 |       expect(contentSnap.exists).toBe(true);
264 |     }
265 |   });
266 | });
267 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
  1 | import type {
  2 |   CollectionReference,
  3 |   DocumentData,
  4 |   DocumentReference,
  5 |   Firestore,
  6 |   Query,
  7 |   WriteBatch,
  8 | } from '@google-cloud/firestore';
  9 | import {FieldValue} from '@google-cloud/firestore';
 10 | import type {LanguageID, Token} from './tokenizer';
 11 | import tokenize from './tokenizer/tokenize';
 12 | import {trace, metrics} from '@opentelemetry/api';
 13 | import {parseQuery, SearchQuery} from './query';
 14 | import {calcScore} from './sort';
 15 | import {getCount, incrementCounter} from './counter';
 16 | import {WriteBatch2} from './utils/firestore';
 17 | import {Cursor, CursorBuilder, parseCursor} from './cursor';
 18 | 
 19 | export type FieldEntity = {
 20 |   __positions: Buffer;
 21 |   __score: number; // tf * idf
 22 |   __ref: DocumentReference;
 23 | };
 24 | 
 25 | export type WordEntity = {
 26 |   related: string[];
 27 | };
 28 | 
 29 | export type CounterEntity = {
 30 |   count: number;
 31 | };
 32 | 
 33 | export type Options = {
 34 |   sharedCounterNum?: number;
 35 | };
 36 | 
 37 | export type SetOptions = {
 38 |   batch?: WriteBatch;
 39 |   data?: DocumentData;
 40 |   indexMask?: string[];
 41 |   fields?: string[];
 42 | };
 43 | 
 44 | export type DeleteOptions = {
 45 |   batch?: WriteBatch;
 46 |   data?: DocumentData;
 47 |   indexMask?: string[];
 48 | };
 49 | 
 50 | export type SearchOptions = {
 51 |   limit?: number;
 52 |   cursor?: Cursor;
 53 | };
 54 | 
 55 | export type SearchResult = {
 56 |   hits: DocumentReference[];
 57 |   total: number;
 58 |   cursor?: Cursor;
 59 | };
 60 | 
 61 | export type FieldTypeEntity = {
 62 |   type: FieldType;
 63 | };
 64 | 
 65 | export type FieldType = 'string' | 'array' | 'number' | 'date';
 66 | 
 67 | const tracer = trace.getTracer('firestore-full-text-search');
 68 | 
 69 | const meter = metrics.getMeterProvider().getMeter('firestore-full-text-search');
 70 | const documentWriteCounter = meter.createCounter('document_write_count');
 71 | const documentWriteTokenCounter = meter.createCounter(
 72 |   'document_write_token_count'
 73 | );
 74 | // const searchTokenCounter = meter.createCounter('search_token_count');
 75 | 
 76 | const defaultSharedCounterNum = 3;
 77 | 
 78 | export default class FirestoreFullTextSearch {
 79 |   #ref: CollectionReference;
 80 |   #db: Firestore;
 81 |   #wordsRef: CollectionReference;
 82 |   #wordDocsRef: CollectionReference;
 83 |   #fieldsRef: CollectionReference;
 84 |   #options?: Options;
 85 | 
 86 |   constructor(ref: CollectionReference, options?: Options) {
 87 |     this.#ref = ref;
 88 |     this.#db = ref.firestore;
 89 |     this.#wordsRef = ref.doc('v1').collection('words');
 90 |     this.#wordDocsRef = ref.doc('v1').collection('word_docs');
 91 |     this.#fieldsRef = ref.doc('v1').collection('fields');
 92 |     this.#options = options;
 93 |   }
 94 | 
 95 |   async set(lang: LanguageID, doc: DocumentReference, options?: SetOptions) {
 96 |     const span = tracer.startSpan('set');
 97 |     span.setAttributes({
 98 |       index: this.#ref.path,
 99 |       doc: doc.path,
100 |       lang,
101 |     });
102 |     let data = options?.data;
103 |     if (!data) {
104 |       const snap = await doc.get();
105 |       if (!snap.exists) {
106 |         throw new Error('Document does not exist.');
107 |       }
108 |       data = snap.data() as DocumentData; // exists checked.
109 |     }
110 | 
111 |     const _data = data;
112 |     if (!_data) {
113 |       throw new Error('Document is empty');
114 |     }
115 | 
116 |     const batch = new WriteBatch2(this.#db, {batch: options?.batch});
117 |     const indexMask = options?.indexMask;
118 |     const fields = options?.fields;
119 | 
120 |     const allDocCount = await getCount(this.#ref.doc('v1'));
121 | 
122 |     let newDocCount = 0;
123 |     const newWordCountMap = new Map<string, number>();
124 |     const tokensMap = new Map<string, Token[]>();
125 |     const targetFields = new Set<string>();
126 |     let writeCount = 0;
127 |     let writeTokenCount = 0;
128 |     for (const [fieldName, value] of Object.entries(data)) {
129 |       if (indexMask) {
130 |         if (!indexMask.includes(fieldName)) {
131 |           continue;
132 |         }
133 |       }
134 | 
135 |       if (fieldName.startsWith('__')) {
136 |         continue;
137 |       }
138 | 
139 |       if (typeof value !== 'string') {
140 |         continue;
141 |       }
142 |       targetFields.add(fieldName);
143 |     }
144 | 
145 |     for (const fieldName of targetFields) {
146 |       const value = data[fieldName];
147 |       if (typeof value !== 'string') {
148 |         continue;
149 |       }
150 |       const tokens = await tokenize(lang, value);
151 |       tokensMap.set(fieldName, tokens);
152 |       for (const token of tokens) {
153 |         const word = token.normalizedWord;
154 |         if (!word) {
155 |           continue;
156 |         }
157 | 
158 |         const wordRef = this.#wordsRef.doc(word);
159 |         const docRef = wordRef.collection('docs').doc(`${doc.id}.${fieldName}`);
160 |         const res = await docRef.get();
161 |         if (!res.exists) {
162 |           newDocCount = 1;
163 |           newWordCountMap.set(word, 1);
164 |         }
165 |       }
166 |     }
167 | 
168 |     for (const fieldName of targetFields) {
169 |       const value = data[fieldName];
170 |       if (typeof value !== 'string') {
171 |         continue;
172 |       }
173 | 
174 |       const tokens = tokensMap.get(fieldName);
175 |       if (!tokens) {
176 |         throw new Error('Not found tokens');
177 |       }
178 |       for (const token of tokens) {
179 |         const word = token.normalizedWord;
180 |         if (!word) {
181 |           continue;
182 |         }
183 |         const wordRef = this.#wordsRef.doc(word);
184 |         const wordSnap = await wordRef.get();
185 |         if (wordSnap.exists) {
186 |           const wordData = wordSnap.data() as WordEntity;
187 |           batch.set(
188 |             wordRef,
189 |             {
190 |               related: Array.from(
191 |                 new Set(wordData.related.concat([token.word])).keys()
192 |               ),
193 |             },
194 |             {merge: true}
195 |           );
196 |         } else {
197 |           batch.set(wordRef, {related: [token.word]});
198 |         }
199 | 
200 |         const wordDocCount = await getCount(wordRef);
201 |         const docRef = wordRef.collection('docs').doc(`${doc.id}.${fieldName}`);
202 |         const wordDocRef = this.#wordDocsRef.doc(`${word}.${doc.id}`);
203 |         const docData = {
204 |           __word: word,
205 |           __fields: Array.from(targetFields.values()),
206 |           __positions: new Uint8Array(token.positions),
207 |           __score: calcScore(
208 |             token.positions.length,
209 |             tokens.length,
210 |             wordDocCount + (newWordCountMap.get(word) ?? 0),
211 |             allDocCount + newDocCount
212 |           ),
213 |           __ref: doc,
214 |         };
215 |         if (fields) {
216 |           const fieldTypes: {[key: string]: FieldType} = {};
217 |           const fieldData: {[key: string]: unknown} = {};
218 |           const _fieldData = fields.reduce((p, name) => {
219 |             const val = _data[name];
220 |             if (Array.isArray(val)) {
221 |               fieldTypes[name] = 'array';
222 |               p[name] = val.sort();
223 |             } else {
224 |               if (val instanceof Date) {
225 |                 fieldTypes[name] = 'date';
226 |                 p[name] = val;
227 |               } else if (
228 |                 val instanceof FieldValue &&
229 |                 val.isEqual(FieldValue.serverTimestamp())
230 |               ) {
231 |                 fieldTypes[name] = 'date';
232 |                 p[name] = val;
233 |               } else {
234 |                 switch (typeof val) {
235 |                   case 'string':
236 |                     fieldTypes[name] = 'string';
237 |                     p[name] = _data[name];
238 |                     break;
239 |                   case 'number':
240 |                     fieldTypes[name] = 'number';
241 |                     p[name] = _data[name];
242 |                     break;
243 |                   default:
244 |                     throw new Error(`Unsupport filed type ${typeof val}`);
245 |                 }
246 |               }
247 |             }
248 |             return p;
249 |           }, fieldData);
250 |           for (const [name, type] of Object.entries(fieldTypes)) {
251 |             batch.set(this.#fieldsRef.doc(name), {
252 |               type,
253 |             } as FieldTypeEntity);
254 |           }
255 |           batch.set(docRef, {...{__ref: doc}, ..._fieldData});
256 |           batch.set(wordDocRef, {...docData, ..._fieldData});
257 |         } else {
258 |           batch.set(docRef, {__ref: doc});
259 |           batch.set(wordDocRef, docData);
260 |         }
261 | 
262 |         if (newWordCountMap.has(word)) {
263 |           await incrementCounter(
264 |             wordRef,
265 |             this.#options?.sharedCounterNum ?? defaultSharedCounterNum,
266 |             newWordCountMap.get(word) ?? 0,
267 |             {batch}
268 |           );
269 |         }
270 |         writeCount += 1;
271 |       }
272 | 
273 |       writeTokenCount += tokens.length;
274 |     }
275 | 
276 |     await incrementCounter(
277 |       this.#ref.doc('v1'),
278 |       this.#options?.sharedCounterNum ?? defaultSharedCounterNum,
279 |       newDocCount,
280 |       {batch}
281 |     );
282 | 
283 |     await batch.commit();
284 | 
285 |     documentWriteCounter
286 |       .bind({
287 |         index: this.#ref.path,
288 |         lang,
289 |       })
290 |       .add(writeCount);
291 |     documentWriteTokenCounter
292 |       .bind({
293 |         index: this.#ref.path,
294 |         lang,
295 |       })
296 |       .add(writeTokenCount);
297 |     span.end();
298 |   }
299 | 
300 |   async delete(
301 |     lang: LanguageID,
302 |     doc: DocumentReference,
303 |     options?: DeleteOptions
304 |   ) {
305 |     const span = tracer.startSpan('delete');
306 |     span.setAttributes({
307 |       index: this.#ref.path,
308 |       doc: doc.path,
309 |       lang,
310 |     });
311 | 
312 |     let data = options?.data;
313 |     if (!data) {
314 |       const snap = await doc.get();
315 |       if (!snap.exists) {
316 |         throw new Error('Document does not exist.');
317 |       }
318 |       data = snap.data() as DocumentData; // exists checked.
319 |     }
320 | 
321 |     const _data = data;
322 |     if (!_data) {
323 |       throw new Error('Document is empty');
324 |     }
325 | 
326 |     const batch = new WriteBatch2(this.#db, {batch: options?.batch});
327 |     const indexMask = options?.indexMask;
328 |     let docCount = 0;
329 | 
330 |     for (const [fieldName, vaule] of Object.entries(data)) {
331 |       if (indexMask) {
332 |         if (!indexMask.includes(fieldName)) {
333 |           continue;
334 |         }
335 |       }
336 | 
337 |       if (fieldName.startsWith('__')) {
338 |         continue;
339 |       }
340 | 
341 |       if (typeof vaule !== 'string') {
342 |         continue;
343 |       }
344 | 
345 |       const tokens = await tokenize(lang, vaule);
346 |       for (const token of tokens) {
347 |         const word = token.normalizedWord;
348 |         if (!word) {
349 |           continue;
350 |         }
351 |         const wordRef = this.#wordsRef.doc(word);
352 |         const docRef = wordRef.collection('docs').doc(`${doc.id}.${fieldName}`);
353 |         const wordDocRef = this.#wordDocsRef.doc(`${word}.${doc.id}`);
354 | 
355 |         batch.delete(docRef);
356 |         batch.delete(wordDocRef);
357 |         await incrementCounter(
358 |           wordRef,
359 |           this.#options?.sharedCounterNum ?? defaultSharedCounterNum,
360 |           -1,
361 |           {batch}
362 |         );
363 |         docCount = 1;
364 |       }
365 |     }
366 | 
367 |     await incrementCounter(
368 |       this.#ref.doc('v1'),
369 |       this.#options?.sharedCounterNum ?? defaultSharedCounterNum,
370 |       docCount * -1,
371 |       {batch}
372 |     );
373 | 
374 |     await batch.commit();
375 | 
376 |     span.end();
377 |   }
378 | 
379 |   async search(
380 |     lang: LanguageID,
381 |     stringOrQuery: string | SearchQuery,
382 |     options?: SearchOptions
383 |   ): Promise<SearchResult> {
384 |     const span = tracer.startSpan('search');
385 |     span.setAttributes({
386 |       index: this.#ref.path,
387 |       lang,
388 |     });
389 | 
390 |     const cursorQueue: string[] = [];
391 | 
392 |     let searchQuery: SearchQuery;
393 |     if (typeof stringOrQuery === 'string') {
394 |       searchQuery = parseQuery(stringOrQuery);
395 |     } else {
396 |       searchQuery = stringOrQuery;
397 |     }
398 | 
399 |     let limit = options?.limit ?? 100;
400 |     if (limit < 1) {
401 |       limit = 1;
402 |     } else if (limit > 500) {
403 |       limit = 500;
404 |     }
405 | 
406 |     const fields = searchQuery?.fields;
407 |     type fieldInfo = {name: string; type: FieldType};
408 |     let fieldInfos: fieldInfo[] | null = null;
409 |     if (fields) {
410 |       const snap = await this.#db.getAll(
411 |         ...fields.map(field => this.#fieldsRef.doc(field.name))
412 |       );
413 |       fieldInfos = snap.map(doc => ({name: doc.id, type: doc.data()?.type}));
414 |     }
415 | 
416 |     const words: string[] = [];
417 |     let total = 0;
418 |     for (const keyword of searchQuery.keywords) {
419 |       const tokens = await tokenize(lang, keyword);
420 |       for (const token of tokens) {
421 |         words.push(token.normalizedWord);
422 |         const wordRef = this.#wordsRef.doc(token.normalizedWord);
423 |         const count = await getCount(wordRef);
424 |         if (count === 0) {
425 |           continue;
426 |         }
427 |         total += count;
428 |       }
429 |     }
430 | 
431 |     let query: Query = this.#wordDocsRef;
432 |     if (words.length === 1) {
433 |       query = query.where('__word', '==', words[0]);
434 |     } else {
435 |       query = query.where('__word', 'in', words);
436 |     }
437 | 
438 |     if (fieldInfos) {
439 |       for (const info of fieldInfos) {
440 |         if (!fields) {
441 |           continue;
442 |         }
443 |         const field = fields.find(f => f.name === info.name);
444 |         if (!field) {
445 |           continue;
446 |         }
447 |         switch (info.type) {
448 |           case 'string':
449 |             query = query.where(field.name, field.operator, field.value);
450 |             break;
451 |           case 'array':
452 |             switch (field.operator) {
453 |               case '==':
454 |                 query = query.where(field.name, 'in', [[field.value].sort()]);
455 |                 break;
456 |               case '!=':
457 |                 query = query.where(field.name, 'not-in', [
458 |                   [field.value].sort(),
459 |                 ]);
460 |                 break;
461 |               default:
462 |             }
463 |             break;
464 |           default:
465 |             query = query.where(field.name, field.operator, field.value);
466 |         }
467 |       }
468 |     } else {
469 |       query = query.orderBy('__score', 'desc');
470 |       cursorQueue.push('__score');
471 |     }
472 | 
473 |     const cursor = options?.cursor;
474 |     if (cursor) {
475 |       const info = await parseCursor(cursor);
476 |       query = query.startAfter(
477 |         ...info.fields.map(field => info.fieldValueMap[field])
478 |       );
479 |     }
480 | 
481 |     if (limit !== undefined) {
482 |       query = query.limit(limit);
483 |     }
484 | 
485 |     const snap = await query.get();
486 | 
487 |     if (snap.empty) {
488 |       return {hits: [], total};
489 |     }
490 | 
491 |     const lastVisible = snap.docs[snap.docs.length - 1];
492 |     const cursorBuilder = new CursorBuilder();
493 |     for (const queue of cursorQueue) {
494 |       cursorBuilder.add(queue, lastVisible.data()[queue]);
495 |     }
496 | 
497 |     const hits = snap.docs.map(doc => doc.data().__ref);
498 | 
499 |     return {
500 |       hits,
501 |       total,
502 |       cursor: hits.length < limit ? undefined : await cursorBuilder.build(),
503 |     };
504 |   }
505 | }
506 | 


--------------------------------------------------------------------------------
/src/pagination.spec.ts:
--------------------------------------------------------------------------------
  1 | import admin from 'firebase-admin';
  2 | import fs from 'fs';
  3 | import path from 'path';
  4 | import {getCount} from './counter';
  5 | import FirestoreFullTextSearch from './index';
  6 | 
  7 | process.env.FIRESTORE_EMULATOR_HOST =
  8 |   process.env.FIRESTORE_EMULATOR_HOST || 'localhost:5000';
  9 | 
 10 | admin.initializeApp({
 11 |   projectId: 'test',
 12 | });
 13 | 
 14 | const db = admin.firestore();
 15 | const docs = db.collection('animals');
 16 | const index = db.collection('pagination');
 17 | const fullTextSearch = new FirestoreFullTextSearch(index);
 18 | 
 19 | describe('pagination', () => {
 20 |   beforeAll(async () => {
 21 |     const count = await getCount(index.doc('v1'));
 22 |     if (count !== 0) {
 23 |       return;
 24 |     }
 25 | 
 26 |     const {items} = await new Promise((resolve, reject) => {
 27 |       fs.readFile(
 28 |         path.resolve(__dirname, '..', 'testdata', '5.en.json'),
 29 |         (err, data) => {
 30 |           if (err) {
 31 |             reject(err);
 32 |             return;
 33 |           }
 34 | 
 35 |           resolve(JSON.parse(data.toString('utf-8')));
 36 |         }
 37 |       );
 38 |     });
 39 |     for (const {title, description} of items) {
 40 |       const batch = db.batch();
 41 |       const ref = docs.doc(title);
 42 |       const data = {description};
 43 |       await batch.set(ref, data);
 44 |       await fullTextSearch.set('en', ref, {data, batch});
 45 |       await batch.commit();
 46 |     }
 47 |   });
 48 | 
 49 |   it('basic', async () => {
 50 |     const {hits, total, cursor} = await fullTextSearch.search('en', 'member', {
 51 |       limit: 2,
 52 |     });
 53 | 
 54 |     // console.log({hits: hits.map(hit => hit.id), total, cursor});
 55 | 
 56 |     expect(hits.length).toBe(2);
 57 |     expect(hits.map(hit => hit.path)).toStrictEqual([
 58 |       'animals/Cattle',
 59 |       'animals/Cat',
 60 |     ]);
 61 |     expect(total).toBe(3);
 62 | 
 63 |     const {
 64 |       hits: hits2,
 65 |       total: total2,
 66 |       cursor: cursor2,
 67 |     } = await fullTextSearch.search('en', 'member', {
 68 |       limit: 2,
 69 |       cursor,
 70 |     });
 71 | 
 72 |     console.log({hits2: hits2.map(hit => hit.id), cursor2});
 73 | 
 74 |     expect(hits2.length).toBe(1);
 75 |     expect(cursor2).toBe(undefined);
 76 |     expect(hits2.map(hit => hit.path)).toStrictEqual(['animals/Bird']);
 77 |     expect(total2).toBe(3);
 78 |   });
 79 | 
 80 |   // it('startAfter', async () => {
 81 |   //   const wordsSnap = await db
 82 |   //     .collection('/pagination/v1/word_docs')
 83 |   //     .where('__word', '==', 'member')
 84 |   //     .orderBy('__score', 'desc')
 85 |   //     .limit(2)
 86 |   //     .get();
 87 | 
 88 |   //   const last = wordsSnap.docs[wordsSnap.docs.length - 1];
 89 |   //   console.log({ids: wordsSnap.docs.map(doc => doc.id)});
 90 | 
 91 |   //   const nextSnap = await db
 92 |   //     .collection('/pagination/v1/word_docs')
 93 |   //     .where('__word', '==', 'member')
 94 |   //     .orderBy('__score', 'desc')
 95 |   //     .startAfter(last)
 96 |   //     .limit(2)
 97 |   //     .get();
 98 |   //   console.log({ids: nextSnap.docs.map(doc => doc.id)});
 99 |   // });
100 | 
101 |   // it('startsWith', async () => {
102 |   //   const wordsRef = index.doc('v1').collection('words');
103 |   //   const query = startsWith(wordsRef, FieldPath.documentId(), 'a');
104 |   //   const snap = await query.get();
105 |   //   console.log({size: snap.size, path: wordsRef.path});
106 |   //   for (const doc of snap.docs) {
107 |   //     console.log(doc.id);
108 |   //   }
109 |   // });
110 | });
111 | 


--------------------------------------------------------------------------------
/src/query.spec.ts:
--------------------------------------------------------------------------------
  1 | import admin from 'firebase-admin';
  2 | import {DateTime} from 'luxon';
  3 | import FirestoreFullTextSearch from './index';
  4 | import {parseQuery, SearchQuery} from './query';
  5 | import {Post, Animal} from './index.spec';
  6 | 
  7 | process.env.FIRESTORE_EMULATOR_HOST =
  8 |   process.env.FIRESTORE_EMULATOR_HOST || 'localhost:5000';
  9 | 
 10 | admin.initializeApp({
 11 |   projectId: 'test',
 12 | });
 13 | 
 14 | describe('parseQuery', () => {
 15 |   it('nothing', () => {
 16 |     const res = parseQuery('');
 17 |     const want: SearchQuery = {
 18 |       keywords: [],
 19 |     };
 20 |     expect(res).toStrictEqual(want);
 21 |   });
 22 | 
 23 |   it('simple', () => {
 24 |     const res = parseQuery('dog');
 25 |     const want: SearchQuery = {
 26 |       keywords: ['dog'],
 27 |     };
 28 |     expect(res).toStrictEqual(want);
 29 |   });
 30 | 
 31 |   it('2 keywords', () => {
 32 |     const res = parseQuery('dog cat');
 33 |     const want: SearchQuery = {
 34 |       keywords: ['dog', 'cat'],
 35 |     };
 36 |     expect(res).toStrictEqual(want);
 37 |   });
 38 | 
 39 |   it('has space keyword', () => {
 40 |     const res = parseQuery('"welsh corgi"');
 41 |     const want: SearchQuery = {
 42 |       keywords: ['welsh corgi'],
 43 |     };
 44 |     expect(res).toStrictEqual(want);
 45 |   });
 46 | 
 47 |   it('has space keywords', () => {
 48 |     const res = parseQuery('"welsh corgi" "cardigan welsh corgi"');
 49 |     const want: SearchQuery = {
 50 |       keywords: ['welsh corgi', 'cardigan welsh corgi'],
 51 |     };
 52 |     expect(res).toStrictEqual(want);
 53 |   });
 54 | 
 55 |   it('string:field-in', () => {
 56 |     const res = parseQuery('dog label:"welsh corgi"');
 57 |     const want: SearchQuery = {
 58 |       keywords: ['dog'],
 59 |       fields: [
 60 |         {name: 'label', type: 'string', operator: '==', value: 'welsh corgi'},
 61 |       ],
 62 |     };
 63 |     expect(res).toStrictEqual(want);
 64 |   });
 65 | 
 66 |   it('string:field-not-in', () => {
 67 |     const res = parseQuery('dog -label:"welsh corgi"');
 68 |     const want: SearchQuery = {
 69 |       keywords: ['dog'],
 70 |       fields: [
 71 |         {name: 'label', type: 'string', operator: '!=', value: 'welsh corgi'},
 72 |       ],
 73 |     };
 74 |     expect(res).toStrictEqual(want);
 75 |   });
 76 | 
 77 |   // // @k2wanko: I can't think of a way to make it work with the current indexing mechanism.
 78 |   //   it('string:not', () => {
 79 |   //     const res = parseQuery('dog NOT "welsh corgi"');
 80 |   //     const want: SearchQuery = {
 81 |   //       keywords: ['dog'],
 82 |   //       fields: [
 83 |   //         {name: 'label', type: 'string', operator: 'NOT', value: 'welsh corgi'},
 84 |   //       ],
 85 |   //     };
 86 |   //     expect(res).toStrictEqual(want);
 87 |   //   });
 88 | 
 89 |   it('number:greater-than', () => {
 90 |     const res = parseQuery('dog like:>10');
 91 |     const want: SearchQuery = {
 92 |       keywords: ['dog'],
 93 |       fields: [{name: 'like', type: 'number', operator: '>', value: 10}],
 94 |     };
 95 |     expect(res).toStrictEqual(want);
 96 |   });
 97 | 
 98 |   it('number:greater-than-or-equal', () => {
 99 |     const res = parseQuery('dog like:>=10');
100 |     const want: SearchQuery = {
101 |       keywords: ['dog'],
102 |       fields: [{name: 'like', type: 'number', operator: '>=', value: 10}],
103 |     };
104 |     expect(res).toStrictEqual(want);
105 |   });
106 | 
107 |   it('number:less-than', () => {
108 |     const res = parseQuery('dog like:<10');
109 |     const want: SearchQuery = {
110 |       keywords: ['dog'],
111 |       fields: [{name: 'like', type: 'number', operator: '<', value: 10}],
112 |     };
113 |     expect(res).toStrictEqual(want);
114 |   });
115 | 
116 |   it('number:less-than-or-equal', () => {
117 |     const res = parseQuery('dog like:<=10');
118 |     const want: SearchQuery = {
119 |       keywords: ['dog'],
120 |       fields: [{name: 'like', type: 'number', operator: '<=', value: 10}],
121 |     };
122 |     expect(res).toStrictEqual(want);
123 |   });
124 | 
125 |   it('date:greater-than', () => {
126 |     const res = parseQuery('hello created:>2021-01-01');
127 |     const want: SearchQuery = {
128 |       keywords: ['hello'],
129 |       fields: [
130 |         {
131 |           name: 'created',
132 |           type: 'date',
133 |           operator: '>',
134 |           value: DateTime.fromISO('2021-01-01').toJSDate(),
135 |         },
136 |       ],
137 |     };
138 |     expect(res).toStrictEqual(want);
139 |   });
140 | 
141 |   it('date:greater-than-or-equal', () => {
142 |     const res = parseQuery('hello created:>=2021-01-01');
143 |     const want: SearchQuery = {
144 |       keywords: ['hello'],
145 |       fields: [
146 |         {
147 |           name: 'created',
148 |           type: 'date',
149 |           operator: '>=',
150 |           value: DateTime.fromISO('2021-01-01').toJSDate(),
151 |         },
152 |       ],
153 |     };
154 |     expect(res).toStrictEqual(want);
155 |   });
156 | 
157 |   it('date:less-than', () => {
158 |     const res = parseQuery('hello created:<2021-01-01');
159 |     const want: SearchQuery = {
160 |       keywords: ['hello'],
161 |       fields: [
162 |         {
163 |           name: 'created',
164 |           type: 'date',
165 |           operator: '<',
166 |           value: DateTime.fromISO('2021-01-01').toJSDate(),
167 |         },
168 |       ],
169 |     };
170 |     expect(res).toStrictEqual(want);
171 |   });
172 | 
173 |   it('date:less-than-or-equal', () => {
174 |     const res = parseQuery('hello created:<=2021-01-01');
175 |     const want: SearchQuery = {
176 |       keywords: ['hello'],
177 |       fields: [
178 |         {
179 |           name: 'created',
180 |           type: 'date',
181 |           operator: '<=',
182 |           value: DateTime.fromISO('2021-01-01').toJSDate(),
183 |         },
184 |       ],
185 |     };
186 |     expect(res).toStrictEqual(want);
187 |   });
188 | });
189 | 
190 | describe('querySearch', () => {
191 |   beforeAll(async () => {
192 |     const db = admin.firestore();
193 | 
194 |     const postsRef = db.collection('posts');
195 |     const postData: Post = {
196 |       title: 'Test Post',
197 |       content: 'Hello',
198 |       created: DateTime.fromISO('2021-01-01').toJSDate(),
199 |       label: ['draft'],
200 |     };
201 |     const postData2: Post = {
202 |       title: 'Test Post',
203 |       content: 'Hello',
204 |       created: DateTime.fromISO('2021-01-02').toJSDate(),
205 |       label: ['published'],
206 |     };
207 |     const postData3: Post = {
208 |       title: 'Test Post 2',
209 |       content: 'Hello World',
210 |       created: DateTime.fromISO('2021-02-01').toJSDate(),
211 |       label: ['published'],
212 |     };
213 | 
214 |     const docRef = postsRef.doc('bF7lfaw8gOlkAPlqGzTHh');
215 |     const docRef2 = postsRef.doc('cF7lfawhaOlkAPlqGzTHh');
216 |     const docRef3 = postsRef.doc('dF7lfawhaOlkAPlqGzTHh');
217 | 
218 |     const batch = db.batch();
219 |     batch.set(docRef, postData);
220 |     batch.set(docRef2, postData2);
221 |     batch.set(docRef3, postData3);
222 | 
223 |     const indexRef = db.collection('index_posts');
224 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
225 |     await fullTextSearch.set('en', docRef, {
226 |       batch,
227 |       data: postData,
228 |       indexMask: ['content'],
229 |       fields: ['label', 'created'],
230 |     });
231 |     await fullTextSearch.set('en', docRef2, {
232 |       batch,
233 |       data: postData2,
234 |       indexMask: ['content'],
235 |       fields: ['label', 'created'],
236 |     });
237 |     await fullTextSearch.set('en', docRef3, {
238 |       batch,
239 |       data: postData3,
240 |       indexMask: ['content'],
241 |       fields: ['label', 'created'],
242 |     });
243 | 
244 |     await batch.commit();
245 |   });
246 | 
247 |   beforeAll(async () => {
248 |     const dogs: {[key: string]: Animal} = {
249 |       akita: {
250 |         type: 'dog',
251 |         description:
252 |           'The Akita (秋田犬, Akita-inu, Japanese pronunciation: [akʲita.inɯ]) is a large breed of dog originating from the mountainous regions of northern Japan.',
253 |         like: 10,
254 |       },
255 |       corgi: {
256 |         type: 'dog',
257 |         description:
258 |           'The Welsh Corgi (/ˈkɔːrɡi/[5] plural "Corgis" or occasionally the etymologically consistent "Corgwn"; /ˈkɔːrɡuːn/) is a small type of herding dog that originated in Wales.[6]',
259 |         like: 50,
260 |       },
261 |       'border collie': {
262 |         type: 'dog',
263 |         description:
264 |           'The Border Collie is a working and herding dog breed developed in the Anglo-Scottish border county of Northumberland, for herding livestock, especially sheep.[1]',
265 |         like: 5,
266 |       },
267 |     };
268 | 
269 |     const db = admin.firestore();
270 |     const batch = db.batch();
271 |     const indexRef = db.collection('index_dogs');
272 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
273 |     for (const [id, data] of Object.entries(dogs)) {
274 |       const dogRef = db.collection('dogs').doc(id);
275 |       batch.set(dogRef, data);
276 |       await fullTextSearch.set('en', dogRef, {
277 |         data,
278 |         batch,
279 |         indexMask: ['description'],
280 |         fields: ['like'],
281 |       });
282 |     }
283 |     await batch.commit();
284 |   });
285 | 
286 |   it('string:field-in', async () => {
287 |     const db = admin.firestore();
288 |     const indexRef = db.collection('index_posts');
289 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
290 |     const {hits} = await fullTextSearch.search('en', 'hello label:published');
291 |     expect(hits.length).toBe(2);
292 |   });
293 | 
294 |   it('string:field-not-in', async () => {
295 |     const db = admin.firestore();
296 |     const indexRef = db.collection('index_posts');
297 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
298 |     const {hits} = await fullTextSearch.search('en', 'hello -label:published');
299 |     expect(hits.length).toBe(1);
300 |   });
301 | 
302 |   it('number:greater-than', async () => {
303 |     const db = admin.firestore();
304 |     const indexRef = db.collection('index_dogs');
305 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
306 |     const {hits} = await fullTextSearch.search('en', 'herding like:>5');
307 |     expect(hits.length >= 1).toBe(true);
308 |     expect(hits[0].id).toBe('corgi');
309 |   });
310 | 
311 |   it('number:greater-than-or-equal', async () => {
312 |     const db = admin.firestore();
313 |     const indexRef = db.collection('index_dogs');
314 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
315 |     const {hits} = await fullTextSearch.search('en', 'herding like:>=5');
316 |     expect(hits.length >= 2).toBe(true);
317 |     expect(hits[0].id).toBe('border collie');
318 |     expect(hits[1].id).toBe('corgi');
319 |   });
320 | 
321 |   it('number:less-than', async () => {
322 |     const db = admin.firestore();
323 |     const indexRef = db.collection('index_dogs');
324 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
325 |     const {hits} = await fullTextSearch.search('en', 'herding like:<10');
326 |     expect(hits.length >= 1).toBe(true);
327 |     expect(hits[0].id).toBe('border collie');
328 |   });
329 | 
330 |   it('number:less-than-or-equal', async () => {
331 |     const db = admin.firestore();
332 |     const indexRef = db.collection('index_dogs');
333 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
334 |     const {hits} = await fullTextSearch.search('en', 'herding like:<=50');
335 |     expect(hits.length >= 2).toBe(true);
336 |     expect(hits[0].id).toBe('border collie');
337 |     expect(hits[1].id).toBe('corgi');
338 |   });
339 | 
340 |   it('date:greater-than', async () => {
341 |     const db = admin.firestore();
342 |     const indexRef = db.collection('index_posts');
343 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
344 |     const {hits} = await fullTextSearch.search(
345 |       'en',
346 |       'hello created:>2021-01-01'
347 |     );
348 |     expect(hits.length >= 2).toBe(true);
349 |     expect(hits[0].id).toBe('cF7lfawhaOlkAPlqGzTHh');
350 |     expect(hits[1].id).toBe('dF7lfawhaOlkAPlqGzTHh');
351 |   });
352 | 
353 |   it('date:greater-than-or-equal', async () => {
354 |     const db = admin.firestore();
355 |     const indexRef = db.collection('index_posts');
356 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
357 |     const {hits} = await fullTextSearch.search(
358 |       'en',
359 |       'hello created:>=2021-01-01'
360 |     );
361 |     expect(hits.length >= 3).toBe(true);
362 |     expect(hits[0].id).toBe('bF7lfaw8gOlkAPlqGzTHh');
363 |     expect(hits[1].id).toBe('cF7lfawhaOlkAPlqGzTHh');
364 |     expect(hits[2].id).toBe('dF7lfawhaOlkAPlqGzTHh');
365 |   });
366 | 
367 |   it('date:less-than', async () => {
368 |     const db = admin.firestore();
369 |     const indexRef = db.collection('index_posts');
370 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
371 |     const {hits} = await fullTextSearch.search(
372 |       'en',
373 |       'hello created:<2021-01-02'
374 |     );
375 |     expect(hits.length === 1).toBe(true);
376 |     expect(hits[0].id).toBe('bF7lfaw8gOlkAPlqGzTHh');
377 |   });
378 | 
379 |   it('date:less-than-or-equal', async () => {
380 |     const db = admin.firestore();
381 |     const indexRef = db.collection('index_posts');
382 |     const fullTextSearch = new FirestoreFullTextSearch(indexRef);
383 |     const {hits} = await fullTextSearch.search(
384 |       'en',
385 |       'hello created:<=2021-01-02'
386 |     );
387 |     expect(hits.length === 2).toBe(true);
388 |     expect(hits[0].id).toBe('bF7lfaw8gOlkAPlqGzTHh');
389 |     expect(hits[1].id).toBe('cF7lfawhaOlkAPlqGzTHh');
390 |   });
391 | });
392 | 


--------------------------------------------------------------------------------
/src/query.ts:
--------------------------------------------------------------------------------
  1 | import {DateTime} from 'luxon';
  2 | 
  3 | export type FieldType = FieldStringType | FieldNumberType | FieldDateType;
  4 | 
  5 | export type FieldStringType = {
  6 |   type: 'string';
  7 |   operator: FilterOp;
  8 |   value: string;
  9 | } & FieldTypeBase;
 10 | 
 11 | export type FieldNumberType = {
 12 |   type: 'number';
 13 |   operator: FilterOp;
 14 |   value: number;
 15 | } & FieldTypeBase;
 16 | 
 17 | export type FieldDateType = {
 18 |   type: 'date';
 19 |   operator: FilterOp;
 20 |   value: Date;
 21 | } & FieldTypeBase;
 22 | 
 23 | export type FilterOp = '==' | '!=' | '>' | '>=' | '<' | '<=';
 24 | 
 25 | export type FieldTypeBase = {name: string};
 26 | 
 27 | export type SearchQuery = {
 28 |   keywords: string[];
 29 |   fields?: FieldType[];
 30 | };
 31 | 
 32 | export function parseQuery(query: string): SearchQuery {
 33 |   if (!query) {
 34 |     return {
 35 |       keywords: [],
 36 |     };
 37 |   }
 38 | 
 39 |   const keywords: string[] = [];
 40 |   const regex = /(\S+:'(?:[^'\\]|\\.)*')|(\S+:"(?:[^"\\]|\\.)*")|(-?"(?:[^"\\]|\\.)*")|(-?'(?:[^'\\]|\\.)*')|\S+|\S+:\S+/g;
 41 |   let fields: FieldType[] | undefined;
 42 |   let match;
 43 |   while ((match = regex.exec(query)) !== null) {
 44 |     const term = match[0];
 45 |     if (!term.includes(':')) {
 46 |       keywords.push(term.replace(/"/g, ''));
 47 |       continue;
 48 |     }
 49 |     if (!fields) {
 50 |       fields = [];
 51 |     }
 52 | 
 53 |     let [name, value] = term.split(':');
 54 |     let operator: FilterOp = '==';
 55 |     if (name.startsWith('-')) {
 56 |       name = name.slice(1, name.length);
 57 |       operator = '!=';
 58 |     }
 59 | 
 60 |     let [numOp, numValOrDateOrStr] = [
 61 |       value.slice(0, 1),
 62 |       value.slice(1, value.length),
 63 |     ];
 64 |     if (numValOrDateOrStr.startsWith('=')) {
 65 |       numOp += '=';
 66 |       numValOrDateOrStr = numValOrDateOrStr.slice(1, numValOrDateOrStr.length);
 67 |     }
 68 |     const numberVal = Number.parseInt(numValOrDateOrStr);
 69 |     if (!Number.isNaN(numberVal) && !numValOrDateOrStr.includes('-')) {
 70 |       switch (numOp) {
 71 |         case '>':
 72 |         case '<':
 73 |         case '>=':
 74 |         case '<=':
 75 |           fields.push({
 76 |             name,
 77 |             type: 'number',
 78 |             operator: numOp,
 79 |             value: numberVal,
 80 |           });
 81 |           continue;
 82 |         default:
 83 |       }
 84 |     }
 85 | 
 86 |     const datetime = DateTime.fromISO(numValOrDateOrStr);
 87 |     if (datetime.invalidReason === null) {
 88 |       switch (numOp) {
 89 |         case '>':
 90 |         case '<':
 91 |         case '>=':
 92 |         case '<=':
 93 |           fields.push({
 94 |             name,
 95 |             type: 'date',
 96 |             operator: numOp,
 97 |             value: datetime.toJSDate(),
 98 |           });
 99 |           continue;
100 |         default:
101 |       }
102 |     }
103 | 
104 |     value = value.replace(/"/g, '');
105 |     fields.push({
106 |       name,
107 |       type: 'string',
108 |       operator,
109 |       value,
110 |     });
111 |   }
112 |   if (fields) {
113 |     return {
114 |       keywords,
115 |       fields,
116 |     };
117 |   }
118 |   return {
119 |     keywords,
120 |   };
121 | }
122 | 


--------------------------------------------------------------------------------
/src/sort.ts:
--------------------------------------------------------------------------------
 1 | export function calcScore(
 2 |   targetWordCount: number,
 3 |   totalWordCount: number,
 4 |   targetWordDocCount: number,
 5 |   allDocCount: number
 6 | ): number {
 7 |   return (
 8 |     (targetWordCount / totalWordCount) *
 9 |     (Math.log(allDocCount / targetWordDocCount) || 1)
10 |   );
11 | }
12 | 


--------------------------------------------------------------------------------
/src/tokenizer/english.ts:
--------------------------------------------------------------------------------
  1 | import type {LanguageID, Tokenizer} from './index';
  2 | 
  3 | const exceptions = new Map([
  4 |   ['skis', 'ski'],
  5 |   ['dying', 'die'],
  6 |   ['lying', 'lie'],
  7 |   ['tying', 'tie'],
  8 |   ['idly', 'idl'],
  9 |   ['gently', 'gentl'],
 10 |   ['ugly', 'ugli'],
 11 |   ['early', 'earli'],
 12 |   ['only', 'onli'],
 13 |   ['singly', 'singl'],
 14 |   ['sky', 'sky'],
 15 |   ['news', 'news'],
 16 |   ['howe', 'howe'],
 17 |   ['atlas', 'atlas'],
 18 |   ['cosmos', 'cosmos'],
 19 |   ['bias', 'bias'],
 20 |   ['andes', 'andes'],
 21 | ]);
 22 | 
 23 | const exceptions1a = new Map([
 24 |   ['inning', 'inning'],
 25 |   ['outing', 'outing'],
 26 |   ['canning', 'canning'],
 27 |   ['herring', 'herring'],
 28 |   ['earring', 'earring'],
 29 |   ['proceed', 'proceed'],
 30 |   ['exceed', 'exceed'],
 31 |   ['succeed', 'succeed'],
 32 | ]);
 33 | 
 34 | const extensions2 = new Map([
 35 |   ['ization', 'ize'],
 36 |   ['fulness', 'ful'],
 37 |   ['iveness', 'ive'],
 38 |   ['ational', 'ate'],
 39 |   ['ousness', 'ous'],
 40 |   ['tional', 'tion'],
 41 |   ['biliti', 'ble'],
 42 |   ['lessli', 'less'],
 43 |   ['entli', 'ent'],
 44 |   ['ation', 'ate'],
 45 |   ['alism', 'al'],
 46 |   ['aliti', 'al'],
 47 |   ['ousli', 'ous'],
 48 |   ['iviti', 'ive'],
 49 |   ['fulli', 'ful'],
 50 |   ['enci', 'ence'],
 51 |   ['anci', 'ance'],
 52 |   ['abli', 'able'],
 53 |   ['izer', 'ize'],
 54 |   ['ator', 'ate'],
 55 |   ['alli', 'al'],
 56 |   ['bli', 'ble'],
 57 |   ['ogi', 'og'],
 58 |   ['li', ''],
 59 | ]);
 60 | 
 61 | // https://github.com/stopwords-iso/stopwords-en/blob/master/raw/snowball-tartarus.txt
 62 | const stopWords = new Set<string>([
 63 |   'i',
 64 |   'me',
 65 |   'my',
 66 |   'myself',
 67 |   'we',
 68 |   'us',
 69 |   'our',
 70 |   'ours',
 71 |   'ourselves',
 72 |   'you',
 73 |   'your',
 74 |   'yours',
 75 |   'yourself',
 76 |   'yourselves',
 77 |   'he',
 78 |   'him',
 79 |   'his',
 80 |   'himself',
 81 |   'she',
 82 |   'her',
 83 |   'hers',
 84 |   'herself',
 85 |   'it',
 86 |   'its',
 87 |   'itself',
 88 |   'they',
 89 |   'them',
 90 |   'their',
 91 |   'theirs',
 92 |   'themselves',
 93 |   'what',
 94 |   'which',
 95 |   'who',
 96 |   'whom',
 97 |   'this',
 98 |   'that',
 99 |   'these',
100 |   'those',
101 |   'am',
102 |   'is',
103 |   'are',
104 |   'was',
105 |   'were',
106 |   'be',
107 |   'been',
108 |   'being',
109 |   'have',
110 |   'has',
111 |   'had',
112 |   'having',
113 |   'do',
114 |   'does',
115 |   'did',
116 |   'doing',
117 |   'will',
118 |   'would',
119 |   'shall',
120 |   'should',
121 |   'can',
122 |   'could',
123 |   'may',
124 |   'might',
125 |   'must',
126 |   'ought',
127 |   "i'm",
128 |   "you're",
129 |   "he's",
130 |   "she's",
131 |   "it's",
132 |   "we're",
133 |   "they're",
134 |   "i've",
135 |   "you've",
136 |   "we've",
137 |   "they've",
138 |   "i'd",
139 |   "you'd",
140 |   "he'd",
141 |   "she'd",
142 |   "we'd",
143 |   "they'd",
144 |   "i'll",
145 |   "you'll",
146 |   "he'll",
147 |   "she'll",
148 |   "we'll",
149 |   "they'll",
150 |   "isn't",
151 |   "aren't",
152 |   "wasn't",
153 |   "weren't",
154 |   "hasn't",
155 |   "haven't",
156 |   "hadn't",
157 |   "doesn't",
158 |   "don't",
159 |   "didn't",
160 |   "won't",
161 |   "wouldn't",
162 |   "shan't",
163 |   "shouldn't",
164 |   "can't",
165 |   'cannot',
166 |   "couldn't",
167 |   "mustn't",
168 |   "let's",
169 |   "that's",
170 |   "who's",
171 |   "what's",
172 |   "here's",
173 |   "there's",
174 |   "when's",
175 |   "where's",
176 |   "why's",
177 |   "how's",
178 |   "daren't",
179 |   "needn't",
180 |   'doubtful',
181 |   "oughtn't",
182 |   "mightn't",
183 |   'a',
184 |   'an',
185 |   'the',
186 |   'and',
187 |   'but',
188 |   'if',
189 |   'or',
190 |   'because',
191 |   'as',
192 |   'until',
193 |   'while',
194 |   'of',
195 |   'at',
196 |   'by',
197 |   'for',
198 |   'with',
199 |   'about',
200 |   'against',
201 |   'between',
202 |   'into',
203 |   'through',
204 |   'during',
205 |   'before',
206 |   'after',
207 |   'above',
208 |   'below',
209 |   'to',
210 |   'from',
211 |   'up',
212 |   'down',
213 |   'in',
214 |   'out',
215 |   'on',
216 |   'off',
217 |   'over',
218 |   'under',
219 |   'again',
220 |   'further',
221 |   'then',
222 |   'once',
223 |   'here',
224 |   'there',
225 |   'when',
226 |   'where',
227 |   'why',
228 |   'how',
229 |   'all',
230 |   'any',
231 |   'both',
232 |   'each',
233 |   'few',
234 |   'more',
235 |   'most',
236 |   'other',
237 |   'some',
238 |   'such',
239 |   'no',
240 |   'nor',
241 |   'not',
242 |   'only',
243 |   'own',
244 |   'same',
245 |   'so',
246 |   'than',
247 |   'too',
248 |   'very',
249 |   'one',
250 |   'every',
251 |   'least',
252 |   'less',
253 |   'many',
254 |   'now',
255 |   'ever',
256 |   'never',
257 |   'say',
258 |   'says',
259 |   'said',
260 |   'also',
261 |   'get',
262 |   'go',
263 |   'goes',
264 |   'just',
265 |   'made',
266 |   'make',
267 |   'put',
268 |   'see',
269 |   'seen',
270 |   'whether',
271 |   'like',
272 |   'well',
273 |   'back',
274 |   'even',
275 |   'still',
276 |   'way',
277 |   'take',
278 |   'since',
279 |   'another',
280 |   'however',
281 |   'two',
282 |   'three',
283 |   'four',
284 |   'five',
285 |   'first',
286 |   'second',
287 |   'new',
288 |   'old',
289 |   'high',
290 |   'long',
291 | ]);
292 | 
293 | export class EnglishTokenizer implements Tokenizer {
294 |   getLanguage(): LanguageID {
295 |     return 'en';
296 |   }
297 | 
298 |   async getStopWords(): Promise<Set<string>> {
299 |     return stopWords;
300 |   }
301 | 
302 |   async splitter(content: string): Promise<string[]> {
303 |     const words = content.trim().split(/ +/);
304 |     return words.map(word => word.replace(/[.,:"]+$/g, '')).filter(v => !!v);
305 |   }
306 | 
307 |   // implemented from algorithm at http://snowball.tartarus.org/algorithms/english/stemmer.html
308 |   async stemmer(content: string): Promise<string> {
309 |     if (content.length < 3) {
310 |       return content;
311 |     }
312 |     if (exceptions.has(content)) {
313 |       return exceptions.get(content) ?? '';
314 |     }
315 | 
316 |     const eRx = ['', ''];
317 |     content = content
318 |       .toLowerCase()
319 |       .replace(/^'/, '')
320 |       .replace(/[^a-z']/g, '')
321 |       .replace(/^y|([aeiouy])y/g, '$1Y');
322 |     let R1, res;
323 | 
324 |     if ((res = /^(gener|commun|arsen)/.exec(content))) {
325 |       R1 = res[0].length;
326 |     } else {
327 |       R1 = (/[aeiouy][^aeiouy]/.exec(' ' + content)?.index || 1000) + 1;
328 |     }
329 | 
330 |     const R2 =
331 |       (/[aeiouy][^aeiouy]/.exec(' ' + content.substr(R1))?.length || 1000) +
332 |       R1 +
333 |       1;
334 | 
335 |     // step 0
336 |     content = content.replace(/('s'?|')$/, '');
337 | 
338 |     // step 1a
339 |     const rx = /(?:(ss)es|(..i)(?:ed|es)|(us)|(ss)|(.ie)(?:d|s))$/;
340 |     if (rx.test(content)) {
341 |       content = content.replace(rx, '$1$2$3$4$5');
342 |     } else {
343 |       content = content.replace(/([aeiouy].+)s$/, '$1');
344 |     }
345 | 
346 |     if (exceptions1a.has(content)) {
347 |       return exceptions1a.get(content) ?? '';
348 |     }
349 | 
350 |     // step 1b
351 |     const s1 = (/(eedly|eed)$/.exec(content) || eRx)[1],
352 |       s2 = (/(?:[aeiouy].*)(ingly|edly|ing|ed)$/.exec(content) || eRx)[1];
353 | 
354 |     if (s1.length > s2.length) {
355 |       if (content.indexOf(s1, R1) >= 0) {
356 |         content = content.substr(0, content.length - s1.length) + 'ee';
357 |       }
358 |     } else if (s2.length > s1.length) {
359 |       content = content.substr(0, content.length - s2.length);
360 |       if (/(at|bl|iz)$/.test(content)) {
361 |         content += 'e';
362 |       } else if (/(bb|dd|ff|gg|mm|nn|pp|rr|tt)$/.test(content)) {
363 |         content = content.substr(0, content.length - 1);
364 |       } else if (
365 |         !content.substr(R1) &&
366 |         /([^aeiouy][aeiouy][^aeiouywxY]|^[aeiouy][^aeiouy]|^[aeiouy])$/.test(
367 |           content
368 |         )
369 |       ) {
370 |         content += 'e';
371 |       }
372 |     }
373 | 
374 |     // step 1c
375 |     content = content.replace(/(.[^aeiouy])[yY]$/, '$1i');
376 | 
377 |     // step 2
378 |     const sfx = /(ization|fulness|iveness|ational|ousness|tional|biliti|lessli|entli|ation|alism|aliti|ousli|iviti|fulli|enci|anci|abli|izer|ator|alli|bli|l(ogi)|[cdeghkmnrt](li))$/.exec(
379 |       content
380 |     );
381 |     if (sfx) {
382 |       const sfx2 = sfx[3] || sfx[2] || sfx[1];
383 |       if (content.indexOf(sfx2, R1) >= 0) {
384 |         content =
385 |           content.substr(0, content.length - sfx2.length) +
386 |           extensions2.get(sfx2);
387 |       }
388 |     }
389 | 
390 |     // step 3
391 |     const sfx3 = (/(ational|tional|alize|icate|iciti|ative|ical|ness|ful)$/.exec(
392 |       content
393 |     ) || eRx)[1];
394 |     if (sfx && content.indexOf(sfx3, R1) >= 0) {
395 |       content = `${content.substr(0, content.length - sfx3.length)}${new Map([
396 |         ['ational', 'ate'],
397 |         ['tional', 'tion'],
398 |         ['alize', 'al'],
399 |         ['icate', 'ic'],
400 |         ['iciti', 'ic'],
401 |         ['ative', content.indexOf('ative', R2) >= 0 ? '' : 'ative'],
402 |         ['ical', 'ic'],
403 |         ['ness', ''],
404 |         ['ful', ''],
405 |       ]).get(sfx3)}`;
406 |     }
407 | 
408 |     // step 4
409 |     const sfx4 = /(ement|ance|ence|able|ible|ment|ant|ent|ism|ate|iti|ous|ive|ize|[st](ion)|al|er|ic)$/.exec(
410 |       content
411 |     );
412 |     if (sfx4) {
413 |       const sfx5 = sfx4[2] || sfx4[1];
414 |       if (content.indexOf(sfx5, R2) >= 0) {
415 |         content = content.substr(0, content.length - sfx5.length);
416 |       }
417 |     }
418 | 
419 |     // step 5
420 |     if (content.substr(-1) === 'e') {
421 |       if (
422 |         content.substr(R2) ||
423 |         (content.substr(R1) &&
424 |           !/([^aeiouy][aeiouy][^aeiouywxY]|^[aeiouy][^aeiouy])e$/.test(content))
425 |       ) {
426 |         content = content.substr(0, content.length - 1);
427 |       }
428 |     } else if (content.substr(-2) === 'll' && content.indexOf('l', R2) >= 0) {
429 |       content = content.substr(0, content.length - 1);
430 |     }
431 | 
432 |     return content.toLowerCase();
433 |   }
434 | }
435 | 


--------------------------------------------------------------------------------
/src/tokenizer/index.ts:
--------------------------------------------------------------------------------
 1 | export type LanguageID = English | Japanese;
 2 | export type English = 'en';
 3 | export type Japanese = 'ja';
 4 | 
 5 | export interface Tokenizer {
 6 |   getLanguage(): LanguageID;
 7 |   getStopWords(): Promise<Set<string>>;
 8 |   splitter(content: string): Promise<string[]>;
 9 |   stemmer(content: string): Promise<string>;
10 | }
11 | 
12 | export type Token = {
13 |   word: string;
14 |   normalizedWord: string;
15 |   positions: number[];
16 | };
17 | 


--------------------------------------------------------------------------------
/src/tokenizer/japanese.ts:
--------------------------------------------------------------------------------
  1 | import type {LanguageID, Tokenizer} from './index';
  2 | import path from 'path';
  3 | import kuromoji from 'kuromoji';
  4 | 
  5 | const stopWords = new Set<string>([
  6 |   'あそこ',
  7 |   'あっ',
  8 |   'あの',
  9 |   'あのかた',
 10 |   'あの人',
 11 |   'あり',
 12 |   'あります',
 13 |   'ある',
 14 |   'あれ',
 15 |   'い',
 16 |   'いう',
 17 |   'います',
 18 |   'いる',
 19 |   'う',
 20 |   'うち',
 21 |   'え',
 22 |   'お',
 23 |   'および',
 24 |   'おり',
 25 |   'おります',
 26 |   'か',
 27 |   'かつて',
 28 |   'から',
 29 |   'が',
 30 |   'き',
 31 |   'ここ',
 32 |   'こちら',
 33 |   'こと',
 34 |   'この',
 35 |   'これ',
 36 |   'これら',
 37 |   'さ',
 38 |   'さらに',
 39 |   'し',
 40 |   'しかし',
 41 |   'する',
 42 |   'ず',
 43 |   'せ',
 44 |   'せる',
 45 |   'そこ',
 46 |   'そして',
 47 |   'その',
 48 |   'その他',
 49 |   'その後',
 50 |   'それ',
 51 |   'それぞれ',
 52 |   'それで',
 53 |   'た',
 54 |   'ただし',
 55 |   'たち',
 56 |   'ため',
 57 |   'たり',
 58 |   'だ',
 59 |   'だっ',
 60 |   'だれ',
 61 |   'つ',
 62 |   'て',
 63 |   'で',
 64 |   'でき',
 65 |   'できる',
 66 |   'です',
 67 |   'では',
 68 |   'でも',
 69 |   'と',
 70 |   'という',
 71 |   'といった',
 72 |   'とき',
 73 |   'ところ',
 74 |   'として',
 75 |   'とともに',
 76 |   'とも',
 77 |   'と共に',
 78 |   'どこ',
 79 |   'どの',
 80 |   'な',
 81 |   'ない',
 82 |   'なお',
 83 |   'なかっ',
 84 |   'ながら',
 85 |   'なく',
 86 |   'なっ',
 87 |   'など',
 88 |   'なに',
 89 |   'なら',
 90 |   'なり',
 91 |   'なる',
 92 |   'なん',
 93 |   'に',
 94 |   'において',
 95 |   'における',
 96 |   'について',
 97 |   'にて',
 98 |   'によって',
 99 |   'により',
100 |   'による',
101 |   'に対して',
102 |   'に対する',
103 |   'に関する',
104 |   'の',
105 |   'ので',
106 |   'のみ',
107 |   'は',
108 |   'ば',
109 |   'へ',
110 |   'ほか',
111 |   'ほとんど',
112 |   'ほど',
113 |   'ます',
114 |   'また',
115 |   'または',
116 |   'まで',
117 |   'も',
118 |   'もの',
119 |   'ものの',
120 |   'や',
121 |   'よう',
122 |   'より',
123 |   'ら',
124 |   'られ',
125 |   'られる',
126 |   'れ',
127 |   'れる',
128 |   'を',
129 |   'ん',
130 |   '何',
131 |   '及び',
132 |   '彼',
133 |   '彼女',
134 |   '我々',
135 |   '特に',
136 |   '私',
137 |   '私達',
138 |   '貴方',
139 |   '貴方方',
140 | ]);
141 | 
142 | export class JapaneseTokenizer implements Tokenizer {
143 |   #builder: kuromoji.TokenizerBuilder<kuromoji.IpadicFeatures>;
144 |   #tokenizer?: kuromoji.Tokenizer<kuromoji.IpadicFeatures>;
145 | 
146 |   constructor() {
147 |     this.#builder = kuromoji.builder({
148 |       dicPath: path.resolve(__dirname, '../../node_modules/kuromoji/dict'),
149 |     });
150 |   }
151 | 
152 |   getLanguage(): LanguageID {
153 |     return 'ja';
154 |   }
155 | 
156 |   async getStopWords(): Promise<Set<string>> {
157 |     return stopWords;
158 |   }
159 | 
160 |   async splitter(content: string): Promise<string[]> {
161 |     const tokenizer = await new Promise<
162 |       kuromoji.Tokenizer<kuromoji.IpadicFeatures>
163 |     >((resolve, reject) => {
164 |       if (this.#tokenizer) {
165 |         return resolve(this.#tokenizer);
166 |       }
167 |       this.#builder.build((err, tokenizer) => {
168 |         if (err) {
169 |           reject(err);
170 |           return;
171 |         }
172 |         this.#tokenizer = tokenizer;
173 |         resolve(tokenizer);
174 |       });
175 |     });
176 |     const res = tokenizer.tokenize(content);
177 |     return res
178 |       .filter(token => token.pos !== '助詞')
179 |       .filter(token => token.pos !== '記号')
180 |       .filter(token => token.surface_form !== '.')
181 |       .map(token => token.surface_form);
182 |   }
183 | 
184 |   async stemmer(content: string): Promise<string> {
185 |     return content;
186 |   }
187 | }
188 | 


--------------------------------------------------------------------------------
/src/tokenizer/tokenize.ts:
--------------------------------------------------------------------------------
 1 | import type {Tokenizer, LanguageID, Token} from './index';
 2 | import {EnglishTokenizer} from './english';
 3 | import {JapaneseTokenizer} from './japanese';
 4 | 
 5 | export default async function tokenize(
 6 |   lang: LanguageID,
 7 |   text: string
 8 | ): Promise<Token[]> {
 9 |   let tokeneizer: Tokenizer | null = null;
10 |   switch (lang) {
11 |     case 'en':
12 |       tokeneizer = new EnglishTokenizer();
13 |       break;
14 |     case 'ja':
15 |       tokeneizer = new JapaneseTokenizer();
16 |       break;
17 |     default:
18 |       throw new Error(`Unsupport language: ${lang}`);
19 |   }
20 |   const words = await tokeneizer.splitter(text);
21 | 
22 |   const wordToPositions = new Map<
23 |     string,
24 |     {word: string; positions: number[]}
25 |   >();
26 |   let index = 0;
27 |   for (const word of words) {
28 |     if ((await tokeneizer.getStopWords()).has(word)) {
29 |       continue;
30 |     }
31 | 
32 |     const stemWord = await tokeneizer.stemmer(word.toLowerCase());
33 |     if (wordToPositions.has(stemWord)) {
34 |       wordToPositions.set(stemWord, {
35 |         word,
36 |         positions: wordToPositions.get(stemWord)?.positions.concat(index) ?? [],
37 |       });
38 |     } else {
39 |       wordToPositions.set(stemWord, {word, positions: [index]});
40 |     }
41 |     index++;
42 |   }
43 | 
44 |   const res: Token[] = new Array(index);
45 |   for (const [stemWord, {word, positions}] of wordToPositions) {
46 |     for (const pos of positions) {
47 |       res[pos] = {
48 |         word,
49 |         normalizedWord: stemWord,
50 |         positions,
51 |       };
52 |     }
53 |   }
54 | 
55 |   return res;
56 | }
57 | 


--------------------------------------------------------------------------------
/src/tokenizer/tokneize.test.ts:
--------------------------------------------------------------------------------
 1 | import tokeneize from './tokenize';
 2 | import type {Token} from './index';
 3 | 
 4 | describe('tokeneize', () => {
 5 |   it('english', async () => {
 6 |     const word =
 7 |       "Node.js is a JavaScript runtime built on Chrome's V8 JavaScript engine.";
 8 |     const wants: Token[] = [
 9 |       {word: 'Node.js', normalizedWord: 'nodej', positions: [0]},
10 |       {word: 'JavaScript', normalizedWord: 'javascript', positions: [1, 6]},
11 |       {word: 'runtime', normalizedWord: 'runtim', positions: [2]},
12 |       {word: 'built', normalizedWord: 'built', positions: [3]},
13 |       {word: "Chrome's", normalizedWord: 'chrome', positions: [4]},
14 |       {word: 'V8', normalizedWord: 'v8', positions: [5]},
15 |       {word: 'JavaScript', normalizedWord: 'javascript', positions: [1, 6]},
16 |       {word: 'engine', normalizedWord: 'engin', positions: [7]},
17 |     ];
18 |     const res = await tokeneize('en', word);
19 |     for (const i in res) {
20 |       const [token, want] = [res[i], wants[i]];
21 |       expect(token.normalizedWord).toBe(want.normalizedWord);
22 |       expect(token.word).toBe(want.word);
23 |       expect(token.positions).toStrictEqual(want.positions);
24 |     }
25 |   });
26 | 
27 |   it('japanese', async () => {
28 |     const word =
29 |       'Node.js は、Chrome の V8 JavaScript エンジン で動作する JavaScript 環境です。';
30 |     const wants: Token[] = [
31 |       {word: 'Node', normalizedWord: 'node', positions: [0]},
32 |       {word: 'js', normalizedWord: 'js', positions: [1]},
33 |       {word: 'Chrome', normalizedWord: 'chrome', positions: [2]},
34 |       {word: 'V', normalizedWord: 'v', positions: [3]},
35 |       {word: '8', normalizedWord: '8', positions: [4]},
36 |       {
37 |         word: 'JavaScript',
38 |         normalizedWord: 'javascript',
39 |         positions: [5, 8],
40 |       },
41 |       {word: 'エンジン', normalizedWord: 'エンジン', positions: [6]},
42 |       {word: '動作', normalizedWord: '動作', positions: [7]},
43 |       {
44 |         word: 'JavaScript',
45 |         normalizedWord: 'javascript',
46 |         positions: [5, 8],
47 |       },
48 |       {word: '環境', normalizedWord: '環境', positions: [9]},
49 |     ];
50 | 
51 |     const res = await tokeneize('ja', word);
52 |     for (const i in res) {
53 |       const [token, want] = [res[i], wants[i]];
54 |       expect(token.normalizedWord).toBe(want.normalizedWord);
55 |       expect(token.word).toBe(want.word);
56 |       expect(token.positions).toStrictEqual(want.positions);
57 |     }
58 |   });
59 | });
60 | 


--------------------------------------------------------------------------------
/src/utils/firestore.ts:
--------------------------------------------------------------------------------
  1 | import type {
  2 |   Firestore,
  3 |   WriteBatch,
  4 |   DocumentReference,
  5 |   SetOptions,
  6 |   Precondition,
  7 |   WriteResult,
  8 |   Query,
  9 |   CollectionReference,
 10 |   FieldPath,
 11 | } from '@google-cloud/firestore';
 12 | 
 13 | export type WriteBatch2Options = {
 14 |   batch?: WriteBatch;
 15 | };
 16 | 
 17 | type WriteData<T> = WriteCreateData<T> | WriteSetData<T> | WriteDeleteData;
 18 | type WriteCreateData<T> = {
 19 |   type: 'create';
 20 |   data: Partial<T>;
 21 | };
 22 | type WriteSetData<T> = {
 23 |   type: 'set';
 24 |   data: Partial<T>;
 25 |   options?: SetOptions;
 26 | };
 27 | type WriteDeleteData = {
 28 |   type: 'delete';
 29 |   precondition?: Precondition;
 30 | };
 31 | 
 32 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
 33 | function flatDeep(arr: Array<any>, d = 1): Array<any> {
 34 |   return d > 0
 35 |     ? arr.reduce(
 36 |         (acc, val) =>
 37 |           acc.concat(Array.isArray(val) ? flatDeep(val, d - 1) : val),
 38 |         []
 39 |       )
 40 |     : arr.slice();
 41 | }
 42 | 
 43 | // Split more than 500 document writes.
 44 | export class WriteBatch2 {
 45 |   #db: Firestore;
 46 |   #externalBatch: WriteBatch | null;
 47 |   #writeDocumentMap = new Map<DocumentReference, WriteData<unknown>>();
 48 |   #commited = false;
 49 | 
 50 |   constructor(db: Firestore, options?: WriteBatch2Options) {
 51 |     this.#db = db;
 52 |     this.#externalBatch = options?.batch ?? null;
 53 |     this.#commited = false;
 54 |   }
 55 | 
 56 |   create<T>(documentRef: DocumentReference<T>, data: T): WriteBatch2 {
 57 |     this.#writeDocumentMap.set(documentRef, {type: 'create', data});
 58 |     return this;
 59 |   }
 60 | 
 61 |   set<T>(
 62 |     documentRef: DocumentReference<T>,
 63 |     data: Partial<T>,
 64 |     options?: SetOptions
 65 |   ): WriteBatch2 {
 66 |     this.#writeDocumentMap.set(documentRef, {type: 'set', data, options});
 67 |     return this;
 68 |   }
 69 | 
 70 |   delete(
 71 |     // eslint-disable-next-line @typescript-eslint/no-explicit-any
 72 |     documentRef: DocumentReference<any>,
 73 |     precondition?: Precondition
 74 |   ): WriteBatch2 {
 75 |     this.#writeDocumentMap.set(documentRef, {type: 'delete', precondition});
 76 |     return this;
 77 |   }
 78 | 
 79 |   async commit(): Promise<WriteResult[]> {
 80 |     if (this.#commited) {
 81 |       throw new Error('commited');
 82 |     }
 83 |     this.#commited = true;
 84 |     const isSmallDocs = this.#writeDocumentMap.size <= 499;
 85 |     let currentBatch = isSmallDocs
 86 |       ? this.#externalBatch ?? this.#db.batch()
 87 |       : this.#db.batch();
 88 |     const batchs: WriteBatch[] = [currentBatch];
 89 |     let i = 0;
 90 |     for (const [ref, data] of this.#writeDocumentMap) {
 91 |       switch (data.type) {
 92 |         case 'create':
 93 |           currentBatch.create(ref, data.data);
 94 |           break;
 95 |         case 'set':
 96 |           if (data.options) {
 97 |             currentBatch.set(ref, data.data, data.options);
 98 |           } else {
 99 |             currentBatch.set(ref, data.data);
100 |           }
101 |           break;
102 |         case 'delete':
103 |           currentBatch.delete(ref, data.precondition);
104 |           break;
105 |       }
106 | 
107 |       if (i % 500 === 0) {
108 |         currentBatch = this.#db.batch();
109 |         batchs.push(currentBatch);
110 |       }
111 | 
112 |       i++;
113 |     }
114 | 
115 |     if (isSmallDocs && this.#externalBatch && batchs.length === 1) {
116 |       return [];
117 |     }
118 | 
119 |     if (isSmallDocs && this.#externalBatch) {
120 |       batchs.shift();
121 |     }
122 | 
123 |     const results = await Promise.all(batchs.map(batch => batch.commit()));
124 |     return flatDeep(results);
125 |   }
126 | }
127 | 
128 | export function startsWith(
129 |   query: Query | CollectionReference,
130 |   fieldPath: string | FieldPath,
131 |   value: string
132 | ) {
133 |   const start = value.slice(0, value.length - 1);
134 |   const end = value.slice(value.length - 1, value.length);
135 |   const v = `${start}${String.fromCharCode(end.charCodeAt(0) + 1)}`;
136 |   return query
137 |     .where(fieldPath, '>=', value)
138 |     .where(fieldPath, '<', v)
139 |     .orderBy(fieldPath);
140 | }
141 | 


--------------------------------------------------------------------------------
/testdata/5.en.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "reference": "https://en.wikipedia.org/",
 3 |     "items": [
 4 |         {
 5 |             "title": "Dog",
 6 |             "description": "The dog (Canis familiaris when considered a distinct species or Canis lupus familiaris when considered a subspecies of the wolf) is a domesticated carnivore of the family Canidae. It is part of the wolf-like canids, and is the most widely abundant terrestrial carnivore. The dog and the extant gray wolf are sister taxa as modern wolves are not closely related to the wolves that were first domesticated, which implies that the direct ancestor of the dog is extinct. The dog was the first species to be domesticated, and has been selectively bred over millennia for various behaviors, sensory capabilities, and physical attributes.Their long association with humans has led dogs to be uniquely attuned to human behavior, and they can thrive on a starch-rich diet that would be inadequate for other canids. Dogs vary widely in shape, size, and colors. They perform many roles for humans, such as hunting, herding, pulling loads, protection, assisting police and military, companionship, and, more recently, aiding disabled people, and therapeutic roles. This influence on human society has given them the sobriquet of \"man's best friend.\""
 7 |         },
 8 |         {
 9 |             "title": "Cat",
10 |             "description": "The cat (Felis catus) is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae and is often referred to as the domestic cat to distinguish it from the wild members of the family. A cat can either be a house cat, a farm cat or a feral cat; the latter ranges freely and avoids human contact. Domestic cats are valued by humans for companionship and their ability to hunt rodents. About 60 cat breeds are recognized by various cat registries.The cat is similar in anatomy to the other felid species: it has a strong flexible body, quick reflexes, sharp teeth and retractable claws adapted to killing small prey. Its night vision and sense of smell are well developed. Cat communication includes vocalizations like meowing, purring, trilling, hissing, growling and grunting as well as cat-specific body language. A predator that is most active at dawn and dusk, the cat is a solitary hunter but a social species. It can hear sounds too faint or too high in frequency for human ears, such as those made by mice and other small mammals. It secretes and perceives pheromones.Female domestic cats can have kittens from spring to late autumn, with litter sizes often ranging from two to five kittens. Domestic cats are bred and shown at events as registered pedigreed cats, a hobby known as cat fancy. Failure to control breeding of pet cats by spaying and neutering, as well as abandonment of pets, resulted in large numbers of feral cats worldwide, contributing to the extinction of entire bird, mammal, and reptile species, and evoking population control.Cats were first domesticated in the Near East around 7500 BC. It was long thought that cat domestication was initiated in ancient Egypt, as since around 3100 BC veneration was given to cats in ancient Egypt. As of 2017, the domestic cat was the second-most popular pet in the United States, with 95 million cats owned. In the United Kingdom, around 7.3 million cats lived in more than 4.8 million households as of 2019."
11 |         },
12 |         {
13 |             "title": "Horse",
14 |             "description": "The horse (Equus ferus caballus) is one of two extant subspecies of Equus ferus. It is an odd-toed ungulate mammal belonging to the taxonomic family Equidae. The horse has evolved over the past 45 to 55 million years from a small multi-toed creature, Eohippus, into the large, single-toed animal of today. Humans began domesticating horses around 4000 BC, and their domestication is believed to have been widespread by 3000 BC. Horses in the subspecies caballus are domesticated, although some domesticated populations live in the wild as feral horses. These feral populations are not true wild horses, as this term is used to describe horses that have never been domesticated, such as the endangered Przewalski's horse, a separate subspecies, and the only remaining true wild horse. There is an extensive, specialized vocabulary used to describe equine-related concepts, covering everything from anatomy to life stages, size, colors, markings, breeds, locomotion, and behavior.\nHorses are adapted to run, allowing them to quickly escape predators, possessing an excellent sense of balance and a strong fight-or-flight response. Related to this need to flee from predators in the wild is an unusual trait: horses are able to sleep both standing up and lying down, with younger horses tending to sleep significantly more than adults. Female horses, called mares, carry their young for approximately 11 months, and a young horse, called a foal, can stand and run shortly following birth. Most domesticated horses begin training under a saddle or in a harness between the ages of two and four. They reach full adult development by age five, and have an average lifespan of between 25 and 30 years.\nHorse breeds are loosely divided into three categories based on general temperament: spirited \"hot bloods\" with speed and endurance; \"cold bloods\", such as draft horses and some ponies, suitable for slow, heavy work; and \"warmbloods\", developed from crosses between hot bloods and cold bloods, often focusing on creating breeds for specific riding purposes, particularly in Europe. There are more than 300 breeds of horse in the world today, developed for many different uses.\nHorses and humans interact in a wide variety of sport competitions and non-competitive recreational pursuits, as well as in working activities such as police work, agriculture, entertainment, and therapy. Horses were historically used in warfare, from which a wide variety of riding and driving techniques developed, using many different styles of equipment and methods of control. Many products are derived from horses, including meat, milk, hide, hair, bone, and pharmaceuticals extracted from the urine of pregnant mares. Humans provide domesticated horses with food, water, and shelter, as well as attention from specialists such as veterinarians and farriers.\n\n"
15 |         },
16 |         {
17 |             "title": "Bird",
18 |             "description": "Birds are a group of warm-blooded vertebrates constituting the class Aves , characterised by feathers, toothless beaked jaws, the laying of hard-shelled eggs, a high metabolic rate, a four-chambered heart, and a strong yet lightweight skeleton. Birds live worldwide and range in size from the 5.5 cm (2.2 in) bee hummingbird to the 2.8 m (9 ft 2 in) ostrich. There are about ten thousand living species, more than half of which are passerine, or \"perching\" birds. Birds have wings whose development varies according to species; the only known groups without wings are the extinct moa and elephant birds. Wings, which evolved from forelimbs, gave birds the ability to fly, although further evolution has led to the loss of flight in some birds, including ratites, penguins, and diverse endemic island species. The digestive and respiratory systems of birds are also uniquely adapted for flight. Some bird species of aquatic environments, particularly seabirds and some waterbirds, have further evolved for swimming.\nBirds are a group of feathered theropod dinosaurs, and constitute the only living dinosaurs. Likewise, birds are considered reptiles in the modern cladistic sense of the term, and their closest living relatives are the crocodilians. Birds are descendants of the primitive avialans (whose members include Archaeopteryx) which first appeared about 160 million years ago (mya) in China. According to DNA evidence, modern birds (Neornithes) evolved in the Middle to Late Cretaceous, and diversified dramatically around the time of the Cretaceous–Paleogene extinction event 66 mya, which killed off the pterosaurs and all non-avian dinosaurs.\nMany social species pass on knowledge across generations, which is considered a form of culture. Birds are social, communicating with visual signals, calls, and songs, and participating in such behaviours as cooperative breeding and hunting, flocking, and mobbing of predators. The vast majority of bird species are socially (but not necessarily sexually) monogamous, usually for one breeding season at a time, sometimes for years, but rarely for life. Other species have breeding systems that are polygynous (one male with many females) or, rarely, polyandrous (one female with many males). Birds produce offspring by laying eggs which are fertilised through sexual reproduction. They are usually laid in a nest and incubated by the parents. Most birds have an extended period of parental care after hatching.\nMany species of birds are economically important as food for human consumption and raw material in manufacturing, with domesticated and undomesticated birds being important sources of eggs, meat, and feathers. Songbirds, parrots, and other species are popular as pets. Guano (bird excrement) is harvested for use as a fertiliser. Birds figure throughout human culture. About 120 to 130 species have become extinct due to human activity since the 17th century, and hundreds more before then. Human activity threatens about 1,200 bird species with extinction, though efforts are underway to protect them. Recreational birdwatching is an important part of the ecotourism industry.\n\n"
19 |         },
20 |         {
21 |             "title": "Cattle",
22 |             "description": "Cattle, or cows (female) and bulls (male), are the most common type of large domesticated ungulates. They are a prominent modern member of the subfamily Bovinae, are the most widespread species of the genus Bos, and are most commonly classified collectively as Bos taurus.\nCattle are commonly raised as livestock for meat (beef or veal, see beef cattle), for milk (see dairy cattle), and for hides, which are used to make leather. They are used as riding animals and draft animals (oxen or bullocks, which pull carts, plows and other implements). Another product of cattle is their dung, which can be used to create manure or fuel. In some regions, such as parts of India, cattle have significant religious meaning. Cattle, mostly small breeds such as the Miniature Zebu, are also kept as pets.\nAround 10,500 years ago, cattle were domesticated from as few as 80 progenitors in central Anatolia, the Levant and Western Iran. According to the Food and Agriculture Organization (FAO), there are approximately 1.5 billion cattle in the world as of 2018. In 2009, cattle became one of the first livestock animals to have a fully mapped genome."
23 |         }
24 |     ]
25 | }


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "./node_modules/gts/tsconfig-google.json",
 3 |   "compilerOptions": {
 4 |     "rootDir": "src",
 5 |     "outDir": "lib",
 6 |     "allowSyntheticDefaultImports": true,
 7 |     "esModuleInterop": true,
 8 |   },
 9 |   "include": [
10 |     "src/**/*.ts",
11 |     "test/**/*.ts"
12 |   ],
13 |   "exclude": [
14 |     "src/**/*.test.ts",
15 |     "src/**/*.spec.ts"
16 |   ]
17 | }


--------------------------------------------------------------------------------