13 |
20 | libsearch is the core text search algorithm that I've polished and
21 | reused over the years across many of my personal
22 | projects for fast and simple full-text
23 | search, packaged into a small single-purpose JavaScript library.
24 |
For how libsearch works, how to import and use in your own project, and
25 | canonical documentation, check out the GitHub repository
26 | page.
27 |
9
28 |
29 | To turn every potential query into a regular expression, we need to be able
30 | to escape characters that are significant in RegExp.
31 |
12function escapeForRegExp(text: string): string {
32 | 13 return text.replace(/[.*+?^${}[\]()|\\]/g, '\\$1');
33 |
34 |
35 | Utility function for sorting an array by some predicate, rather than a
36 | comparator function. This implementation assumes by(it) is very cheap.
37 |
18function sortBy<T>(items: T[], by: (_it: T) => any): T[] {
38 | 19 return items.sort((a, b) => {
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | The search function takes:
52 |
53 | items, the list of items to search
54 | query, the search query text
55 | by, which is a predicate function that takes an item from the items
56 | list and returns the string that should be matched with the query
57 | options, a dictionary of options:
58 |
59 |
Options include
60 |
61 | caseSensitive, which is self-explanatory
62 | mode: which is 'word', 'prefix', or 'autocomplete' ('autocomplete' by
63 | default), determining the way in which partial matches are processed
64 |
65 |
43export function search<T>(items: T[], query: string, by: (_it: T) => string = x => String(x), {
66 | 44 caseSensitive = false,
67 | 45 mode = 'autocomplete',
68 |
69 | 47 caseSensitive?: boolean;
70 | 48 mode?: 'word' | 'prefix' | 'autocomplete';
71 |
72 | countMatches counts the number of times regexp occurs in the string
73 | s. We need this information for ranking, where documents that mention
74 | the keyword more times (relative to the total word count of the
75 | document) are ranked higher.
76 |
54 function countMatches(s: string, regexp: RegExp): number {
77 |
78 | 56 while (regexp.exec(s) !== null) {
79 |
80 |
81 |
82 |
83 |
84 | We chunk up the query string into a list of "words", each of which will
85 | become a regular expression filter.
86 |
64 const words = query
87 |
88 |
89 | 67 .filter(s => s !== '');
90 |
91 | Short-circuit if the search query is empty -- return the original list.
92 | This is a sensible default because in most apps this corresponds to the
93 | "home view" of the list, where a search has not been performed.
94 |
72 if (words.length === 0) {
95 |
96 |
97 |
98 | For every word in the search query, we're going to keep track of every
99 | document's TF-IDF value in this map, and aggregate them together by the
100 | end for sorting.
101 |
79 const tfidf = new Map<T, number>();
102 |
103 | Iterate through every word in the query and progressively filter down
104 | items to just the documents that match every query word.
105 |
83 const results = words.reduce((results, word, i) => {
106 | 84 const isLastWord = i + 1 === words.length;
107 | 85 const regexp = new RegExp(
108 |
109 | 87 + escapeForRegExp(word)
110 | 88 + ((mode === 'autocomplete' && isLastWord) || mode === 'prefix' ? '' : '($|\\W)'),
111 | The 'u' flag for Unicode used to be used here, but was removed
112 | because it was (1) across-the-board too slow, and removing it
113 | made a statistically significant speed improvement, and (2)
114 | caused at least Chrome to have strange performance cliffs in
115 | unpredictable ways where certain RegExp operations would take
116 | 10s of ms.
117 |
95 caseSensitive ? 'mg' : 'img'
118 |
119 | 97 return results.filter(result => {
120 | 98 const text = by(result);
121 | 99 const count = countMatches(text, regexp);
122 |
123 |
124 |
125 | Compute the TF-IDF value for this word, and add it to this
126 | result's TF-IDF value so far.
127 |
105 tfidf.set(
128 |
129 | 107 (tfidf.get(result) || 0)
130 | 108 + (count / text.length * Math.log(items.length / results.length))
131 |
132 |
133 |
134 |
135 |
136 | Sort the results list by our ranking metric, TF-IDF.
137 |
115 return sortBy(results, result => tfidf.get(result));
138 |
139 |
140 |
141 |
142 |
13 |
20 | 1import {strict as assert} from 'node:assert';
21 | 2import {search} from '../dist/search.js';
22 |
23 | 4const item = name => ({name});
24 |
25 | Most of the tests operate on this pre-set list of items to search
26 |
7const ITEMS = [
27 |
28 |
29 |
30 |
31 | 12 item('linus is a person'),
32 |
33 |
34 |
35 | 16describe('basic search', () => {
36 | 17 it('search empty array', () => {
37 | 18 assert.deepEqual(search([], 'query', x => x.name), []);
38 |
39 |
40 | 21 it('search with empty query', () => {
41 | 22 assert.deepEqual(search(ITEMS, '', x => x.name), ITEMS);
42 |
43 |
44 | 25 it('search with 1 letter returns correct result', () => {
45 | 26 assert.deepEqual(search(ITEMS, 'l', x => x.name), [
46 |
47 |
48 | 29 item('linus is a person'),
49 |
50 |
51 |
52 | 33 it('search does not match from middle of words', () => {
53 | 34 assert.deepEqual(search(ITEMS, 'w', x => x.name), []);
54 |
55 |
56 | 37 it('multi-word search returns correct result', () => {
57 | 38 assert.deepEqual(search(ITEMS, 'linus lee', x => x.name), [
58 |
59 |
60 |
61 |
62 | 43 it('searching words out of order returns correct result', () => {
63 | 44 assert.deepEqual(search(ITEMS, 'lee linus', x => x.name), [
64 |
65 |
66 |
67 |
68 | 49 it('search works even if the last query word is incomplete', () => {
69 | 50 assert.deepEqual(search(ITEMS, 'linus le', x => x.name), [
70 |
71 |
72 |
73 |
74 | 55 it('search query may contain newlines, tabs, and multiple consecutive spaces', () => {
75 | 56 assert.deepEqual(search(ITEMS, ' linus\t is\nperson\t', x => x.name), [
76 | 57 item('linus is a person'),
77 |
78 |
79 |
80 | 61 it('correctly implements TF-IDF ranking', () => {
81 | In this example, "mango" has much higher IDF (is a higher-signal
82 | word) in the corpus than "apple", which appears in nearly every
83 | document. Therefore, documents that mention "mango" more times
84 | (relative to the length of the document) should rank higher.
85 |
66 assert.deepEqual(
86 |
87 |
88 | 69 item('mango mango mango apple'),
89 | 70 item('mango apple mango apple'),
90 | 71 item('apple mango apple mango apple mango apple mango'),
91 | 72 item('apple apple apple apple apple apple apple apple mango'),
92 |
93 | 74 item('apple apple apple'),
94 | 75 item('mango mango mango'),
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 | 87 ], 'apple mango', x => x.name),
107 |
108 | 89 item('mango mango mango apple'),
109 | 90 item('mango apple mango apple'),
110 | 91 item('apple mango apple mango apple mango apple mango'),
111 | 92 item('apple apple apple apple apple apple apple apple mango'),
112 |
113 |
114 |
115 |
116 |
117 | 98describe('custom search-by predicates', () => {
118 | 99 it('default predicate is provided as x => x', () => {
119 |
120 |
121 |
122 |
123 | 104 'university of california',
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 | 112 it('accepts and uses a custom predicate', () => {
132 | 113 assert.deepEqual(search(ITEMS, 'sunil ee', x => x.name.split('').reverse().join('')), [
133 |
134 |
135 |
136 |
137 |
138 | 119describe('search modes', () => {
139 | 120 it('in mode: word, search does not match if any words are incomplete', () => {
140 | 121 assert.deepEqual(search(ITEMS, 'linu lee', x => x.name, {mode: 'word'}), []);
141 |
142 |
143 | 124 it('in mode: prefix, every query word may be incomplete', () => {
144 | 125 assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'prefix'}), [
145 |
146 |
147 |
148 |
149 | 130 it('in mode: autocomplete, only the last query word may be incomplete', () => {
150 | 131 assert.deepEqual(search(ITEMS, 'linus le', x => x.name, {mode: 'autocomplete'}), [
151 |
152 |
153 | 134 assert.deepEqual(search(ITEMS, 'linu le', x => x.name, {mode: 'autocomplete'}), []);
154 |
155 |
156 |
157 | 138describe('case sensitivity', () => {
158 | 139 it('caseSensitive: true omits non-matching results', () => {
159 | 140 assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: true}), [
160 |
161 | 142 item('linus is a person'),
162 |
163 |
164 |
165 | 146 it('caseSensitive: false includes case-insensitive results', () => {
166 | 147 assert.deepEqual(search(ITEMS, 'l', x => x.name, {caseSensitive: false}), [
167 |
168 |
169 | 150 item('linus is a person'),
170 |
171 |
172 |
173 |
174 |
175 |
176 |