tag, which goes into the $titles hashref 201 | # 202 | $titles->{$file} = shift; 203 | 204 | } else { 205 | # 206 | # everything else, which just gets appended to the $text string 207 | # 208 | $text .= " ".shift; 209 | 210 | } 211 | }, qq{dtext}], 212 | 213 | # 214 | # start tag handler 215 | # 216 | 'start_h' => [sub { 217 | # 218 | # set $noindex if a <meta> tag is found 219 | # 220 | $noindex = 1 if ('meta' eq $_[0] && 'robots' eq $_[1]->{'name'} && $_[1]->{'content'} =~ m/noindex/i); 221 | 222 | # 223 | # add the alt attributes of images, and any title attributes found 224 | # 225 | $text .= " ".$_[1]->{'alt'} if ('img' eq $_[0]); 226 | $text .= " ".$_[1]->{'title'} if (defined($_[1]->{'title'})); 227 | 228 | $currtag = $_[0]; 229 | }, qq{tag,attr}], 230 | 231 | # 232 | # end tag handler 233 | # 234 | 'end_h' => [sub { 235 | undef($currtag); 236 | }, qq{tag}], 237 | ); 238 | 239 | $parser->unbroken_text(1); 240 | 241 | # 242 | # we expect these elements contain text we don't want to index 243 | # 244 | $parser->ignore_elements(qw(h1 script style header nav footer)); 245 | 246 | # 247 | # open the file, being careful to ensure it's treated as UTF-8 248 | # 249 | my $fh = IO::File->new($file); 250 | $fh->binmode(qq{:utf8}); 251 | 252 | # 253 | # parse 254 | # 255 | $parser->parse_file($fh); 256 | $fh->close; 257 | 258 | return if ($noindex); 259 | 260 | my @words = grep { my $w = $_ ; none { $w eq $_ } @common } # filter out common words 261 | grep { /\w/ } # filter out strings that don't contain at least one word character 262 | map { 263 | $_ =~ s/^[^\w]+//g; # remove leading non-word characters 264 | $_ =~ s/[^\w]+$//g; # remove trailing non-word characters 265 | $_; 266 | } 267 | split(/[\s\r\n]+/, lc($text)); # split by whitespace 268 | 269 | foreach my $word (@words) { 270 | # 271 | # increment the counter for this word/file 272 | # 273 | $index->{$word}->{$file}++; 274 | } 275 | } 276 | 277 | =pod 278 | 279 | =head1 SYNOPSIS 280 | 281 | webidx [-x FILE [-x FILE2 [...]]] [--xP PATTERN [--xP PATTERN2 [...]]] [-o ORIGIN] [-z] [DIRECTORY] [DBFILE] 282 | 283 | This will cause all HTML files in C<DIRECTORY> to be indexed, and the resulting database written to C<DBFILE>. The supported options are: 284 | 285 | =over 286 | 287 | =item * C<-x FILE> specifies a file to be excluded. May be specified multiple times. 288 | 289 | =item * C<--xP PATTERN> specifies a pattern of folders and files to be excluded. May be specified multiple times. 290 | 291 | =item * C<-o ORIGIN> specifies a base URL which will be prepended to the filenames (once C<DIRECTORY> has been removed). 292 | 293 | =item C<-z> specifies that the database file should be compressed once generated. If specified, the database will be at C<DBFILE.gz>. 294 | 295 | =item * C<DIRECTORY> is the directory to be indexed, defaults to the current working directory. 296 | 297 | =item * C<DBFILE> is the location where the database should be written. if not specified, defaults to C<DIRECTORY/index.db>. 298 | 299 | =back 300 | 301 | =cut 302 | --------------------------------------------------------------------------------

├── LICENSE ├── README.md ├── webidx.js └── webidx.pl /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, Gavin Brown 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # webidx 2 | 3 | webidx is a client-side search engine for static websites. 4 | 5 | The search functionality is implemented in [webidx.js](webidx.js), which uses [sql.js](https://github.com/sql-js/sql.js) to provide an interface to an SQLite database. 6 | 7 | The database is generated by a simple Perl script ([webidx.pl](webidx.pl)) that should be run as part of the site build pipeline (eg after `jekyll build` or whatever). The SQLite database should then be published alongside the static content. 8 | 9 | You can see a live demo of it [here](https://gavinbrown.xyz/webidx-demo/). 10 | 11 | ## How to use it 12 | 13 | 1. use [webidx.pl](webidx.pl) to generate the index: 14 | 15 | ``` 16 | $ /path/to/webidx.pl -x index.html -x archives.html --xP secret_files -o https://example.com -z . ./index.db 17 | ``` 18 | 19 | You can run `webidx.pl --help` to see all the available command-line options. 20 | 21 | 2. Include [sql.js](https://cdnjs.com/libraries/sql.js), [pako](https://cdnjs.com/libraries/pako) and [webidx.js](webidx.js) in your web page: 22 | 23 | ```html 24 | 25 | 26 | 27 | ``` 28 | 29 | 3. Create a search form: 30 | 31 | ```html 32 | 35 | ``` 36 | 37 | When the user hits the return key in the search box, a modal dialog will pop up containing search results! 38 | 39 | The object that's passed to `window.webidx.search()` can have the following properties: 40 | 41 | * `dbfile`: URL of the SQLite database file 42 | * `query`: search query 43 | * `resultCallback`: a callback which is passed an array of search results. Each result is an object with the `title` and `url` properties. If not defined, a modal dialog will be displayed. 44 | * `errorCallback`: a callback which is passed any error string as an argument. 45 | * `titleSuffix`: a string to be removed from the end of page titles. 46 | * `titlePrefix`: a string to be removed from the beginning of page titles. 47 | -------------------------------------------------------------------------------- /webidx.js: -------------------------------------------------------------------------------- 1 | window.webidx = {}; 2 | const webidx = window.webidx; 3 | 4 | webidx.search = async function (params) { 5 | if (!webidx.sql) { 6 | // 7 | // initialise sql.js 8 | // 9 | webidx.sql = await window.initSqlJs({locateFile: file => `https://sql.js.org/dist/${file}`}); 10 | } 11 | 12 | if (webidx.hasOwnProperty("db")) { 13 | webidx.displayResults(webidx.query(params.query), params); 14 | 15 | } else { 16 | webidx.loadDB(params); 17 | 18 | } 19 | }; 20 | 21 | webidx.loadDB = function (params) { 22 | const xhr = new XMLHttpRequest(); 23 | 24 | xhr.open("GET", params.dbfile); 25 | xhr.timeout = params.timeout ?? 5000; 26 | xhr.responseType = "arraybuffer"; 27 | 28 | xhr.ontimeout = function() { 29 | if (params.hasOwnProperty("errorCallback")) { 30 | params.errorCallback("Unable to load index, please refresh the page."); 31 | } 32 | }; 33 | 34 | xhr.onload = function() { 35 | webidx.initializeDB(this.response); 36 | const results = webidx.query(params.query); 37 | webidx.displayResults(results, params); 38 | }; 39 | 40 | xhr.send(); 41 | }; 42 | 43 | webidx.initializeDB = function (arrayBuffer) { 44 | webidx.db = new webidx.sql.Database(window.pako.inflate(new Uint8Array(arrayBuffer))); 45 | }; 46 | 47 | webidx.query = function (query) { 48 | // 49 | // search results 50 | // 51 | let pages = []; 52 | 53 | // 54 | // split the search term into words 55 | // 56 | const words = query.trim().toLowerCase().split(" "); 57 | 58 | let queryBuffer = []; 59 | for (var i = 0 ; i < words.length ; i++) { 60 | queryBuffer.push(`SELECT page_id,SUM(hits) AS hits FROM \`index\`,words WHERE (word_id=words.id AND word=:word${i}) GROUP BY page_id`); 61 | } 62 | 63 | const sth = webidx.db.prepare( 64 | "SELECT pages.*,page_id,SUM(hits) AS hits FROM (" 65 | + queryBuffer.join(" UNION ") 66 | + ") JOIN pages ON pages.id=page_id GROUP BY page_id ORDER BY hits DESC" 67 | ); 68 | 69 | sth.bind(words); 70 | 71 | while (sth.step()) { 72 | pages.push(sth.getAsObject()); 73 | } 74 | 75 | return pages; 76 | }; 77 | 78 | webidx.regExpQuote = function (str) { 79 | return str.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&"); 80 | }; 81 | 82 | webidx.displayResults = function (pages, params) { 83 | var callback = params.resultCallback ?? webidx.displayDialog; 84 | callback(pages, params); 85 | }; 86 | 87 | webidx.displayDialog = function (pages, params) { 88 | var dialog = document.createElement("dialog"); 89 | dialog.classList.add("webidx-results-dialog") 90 | 91 | dialog.appendChild(document.createElement("h2")).appendChild(document.createTextNode("Search Results")); 92 | 93 | if (pages.length < 1) { 94 | dialog.appendChild(document.createElement("p")).appendChild(document.createTextNode("Nothing found.")); 95 | 96 | } else { 97 | var ul = dialog.appendChild(document.createElement("ul")); 98 | 99 | pages.forEach(function(page) { 100 | var titleText = page.title; 101 | 102 | if (params.titleSuffix) { 103 | titleText = titleText.replace(new RegExp(webidx.regExpQuote(params.titleSuffix)+"$"), ""); 104 | } 105 | 106 | if (params.titlePrefix) { 107 | titleText = titleText.replace(new RegExp("^" + webidx.regExpQuote(params.titleSuffix)), ""); 108 | } 109 | 110 | var li = ul.appendChild(document.createElement("li")); 111 | var a = li.appendChild(document.createElement("a")); 112 | a.setAttribute("href", page.url); 113 | a.appendChild(document.createTextNode(titleText)); 114 | li.appendChild(document.createElement("br")); 115 | 116 | var span = li.appendChild(document.createElement("span")); 117 | span.classList.add("webidx-page-url"); 118 | span.appendChild(document.createTextNode(page.url)); 119 | }); 120 | } 121 | 122 | var form = dialog.appendChild(document.createElement("form")); 123 | form.setAttribute("method", "dialog"); 124 | 125 | var button = form.appendChild(document.createElement("button")); 126 | button.setAttribute("autofocus", true); 127 | button.appendChild(document.createTextNode("Close")); 128 | 129 | document.body.appendChild(dialog); 130 | 131 | dialog.addEventListener("close", function() { 132 | dialog.parentNode.removeChild(dialog); 133 | }); 134 | 135 | dialog.showModal(); 136 | dialog.scrollTop = 0; 137 | }; 138 | -------------------------------------------------------------------------------- /webidx.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use Cwd qw(abs_path); 3 | use Getopt::Long qw(:config bundling auto_version auto_help); 4 | use DBD::SQLite; 5 | use DBI; 6 | use File::Basename qw(basename); 7 | use File::Glob qw(:bsd_glob); 8 | use HTML::Parser; 9 | use IPC::Open2; 10 | use IO::File; 11 | use List::Util qw(uniq none any); 12 | use feature qw(say); 13 | use open qw(:encoding(utf8)); 14 | use strict; 15 | use utf8; 16 | use vars qw($VERSION); 17 | 18 | $VERSION = 0.02; 19 | 20 | # 21 | # parse command line options 22 | # 23 | my (@exclude, @excludePattern, $compress, $origin); 24 | die() unless (GetOptions( 25 | 'exclude|x=s' => \@exclude, 26 | 'excludePattern|xP=s' => \@excludePattern, 27 | 'compress|z' => \$compress, 28 | 'origin|o=s' => \$origin 29 | )); 30 | 31 | @exclude = map { abs_path($_) } @exclude; 32 | 33 | # 34 | # determine the source directory and the database filename 35 | # 36 | my $dir = abs_path(shift(@ARGV) || '.'); 37 | my $dbfile = abs_path(shift(@ARGV) || $dir.'/webidx.db'); 38 | 39 | # 40 | # initialise the database 41 | # 42 | unlink($dbfile) if (-e $dbfile); 43 | my $db = DBI->connect('dbi:SQLite:dbname='.$dbfile, '', '', { 44 | 'PrintError' => 1, 45 | 'RaiseError' => 1, 46 | 'AutoCommit' => 0, 47 | }); 48 | 49 | # 50 | # a list of words we want to exclude 51 | # 52 | my @common = qw(be and of a in to it i for he she on do at but from that not by or as can who get if my as up so me the are we was is); 53 | 54 | # 55 | # this is a map of filename => page title 56 | # 57 | my $titles = {}; 58 | 59 | # 60 | # this is map of word => page 61 | # 62 | my $index = {}; 63 | 64 | # 65 | # scan the source directory 66 | # 67 | 68 | say 'scanning ', $dir; 69 | 70 | scan_directory($dir); 71 | 72 | # 73 | # generate the database 74 | # 75 | 76 | say 'finished scan, generating index'; 77 | 78 | $db->do(qq{BEGIN}); 79 | 80 | $db->do(qq{CREATE TABLE `pages` (`id` INTEGER PRIMARY KEY, `url` TEXT, `title` TEXT)}); 81 | $db->do(qq{CREATE TABLE `words` (`id` INTEGER PRIMARY KEY, `word` TEXT)}); 82 | $db->do(qq{CREATE TABLE `index` (`id` INTEGER PRIMARY KEY, `word_id` INT, `page_id` INT, `hits` INT)}); 83 | 84 | my $word_sth = $db->prepare(qq{INSERT INTO `words` (`word`) VALUES (?)}); 85 | my $page_sth = $db->prepare(qq{INSERT INTO `pages` (`url`, `title`) VALUES (?, ?)}); 86 | my $index_sth = $db->prepare(qq{INSERT INTO `index` (`word_id`, `page_id`, `hits`) VALUES (?, ?, ?)}); 87 | 88 | my $word_ids = {}; 89 | my $page_ids = {}; 90 | 91 | # 92 | # for each word... 93 | # 94 | foreach my $word (keys(%{$index})) { 95 | 96 | # 97 | # insert an entry into the words table (if one doesn't already exist) 98 | # 99 | if (!defined($word_ids->{$word})) { 100 | $word_sth->execute($word); 101 | $word_ids->{$word} = $db->last_insert_id; 102 | } 103 | 104 | # 105 | # for each page... 106 | # 107 | foreach my $page (keys(%{$index->{$word}})) { 108 | my $hits = $index->{$word}->{$page}; 109 | 110 | # 111 | # clean up the page title by removing leading and trailing whitespace 112 | # 113 | my $title = $titles->{$page}; 114 | $title =~ s/^[ \s\t\r\n]+//g; 115 | $title =~ s/[ \s\t\r\n]+$//g; 116 | 117 | # 118 | # remove the directory 119 | # 120 | $page =~ s/^$dir//; 121 | 122 | # 123 | # prepend the origin 124 | # 125 | $page = $origin.$page if ($origin); 126 | 127 | # 128 | # insert an entry into the pages table (if one doesn't already exist) 129 | # 130 | if (!defined($page_ids->{$page})) { 131 | $page_sth->execute($page, $title); 132 | $page_ids->{$page} = $db->last_insert_id; 133 | } 134 | 135 | # 136 | # insert an index entry 137 | # 138 | $index_sth->execute($word_ids->{$word}, $page_ids->{$page}, $hits) || die(); 139 | } 140 | } 141 | 142 | $db->do(qq{COMMIT}); 143 | 144 | $db->disconnect; 145 | 146 | if ($compress) { 147 | say 'compressing database...'; 148 | open2(undef, undef, qw(gzip -f -9), $dbfile); 149 | } 150 | 151 | say 'done'; 152 | 153 | exit; 154 | 155 | # 156 | # reads the contents of a directory: all HTML files are indexed, all directories 157 | # are scanned recursively. symlinks to directories are *not* followed 158 | # 159 | sub scan_directory { 160 | my $dir = shift; 161 | 162 | foreach my $file (map { abs_path($_) } bsd_glob(sprintf('%s/*', $dir))) { 163 | if (-d $file) { 164 | 165 | next if (any { $file =~ m/\Q$_/i } @excludePattern); 166 | 167 | # 168 | # directory, scan it 169 | # 170 | scan_directory($file); 171 | 172 | } elsif ($file =~ /\.html?$/i) { 173 | # 174 | # HTML file, index it 175 | # 176 | index_html($file); 177 | 178 | } 179 | } 180 | } 181 | 182 | # 183 | # index an HTML file 184 | # 185 | sub index_html { 186 | my $file = shift; 187 | 188 | return if (any { $_ eq $file } @exclude) || (any { $file =~ m/\Q$_/i } @excludePattern); 189 | 190 | my $currtag; 191 | my $text; 192 | my $noindex; 193 | my $parser = HTML::Parser->new( 194 | # 195 | # text handler 196 | # 197 | 'text_h' => [sub { 198 | if ('title' eq $currtag) { 199 | # 200 | # tag, which goes into the $titles hashref 201 | # 202 | $titles->{$file} = shift; 203 | 204 | } else { 205 | # 206 | # everything else, which just gets appended to the $text string 207 | # 208 | $text .= " ".shift; 209 | 210 | } 211 | }, qq{dtext}], 212 | 213 | # 214 | # start tag handler 215 | # 216 | 'start_h' => [sub { 217 | # 218 | # set $noindex if a <meta> tag is found 219 | # 220 | $noindex = 1 if ('meta' eq $_[0] && 'robots' eq $_[1]->{'name'} && $_[1]->{'content'} =~ m/noindex/i); 221 | 222 | # 223 | # add the alt attributes of images, and any title attributes found 224 | # 225 | $text .= " ".$_[1]->{'alt'} if ('img' eq $_[0]); 226 | $text .= " ".$_[1]->{'title'} if (defined($_[1]->{'title'})); 227 | 228 | $currtag = $_[0]; 229 | }, qq{tag,attr}], 230 | 231 | # 232 | # end tag handler 233 | # 234 | 'end_h' => [sub { 235 | undef($currtag); 236 | }, qq{tag}], 237 | ); 238 | 239 | $parser->unbroken_text(1); 240 | 241 | # 242 | # we expect these elements contain text we don't want to index 243 | # 244 | $parser->ignore_elements(qw(h1 script style header nav footer)); 245 | 246 | # 247 | # open the file, being careful to ensure it's treated as UTF-8 248 | # 249 | my $fh = IO::File->new($file); 250 | $fh->binmode(qq{:utf8}); 251 | 252 | # 253 | # parse 254 | # 255 | $parser->parse_file($fh); 256 | $fh->close; 257 | 258 | return if ($noindex); 259 | 260 | my @words = grep { my $w = $_ ; none { $w eq $_ } @common } # filter out common words 261 | grep { /\w/ } # filter out strings that don't contain at least one word character 262 | map { 263 | $_ =~ s/^[^\w]+//g; # remove leading non-word characters 264 | $_ =~ s/[^\w]+$//g; # remove trailing non-word characters 265 | $_; 266 | } 267 | split(/[\s\r\n]+/, lc($text)); # split by whitespace 268 | 269 | foreach my $word (@words) { 270 | # 271 | # increment the counter for this word/file 272 | # 273 | $index->{$word}->{$file}++; 274 | } 275 | } 276 | 277 | =pod 278 | 279 | =head1 SYNOPSIS 280 | 281 | webidx [-x FILE [-x FILE2 [...]]] [--xP PATTERN [--xP PATTERN2 [...]]] [-o ORIGIN] [-z] [DIRECTORY] [DBFILE] 282 | 283 | This will cause all HTML files in C<DIRECTORY> to be indexed, and the resulting database written to C<DBFILE>. The supported options are: 284 | 285 | =over 286 | 287 | =item * C<-x FILE> specifies a file to be excluded. May be specified multiple times. 288 | 289 | =item * C<--xP PATTERN> specifies a pattern of folders and files to be excluded. May be specified multiple times. 290 | 291 | =item * C<-o ORIGIN> specifies a base URL which will be prepended to the filenames (once C<DIRECTORY> has been removed). 292 | 293 | =item C<-z> specifies that the database file should be compressed once generated. If specified, the database will be at C<DBFILE.gz>. 294 | 295 | =item * C<DIRECTORY> is the directory to be indexed, defaults to the current working directory. 296 | 297 | =item * C<DBFILE> is the location where the database should be written. if not specified, defaults to C<DIRECTORY/index.db>. 298 | 299 | =back 300 | 301 | =cut 302 | --------------------------------------------------------------------------------