├── .changeset ├── README.md └── config.json ├── .gitignore ├── .vscode └── settings.json ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── app ├── cdk.context.json ├── cdk.json ├── package.json └── src │ ├── app.ts │ └── test-data-stack │ └── index.ts ├── cspell.json ├── data.json ├── doc ├── api.md ├── diagram.png └── index-config.md ├── examples └── getting-started │ ├── README.md │ ├── cdk.json │ ├── package.json │ ├── src │ └── app.ts │ └── tsconfig.json ├── integration-test ├── package.json └── src │ └── index-test-data.ts ├── package.json ├── packages ├── pathery-cdk │ ├── CHANGELOG.md │ ├── README.md │ ├── package.json │ ├── src │ │ ├── config.ts │ │ ├── index.ts │ │ ├── pathery-dashboard.ts │ │ ├── pathery-stack.ts │ │ └── rust-function.ts │ └── tsconfig.json └── pathery │ ├── Cargo.toml │ └── src │ ├── bin │ ├── async-delete-worker.rs │ ├── batch-index.rs │ ├── delete-doc.rs │ ├── index-writer-worker.rs │ ├── post-index.rs │ ├── query-index-partition-fn.rs │ ├── query-index.rs │ └── stats-index.rs │ ├── directory.rs │ ├── function │ ├── mod.rs │ └── query_index_partition │ │ ├── client.rs │ │ └── mod.rs │ ├── index.rs │ ├── lambda │ ├── mod.rs │ └── sqs.rs │ ├── lib.rs │ ├── pagination.rs │ ├── schema.rs │ ├── search_doc.rs │ ├── serialize │ ├── compressed_json.rs │ └── mod.rs │ ├── service │ ├── doc.rs │ ├── index │ │ ├── batch_index.rs │ │ ├── mod.rs │ │ ├── post_index.rs │ │ ├── query_index.rs │ │ └── stats_index.rs │ └── mod.rs │ ├── store │ ├── document.rs │ └── mod.rs │ ├── util.rs │ └── worker │ ├── async_delete │ ├── client.rs │ ├── job.rs │ └── mod.rs │ ├── index_writer │ ├── client.rs │ ├── job.rs │ └── mod.rs │ └── mod.rs ├── pnpm-lock.yaml ├── pnpm-workspace.yaml ├── rustfmt.toml └── turbo.json /.changeset/README.md: -------------------------------------------------------------------------------- 1 | # Changesets 2 | 3 | Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works 4 | with multi-package repos, or single-package repos to help you version and publish your code. You can 5 | find the full documentation for it [in our repository](https://github.com/changesets/changesets) 6 | 7 | We have a quick list of common questions to get you started engaging with this project in 8 | [our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md) 9 | -------------------------------------------------------------------------------- /.changeset/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://unpkg.com/@changesets/config@2.2.0/schema.json", 3 | "changelog": "@changesets/cli/changelog", 4 | "commit": true, 5 | "fixed": [], 6 | "linked": [], 7 | "access": "public", 8 | "baseBranch": "main", 9 | "updateInternalDependencies": "patch", 10 | "ignore": ["app", "integration-test"] 11 | } 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dev-env.json 3 | 4 | .turbo 5 | .pathery 6 | 7 | cdk.out 8 | target 9 | cdk-outputs.json 10 | lib 11 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "rust-analyzer.rustfmt.extraArgs": ["+nightly"] 4 | } 5 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "packages/pathery" 4 | ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Tyler van Hensbergen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pathery :fire: Serverless Search :fire: 2 | 3 | [![npm version](https://badge.fury.io/js/@pathery%2Fcdk.svg)](https://badge.fury.io/js/@pathery%2Fcdk) 4 | 5 | Pathery is a **serverless search service** built on AWS using Rust, CDK and [Tantivy][tantivy]. It uses AWS managed serverless offerings – DynamoDB, EFS, Lambda, SQS, and API Gateway – to the maximum extent possible. 6 | 7 | **:bell: WARNING:** This is currently a work in progress and not ready for production usage. 8 | 9 | ## Features 10 | 11 | - **🔥 Fast full-text search**. Built on Rust to limit AWS Lambda cold start overhead. 12 | - **🥰 Simple REST API**. A [simple REST API][api-docs] to make search as easy as possible. 13 | - **👍 Easy to install**. Ships as a CDK Component, making it easy to [get started][get-started]. 14 | - **💵 Usage based infra**. No long running servers, only pay for what you use. 15 | - **🔼 Built for AWS**. Leans on AWS managed services to limit maintenance burden and maximize scalability. 16 | - Document store: [DynamoDB](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html) 17 | - Index store: [Elastic File System (EFS)](https://docs.aws.amazon.com/efs/latest/ug/whatisefs.html) 18 | - Index writer & handler: [Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html) 19 | - Index queue: [SQS](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/welcome.html) 20 | - API: [API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/welcome.html) 21 | 22 | ## Getting Started 23 | 24 | Check out the [getting started guide][get-started] to deploy Pathery into your AWS account using CDK. 25 | 26 | [tantivy]: https://github.com/quickwit-oss/tantivy 27 | [get-started]: ./examples/getting-started/ 28 | [api-docs]: ./doc/api.md 29 | 30 | ## Architecture 31 | 32 | Follow along with the Dev Log: 33 | 34 | - [Pathery Dev Log #1: Performant Serverless Queries Without a Cluster](https://tvanhens.substack.com/p/pathery-dev-log-1-performant-serverless) 35 | - [Pathery Dev Log #2: Indexing and the Document Store](https://tvanhens.substack.com/p/pathery-dev-log-2-indexing-and-the) 36 | 37 | ![diagram](/doc/diagram.png) 38 | -------------------------------------------------------------------------------- /app/cdk.context.json: -------------------------------------------------------------------------------- 1 | { 2 | "availability-zones:account=117773642559:region=us-east-1": [ 3 | "us-east-1a", 4 | "us-east-1b", 5 | "us-east-1c", 6 | "us-east-1d", 7 | "us-east-1e", 8 | "us-east-1f" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /app/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "ts-node --swc src/app.ts", 3 | "profile": "pathery-dev" 4 | } 5 | -------------------------------------------------------------------------------- /app/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "app", 3 | "private": true, 4 | "version": "0.0.0", 5 | "description": "", 6 | "main": "index.js", 7 | "scripts": { 8 | "build": "cdk synth", 9 | "deploy": "cdk deploy --all -O cdk-outputs.json --require-approval never --method direct" 10 | }, 11 | "dependencies": { 12 | "@pathery/cdk": "workspace:*", 13 | "aws-sdk": "^2.1262.0", 14 | "axios": "^1.2.0", 15 | "esbuild": "^0.15.16" 16 | }, 17 | "keywords": [], 18 | "author": "", 19 | "license": "ISC" 20 | } 21 | -------------------------------------------------------------------------------- /app/src/app.ts: -------------------------------------------------------------------------------- 1 | import { App } from "aws-cdk-lib"; 2 | import { PatheryStack } from "@pathery/cdk"; 3 | import { TestDataStack } from "./test-data-stack"; 4 | 5 | const app = new App(); 6 | 7 | const pathery = new PatheryStack(app, "pathery-dev", { 8 | config: { 9 | indexes: [ 10 | { 11 | prefix: "test-index-v1", 12 | fields: [ 13 | { 14 | name: "author", 15 | flags: ["TEXT"], 16 | kind: "text", 17 | }, 18 | { 19 | name: "song", 20 | flags: ["TEXT"], 21 | kind: "text", 22 | }, 23 | { 24 | name: "genre", 25 | flags: ["STRING"], 26 | kind: "text", 27 | }, 28 | { 29 | name: "releaseDate", 30 | flags: ["INDEXED"], 31 | kind: "i64", 32 | }, 33 | ], 34 | }, 35 | ], 36 | }, 37 | }); 38 | 39 | new TestDataStack(app, "pathery-test-data"); 40 | -------------------------------------------------------------------------------- /app/src/test-data-stack/index.ts: -------------------------------------------------------------------------------- 1 | import { CfnOutput, Stack } from "aws-cdk-lib"; 2 | import { Bucket } from "aws-cdk-lib/aws-s3"; 3 | import { Construct } from "constructs"; 4 | 5 | export class TestDataStack extends Stack { 6 | constructor(scope: Construct, id: string) { 7 | super(scope, id); 8 | 9 | const dataBucket = new Bucket(this, "DataBucket"); 10 | 11 | new CfnOutput(this, "DataBucketName", { 12 | value: dataBucket.bucketName, 13 | }); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /cspell.json: -------------------------------------------------------------------------------- 1 | // cSpell Settings 2 | { 3 | "version": "0.2", 4 | "language": "en", 5 | "words": [ 6 | "chrono", 7 | "Hensbergen", 8 | "Mmap", 9 | "Pathery", 10 | "Pirsig", 11 | "Runtimes", 12 | "upsert", 13 | "thiserror" 14 | ], 15 | "flagWords": [], 16 | "ignorePaths": ["**/node_modules/**", "target/**", "lib/**"] 17 | } 18 | -------------------------------------------------------------------------------- /data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "title": "Zen and the Art of Motorcycle Maintenance", 4 | "author": "Robert Pirsig" 5 | }, 6 | { 7 | "title": "One Flew Over the Cuckoo's Nest", 8 | "author": "Ken Kesey" 9 | } 10 | ] 11 | -------------------------------------------------------------------------------- /doc/api.md: -------------------------------------------------------------------------------- 1 | # API Docs 2 | 3 | ## General 4 | 5 | The base url is the url of the API gateway and is emitted on installation via CDK. 6 | 7 | **Example** 8 | 9 | ``` 10 | https://.execute-api.us-east-1.amazonaws.com/prod 11 | ``` 12 | 13 | ## Index Operations 14 | 15 | ### Index a Document 16 | 17 | `POST /index/{index_id}` 18 | 19 | Indexes a document so that the document is searchable. 20 | A document can optionally provide an `__id` field to set the document id. 21 | If no `__id` is provided one is generated and returned. 22 | Indexing a document with an `__id` will upsert any previously indexed data with the provided `__id`. 23 | 24 | #### Parameters 25 | 26 | - `__id` - (optional) the document id to use for the document 27 | 28 | #### Examples 29 | 30 | **Basic Indexing** 31 | 32 | Request: 33 | 34 | ```bash 35 | http https://.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1 \ 36 | author="Robert M. Pirsig" \ 37 | title="Zen and the Art of Motorcycle Maintenance" 38 | ``` 39 | 40 | Response: 41 | 42 | ```json 43 | { 44 | "__id": "b7c8aee4-9656-47a3-8217-df1b71056a83", 45 | "updated_at": "2022-11-14T21:17:58.824791120+00:00" 46 | } 47 | ``` 48 | 49 | **Providing an `\_\_id`** 50 | 51 | Request: 52 | 53 | ```bash 54 | http https://.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1 \ 55 | author="Robert M. Pirsig" \ 56 | title="Zen and the Art of Motorcycle Maintenance" \ 57 | __id=zen 58 | ``` 59 | 60 | Response: 61 | 62 | ```json 63 | { 64 | "__id": "zen", 65 | "updated_at": "2022-11-14T21:17:58.824791120+00:00" 66 | } 67 | ``` 68 | 69 | ### Query a Document 70 | 71 | `POST /index/{index_id}/query` 72 | 73 | Query an index with a provided search string. 74 | 75 | #### Parameters 76 | 77 | - `query` - a query string to search against the index 78 | 79 | #### Examples 80 | 81 | **Simple Full Text Search** 82 | 83 | Request: 84 | 85 | ```bash 86 | http https://.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1/query \ 87 | query="zen art" 88 | ``` 89 | 90 | Response: 91 | 92 | ```json 93 | { 94 | "matches": [ 95 | { 96 | "doc": { 97 | "__id": "ebf5c0a0-ca14-4471-bc21-5259d7898df3", 98 | "title": "Zen and the Art of Motorcycle Maintenance" 99 | }, 100 | "score": 0.57536423, 101 | "snippets": { 102 | "title": "Zen and the Art of Motorcycle Maintenance" 103 | } 104 | } 105 | ] 106 | } 107 | ``` 108 | 109 | ### Delete a Document 110 | 111 | `DELETE /index/{index_id}/doc/{doc_id}` 112 | 113 | Delete a document from an index such that it is no longer searchable. 114 | 115 | #### Examples 116 | 117 | **Simple Full Text Search** 118 | 119 | Request: 120 | 121 | ```bash 122 | http DELETE https://.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1/doc/b7c8aee4-9656-47a3-8217-df1b71056a83 123 | ``` 124 | 125 | Response: 126 | 127 | ```json 128 | { 129 | "__id": "b7c8aee4-9656-47a3-8217-df1b71056a83", 130 | "deleted_at": "2022-11-14T21:30:04.845814727+00:00" 131 | } 132 | ``` 133 | -------------------------------------------------------------------------------- /doc/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tvanhens/pathery/f3781d37df2c71f1debfcd5a68afa5a5a899f65b/doc/diagram.png -------------------------------------------------------------------------------- /doc/index-config.md: -------------------------------------------------------------------------------- 1 | # Coming Soon 2 | -------------------------------------------------------------------------------- /examples/getting-started/README.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | This guide will walk you through: 4 | 5 | 1. Project Setup 6 | 1. Pathery Deployment 7 | 1. Writing documents to an index 8 | 1. Querying an index 9 | 10 | ## Project Setup 11 | 12 | Pathery ships as a CDK Construct and requires TypeScript and AWS CDK to be installed. 13 | The minimum set of dependencies is shown below: 14 | 15 | **package.json** 16 | 17 | ```json 18 | { 19 | "name": "getting-started", 20 | "version": "0.0.0", 21 | "description": "", 22 | "main": "index.js", 23 | "scripts": { 24 | "deploy": "cdk deploy" 25 | }, 26 | "keywords": [], 27 | "author": "", 28 | "license": "ISC", 29 | "dependencies": { 30 | "@pathery/cdk": "^0.0.4", 31 | "@swc/core": "^1.3.14", 32 | "@types/node": "^18.11.9", 33 | "aws-cdk": "^2.50.0", 34 | "aws-cdk-lib": "^2.50.0", 35 | "constructs": "^10.1.155", 36 | "ts-node": "^10.9.1", 37 | "typescript": "^4.8.4" 38 | } 39 | } 40 | ``` 41 | 42 | Running `npm install` will install the required dependencies. 43 | Next, you can configure your first index pattern in `src/app.ts`. 44 | Index patterns define the field configuration for indexes that start with the given prefix. 45 | 46 | In the example below, any index that starts with the name `book-index-v1-` will have the fields `author` and `title` indexed. 47 | You can read more about index configuration in the [index configuration guide][index-config]. 48 | 49 | **src/app.ts** 50 | 51 | ```typescript 52 | import { App } from "aws-cdk-lib"; 53 | import { PatheryStack } from "@pathery/cdk"; 54 | 55 | const app = new App(); 56 | 57 | new PatheryStack(app, "pathery-dev", { 58 | config: { 59 | indexes: [ 60 | { 61 | // Indexes starting with this prefix will use this config 62 | prefix: "book-index-v1-", 63 | fields: [ 64 | { 65 | // Index the field title 66 | name: "title", 67 | flags: ["STORED", "TEXT"], 68 | kind: "text", 69 | }, 70 | { 71 | // Index the field author 72 | name: "author", 73 | flags: ["STORED", "TEXT"], 74 | kind: "text", 75 | }, 76 | ], 77 | }, 78 | ], 79 | }, 80 | }); 81 | ``` 82 | 83 | Lastly, CDK needs to know where our CDK app is declared so we include a `cdk.json`: 84 | 85 | **cdk.json** 86 | 87 | ```json 88 | { 89 | "app": "ts-node --swc src/app.ts" 90 | } 91 | ``` 92 | 93 | This is the minimum amount of setup required. Now we can deploy our Pathery search service. 94 | 95 | ## Deployment 96 | 97 | To deploy the project run `npm run deploy`. 98 | 99 | If everything worked, you should see an output that looks like the one below: 100 | 101 | ```bash 102 | ✅ pathery-dev 103 | 104 | ✨ Deployment time: 55.94s 105 | 106 | Outputs: 107 | arn:aws:cloudformation:us-east-1:117773642559:stack/pathery-dev/f1c49c40-60b3-11ed-b19f-0e7f8a5bfcb7 108 | pathery-dev.ApiKeyOutput = 109 | pathery-dev.PatheryApiEndpointB5297505 = https://.execute-api.us-east-1.amazonaws.com/prod/ 110 | Stack ARN: 111 | 112 | ✨ Total time: 58.13s 113 | ``` 114 | 115 | Note the output called `pathery-dev.PatheryApiEndpointB5297505`, this is the URL to your search API. 116 | Lets save it to your shell environment for the next step by running: 117 | 118 | ```bash 119 | export PATHERY_ENDPOINT= 120 | ``` 121 | 122 | This endpoint is authenticated using an API key that gets automatically generated. 123 | Copy the id on the right hand side of the output `.ApiKeyOutput = ` and paste it into the line below for ``: 124 | 125 | ```bash 126 | export PATHERY_KEY="$(aws apigateway get-api-key --include-value --api-key --query value --output text)" 127 | ``` 128 | 129 | [index-config]: ../../doc/index-config.md 130 | 131 | ## Indexing a Document 132 | 133 | To index an example document run: 134 | 135 | ```bash 136 | curl -X POST ${PATHERY_ENDPOINT}index/book-index-v1-test \ 137 | -H 'Content-Type: application/json' \ 138 | -H "x-api-key: ${PATHERY_KEY}" \ 139 | -d '{"title": "Zen and the Art of Motorcycle Maintenance", "author": "Robert Pirsig"}' 140 | ``` 141 | 142 | > **❕ Note** 143 | > 144 | > Our index is name is `book-index-v1-test`. 145 | > The prefix of `book-index-v1-` is required to match the prefix in our configuration. 146 | > 147 | > **If you try to post to an index which does not match a configuration prefix, the request will fail.** 148 | 149 | If indexing is successful you should see: 150 | 151 | ```json 152 | { 153 | "__id": "7a309cda-1314-4e0a-a97d-02ce2c5e24c7", 154 | "updated_at": "2022-11-17T17:49:28.835542383+00:00" 155 | } 156 | ``` 157 | 158 | Now we're ready to query our index. 159 | 160 | ## Querying an Index 161 | 162 | To query our index we can use a request like the one below: 163 | 164 | ```bash 165 | curl -X POST ${PATHERY_ENDPOINT}index/book-index-v1-test/query \ 166 | -H 'Content-Type: application/json' \ 167 | -H "x-api-key: ${PATHERY_KEY}" \ 168 | -d '{"query": "zen art pirsig"}' 169 | ``` 170 | 171 | You should see a response like the one below, note the matching search terms are highlighted in the `snippets` of the response: 172 | 173 | ```json 174 | { 175 | "matches": [ 176 | { 177 | "doc": { 178 | "__id": "7a309cda-1314-4e0a-a97d-02ce2c5e24c7", 179 | "author": "Robert Pirsig", 180 | "title": "Zen and the Art of Motorcycle Maintenance" 181 | }, 182 | "snippets": { 183 | "title": "Zen and the Art of Motorcycle Maintenance", 184 | "author": "Robert Pirsig" 185 | }, 186 | "score": 0.86304635 187 | } 188 | ] 189 | } 190 | ``` 191 | -------------------------------------------------------------------------------- /examples/getting-started/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "ts-node --swc src/app.ts" 3 | } 4 | -------------------------------------------------------------------------------- /examples/getting-started/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "getting-started", 3 | "version": "0.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "deploy": "cdk deploy" 8 | }, 9 | "keywords": [], 10 | "author": "", 11 | "license": "ISC", 12 | "dependencies": { 13 | "@pathery/cdk": "^0.0.4", 14 | "@swc/core": "^1.3.14", 15 | "@types/node": "^18.11.9", 16 | "aws-cdk": "^2.50.0", 17 | "aws-cdk-lib": "^2.50.0", 18 | "constructs": "^10.1.155", 19 | "ts-node": "^10.9.1", 20 | "typescript": "^4.8.4" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /examples/getting-started/src/app.ts: -------------------------------------------------------------------------------- 1 | import { App } from "aws-cdk-lib"; 2 | import { PatheryStack } from "@pathery/cdk"; 3 | 4 | const app = new App(); 5 | 6 | new PatheryStack(app, "pathery-dev", { 7 | config: { 8 | indexes: [ 9 | { 10 | prefix: "book-index-v1-", 11 | fields: [ 12 | { 13 | name: "title", 14 | flags: ["STORED", "TEXT"], 15 | kind: "text", 16 | }, 17 | { 18 | name: "author", 19 | flags: ["STORED", "TEXT"], 20 | kind: "text", 21 | }, 22 | ], 23 | }, 24 | ], 25 | }, 26 | }); 27 | -------------------------------------------------------------------------------- /examples/getting-started/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig to read more about this file */ 4 | 5 | /* Projects */ 6 | // "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */ 7 | // "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */ 8 | // "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */ 9 | // "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */ 10 | // "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */ 11 | // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ 12 | 13 | /* Language and Environment */ 14 | "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ 15 | // "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ 16 | // "jsx": "preserve", /* Specify what JSX code is generated. */ 17 | // "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */ 18 | // "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */ 19 | // "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */ 20 | // "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */ 21 | // "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */ 22 | // "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */ 23 | // "noLib": true, /* Disable including any library files, including the default lib.d.ts. */ 24 | // "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */ 25 | // "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */ 26 | 27 | /* Modules */ 28 | "module": "commonjs", /* Specify what module code is generated. */ 29 | // "rootDir": "./", /* Specify the root folder within your source files. */ 30 | // "moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */ 31 | // "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */ 32 | // "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */ 33 | // "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */ 34 | // "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */ 35 | // "types": [], /* Specify type package names to be included without being referenced in a source file. */ 36 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 37 | // "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */ 38 | // "resolveJsonModule": true, /* Enable importing .json files. */ 39 | // "noResolve": true, /* Disallow 'import's, 'require's or ''s from expanding the number of files TypeScript should add to a project. */ 40 | 41 | /* JavaScript Support */ 42 | // "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */ 43 | // "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */ 44 | // "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */ 45 | 46 | /* Emit */ 47 | // "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */ 48 | // "declarationMap": true, /* Create sourcemaps for d.ts files. */ 49 | // "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */ 50 | // "sourceMap": true, /* Create source map files for emitted JavaScript files. */ 51 | // "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */ 52 | // "outDir": "./", /* Specify an output folder for all emitted files. */ 53 | // "removeComments": true, /* Disable emitting comments. */ 54 | // "noEmit": true, /* Disable emitting files from a compilation. */ 55 | // "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */ 56 | // "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */ 57 | // "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */ 58 | // "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */ 59 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 60 | // "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */ 61 | // "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */ 62 | // "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */ 63 | // "newLine": "crlf", /* Set the newline character for emitting files. */ 64 | // "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */ 65 | // "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */ 66 | // "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */ 67 | // "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */ 68 | // "declarationDir": "./", /* Specify the output directory for generated declaration files. */ 69 | // "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */ 70 | 71 | /* Interop Constraints */ 72 | // "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */ 73 | // "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */ 74 | "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ 75 | // "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */ 76 | "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ 77 | 78 | /* Type Checking */ 79 | "strict": true, /* Enable all strict type-checking options. */ 80 | // "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */ 81 | // "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */ 82 | // "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */ 83 | // "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */ 84 | // "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */ 85 | // "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */ 86 | // "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */ 87 | // "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */ 88 | // "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */ 89 | // "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */ 90 | // "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */ 91 | // "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */ 92 | // "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */ 93 | // "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */ 94 | // "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */ 95 | // "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */ 96 | // "allowUnusedLabels": true, /* Disable error reporting for unused labels. */ 97 | // "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */ 98 | 99 | /* Completeness */ 100 | // "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */ 101 | "skipLibCheck": true /* Skip type checking all .d.ts files. */ 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /integration-test/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "integration-test", 3 | "version": "1.0.0", 4 | "private": true, 5 | "description": "", 6 | "main": "index.js", 7 | "scripts": { 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "keywords": [], 11 | "author": "", 12 | "license": "ISC", 13 | "dependencies": { 14 | "@faker-js/faker": "^7.6.0", 15 | "aws-sdk": "^2.1262.0", 16 | "axios": "^1.2.0" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /integration-test/src/index-test-data.ts: -------------------------------------------------------------------------------- 1 | import * as AWS from "aws-sdk"; 2 | import http, { AxiosError } from "axios"; 3 | import { faker } from "@faker-js/faker"; 4 | 5 | const maxBatch = 20_000; 6 | const batchSize = 25; 7 | const patheryEndpoint = 8 | "https://nlztni8cx5.execute-api.us-east-1.amazonaws.com/prod/"; 9 | const index_id = "test-index-v1-3"; 10 | const apiKeyId = "7xyag5xp0d"; 11 | 12 | const api = new AWS.APIGateway(); 13 | 14 | const s3 = new AWS.S3(); 15 | 16 | export async function getApiKey() { 17 | const response = await api 18 | .getApiKey({ 19 | apiKey: apiKeyId, 20 | includeValue: true, 21 | }) 22 | .promise(); 23 | 24 | const value = response.value; 25 | 26 | if (!value) { 27 | throw new Error("Could not get API key value"); 28 | } 29 | 30 | return value; 31 | } 32 | 33 | async function uploadBatch(apiKey: string, batch: any[]) { 34 | const batchUrl = `${patheryEndpoint}index/${index_id}/batch`; 35 | 36 | try { 37 | await http.post(batchUrl, batch, { 38 | headers: { 39 | "Content-Type": "application/json", 40 | "X-Api-Key": apiKey, 41 | }, 42 | }); 43 | return { status: "OK" as const }; 44 | } catch (err) { 45 | if (err instanceof AxiosError) { 46 | if (!err.response) { 47 | console.error(err); 48 | process.exit(1); 49 | } 50 | 51 | const message: string = err.response.data.message; 52 | const code = err.response.status; 53 | 54 | if (code !== 500) { 55 | console.error(err); 56 | process.exit(1); 57 | } 58 | 59 | console.log(`[${code}] ${message}`); 60 | 61 | return { status: "Error" as const, code, message }; 62 | } 63 | } 64 | } 65 | 66 | export async function* batchGenerator() { 67 | let batchNum = 1; 68 | 69 | let batch: unknown[] = []; 70 | 71 | while (true) { 72 | if (batchNum > maxBatch) { 73 | return batch; 74 | } 75 | const next = { 76 | author: faker.name.fullName(), 77 | song: faker.music.songName(), 78 | genre: faker.music.genre(), 79 | releaseDate: faker.date.past().getTime(), 80 | }; 81 | 82 | batch.push(next); 83 | 84 | if (batch.length >= batchSize) { 85 | console.log(`Uploading batch #${batchNum++}`); 86 | 87 | yield batch; 88 | 89 | batch = []; 90 | } 91 | } 92 | } 93 | 94 | async function startUploader( 95 | apiKey: string, 96 | batches: AsyncGenerator 97 | ) { 98 | for await (const batch of batches) { 99 | let attempts = 0; 100 | while (true) { 101 | if (attempts >= 3) { 102 | process.exit(1); 103 | } 104 | 105 | attempts++; 106 | 107 | const result = await uploadBatch(apiKey, batch); 108 | 109 | if (result?.status === "OK") { 110 | break; 111 | } 112 | 113 | console.log("Backing off..."); 114 | 115 | await new Promise((resolve) => { 116 | setTimeout(resolve, 2000); 117 | }); 118 | } 119 | } 120 | } 121 | 122 | export async function doIndex(numUploader: number) { 123 | const apiKey = await getApiKey(); 124 | 125 | const batches = batchGenerator(); 126 | 127 | const uploaderList: Promise[] = []; 128 | 129 | for (let i = 0; i < numUploader; i++) { 130 | uploaderList.push(startUploader(apiKey, batches)); 131 | } 132 | 133 | await Promise.all(uploaderList); 134 | 135 | console.log("Done"); 136 | } 137 | 138 | doIndex(10); 139 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pathery", 3 | "private": true, 4 | "version": "1.0.0", 5 | "description": "", 6 | "main": "index.js", 7 | "scripts": { 8 | "sso:dev": "aws sso login --profile=pathery-dev", 9 | "build:lambda": "cargo lambda build --release --arm64", 10 | "build": "turbo run build", 11 | "check:spell": "cspell '**/*.{md,ts,rs}'", 12 | "check": "npm run check:spell", 13 | "version": "changeset version", 14 | "publish": "pnpm publish -r --access public", 15 | "release": "npm run build && pnpm run publish", 16 | "deploy:example": "turbo run deploy" 17 | }, 18 | "keywords": [], 19 | "author": "", 20 | "license": "ISC", 21 | "dependencies": { 22 | "@changesets/cli": "^2.25.2", 23 | "@swc/core": "^1.3.14", 24 | "@types/node": "^18.11.9", 25 | "aws-cdk": "^2.50.0", 26 | "aws-cdk-lib": "^2.50.0", 27 | "constructs": "^10.1.155", 28 | "cspell": "^6.14.3", 29 | "depcheck": "^1.4.3", 30 | "ts-node": "^10.9.1", 31 | "turbo": "^1.6.3", 32 | "typescript": "^4.8.4" 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /packages/pathery-cdk/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # @pathery/cdk 2 | 3 | ## 0.2.5 4 | 5 | ### Patch Changes 6 | 7 | - a97b5b2: feat: enable multithreading for query partition lambda 8 | 9 | ## 0.2.4 10 | 11 | ### Patch Changes 12 | 13 | - 371ab81: fix: run requests in parallel 14 | 15 | ## 0.2.3 16 | 17 | ### Patch Changes 18 | 19 | - 3b2b99b: fix: stats endpoint missing env var 20 | 21 | ## 0.2.2 22 | 23 | ### Patch Changes 24 | 25 | - 6e4ad31: Feature: automatically fan out queries as index grows 26 | - 3142bbf: feat: add pagination via pagination token 27 | 28 | ## 0.2.1 29 | 30 | ### Patch Changes 31 | 32 | - f4f3868: fix: run deletes async on delay queue 33 | 34 | ## 0.2.0 35 | 36 | ### Minor Changes 37 | 38 | - b19295c: Compressed stored document representation. 39 | 40 | ## 0.1.1 41 | 42 | ### Patch Changes 43 | 44 | - 61cd70b: Fix: documents were not serializing to writer queue correctly. 45 | - 22598b6: Feature: Allow query handler memory size to be specified via CDK construct. 46 | - ea2676c: Add json field type to schema config. 47 | - 534908c: Fix: 404 error for missing index config 48 | - 576d352: Feature: Partition queries using the optional with_partition body param. 49 | - 534908c: Improvement: Use DynamoDB for original document storage. 50 | - 534908c: Fix: allow empty body for delete doc request 51 | - 653cd03: Feature: Add date field type. 52 | - 61cd70b: Feature: add i64 as index field type 53 | 54 | ## 0.1.0 55 | 56 | ### Minor Changes 57 | 58 | - 9ee82b6: Add API key authorization and generate default key. 59 | 60 | ### Patch Changes 61 | 62 | - 83cb85c: Allow IndexWriter config to be specified. 63 | 64 | ## 0.0.9 65 | 66 | ### Patch Changes 67 | 68 | - 03f647a: Add batch index endpoint 69 | 70 | ## 0.0.8 71 | 72 | ### Patch Changes 73 | 74 | - 38a8116: Fix: incorrect dashboard naming 75 | 76 | ## 0.0.7 77 | 78 | ### Patch Changes 79 | 80 | - 903af06: Add basic dashboard with errors and writer stats 81 | 82 | ## 0.0.6 83 | 84 | ### Patch Changes 85 | 86 | - bf29fa3: Fixes https://github.com/tvanhens/pathery/issues/1 87 | 88 | ## 0.0.5 89 | 90 | ### Patch Changes 91 | 92 | - 255c378: Add the STRING flag for text fields to enable exact-only matching. 93 | 94 | ## 0.0.4 95 | 96 | ### Patch Changes 97 | 98 | - 8220a12: Improve package docs, keywords and description. 99 | 100 | ## 0.0.3 101 | 102 | ### Patch Changes 103 | 104 | - c1e6d24: Add readme to package. 105 | 106 | ## 0.0.2 107 | 108 | ### Patch Changes 109 | 110 | - 1e8060d: Move configuration into construct props. 111 | -------------------------------------------------------------------------------- /packages/pathery-cdk/README.md: -------------------------------------------------------------------------------- 1 | # Pathery Serverless Search CDK Construct 2 | 3 | [![npm version](https://badge.fury.io/js/@pathery%2Fcdk.svg)](https://badge.fury.io/js/@pathery%2Fcdk) 4 | 5 | Pathery Search is a serverless search solution built on AWS. 6 | 7 | For more information, visit the [project page][project-page]. 8 | 9 | Pathery CDK is an [AWS CDK][aws-cdk] construct that packages the infrastructure required to deploy Pathery Search. 10 | 11 | [aws-cdk]: https://github.com/aws/aws-cdk 12 | [project-page]: https://github.com/tvanhens/pathery 13 | -------------------------------------------------------------------------------- /packages/pathery-cdk/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@pathery/cdk", 3 | "publishConfig": { 4 | "access": "public" 5 | }, 6 | "version": "0.2.5", 7 | "license": "MIT", 8 | "description": "AWS CDK Construct for Pathery Serverless Search.", 9 | "keywords": [ 10 | "aws", 11 | "cdk", 12 | "serverless", 13 | "search", 14 | "full-text" 15 | ], 16 | "author": "Tyler van Hensbergen", 17 | "main": "lib/index.js", 18 | "types": "lib/index.d.ts", 19 | "files": [ 20 | "target", 21 | "lib" 22 | ], 23 | "scripts": { 24 | "build:pack-deps": "mkdir -p target && cp -r ../../target/lambda/* target", 25 | "build": "npm run build:pack-deps && tsc" 26 | }, 27 | "peerDependencies": { 28 | "aws-cdk-lib": "^2.50.0", 29 | "constructs": "^10.1.155" 30 | }, 31 | "directories": { 32 | "lib": "lib" 33 | }, 34 | "devDependencies": {}, 35 | "repository": { 36 | "type": "git", 37 | "url": "git+https://github.com/tvanhens/pathery.git" 38 | }, 39 | "bugs": { 40 | "url": "https://github.com/tvanhens/pathery/issues" 41 | }, 42 | "homepage": "https://github.com/tvanhens/pathery#readme" 43 | } 44 | -------------------------------------------------------------------------------- /packages/pathery-cdk/src/config.ts: -------------------------------------------------------------------------------- 1 | export interface FieldConfig { 2 | /** 3 | * The name of the field to index. 4 | * 5 | * This must match the object key name of objects being indexed. 6 | */ 7 | name: string; 8 | 9 | /** 10 | * The kind of field. 11 | * 12 | * Kind descriptions: 13 | * 14 | * `text` - Indexes field values as `string`. 15 | * 16 | * `date` - Indexes field values as ints but serialized as ISO 80601 strings in transit. 17 | */ 18 | kind: K; 19 | 20 | /** 21 | * Flags to add additional indexing capabilities. 22 | * 23 | * Flag descriptions: 24 | * 25 | * 26 | * `TEXT` - (only for `text`) Marks this field for full-text indexing. 27 | * 28 | * `STRING` - (only for `text`) Marks this field for exact-string indexing. 29 | * 30 | * `INDEXED` - (only for `date`) Marks this field for ordered search indexing. 31 | */ 32 | flags: Flags[]; 33 | } 34 | 35 | export type TextFieldConfig = FieldConfig<"text", "STRING" | "TEXT" | "FAST">; 36 | 37 | export type DateFieldConfig = FieldConfig<"date", "INDEXED" | "FAST">; 38 | 39 | export type IntegerFieldConfig = FieldConfig<"i64", "INDEXED" | "FAST">; 40 | 41 | export type JsonFieldConfig = FieldConfig<"json", "TEXT">; 42 | 43 | export type IndexFieldConfig = 44 | | TextFieldConfig 45 | | DateFieldConfig 46 | | IntegerFieldConfig 47 | | JsonFieldConfig; 48 | 49 | export interface IndexConfig { 50 | /** 51 | * Prefix matcher for index name. 52 | * 53 | * Indexes that start with `prefix` will use the fields schema and configuration specified in this object. 54 | * 55 | * For example: 56 | * 57 | * ```ts 58 | * { prefix: `book-index-`, ... } 59 | * ``` 60 | * 61 | * will cause indexes named `book-index-1` and `book-index-foo` to match. 62 | */ 63 | prefix: string; 64 | 65 | /** 66 | * List of field configurations for the index. 67 | * 68 | * Documents must have fields that match the fields specified in this configuration in order to be indexed. 69 | * Fields which are not included in the list of fields will be ignored. 70 | * 71 | * @example 72 | * String text field config: 73 | * 74 | * ```ts 75 | * { 76 | * name: "isbn", 77 | * kind: "text", 78 | * // Note "STRING" here which indexes the field as one string (e.g. no splitting). 79 | * flags: ["STRING"] 80 | * } 81 | * ``` 82 | * 83 | * @example 84 | * Full-text text field config: 85 | * 86 | * ```ts 87 | * { 88 | * name: "description", 89 | * kind: "text", 90 | * // Note "TEXT" flag which indexes the field as a full-text field splitting on characters such as spaces. 91 | * flags: ["TEXT"] 92 | * } 93 | * ``` 94 | */ 95 | fields: IndexFieldConfig[]; 96 | } 97 | 98 | export interface PatheryConfig { 99 | /** 100 | * List of index configurations. 101 | */ 102 | indexes: IndexConfig[]; 103 | } 104 | -------------------------------------------------------------------------------- /packages/pathery-cdk/src/index.ts: -------------------------------------------------------------------------------- 1 | import { PatheryStack } from "./pathery-stack"; 2 | 3 | export { PatheryStack }; 4 | -------------------------------------------------------------------------------- /packages/pathery-cdk/src/pathery-dashboard.ts: -------------------------------------------------------------------------------- 1 | import { Construct } from "constructs"; 2 | import { 3 | Column, 4 | Dashboard, 5 | GraphWidget, 6 | LogQueryWidget, 7 | MathExpression, 8 | Row, 9 | Shading, 10 | TextWidget, 11 | } from "aws-cdk-lib/aws-cloudwatch"; 12 | import { RustFunction } from "./rust-function"; 13 | import { Duration, Stack } from "aws-cdk-lib"; 14 | 15 | export interface PatheryDashboardProps { 16 | indexWriterWorker: RustFunction; 17 | } 18 | 19 | export class PatheryDashboard extends Construct { 20 | constructor(scope: Construct, id: string, props: PatheryDashboardProps) { 21 | super(scope, id); 22 | 23 | let stack = Stack.of(this); 24 | 25 | const dashboard = new Dashboard(this, "Resource", { 26 | dashboardName: `Pathery-${stack.stackName}`, 27 | }); 28 | 29 | let functions = stack.node 30 | .findAll() 31 | .filter((c): c is RustFunction => c instanceof RustFunction); 32 | 33 | let successRate = new MathExpression({ 34 | expression: "100 - ((errors / invocations) * 100)", 35 | period: Duration.minutes(1), 36 | usingMetrics: { 37 | errors: props.indexWriterWorker.metricErrors({ 38 | statistic: "sum", 39 | }), 40 | invocations: props.indexWriterWorker.metricInvocations({ 41 | statistic: "sum", 42 | }), 43 | }, 44 | color: "#72bf6a", 45 | label: "Success Rate", 46 | }); 47 | 48 | dashboard.addWidgets( 49 | new LogQueryWidget({ 50 | title: "Errors", 51 | logGroupNames: functions.map((f) => f.logGroup.logGroupName), 52 | queryLines: [ 53 | "fields @timestamp, @log, fields.message", 54 | "filter level = 'ERROR'", 55 | ], 56 | width: 24, 57 | }), 58 | new Column( 59 | new TextWidget({ 60 | markdown: "# IndexWriterWorker", 61 | width: 24, 62 | height: 1, 63 | }), 64 | new Row( 65 | new GraphWidget({ 66 | liveData: true, 67 | title: "IndexWriterWorker Execution", 68 | width: 12, 69 | left: [ 70 | props.indexWriterWorker.metricDuration({ 71 | period: Duration.minutes(1), 72 | statistic: "max", 73 | }), 74 | ], 75 | leftYAxis: { 76 | min: 0, 77 | label: "Latency (ms)", 78 | showUnits: false, 79 | }, 80 | right: [successRate], 81 | rightYAxis: { 82 | min: 0, 83 | max: 100, 84 | label: "Success Rate (%)", 85 | showUnits: false, 86 | }, 87 | leftAnnotations: [ 88 | { 89 | value: 90 | (props.indexWriterWorker.timeout?.toMilliseconds() ?? 3000) * 91 | 0.75, 92 | fill: Shading.ABOVE, 93 | color: "#e6b400", 94 | label: "Timeout Warning", 95 | }, 96 | { 97 | value: 98 | props.indexWriterWorker.timeout?.toMilliseconds() ?? 3000, 99 | fill: Shading.ABOVE, 100 | color: "#f44336", 101 | label: "Timeout", 102 | }, 103 | ], 104 | }) 105 | ) 106 | ) 107 | ); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /packages/pathery-cdk/src/pathery-stack.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Stack, 3 | aws_lambda, 4 | CfnOutput, 5 | Duration, 6 | StackProps, 7 | } from "aws-cdk-lib"; 8 | import { 9 | ApiKey, 10 | EndpointType, 11 | LambdaIntegration, 12 | RestApi, 13 | } from "aws-cdk-lib/aws-apigateway"; 14 | import { 15 | GatewayVpcEndpointAwsService, 16 | InterfaceVpcEndpointAwsService, 17 | SubnetType, 18 | Vpc, 19 | } from "aws-cdk-lib/aws-ec2"; 20 | import { FileSystem } from "aws-cdk-lib/aws-efs"; 21 | import { Function, LayerVersion } from "aws-cdk-lib/aws-lambda"; 22 | import { Architecture, Code, Runtime } from "aws-cdk-lib/aws-lambda"; 23 | import { SqsEventSource } from "aws-cdk-lib/aws-lambda-event-sources"; 24 | import { IQueue, Queue } from "aws-cdk-lib/aws-sqs"; 25 | import { Construct } from "constructs"; 26 | import { PatheryConfig } from "./config"; 27 | import * as fs from "fs"; 28 | import { RustFunction } from "./rust-function"; 29 | import { PatheryDashboard } from "./pathery-dashboard"; 30 | import { 31 | AttributeType, 32 | BillingMode, 33 | ITable, 34 | Table, 35 | } from "aws-cdk-lib/aws-dynamodb"; 36 | 37 | export interface PatheryStackProps extends StackProps { 38 | config: PatheryConfig; 39 | 40 | /** 41 | * IndexWriter configuration overrides. 42 | */ 43 | indexWriter?: { 44 | /** 45 | * IndexWriter Lambda memorySize. 46 | * 47 | * @default 2048 48 | */ 49 | memorySize?: number; 50 | 51 | /** 52 | * IndexWriter Lambda timeout duration. 53 | * 54 | * @default Duration.minutes(1) 55 | */ 56 | timeout?: Duration; 57 | }; 58 | 59 | /** 60 | * QueryHandler configuration overrides. 61 | */ 62 | queryHandler?: { 63 | /** 64 | * IndexWriter Lambda memorySize. 65 | * 66 | * @default 3008 67 | */ 68 | memorySize?: number; 69 | }; 70 | } 71 | 72 | export class PatheryStack extends Stack { 73 | readonly apiKey: ApiKey; 74 | 75 | readonly apiGateway: RestApi; 76 | 77 | private readonly table: ITable; 78 | 79 | private indexWriterQueue: IQueue; 80 | 81 | private deleteQueue: IQueue; 82 | 83 | constructor(scope: Construct, id: string, props: PatheryStackProps) { 84 | super(scope, id, props); 85 | 86 | this.table = new Table(this, "DataTable", { 87 | billingMode: BillingMode.PAY_PER_REQUEST, 88 | partitionKey: { 89 | name: "pk", 90 | type: AttributeType.STRING, 91 | }, 92 | sortKey: { 93 | name: "sk", 94 | type: AttributeType.STRING, 95 | }, 96 | timeToLiveAttribute: "__ttl", 97 | }); 98 | 99 | this.deleteQueue = new Queue(this, "DeleteQueue", { 100 | deliveryDelay: Duration.minutes(15), 101 | visibilityTimeout: Duration.minutes(2), 102 | }); 103 | 104 | this.indexWriterQueue = new Queue(this, "IndexWriterQueue", { 105 | fifo: true, 106 | contentBasedDeduplication: true, 107 | }); 108 | 109 | const vpc = new Vpc(this, "Vpc", { 110 | subnetConfiguration: [ 111 | { 112 | cidrMask: 28, 113 | name: "isolated", 114 | subnetType: SubnetType.PRIVATE_ISOLATED, 115 | }, 116 | ], 117 | }); 118 | vpc.addGatewayEndpoint("S3Endpoint", { 119 | service: GatewayVpcEndpointAwsService.S3, 120 | }); 121 | vpc.addGatewayEndpoint("DynamoEndpoint", { 122 | service: GatewayVpcEndpointAwsService.DYNAMODB, 123 | }); 124 | const sqsEndpoint = vpc.addInterfaceEndpoint("SqsGateway", { 125 | service: InterfaceVpcEndpointAwsService.SQS, 126 | }); 127 | sqsEndpoint.connections.allowDefaultPortFromAnyIpv4(); 128 | const lambdaEndpoint = vpc.addInterfaceEndpoint("LambdaEndpoint", { 129 | service: InterfaceVpcEndpointAwsService.LAMBDA, 130 | }); 131 | lambdaEndpoint.connections.allowDefaultPortFromAnyIpv4(); 132 | 133 | const efs = new FileSystem(this, "Filesystem", { 134 | vpc, 135 | }); 136 | 137 | let accessPoint = efs.addAccessPoint("ReadWrite", { 138 | createAcl: { 139 | ownerGid: "1001", 140 | ownerUid: "1001", 141 | permissions: "750", 142 | }, 143 | posixUser: { 144 | uid: "1001", 145 | gid: "1001", 146 | }, 147 | path: "/pathery-data", 148 | }); 149 | 150 | fs.mkdirSync(".pathery/layer/pathery", { recursive: true }); 151 | fs.writeFileSync( 152 | ".pathery/layer/pathery/config.json", 153 | JSON.stringify(props.config) 154 | ); 155 | let configLayer = new LayerVersion(this, "config-layer", { 156 | code: Code.fromAsset(".pathery/layer"), 157 | compatibleArchitectures: [Architecture.ARM_64], 158 | compatibleRuntimes: [Runtime.PROVIDED_AL2], 159 | }); 160 | 161 | const postIndex = new RustFunction(this, "post-index"); 162 | postIndex.addLayers(configLayer); 163 | this.indexWriterProducer(postIndex); 164 | 165 | const batchIndex = new RustFunction(this, "batch-index"); 166 | batchIndex.addLayers(configLayer); 167 | this.indexWriterProducer(batchIndex); 168 | 169 | const queryIndexPartition = new RustFunction( 170 | this, 171 | "query-index-partition-fn", 172 | { 173 | memorySize: props.queryHandler?.memorySize ?? 3008, 174 | timeout: Duration.seconds(5), 175 | vpc, 176 | vpcSubnets: { 177 | subnets: vpc.isolatedSubnets, 178 | }, 179 | filesystem: aws_lambda.FileSystem.fromEfsAccessPoint( 180 | accessPoint, 181 | "/mnt/pathery-data" 182 | ), 183 | } 184 | ); 185 | queryIndexPartition.addLayers(configLayer); 186 | this.table.grantReadData(queryIndexPartition); 187 | queryIndexPartition.addEnvironment("DATA_TABLE_NAME", this.table.tableName); 188 | queryIndexPartition.addEnvironment( 189 | "ASYNC_DELETE_QUEUE_URL", 190 | this.deleteQueue.queueUrl 191 | ); 192 | 193 | const queryIndex = new RustFunction(this, "query-index", { 194 | memorySize: props.queryHandler?.memorySize ?? 3008, 195 | timeout: Duration.seconds(5), 196 | vpc, 197 | vpcSubnets: { 198 | subnets: vpc.isolatedSubnets, 199 | }, 200 | filesystem: aws_lambda.FileSystem.fromEfsAccessPoint( 201 | accessPoint, 202 | "/mnt/pathery-data" 203 | ), 204 | }); 205 | queryIndex.addLayers(configLayer); 206 | this.table.grantReadData(queryIndex); 207 | queryIndex.addEnvironment("DATA_TABLE_NAME", this.table.tableName); 208 | queryIndex.addEnvironment( 209 | "ASYNC_DELETE_QUEUE_URL", 210 | this.deleteQueue.queueUrl 211 | ); 212 | queryIndexPartition.grantInvoke(queryIndex); 213 | queryIndex.addEnvironment( 214 | "QUERY_INDEX_PARTITION_NAME", 215 | queryIndexPartition.functionName 216 | ); 217 | 218 | const statsIndex = new RustFunction(this, "stats-index", { 219 | vpc, 220 | vpcSubnets: { 221 | subnets: vpc.isolatedSubnets, 222 | }, 223 | filesystem: aws_lambda.FileSystem.fromEfsAccessPoint( 224 | accessPoint, 225 | "/mnt/pathery-data" 226 | ), 227 | }); 228 | statsIndex.addLayers(configLayer); 229 | // FIXME: This doesn't actually get used but is required to be 230 | // set because of some tangled internal dependencies. 231 | statsIndex.addEnvironment( 232 | "ASYNC_DELETE_QUEUE_URL", 233 | this.deleteQueue.queueUrl 234 | ); 235 | 236 | const deleteDoc = new RustFunction(this, "delete-doc"); 237 | deleteDoc.addLayers(configLayer); 238 | this.indexWriterProducer(deleteDoc); 239 | 240 | const api = new RestApi(this, "PatheryApi", { 241 | restApiName: id, 242 | endpointConfiguration: { 243 | types: [EndpointType.REGIONAL], 244 | }, 245 | defaultMethodOptions: { 246 | apiKeyRequired: true, 247 | }, 248 | }); 249 | 250 | this.apiGateway = api; 251 | 252 | const apiKey = new ApiKey(this, "DefaultApiKey", {}); 253 | 254 | const plan = api.addUsagePlan("DefaultPlan", { 255 | apiStages: [ 256 | { 257 | api, 258 | stage: api.deploymentStage, 259 | }, 260 | ], 261 | }); 262 | 263 | plan.addApiKey(apiKey); 264 | 265 | this.apiKey = apiKey; 266 | 267 | const indexRoute = api.root.addResource("index"); 268 | 269 | const indexSingleRoute = indexRoute.addResource("{index_id}"); 270 | 271 | indexSingleRoute.addMethod("POST", new LambdaIntegration(postIndex)); 272 | 273 | const queryActionRoute = indexSingleRoute.addResource("query"); 274 | 275 | queryActionRoute.addMethod("POST", new LambdaIntegration(queryIndex)); 276 | 277 | const statsActionRoute = indexSingleRoute.addResource("stats"); 278 | 279 | statsActionRoute.addMethod("GET", new LambdaIntegration(statsIndex)); 280 | 281 | const batchIndexRoute = indexSingleRoute.addResource("batch"); 282 | 283 | batchIndexRoute.addMethod("POST", new LambdaIntegration(batchIndex)); 284 | 285 | const documentRoute = indexSingleRoute.addResource("doc"); 286 | 287 | const documentSingleRoute = documentRoute.addResource("{doc_id}"); 288 | 289 | documentSingleRoute.addMethod("DELETE", new LambdaIntegration(deleteDoc)); 290 | 291 | const indexWriterWorker = new RustFunction(this, "index-writer-worker", { 292 | memorySize: props.indexWriter?.memorySize ?? 2048, 293 | timeout: props.indexWriter?.timeout ?? Duration.minutes(1), 294 | vpc, 295 | vpcSubnets: { 296 | subnets: vpc.isolatedSubnets, 297 | }, 298 | filesystem: aws_lambda.FileSystem.fromEfsAccessPoint( 299 | accessPoint, 300 | "/mnt/pathery-data" 301 | ), 302 | }); 303 | indexWriterWorker.addLayers(configLayer); 304 | indexWriterWorker.addEventSource( 305 | new SqsEventSource(this.indexWriterQueue, { 306 | batchSize: 10, 307 | }) 308 | ); 309 | this.table.grantReadWriteData(indexWriterWorker); 310 | indexWriterWorker.addEnvironment("DATA_TABLE_NAME", this.table.tableName); 311 | this.deleteQueue.grantSendMessages(indexWriterWorker); 312 | indexWriterWorker.addEnvironment( 313 | "ASYNC_DELETE_QUEUE_URL", 314 | this.deleteQueue.queueUrl 315 | ); 316 | 317 | const asyncDeleteWorker = new RustFunction(this, "async-delete-worker", { 318 | memorySize: 2048, 319 | timeout: props.indexWriter?.timeout ?? Duration.minutes(1), 320 | vpc, 321 | vpcSubnets: { 322 | subnets: vpc.isolatedSubnets, 323 | }, 324 | filesystem: aws_lambda.FileSystem.fromEfsAccessPoint( 325 | accessPoint, 326 | "/mnt/pathery-data" 327 | ), 328 | }); 329 | asyncDeleteWorker.addLayers(configLayer); 330 | asyncDeleteWorker.addEventSource( 331 | new SqsEventSource(this.deleteQueue, { 332 | batchSize: 10, 333 | }) 334 | ); 335 | 336 | new PatheryDashboard(this, "Dashboard", { 337 | indexWriterWorker, 338 | }); 339 | 340 | new CfnOutput(this, "ApiKeyOutput", { 341 | value: apiKey.keyId, 342 | }); 343 | } 344 | 345 | private indexWriterProducer(lambda: Function) { 346 | this.table.grantWriteData(lambda); 347 | lambda.addEnvironment("DATA_TABLE_NAME", this.table.tableName); 348 | 349 | this.indexWriterQueue.grantSendMessages(lambda); 350 | lambda.addEnvironment( 351 | "INDEX_WRITER_QUEUE_URL", 352 | this.indexWriterQueue.queueUrl 353 | ); 354 | } 355 | } 356 | -------------------------------------------------------------------------------- /packages/pathery-cdk/src/rust-function.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Architecture, 3 | Code, 4 | Function, 5 | FunctionProps, 6 | Runtime, 7 | } from "aws-cdk-lib/aws-lambda"; 8 | import { RetentionDays } from "aws-cdk-lib/aws-logs"; 9 | import { Construct } from "constructs"; 10 | import * as path from "path"; 11 | 12 | export class RustFunction extends Function { 13 | constructor(scope: Construct, id: string, props?: Partial) { 14 | const lambdaAssetPath = path.join(__dirname, "..", "target", id); 15 | super(scope, id, { 16 | ...props, 17 | code: Code.fromAsset(lambdaAssetPath), 18 | handler: "default", 19 | runtime: Runtime.PROVIDED_AL2, 20 | architecture: Architecture.ARM_64, 21 | logRetention: RetentionDays.THREE_DAYS, 22 | }); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /packages/pathery-cdk/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2016", 4 | "module": "commonjs", 5 | "esModuleInterop": true, 6 | "forceConsistentCasingInFileNames": true, 7 | "strict": true, 8 | "skipLibCheck": true, 9 | "outDir": "lib", 10 | "declaration": true, 11 | "baseUrl": "./src", 12 | "declarationMap": true 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /packages/pathery/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition = "2021" 3 | name = "pathery" 4 | version = "0.1.0" 5 | 6 | [dependencies] 7 | anyhow = "1.0.66" 8 | async-trait = "0.1.58" 9 | aws-config = "0.51.0" 10 | aws-sdk-dynamodb = "0.21.0" 11 | aws-sdk-lambda = "0.21.0" 12 | aws-sdk-sqs = "0.21.0" 13 | aws-smithy-types = "0.51.0" 14 | aws_lambda_events = "0.7.2" 15 | base64 = "0.21.0" 16 | chrono = "0.4.23" 17 | http = "0.2.8" 18 | lambda_http = {version = "0.7", default-features = false, features = ["apigw_rest"]} 19 | lambda_runtime = "0.7" 20 | serde = {version = "1.0.147", features = ["derive"]} 21 | serde_dynamo = {version = "4", features = ["aws-sdk-dynamodb+0_21"]} 22 | serde_json = "1.0.87" 23 | tantivy = {version = "0.18.1"} 24 | tantivy-common = "0.3.0" 25 | thiserror = "1.0.37" 26 | tokio = {version = "1", features = ["full"]} 27 | tracing = {version = "0.1", features = ["log"]} 28 | tracing-subscriber = {version = "0.3", default-features = false, features = ["fmt", "json", "std"]} 29 | uuid = "1.2.1" 30 | zstd = "0.12.3" 31 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/async-delete-worker.rs: -------------------------------------------------------------------------------- 1 | use pathery::lambda; 2 | use pathery::lambda::lambda_runtime::{run, service_fn}; 3 | use pathery::lambda::sqs; 4 | use pathery::worker::async_delete::handle_event; 5 | 6 | #[tokio::main] 7 | async fn main() -> Result<(), sqs::Error> { 8 | lambda::init_tracing(); 9 | 10 | run(service_fn(|event| handle_event(event))).await 11 | } 12 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/batch-index.rs: -------------------------------------------------------------------------------- 1 | use pathery::service::index::BatchIndexService; 2 | use pathery::service::start_service; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<(), lambda_http::Error> { 6 | let service = BatchIndexService::create().await; 7 | 8 | start_service(&service).await 9 | } 10 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/delete-doc.rs: -------------------------------------------------------------------------------- 1 | use pathery::service::doc::DeleteDocService; 2 | use pathery::service::start_service; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<(), lambda_http::Error> { 6 | let service = DeleteDocService::create().await; 7 | 8 | start_service(&service).await 9 | } 10 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/index-writer-worker.rs: -------------------------------------------------------------------------------- 1 | use pathery::index::LambdaIndexLoader; 2 | use pathery::lambda; 3 | use pathery::lambda::lambda_runtime::{run, service_fn}; 4 | use pathery::lambda::sqs; 5 | use pathery::store::document::DDBDocumentStore; 6 | use pathery::worker::index_writer::handle_event; 7 | 8 | #[tokio::main] 9 | async fn main() -> Result<(), sqs::Error> { 10 | lambda::init_tracing(); 11 | 12 | let document_store = DDBDocumentStore::create(None).await; 13 | let index_loader = LambdaIndexLoader::create().await; 14 | 15 | run(service_fn(|event| { 16 | handle_event(&document_store, &index_loader, event) 17 | })) 18 | .await 19 | } 20 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/post-index.rs: -------------------------------------------------------------------------------- 1 | use pathery::service::index::PostIndexService; 2 | use pathery::service::start_service; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<(), lambda_http::Error> { 6 | let service = PostIndexService::create().await; 7 | 8 | start_service(&service).await 9 | } 10 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/query-index-partition-fn.rs: -------------------------------------------------------------------------------- 1 | use pathery::function::query_index_partition::handle_event; 2 | use pathery::index::LambdaIndexLoader; 3 | use pathery::lambda; 4 | use pathery::lambda::lambda_runtime::{run, service_fn}; 5 | use pathery::lambda::sqs; 6 | 7 | #[tokio::main] 8 | async fn main() -> Result<(), sqs::Error> { 9 | lambda::init_tracing(); 10 | 11 | let index_loader = LambdaIndexLoader::create().await; 12 | 13 | run(service_fn(|event| handle_event(&index_loader, event))).await 14 | } 15 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/query-index.rs: -------------------------------------------------------------------------------- 1 | use pathery::service::index::QueryIndexService; 2 | use pathery::service::start_service; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<(), lambda_http::Error> { 6 | let service = QueryIndexService::create().await; 7 | 8 | start_service(&service).await 9 | } 10 | -------------------------------------------------------------------------------- /packages/pathery/src/bin/stats-index.rs: -------------------------------------------------------------------------------- 1 | use pathery::service::index::StatsIndexService; 2 | use pathery::service::start_service; 3 | 4 | #[tokio::main] 5 | async fn main() -> Result<(), lambda_http::Error> { 6 | let service = StatsIndexService::create().await; 7 | 8 | start_service(&service).await 9 | } 10 | -------------------------------------------------------------------------------- /packages/pathery/src/directory.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::path::{Path, PathBuf}; 3 | use std::sync::Arc; 4 | 5 | use tantivy::directory::error::OpenDirectoryError; 6 | use tantivy::directory::{DirectoryLock, MmapDirectory}; 7 | use tantivy::Directory; 8 | use tokio::runtime::Handle; 9 | 10 | use crate::pagination::SegmentMeta; 11 | use crate::worker::async_delete::client::AsyncDeleteClient; 12 | use crate::worker::async_delete::job::AsyncDeleteJob; 13 | 14 | struct NoopLockGuard; 15 | 16 | /// Directory that wraps MmapDirectory without using a lockfile. 17 | /// 18 | /// Using a FIFO SQS queue for orchestrating indexing removes the need for a lockfile. 19 | #[derive(Clone, Debug)] 20 | pub struct PatheryDirectory { 21 | directory_path: PathBuf, 22 | 23 | segments: Option>, 24 | 25 | inner: MmapDirectory, 26 | 27 | async_delete_client: Arc, 28 | 29 | handle: Handle, 30 | } 31 | 32 | impl PatheryDirectory { 33 | pub fn open

( 34 | directory_path: P, 35 | async_delete_client: &Arc, 36 | segments: Option>, 37 | ) -> Result 38 | where 39 | P: AsRef, 40 | { 41 | Ok(PatheryDirectory { 42 | directory_path: directory_path.as_ref().to_owned(), 43 | segments, 44 | inner: MmapDirectory::open(directory_path)?, 45 | async_delete_client: Arc::clone(async_delete_client), 46 | handle: Handle::try_current().unwrap(), 47 | }) 48 | } 49 | } 50 | 51 | impl Directory for PatheryDirectory { 52 | fn get_file_handle( 53 | &self, 54 | path: &std::path::Path, 55 | ) -> Result, tantivy::directory::error::OpenReadError> 56 | { 57 | self.inner.get_file_handle(path) 58 | } 59 | 60 | fn delete(&self, path: &std::path::Path) -> Result<(), tantivy::directory::error::DeleteError> { 61 | let path = self.directory_path.join(path.to_path_buf()); 62 | let job = AsyncDeleteJob::fs_delete(path); 63 | self.handle 64 | .block_on(self.async_delete_client.submit_job(job)) 65 | .expect("Message should queue successfully"); 66 | Ok(()) 67 | } 68 | 69 | fn exists( 70 | &self, 71 | path: &std::path::Path, 72 | ) -> Result { 73 | self.inner.exists(path) 74 | } 75 | 76 | fn open_write( 77 | &self, 78 | path: &std::path::Path, 79 | ) -> Result { 80 | self.inner.open_write(path) 81 | } 82 | 83 | fn atomic_read( 84 | &self, 85 | path: &std::path::Path, 86 | ) -> Result, tantivy::directory::error::OpenReadError> { 87 | let result = self.inner.atomic_read(path)?; 88 | 89 | // check that we are returning meta.json 90 | if path == Path::new("meta.json") { 91 | if let Some(segments) = &self.segments { 92 | let mut meta: HashMap = 93 | serde_json::from_slice(&result[..]).expect("meta.json should be parsable"); 94 | 95 | // let segments = meta 96 | // .get("segments") 97 | // .and_then(|s| s.as_array()) 98 | // .expect("segments should be set"); 99 | 100 | // let filtered_segments: Vec<_> = segments 101 | // .iter() 102 | // .enumerate() 103 | // .filter(|(idx, _)| (idx + self.partition_n) % self.total_partitions == 0) 104 | // .map(|(_, v)| v.to_owned()) 105 | // .collect(); 106 | let segments = serde_json::to_value(segments).unwrap(); 107 | 108 | meta.insert(String::from("segments"), segments); 109 | 110 | return Ok(serde_json::to_vec(&meta).expect("meta.json should serialize")); 111 | } 112 | } 113 | 114 | Ok(result) 115 | } 116 | 117 | fn atomic_write(&self, path: &std::path::Path, data: &[u8]) -> std::io::Result<()> { 118 | self.inner.atomic_write(path, data) 119 | } 120 | 121 | fn sync_directory(&self) -> std::io::Result<()> { 122 | self.inner.sync_directory() 123 | } 124 | 125 | fn watch( 126 | &self, 127 | watch_callback: tantivy::directory::WatchCallback, 128 | ) -> tantivy::Result { 129 | self.inner.watch(watch_callback) 130 | } 131 | 132 | fn acquire_lock( 133 | &self, 134 | _lock: &tantivy::directory::Lock, 135 | ) -> Result { 136 | Ok(DirectoryLock::from(Box::new(NoopLockGuard))) 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /packages/pathery/src/function/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod query_index_partition; 2 | -------------------------------------------------------------------------------- /packages/pathery/src/function/query_index_partition/client.rs: -------------------------------------------------------------------------------- 1 | use aws_smithy_types::Blob; 2 | 3 | use super::{PartitionQueryResponse, QueryRequest}; 4 | use crate::pagination::SegmentMeta; 5 | use crate::util; 6 | 7 | pub struct LambdaQueryIndexPartitionClient { 8 | function_name: String, 9 | 10 | client: aws_sdk_lambda::Client, 11 | } 12 | 13 | impl LambdaQueryIndexPartitionClient { 14 | pub async fn create() -> LambdaQueryIndexPartitionClient { 15 | let sdk_config = aws_config::load_from_env().await; 16 | let function_name = util::require_env("QUERY_INDEX_PARTITION_NAME"); 17 | 18 | LambdaQueryIndexPartitionClient { 19 | function_name, 20 | client: aws_sdk_lambda::Client::new(&sdk_config), 21 | } 22 | } 23 | 24 | pub async fn query_partition( 25 | &self, 26 | index_id: String, 27 | query: String, 28 | offset: usize, 29 | partition_n: usize, 30 | segments: Vec, 31 | ) -> PartitionQueryResponse { 32 | // TODO: Error handling and retries 33 | let request = self.client.invoke(); 34 | let request = request.function_name(&self.function_name); 35 | let input = QueryRequest { 36 | index_id, 37 | query, 38 | offset, 39 | partition_n, 40 | segments, 41 | }; 42 | let input = serde_json::to_vec(&input).expect("should serialize"); 43 | let input = Blob::new(input); 44 | let request = request.payload(input); 45 | let response = tokio::spawn(request.send()); 46 | let response = response.await.unwrap().expect("should succeed"); 47 | 48 | let payload = response.payload().expect("payload should exist"); 49 | let payload = payload.to_owned().into_inner(); 50 | let payload: PartitionQueryResponse = 51 | serde_json::from_slice(&payload).expect("payload should parse"); 52 | payload 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /packages/pathery/src/function/query_index_partition/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod client; 2 | 3 | use lambda_runtime::{Error, LambdaEvent}; 4 | use serde::{Deserialize, Serialize}; 5 | use tantivy::collector::TopDocs; 6 | use tantivy::query::QueryParser; 7 | use tantivy::schema::{Field, FieldType}; 8 | use tantivy::{DocAddress, Score}; 9 | 10 | use crate::index::IndexLoader; 11 | use crate::pagination::SegmentMeta; 12 | use crate::service::ServiceError; 13 | use crate::store::document::SearchDocRef; 14 | 15 | #[derive(Serialize, Deserialize, Debug)] 16 | pub struct QueryRequest { 17 | pub index_id: String, 18 | pub query: String, 19 | pub offset: usize, 20 | pub partition_n: usize, 21 | pub segments: Vec, 22 | } 23 | 24 | #[derive(Serialize, Deserialize, Debug, PartialEq)] 25 | pub struct PartitionSearchHit { 26 | pub doc_ref: SearchDocRef, 27 | pub score: f32, 28 | pub partition_n: usize, 29 | } 30 | 31 | #[derive(Serialize, Deserialize, Debug, PartialEq)] 32 | pub struct PartitionQueryResponse { 33 | pub matches: Vec, 34 | } 35 | 36 | pub async fn handle_event( 37 | index_loader: &dyn IndexLoader, 38 | event: LambdaEvent, 39 | ) -> Result { 40 | let body = event.payload; 41 | let index_id = body.index_id; 42 | 43 | let mut index = index_loader.load_index(&index_id, Some(body.segments))?; 44 | 45 | index.set_default_multithread_executor().unwrap(); 46 | 47 | let reader = index.reader().expect("Reader should load"); 48 | 49 | let searcher = reader.searcher(); 50 | 51 | let schema = index.schema(); 52 | 53 | let query_parser = QueryParser::for_index( 54 | &index, 55 | schema 56 | .fields() 57 | .filter_map(|(field, entry)| { 58 | if !entry.is_indexed() { 59 | return None; 60 | } 61 | match entry.field_type() { 62 | FieldType::Str(_) => Some(field), 63 | _ => None, 64 | } 65 | }) 66 | .collect::>(), 67 | ); 68 | 69 | let query = query_parser 70 | .parse_query(&body.query) 71 | .map_err(|err| ServiceError::invalid_request(&err.to_string()))?; 72 | 73 | let collector = TopDocs::with_limit(10).and_offset(body.offset); 74 | 75 | let top_docs: Vec<(Score, DocAddress)> = searcher 76 | .search(&query, &collector) 77 | .expect("search should succeed"); 78 | 79 | let matches: Vec<_> = top_docs 80 | .into_iter() 81 | .map(|(score, address)| { 82 | let document = searcher.doc(address).expect("doc should exist"); 83 | 84 | let named_doc = schema.to_named_doc(&document); 85 | 86 | let stored_ref = SearchDocRef::from(named_doc); 87 | 88 | PartitionSearchHit { 89 | doc_ref: stored_ref, 90 | score, 91 | partition_n: body.partition_n, 92 | } 93 | }) 94 | .collect(); 95 | 96 | if matches.len() == 0 { 97 | return Ok(PartitionQueryResponse { matches: vec![] }); 98 | } 99 | 100 | Ok(PartitionQueryResponse { matches }) 101 | } 102 | 103 | // #[cfg(test)] 104 | // mod tests { 105 | // use super::*; 106 | // use crate::test_utils::*; 107 | 108 | // fn test_service(ctx: &TestContext) -> QueryIndexService { 109 | // QueryIndexService { 110 | // document_store: Box::new(ctx.document_store().clone()), 111 | // index_loader: Box::new(ctx.index_loader().clone()), 112 | // } 113 | // } 114 | 115 | // #[tokio::test] 116 | // async fn query_default_response() { 117 | // let ctx = setup() 118 | // .with_documents( 119 | // "test", 120 | // vec![json!({ 121 | // "__id": "foobar", 122 | // "title": "hello", 123 | // "author": "world" 124 | // })], 125 | // ) 126 | // .await; 127 | 128 | // let service = test_service(&ctx); 129 | 130 | // let request = ServiceRequest::create(QueryRequest { 131 | // query: "hello".into(), 132 | // with_partition: None, 133 | // }) 134 | // .with_path_param("index_id", "test"); 135 | 136 | // let response = service.handle_request(request).await.unwrap(); 137 | 138 | // assert_eq!( 139 | // QueryResponse { 140 | // matches: vec![SearchHit { 141 | // doc: json::json!({ 142 | // "__id": ["foobar"], 143 | // "title": ["hello"], 144 | // "author": ["world"], 145 | // }), 146 | // score: 0.28768212, 147 | // snippets: json::json!({ 148 | // "title": "hello" 149 | // }) 150 | // }] 151 | // }, 152 | // response 153 | // ); 154 | // } 155 | 156 | // #[tokio::test] 157 | // async fn query_document_with_un_indexed_fields() { 158 | // let ctx = setup() 159 | // .with_documents( 160 | // "test", 161 | // vec![json!({ 162 | // "__id": "foobar", 163 | // "title": "hello", 164 | // "meta": "world" 165 | // })], 166 | // ) 167 | // .await; 168 | 169 | // let service = test_service(&ctx); 170 | 171 | // let request = ServiceRequest::create(QueryRequest { 172 | // query: "hello".into(), 173 | // with_partition: None, 174 | // }) 175 | // .with_path_param("index_id", "test"); 176 | 177 | // let response = service.handle_request(request).await.unwrap(); 178 | 179 | // assert_eq!(1, response.matches.len()); 180 | // } 181 | 182 | // #[tokio::test] 183 | // async fn query_document_with_json_field() { 184 | // let ctx = setup() 185 | // .with_documents( 186 | // "test", 187 | // vec![json!({ 188 | // "__id": "foobar", 189 | // "title": "hello", 190 | // "props": { 191 | // "foo": "bar" 192 | // } 193 | // })], 194 | // ) 195 | // .await; 196 | 197 | // let service = test_service(&ctx); 198 | 199 | // let request = ServiceRequest::create(QueryRequest { 200 | // query: "props.foo:bar".into(), 201 | // with_partition: None, 202 | // }) 203 | // .with_path_param("index_id", "test"); 204 | 205 | // let response = service.handle_request(request).await.unwrap(); 206 | 207 | // assert_eq!(1, response.matches.len()); 208 | // } 209 | // } 210 | -------------------------------------------------------------------------------- /packages/pathery/src/index.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::path::Path; 3 | use std::sync::Arc; 4 | 5 | use tantivy::merge_policy::DefaultMergePolicy; 6 | use tantivy::schema::Field; 7 | use tantivy::{Index, IndexWriter}; 8 | 9 | use crate::directory::PatheryDirectory; 10 | use crate::pagination::SegmentMeta; 11 | use crate::schema::{SchemaLoader, SchemaProvider}; 12 | use crate::service::ServiceError; 13 | use crate::worker::async_delete::client::{AsyncDeleteClient, LambdaAsyncDeleteClient}; 14 | 15 | pub trait IndexLoader: Send + Sync { 16 | fn load_index( 17 | &self, 18 | index_id: &str, 19 | segments: Option>, 20 | ) -> Result; 21 | } 22 | 23 | pub struct LambdaIndexLoader { 24 | schema_loader: SchemaProvider, 25 | 26 | async_delete_client: Arc, 27 | } 28 | 29 | impl LambdaIndexLoader { 30 | pub async fn create() -> Self { 31 | let async_delete_client = LambdaAsyncDeleteClient::create(None).await; 32 | let async_delete_client = Arc::new(async_delete_client); 33 | 34 | Self { 35 | schema_loader: SchemaProvider::lambda(), 36 | async_delete_client, 37 | } 38 | } 39 | } 40 | 41 | impl IndexLoader for LambdaIndexLoader { 42 | fn load_index( 43 | &self, 44 | index_id: &str, 45 | segments: Option>, 46 | ) -> Result { 47 | let directory_path = format!("/mnt/pathery-data/{index_id}"); 48 | 49 | let mut index = if let Ok(existing_dir) = 50 | PatheryDirectory::open(&directory_path, &self.async_delete_client, segments) 51 | { 52 | Index::open(existing_dir).expect("Index should be openable") 53 | } else { 54 | fs::create_dir(&directory_path).expect("Directory should be creatable"); 55 | let schema = self.schema_loader.load_schema(index_id)?; 56 | Index::create_in_dir(Path::new(&directory_path), schema) 57 | .expect("Index should be creatable") 58 | }; 59 | 60 | index 61 | .set_default_multithread_executor() 62 | .expect("default multithread executor should succeed"); 63 | 64 | Ok(index) 65 | } 66 | } 67 | 68 | pub trait IndexExt { 69 | fn default_writer(&self) -> IndexWriter; 70 | 71 | fn id_field(&self) -> Field; 72 | } 73 | 74 | impl IndexExt for Index { 75 | fn default_writer(&self) -> IndexWriter { 76 | let writer = self 77 | .writer(100_000_000) 78 | .expect("Writer should be available"); 79 | 80 | let mut merge_policy = DefaultMergePolicy::default(); 81 | merge_policy.set_max_docs_before_merge(10_000); 82 | 83 | writer.set_merge_policy(Box::new(merge_policy)); 84 | 85 | writer 86 | } 87 | 88 | fn id_field(&self) -> Field { 89 | self.schema() 90 | .get_field("__id") 91 | .expect("__id field should exist") 92 | } 93 | } 94 | 95 | #[cfg(test)] 96 | pub mod test_util { 97 | use std::collections::HashMap; 98 | use std::sync::{Arc, Mutex}; 99 | 100 | use super::*; 101 | 102 | #[derive(Debug)] 103 | pub struct TestIndexLoader { 104 | schema_loader: SchemaProvider, 105 | 106 | table: Arc>>, 107 | } 108 | 109 | impl Clone for TestIndexLoader { 110 | fn clone(&self) -> Self { 111 | Self { 112 | schema_loader: self.schema_loader.clone(), 113 | table: self.table.clone(), 114 | } 115 | } 116 | } 117 | 118 | impl IndexLoader for TestIndexLoader { 119 | fn load_index( 120 | &self, 121 | index_id: &str, 122 | _segments: Option>, 123 | ) -> Result { 124 | let mut table = self.table.lock().unwrap(); 125 | 126 | let entry = (*table).entry(index_id.into()); 127 | 128 | let schema = self.schema_loader.load_schema(index_id)?; 129 | 130 | let index = entry.or_insert_with(|| Index::create_in_ram(schema)); 131 | 132 | Ok(index.clone()) 133 | } 134 | } 135 | 136 | impl TestIndexLoader { 137 | pub fn create(schema_loader: SchemaProvider) -> Self { 138 | TestIndexLoader { 139 | schema_loader, 140 | table: Arc::new(Mutex::new(HashMap::new())), 141 | } 142 | } 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /packages/pathery/src/lambda/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod sqs; 2 | 3 | pub use lambda_runtime::Error; 4 | pub use {lambda_runtime, tracing}; 5 | 6 | pub fn init_tracing() { 7 | tracing_subscriber::fmt() 8 | .json() 9 | .with_target(false) 10 | .without_time() 11 | .init(); 12 | } 13 | -------------------------------------------------------------------------------- /packages/pathery/src/lambda/sqs.rs: -------------------------------------------------------------------------------- 1 | use aws_lambda_events::event::sqs; 2 | pub use lambda_runtime::Error; 3 | use lambda_runtime::LambdaEvent; 4 | 5 | pub type SqsEvent = LambdaEvent; 6 | -------------------------------------------------------------------------------- /packages/pathery/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod directory; 2 | pub mod function; 3 | pub mod index; 4 | pub mod lambda; 5 | pub mod pagination; 6 | pub mod schema; 7 | pub mod search_doc; 8 | pub mod serialize; 9 | pub mod service; 10 | pub mod store; 11 | pub mod util; 12 | pub mod worker; 13 | 14 | pub(crate) use serde_json as json; 15 | 16 | #[cfg(test)] 17 | pub mod test_utils { 18 | pub use serde_json as json; 19 | pub use serde_json::json; 20 | 21 | use crate::index::test_util::TestIndexLoader; 22 | use crate::schema::{SchemaLoader, SchemaProvider}; 23 | use crate::search_doc::SearchDoc; 24 | use crate::store::document::test_util::TestDocumentStore; 25 | use crate::store::document::DocumentStore; 26 | use crate::worker::index_writer::client::test_utils::TestIndexWriterClient; 27 | use crate::worker::index_writer::client::IndexWriterClient; 28 | use crate::worker::index_writer::job::Job; 29 | 30 | pub struct TestContext { 31 | schema_loader: SchemaProvider, 32 | 33 | document_store: TestDocumentStore, 34 | 35 | writer_client: TestIndexWriterClient, 36 | 37 | index_loader: TestIndexLoader, 38 | } 39 | 40 | impl TestContext { 41 | pub async fn with_documents(self, index_id: &str, docs: Vec) -> TestContext { 42 | let schema = self.schema_loader.load_schema(index_id).unwrap(); 43 | let documents: Vec<_> = docs 44 | .into_iter() 45 | .map(|value| SearchDoc::from_json(&schema, value).unwrap()) 46 | .collect(); 47 | let doc_refs = self.document_store.save_documents(documents).await.unwrap(); 48 | let mut job = Job::create(index_id); 49 | for doc_ref in doc_refs { 50 | job.index_doc(doc_ref); 51 | } 52 | self.writer_client().submit_job(job).await.unwrap(); 53 | self 54 | } 55 | 56 | pub fn schema_loader(&self) -> &SchemaProvider { 57 | &self.schema_loader 58 | } 59 | 60 | pub fn document_store(&self) -> &TestDocumentStore { 61 | &self.document_store 62 | } 63 | 64 | pub fn writer_client(&self) -> &TestIndexWriterClient { 65 | &self.writer_client 66 | } 67 | 68 | pub fn index_loader(&self) -> &TestIndexLoader { 69 | &self.index_loader 70 | } 71 | } 72 | 73 | pub fn setup() -> TestContext { 74 | let config = json!({ 75 | "indexes": [ 76 | { 77 | "prefix": "test", 78 | "fields": [ 79 | { 80 | "name": "title", 81 | "kind": "text", 82 | "flags": ["TEXT"] 83 | }, 84 | { 85 | "name": "author", 86 | "kind": "text", 87 | "flags": ["TEXT"] 88 | }, 89 | { 90 | "name": "isbn", 91 | "kind": "text", 92 | "flags": ["STRING"] 93 | }, 94 | { 95 | "name": "date_added", 96 | "kind": "date", 97 | "flags": ["INDEXED", "FAST"] 98 | }, 99 | { 100 | "name": "meta", 101 | "kind": "text", 102 | "flags": [] 103 | }, 104 | { 105 | "name": "year", 106 | "kind": "i64", 107 | "flags": ["INDEXED"] 108 | }, 109 | { 110 | "name": "props", 111 | "kind": "json", 112 | "flags": ["TEXT"] 113 | } 114 | ] 115 | } 116 | ] 117 | }); 118 | 119 | let schema_loader = SchemaProvider::from_json(config); 120 | 121 | let index_loader = TestIndexLoader::create(schema_loader.clone()); 122 | 123 | let document_store = TestDocumentStore::create(); 124 | 125 | TestContext { 126 | schema_loader, 127 | writer_client: TestIndexWriterClient::create( 128 | index_loader.clone(), 129 | document_store.clone(), 130 | ), 131 | document_store, 132 | index_loader, 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /packages/pathery/src/pagination.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use base64::Engine; 4 | use serde::{Deserialize, Serialize}; 5 | use serde_json::Value; 6 | 7 | #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] 8 | pub struct SegmentMeta { 9 | segment_id: String, 10 | 11 | #[serde(flatten)] 12 | extra: HashMap, 13 | } 14 | 15 | #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] 16 | pub struct PaginationToken { 17 | query: String, 18 | segments: Vec, 19 | partition_state: Vec, 20 | } 21 | 22 | impl PaginationToken { 23 | pub fn new(query: T, total_partitions: usize) -> PaginationToken 24 | where T: Into { 25 | let mut partition_state: Vec = vec![]; 26 | partition_state.resize(total_partitions, 0); 27 | PaginationToken { 28 | query: query.into(), 29 | segments: vec![], 30 | partition_state, 31 | } 32 | } 33 | 34 | pub fn import_segments_json(&mut self, segments_json: Value) { 35 | let segments: Vec = serde_json::from_value(segments_json).unwrap(); 36 | self.segments = segments; 37 | } 38 | 39 | pub fn segments_for_partition(&self, n: usize) -> Vec { 40 | self.segments 41 | .iter() 42 | .enumerate() 43 | .filter(|(idx, _)| (idx + n) % self.partition_state.len() == 0) 44 | .map(|(_, x)| x.clone()) 45 | .collect() 46 | } 47 | 48 | pub fn inc_offset(&mut self, partition_n: usize) { 49 | let value = self.partition_state.get_mut(partition_n).unwrap(); 50 | *value = *value + 1; 51 | } 52 | 53 | pub fn get_offset(&self, partition_n: usize) -> usize { 54 | *self.partition_state.get(partition_n).unwrap() 55 | } 56 | 57 | pub fn get_query(&self) -> String { 58 | self.query.to_string() 59 | } 60 | 61 | pub fn serialize(&self) -> String { 62 | let json = serde_json::to_vec(self).expect("should serialize to json"); 63 | let compressed = zstd::encode_all(json.as_slice(), 20).expect("should encode"); 64 | base64::engine::general_purpose::STANDARD.encode(compressed) 65 | } 66 | 67 | pub fn parse(from: T) -> PaginationToken 68 | where T: Into { 69 | let decoded = base64::engine::general_purpose::STANDARD 70 | .decode(from.into()) 71 | .unwrap(); 72 | let decompressed = zstd::decode_all(decoded.as_slice()).unwrap(); 73 | serde_json::from_slice(&decompressed).unwrap() 74 | } 75 | } 76 | 77 | #[cfg(test)] 78 | mod tests { 79 | use serde_json::json; 80 | 81 | use super::PaginationToken; 82 | 83 | #[test] 84 | fn test_round_trip() { 85 | let mut token = PaginationToken::new("foobar", 2); 86 | token.import_segments_json(json!([ 87 | { 88 | "segment_id": "abc123", 89 | "foo": "bar" 90 | } 91 | ])); 92 | 93 | let token_str = token.serialize(); 94 | let parsed = PaginationToken::parse(token_str); 95 | 96 | println!("{:?}", parsed); 97 | 98 | assert_eq!(token, parsed); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /packages/pathery/src/schema.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | use serde_json as json; 5 | use tantivy::schema::{self, DocParsingError, Field, NumericOptions, Schema, TextOptions}; 6 | use thiserror::Error; 7 | 8 | use crate::service::ServiceError; 9 | 10 | #[derive(Serialize, Deserialize, Debug, Clone)] 11 | pub enum TextFieldOption { 12 | TEXT, 13 | STRING, 14 | FAST, 15 | } 16 | 17 | #[derive(Serialize, Deserialize, Debug, Clone)] 18 | pub enum NumericFieldOption { 19 | INDEXED, 20 | FAST, 21 | } 22 | 23 | #[derive(Serialize, Deserialize, Debug, Clone)] 24 | pub enum JsonFieldOption { 25 | TEXT, 26 | } 27 | 28 | #[derive(Serialize, Deserialize, Debug, Clone)] 29 | #[serde(tag = "kind")] 30 | pub enum FieldConfig { 31 | #[serde(rename = "text")] 32 | TextFieldConfig { 33 | name: String, 34 | flags: Vec, 35 | }, 36 | #[serde(rename = "date")] 37 | DateFieldConfig { 38 | name: String, 39 | flags: Vec, 40 | }, 41 | #[serde(rename = "i64")] 42 | IntegerFieldConfig { 43 | name: String, 44 | flags: Vec, 45 | }, 46 | #[serde(rename = "json")] 47 | JsonFieldConfig { 48 | name: String, 49 | flags: Vec, 50 | }, 51 | } 52 | 53 | #[derive(Serialize, Deserialize, Debug, Clone)] 54 | pub struct IndexConfig { 55 | prefix: String, 56 | fields: Vec, 57 | } 58 | 59 | #[derive(Serialize, Deserialize, Debug, Clone)] 60 | pub struct PatheryConfig { 61 | indexes: Vec, 62 | } 63 | 64 | pub trait SchemaLoader: Send + Sync { 65 | fn load_schema(&self, index_id: &str) -> Result; 66 | } 67 | 68 | #[derive(Error, Debug)] 69 | pub enum IndexDocError { 70 | #[error("Expected JSON object")] 71 | NotJsonObject, 72 | #[error("Request JSON object is empty")] 73 | EmptyDoc, 74 | #[error("Error parsing JSON object document")] 75 | DocParsingError(DocParsingError), 76 | } 77 | 78 | fn numeric_field_options(flags: &Vec) -> NumericOptions { 79 | flags 80 | .iter() 81 | .fold(NumericOptions::default(), |acc, opt| match opt { 82 | NumericFieldOption::INDEXED => acc | schema::INDEXED, 83 | NumericFieldOption::FAST => acc | schema::FAST, 84 | }) 85 | } 86 | 87 | pub trait SchemaExt { 88 | fn id_field(&self) -> Field; 89 | } 90 | 91 | impl SchemaExt for Schema { 92 | fn id_field(&self) -> Field { 93 | self.get_field("__id") 94 | .expect("__id field should be present") 95 | } 96 | } 97 | 98 | #[derive(Clone, Debug)] 99 | pub struct SchemaProvider { 100 | config: PatheryConfig, 101 | } 102 | 103 | impl SchemaProvider { 104 | pub fn lambda() -> Self { 105 | let config_path = "/opt/pathery/config.json"; 106 | let content = fs::read_to_string(config_path).expect("config should exist"); 107 | let config: PatheryConfig = json::from_str(&content).expect("config should parse"); 108 | 109 | SchemaProvider { config } 110 | } 111 | 112 | pub fn from_json(config: json::Value) -> Self { 113 | let config = json::from_value(config).expect("config should parse"); 114 | Self { config } 115 | } 116 | } 117 | 118 | impl SchemaLoader for SchemaProvider { 119 | fn load_schema(&self, index_id: &str) -> Result { 120 | let config = self 121 | .config 122 | .indexes 123 | .iter() 124 | .find(|config| index_id.starts_with(&config.prefix)) 125 | .ok_or_else(|| { 126 | ServiceError::not_found(&format!("Schema for index [{}] not found", index_id)) 127 | })?; 128 | 129 | let mut schema = Schema::builder(); 130 | 131 | for field in &config.fields { 132 | match &field { 133 | FieldConfig::TextFieldConfig { name, flags } => { 134 | let field_opts = 135 | flags 136 | .iter() 137 | .fold(TextOptions::default(), |acc, opt| match opt { 138 | TextFieldOption::TEXT => acc | schema::TEXT, 139 | TextFieldOption::STRING => acc | schema::STRING, 140 | TextFieldOption::FAST => acc | schema::FAST, 141 | }); 142 | schema.add_text_field(name, field_opts); 143 | } 144 | FieldConfig::DateFieldConfig { name, flags } => { 145 | schema.add_date_field(name, numeric_field_options(flags)); 146 | } 147 | FieldConfig::IntegerFieldConfig { name, flags } => { 148 | schema.add_i64_field(name, numeric_field_options(flags)); 149 | } 150 | FieldConfig::JsonFieldConfig { name, flags } => { 151 | let field_opts = 152 | flags 153 | .iter() 154 | .fold(TextOptions::default(), |acc, opt| match opt { 155 | JsonFieldOption::TEXT => acc | schema::TEXT, 156 | }); 157 | schema.add_json_field(name, field_opts); 158 | } 159 | } 160 | } 161 | 162 | // Add system schema fields 163 | 164 | // __id is the document id used for uniqueness 165 | schema.add_text_field("__id", schema::STRING | schema::STORED); 166 | 167 | Ok(schema.build()) 168 | } 169 | } 170 | 171 | #[cfg(test)] 172 | mod tests { 173 | use serde_json::json; 174 | 175 | use super::*; 176 | 177 | #[test] 178 | fn parse_test_config() { 179 | let config = json!({ 180 | "indexes": [{ 181 | "prefix": "book-index-v1-", 182 | "fields": [ 183 | { 184 | "name": "title", 185 | "flags": ["TEXT"], 186 | "kind": "text", 187 | }, 188 | { 189 | "name": "author", 190 | "flags": ["STRING"], 191 | "kind": "text", 192 | }, 193 | { 194 | "name": "date_added", 195 | "flags": ["INDEXED", "FAST"], 196 | "kind": "date", 197 | }, 198 | { 199 | "name": "year", 200 | "flags": ["INDEXED", "FAST"], 201 | "kind": "i64", 202 | }, 203 | { 204 | "name": "meta", 205 | "flags": ["TEXT"], 206 | "kind": "json" 207 | } 208 | ], 209 | }] 210 | }); 211 | 212 | serde_json::from_value::(config).expect("should not throw"); 213 | } 214 | 215 | #[test] 216 | fn serialize_schema() { 217 | let mut schema = Schema::builder(); 218 | 219 | schema.add_text_field("title", schema::STORED | schema::TEXT); 220 | schema.add_text_field("author", schema::STORED | schema::STRING); 221 | schema.add_date_field( 222 | "created_date", 223 | schema::STORED | schema::INDEXED | schema::FAST, 224 | ); 225 | 226 | let schema = schema.build(); 227 | 228 | println!("{}", json::to_string_pretty(&schema).expect("ok")); 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /packages/pathery/src/search_doc.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | use serde_json::{json, Map, Value}; 3 | use tantivy::schema::{DocParsingError, Schema}; 4 | use tantivy::Document; 5 | use thiserror::Error; 6 | 7 | use crate::serialize::compressed_json; 8 | use crate::util; 9 | 10 | #[derive(Debug, Error, PartialEq, Eq)] 11 | pub enum SearchDocError { 12 | #[error("json value is not an object")] 13 | NotAnObject, 14 | 15 | #[error("invalid type for __id, expected string")] 16 | InvalidIdType, 17 | 18 | #[error("{0}")] 19 | SchemaValidationError(String), 20 | 21 | #[error("cannot index empty document")] 22 | EmptyDocument, 23 | } 24 | 25 | impl From for SearchDocError { 26 | fn from(err: DocParsingError) -> Self { 27 | SearchDocError::SchemaValidationError(err.to_string()) 28 | } 29 | } 30 | 31 | #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] 32 | pub struct DDBKey { 33 | pub pk: String, 34 | pub sk: String, 35 | } 36 | 37 | impl From for DDBKey { 38 | fn from(id: SearchDocId) -> Self { 39 | DDBKey { 40 | pk: format!("document|{}", id.0), 41 | sk: format!("document|{}", id.0), 42 | } 43 | } 44 | } 45 | 46 | #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] 47 | #[serde(transparent)] 48 | pub struct SearchDocId(String); 49 | 50 | impl From for SearchDocId { 51 | fn from(key: DDBKey) -> Self { 52 | let doc_id = key 53 | .pk 54 | .split("|") 55 | .nth(1) 56 | .expect("key should be formatted correctly"); 57 | 58 | Self(doc_id.into()) 59 | } 60 | } 61 | 62 | impl SearchDocId { 63 | pub fn parse(id: &str) -> SearchDocId { 64 | SearchDocId(id.into()) 65 | } 66 | 67 | pub fn id(&self) -> &str { 68 | &self.0 69 | } 70 | } 71 | 72 | #[derive(Debug, Serialize, Deserialize, Clone)] 73 | pub struct SearchDoc { 74 | id: SearchDocId, 75 | #[serde(with = "compressed_json")] 76 | content: Map, 77 | } 78 | 79 | impl SearchDoc { 80 | /// Converts a JSON value into a SearchDoc if the document is valid according to the schema. 81 | /// Also generate an `__id` if no `__id` is present. 82 | pub fn from_json(schema: &Schema, json_value: Value) -> Result { 83 | let mut json_object = match json_value { 84 | Value::Object(obj) => obj, 85 | _ => return Err(SearchDocError::NotAnObject), 86 | }; 87 | 88 | let id = json_object 89 | .entry("__id") 90 | .or_insert_with(|| json!(util::generate_id())) 91 | .as_str() 92 | .ok_or_else(|| SearchDocError::InvalidIdType)? 93 | .to_string(); 94 | 95 | // Validate the document against the provided schema. 96 | let document = schema.json_object_to_doc(json_object.clone())?; 97 | 98 | if document.field_values().len() <= 1 { 99 | return Err(SearchDocError::EmptyDocument); 100 | } 101 | 102 | Ok(SearchDoc { 103 | id: SearchDocId(id), 104 | content: json_object, 105 | }) 106 | } 107 | 108 | pub fn id(&self) -> &SearchDocId { 109 | &self.id 110 | } 111 | 112 | pub fn document(&self, schema: &Schema) -> Document { 113 | schema 114 | .json_object_to_doc(self.content.clone()) 115 | .expect("should succeed since from_json validates") 116 | } 117 | } 118 | 119 | #[cfg(test)] 120 | mod tests { 121 | use tantivy::schema; 122 | 123 | use super::*; 124 | 125 | fn setup() -> Schema { 126 | let mut schema = Schema::builder(); 127 | schema.add_text_field("__id", schema::STRING); 128 | schema.add_text_field("name", schema::STRING); 129 | schema.build() 130 | } 131 | 132 | #[test] 133 | fn from_json_generates_id() { 134 | let schema = setup(); 135 | let value = json!({ 136 | "name": "world" 137 | }); 138 | 139 | let search_doc = SearchDoc::from_json(&schema, value).unwrap(); 140 | 141 | assert!(search_doc.id.0.len() > 0); 142 | } 143 | 144 | #[test] 145 | fn from_json_uses_id_when_exists() { 146 | let schema = setup(); 147 | let id = util::generate_id(); 148 | let value = json!({ "__id": id, "name": "world" }); 149 | 150 | let search_doc = SearchDoc::from_json(&schema, value).unwrap(); 151 | 152 | assert_eq!(id, search_doc.id.0); 153 | } 154 | 155 | #[test] 156 | fn from_json_returns_validation_error_when_schema_does_not_match() { 157 | let schema = setup(); 158 | let value = json!({ "name": 1234 }); 159 | 160 | let search_doc = SearchDoc::from_json(&schema, value).unwrap_err(); 161 | 162 | assert_eq!( 163 | SearchDocError::SchemaValidationError( 164 | "The field '\"name\"' could not be parsed: TypeError { expected: \"a string\", \ 165 | json: Number(1234) }" 166 | .into() 167 | ), 168 | search_doc, 169 | ); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /packages/pathery/src/serialize/compressed_json.rs: -------------------------------------------------------------------------------- 1 | use serde::de::Visitor; 2 | use serde::{Deserializer, Serializer}; 3 | use serde_json::{Map, Value}; 4 | 5 | pub fn serialize(input: &Map, serializer: S) -> Result 6 | where S: Serializer { 7 | let json_bytes = serde_json::to_vec(input).unwrap(); 8 | let encoded_bytes = zstd::encode_all(json_bytes.as_slice(), 0).unwrap(); 9 | serializer.serialize_bytes(&encoded_bytes) 10 | } 11 | 12 | struct CompressedJsonVisitor; 13 | 14 | impl<'de> Visitor<'de> for CompressedJsonVisitor { 15 | type Value = Map; 16 | 17 | fn expecting(&self, _formatter: &mut std::fmt::Formatter) -> std::fmt::Result { 18 | todo!() 19 | } 20 | 21 | fn visit_bytes(self, v: &[u8]) -> Result 22 | where E: serde::de::Error { 23 | let decoded_bytes = zstd::decode_all(v).unwrap(); 24 | let deserialized = serde_json::from_slice(&decoded_bytes).unwrap(); 25 | Ok(deserialized) 26 | } 27 | } 28 | 29 | pub fn deserialize<'de, D>(deserializer: D) -> Result, D::Error> 30 | where D: Deserializer<'de> { 31 | deserializer.deserialize_bytes(CompressedJsonVisitor) 32 | } 33 | 34 | #[cfg(test)] 35 | mod tests { 36 | use std::collections::HashMap; 37 | 38 | use serde::{Deserialize, Serialize}; 39 | use serde_dynamo::{self, AttributeValue}; 40 | use serde_json::{json, Map, Value}; 41 | 42 | #[derive(Serialize, Deserialize, Debug, PartialEq, Clone)] 43 | struct MyType { 44 | #[serde(with = "super")] 45 | inner: Map, 46 | } 47 | 48 | #[test] 49 | fn test_round_trip() { 50 | let init = MyType { 51 | inner: json!({ 52 | "hello": "world" 53 | }) 54 | .as_object() 55 | .unwrap() 56 | .to_owned(), 57 | }; 58 | 59 | let serialized: HashMap = 60 | serde_dynamo::to_item(init.clone()).unwrap(); 61 | let deserialized: MyType = serde_dynamo::from_item(serialized).unwrap(); 62 | 63 | assert_eq!(init, deserialized); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /packages/pathery/src/serialize/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod compressed_json; 2 | -------------------------------------------------------------------------------- /packages/pathery/src/service/doc.rs: -------------------------------------------------------------------------------- 1 | use async_trait::async_trait; 2 | use serde::{Deserialize, Serialize}; 3 | use serde_json as json; 4 | 5 | use super::{ServiceHandler, ServiceRequest, ServiceResponse}; 6 | use crate::search_doc::SearchDocId; 7 | use crate::worker::index_writer::client::{IndexWriterClient, LambdaIndexWriterClient}; 8 | use crate::worker::index_writer::job::Job; 9 | 10 | #[derive(Serialize, Deserialize, Debug)] 11 | pub struct PathParams { 12 | index_id: String, 13 | doc_id: String, 14 | } 15 | 16 | #[derive(Serialize)] 17 | pub struct DeleteDocResponse { 18 | pub job_id: String, 19 | } 20 | 21 | pub struct DeleteDocService { 22 | client: Box, 23 | } 24 | 25 | #[async_trait] 26 | impl ServiceHandler for DeleteDocService { 27 | async fn handle_request( 28 | &self, 29 | request: ServiceRequest, 30 | ) -> ServiceResponse { 31 | let index_id = request.path_param("index_id")?; 32 | let doc_id = request.path_param("doc_id")?; 33 | 34 | let mut job = Job::create(&index_id); 35 | 36 | job.delete_doc(SearchDocId::parse(&doc_id)); 37 | 38 | let job_id = self.client.submit_job(job).await?; 39 | 40 | Ok(DeleteDocResponse { job_id }) 41 | } 42 | } 43 | 44 | impl DeleteDocService { 45 | pub async fn create() -> Self { 46 | let client = LambdaIndexWriterClient::create(None).await; 47 | 48 | DeleteDocService { 49 | client: Box::new(client), 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /packages/pathery/src/service/index/batch_index.rs: -------------------------------------------------------------------------------- 1 | use async_trait::async_trait; 2 | use serde::Serialize; 3 | 4 | use crate::json; 5 | use crate::schema::{SchemaLoader, SchemaProvider}; 6 | use crate::search_doc::SearchDoc; 7 | use crate::service::{ServiceError, ServiceHandler, ServiceRequest, ServiceResponse}; 8 | use crate::store::document::{DDBDocumentStore, DocumentStore}; 9 | use crate::worker::index_writer::client::{IndexWriterClient, LambdaIndexWriterClient}; 10 | use crate::worker::index_writer::job::Job; 11 | 12 | #[derive(Serialize)] 13 | pub struct BatchIndexResponse { 14 | pub job_id: String, 15 | } 16 | 17 | pub struct BatchIndexService { 18 | schema_loader: Box, 19 | 20 | document_store: Box, 21 | 22 | index_writer: Box, 23 | } 24 | 25 | #[async_trait] 26 | impl ServiceHandler, BatchIndexResponse> for BatchIndexService { 27 | async fn handle_request( 28 | &self, 29 | request: ServiceRequest>, 30 | ) -> ServiceResponse { 31 | let body = request.body()?; 32 | 33 | let index_id = request.path_param("index_id")?; 34 | 35 | let schema = self.schema_loader.load_schema(&index_id)?; 36 | 37 | let mut job = Job::create(&index_id); 38 | 39 | let documents = body 40 | .into_iter() 41 | .map(|value| SearchDoc::from_json(&schema, value)) 42 | .collect::>(); 43 | 44 | let error = documents 45 | .iter() 46 | .enumerate() 47 | .filter_map(|(idx, result)| result.as_ref().err().map(|err| (idx, err))) 48 | .collect::>(); 49 | 50 | if let Some((idx, error)) = error.first() { 51 | return Err(ServiceError::invalid_request(&format!( 52 | "Error parsing document (path: [{}]): {}", 53 | idx, 54 | error.to_string() 55 | ))); 56 | } 57 | 58 | let documents = documents 59 | .into_iter() 60 | .filter_map(Result::ok) 61 | .collect::>(); 62 | 63 | let doc_refs = self.document_store.save_documents(documents).await?; 64 | 65 | for doc_ref in doc_refs { 66 | job.index_doc(doc_ref) 67 | } 68 | 69 | let job_id = self.index_writer.submit_job(job).await?; 70 | 71 | Ok(BatchIndexResponse { job_id }) 72 | } 73 | } 74 | 75 | impl BatchIndexService { 76 | pub async fn create() -> Self { 77 | let document_store = DDBDocumentStore::create(None).await; 78 | let writer_client = LambdaIndexWriterClient::create(None).await; 79 | let schema_loader = SchemaProvider::lambda(); 80 | 81 | BatchIndexService { 82 | document_store: Box::new(document_store), 83 | index_writer: Box::new(writer_client), 84 | schema_loader: Box::new(schema_loader), 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /packages/pathery/src/service/index/mod.rs: -------------------------------------------------------------------------------- 1 | mod batch_index; 2 | mod post_index; 3 | mod query_index; 4 | mod stats_index; 5 | 6 | pub use batch_index::BatchIndexService; 7 | pub use post_index::PostIndexService; 8 | pub use query_index::QueryIndexService; 9 | pub use stats_index::StatsIndexService; 10 | -------------------------------------------------------------------------------- /packages/pathery/src/service/index/post_index.rs: -------------------------------------------------------------------------------- 1 | use async_trait::async_trait; 2 | use serde::Serialize; 3 | 4 | use crate::schema::{SchemaLoader, SchemaProvider}; 5 | use crate::search_doc::SearchDoc; 6 | use crate::service::{ServiceError, ServiceHandler, ServiceRequest, ServiceResponse}; 7 | use crate::store::document::{DDBDocumentStore, DocumentStore}; 8 | use crate::worker::index_writer::client::{IndexWriterClient, LambdaIndexWriterClient}; 9 | use crate::worker::index_writer::job::Job; 10 | use crate::{json, util}; 11 | 12 | #[derive(Serialize, Debug)] 13 | pub struct PostIndexResponse { 14 | pub job_id: String, 15 | pub updated_at: String, 16 | } 17 | 18 | pub struct PostIndexService { 19 | schema_loader: Box, 20 | 21 | document_store: Box, 22 | 23 | writer_client: Box, 24 | } 25 | 26 | #[async_trait] 27 | impl ServiceHandler for PostIndexService { 28 | async fn handle_request( 29 | &self, 30 | request: ServiceRequest, 31 | ) -> ServiceResponse { 32 | let body = request.body()?; 33 | 34 | let index_id = request.path_param("index_id")?; 35 | 36 | let schema = self.schema_loader.load_schema(&index_id)?; 37 | 38 | let document = SearchDoc::from_json(&schema, body) 39 | .map_err(|err| ServiceError::invalid_request(&err.to_string()))?; 40 | 41 | let doc_refs = self.document_store.save_documents(vec![document]).await?; 42 | 43 | let mut job = Job::create(&index_id); 44 | 45 | for doc_ref in doc_refs { 46 | job.index_doc(doc_ref); 47 | } 48 | 49 | let job_id = self.writer_client.submit_job(job).await?; 50 | 51 | Ok(PostIndexResponse { 52 | job_id, 53 | updated_at: util::timestamp(), 54 | }) 55 | } 56 | } 57 | 58 | impl PostIndexService { 59 | pub async fn create() -> Self { 60 | let document_store = DDBDocumentStore::create(None).await; 61 | let writer_client = LambdaIndexWriterClient::create(None).await; 62 | let schema_loader = SchemaProvider::lambda(); 63 | 64 | PostIndexService { 65 | document_store: Box::new(document_store), 66 | writer_client: Box::new(writer_client), 67 | schema_loader: Box::new(schema_loader), 68 | } 69 | } 70 | } 71 | 72 | #[cfg(test)] 73 | mod tests { 74 | use super::*; 75 | use crate::test_utils::*; 76 | 77 | pub fn test_service() -> PostIndexService { 78 | let ctx = setup(); 79 | 80 | let schema_loader = Box::new(ctx.schema_loader().clone()); 81 | let document_store = Box::new(ctx.document_store().clone()); 82 | let writer_client = Box::new(ctx.writer_client().clone()); 83 | 84 | PostIndexService { 85 | schema_loader, 86 | document_store, 87 | writer_client, 88 | } 89 | } 90 | 91 | #[tokio::test] 92 | async fn post_index_doc_with_no_id() { 93 | let service = test_service(); 94 | 95 | let doc = json::json!({ 96 | "title": "Zen and the Art of Motorcycle Maintenance", 97 | "author": "Robert Pirsig", 98 | "date_added": "2022-11-23T18:24:40Z", 99 | "isbn": "0060589469" 100 | }); 101 | 102 | let request = ServiceRequest::create(doc).with_path_param("index_id", "test"); 103 | 104 | service.handle_request(request).await.unwrap(); 105 | } 106 | 107 | #[tokio::test] 108 | async fn post_index_non_object() { 109 | let service = test_service(); 110 | 111 | let doc = json::json!([]); 112 | 113 | let request = ServiceRequest::create(doc).with_path_param("index_id", "test"); 114 | 115 | let response = service.handle_request(request).await.unwrap_err(); 116 | 117 | assert_eq!(400, response.status()); 118 | assert_eq!("json value is not an object", response.message()); 119 | } 120 | 121 | #[tokio::test] 122 | async fn post_index_value_that_does_not_match_schema() { 123 | let service = test_service(); 124 | 125 | let doc = json::json!({"title": 1}); 126 | 127 | let request = ServiceRequest::create(doc).with_path_param("index_id", "test"); 128 | 129 | let response = service.handle_request(request).await.unwrap_err(); 130 | 131 | assert_eq!(400, response.status()); 132 | assert_eq!( 133 | "The field '\"title\"' could not be parsed: TypeError { expected: \"a string\", json: \ 134 | Number(1) }", 135 | response.message() 136 | ); 137 | } 138 | 139 | #[tokio::test] 140 | async fn post_index_field_that_does_not_exist() { 141 | let service = test_service(); 142 | 143 | let doc = json::json!({ 144 | "foobar": "baz", 145 | }); 146 | 147 | let request = ServiceRequest::create(doc).with_path_param("index_id", "test"); 148 | 149 | let response = service.handle_request(request).await.unwrap_err(); 150 | 151 | // Empty because the non-existent field does not explicitly trigger a failure - it just 152 | // doesn't get indexed. 153 | assert_eq!(400, response.status()); 154 | assert_eq!("cannot index empty document", response.message()); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /packages/pathery/src/service/index/query_index.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::Arc; 3 | 4 | use async_trait::async_trait; 5 | use serde::{Deserialize, Serialize}; 6 | use tantivy::query::QueryParser; 7 | use tantivy::schema::{Field, FieldType}; 8 | use tantivy::{Index, SnippetGenerator, TantivyError}; 9 | use tracing::info; 10 | 11 | use crate::function::query_index_partition::client::LambdaQueryIndexPartitionClient; 12 | use crate::function::query_index_partition::PartitionSearchHit; 13 | use crate::index::{IndexExt, IndexLoader, LambdaIndexLoader}; 14 | use crate::json; 15 | use crate::pagination::PaginationToken; 16 | use crate::service::{ServiceError, ServiceHandler, ServiceRequest, ServiceResponse}; 17 | use crate::store::document::{DDBDocumentStore, DocumentStore}; 18 | 19 | #[derive(Serialize, Deserialize, Debug)] 20 | pub struct QueryRequest { 21 | pub query: String, 22 | pub pagination_token: Option, 23 | } 24 | 25 | #[derive(Serialize, Deserialize, Debug, PartialEq)] 26 | pub struct SearchHit { 27 | pub doc: json::Value, 28 | pub snippets: json::Value, 29 | pub score: f32, 30 | } 31 | 32 | #[derive(Serialize, Deserialize, Debug, PartialEq)] 33 | pub struct QueryResponse { 34 | pub matches: Vec, 35 | pub pagination_token: Option, 36 | } 37 | 38 | pub struct QueryIndexService { 39 | index_loader: Box, 40 | 41 | document_store: Box, 42 | 43 | query_index_paritition_client: Arc, 44 | } 45 | 46 | #[async_trait] 47 | impl ServiceHandler for QueryIndexService { 48 | async fn handle_request( 49 | &self, 50 | request: ServiceRequest, 51 | ) -> ServiceResponse { 52 | let body = request.body()?; 53 | 54 | let index_id = request.path_param("index_id")?; 55 | 56 | let index = self.index_loader.load_index(&index_id, None)?; 57 | 58 | let metas = index.load_metas().unwrap(); 59 | let num_docs: u32 = metas.segments.iter().map(|seg| seg.num_docs()).sum(); 60 | info!("Doc count: {}", num_docs); 61 | 62 | let total_partitions = (num_docs / 60_000) + 1; 63 | info!("Total partitions: {}", total_partitions); 64 | 65 | let mut pagination_token = match body.pagination_token { 66 | Some(token) => PaginationToken::parse(token), 67 | None => { 68 | let mut pagination_token = 69 | PaginationToken::new(&body.query, total_partitions as usize); 70 | let metas = index.load_metas().unwrap(); 71 | let segments = metas.segments; 72 | let segments_json = serde_json::to_value(segments).unwrap(); 73 | pagination_token.import_segments_json(segments_json); 74 | pagination_token 75 | } 76 | }; 77 | 78 | let requests: Vec<_> = (0..total_partitions) 79 | .map(|partition_n| { 80 | let query_client = Arc::clone(&self.query_index_paritition_client); 81 | let index_id = index_id.clone(); 82 | let ro_token = pagination_token.clone(); 83 | 84 | tokio::spawn(async move { 85 | query_client 86 | .query_partition( 87 | index_id.clone(), 88 | ro_token.get_query(), 89 | ro_token.get_offset(partition_n as usize), 90 | partition_n as usize, 91 | ro_token.segments_for_partition(partition_n as usize), 92 | ) 93 | .await 94 | }) 95 | }) 96 | .collect(); 97 | 98 | let mut matches: Vec = Vec::new(); 99 | 100 | for request in requests { 101 | let mut response = request.await.unwrap(); 102 | let response = response.matches.as_mut(); 103 | matches.append(response); 104 | } 105 | 106 | matches.sort_by(|a, b| b.score.total_cmp(&a.score)); 107 | matches.truncate(10); 108 | 109 | for match_one in &matches { 110 | pagination_token.inc_offset(match_one.partition_n) 111 | } 112 | 113 | println!("{}", serde_json::to_string(&pagination_token).unwrap()); 114 | 115 | if matches.len() == 0 { 116 | return Ok(QueryResponse { 117 | matches: vec![], 118 | pagination_token: None, 119 | }); 120 | } 121 | 122 | let retrieved_matches = self 123 | .document_store 124 | .get_documents( 125 | matches 126 | .iter() 127 | .map(|one_match| one_match.doc_ref.clone()) 128 | .collect(), 129 | ) 130 | .await 131 | .unwrap(); 132 | 133 | let snippet_index = Index::create_in_ram(index.schema()); 134 | let mut snippet_writer = snippet_index.default_writer(); 135 | let snippet_reader = snippet_index.reader().unwrap(); 136 | let snippet_schema = snippet_index.schema(); 137 | 138 | let query_parser = QueryParser::for_index( 139 | &snippet_index, 140 | snippet_schema 141 | .fields() 142 | .filter_map(|(field, entry)| { 143 | if !entry.is_indexed() { 144 | return None; 145 | } 146 | match entry.field_type() { 147 | FieldType::Str(_) => Some(field), 148 | _ => None, 149 | } 150 | }) 151 | .collect::>(), 152 | ); 153 | 154 | let query = query_parser 155 | .parse_query(&body.query) 156 | .map_err(|err| ServiceError::invalid_request(&err.to_string()))?; 157 | 158 | let matches = retrieved_matches 159 | .iter() 160 | .zip(matches) 161 | .map(|(search_doc, one_match)| { 162 | let document = search_doc.document(&snippet_schema); 163 | let named_doc = snippet_schema.to_named_doc(&document); 164 | snippet_writer.add_document(document.clone()).unwrap(); 165 | snippet_writer.commit().unwrap(); 166 | snippet_reader.reload().unwrap(); 167 | let snippet_searcher = snippet_reader.searcher(); 168 | 169 | let snippets: HashMap = document 170 | .field_values() 171 | .iter() 172 | .filter_map(|field_value| { 173 | // Only text fields are supported for snippets 174 | let text = field_value.value().as_text()?; 175 | 176 | let generator = match SnippetGenerator::create( 177 | &snippet_searcher, 178 | &query, 179 | field_value.field(), 180 | ) { 181 | Ok(generator) => Some(generator), 182 | // InvalidArgument is returned when field is not indexed 183 | Err(TantivyError::InvalidArgument(_)) => None, 184 | Err(err) => panic!("{}", err.to_string()), 185 | }?; 186 | 187 | let snippet = generator.snippet(text).to_html(); 188 | 189 | if snippet.is_empty() { 190 | None 191 | } else { 192 | Some(( 193 | snippet_schema.get_field_name(field_value.field()).into(), 194 | snippet, 195 | )) 196 | } 197 | }) 198 | .collect(); 199 | 200 | SearchHit { 201 | score: one_match.score, 202 | doc: json::to_value(named_doc).expect("named doc should serialize"), 203 | snippets: json::to_value(snippets).expect("snippets should serialize"), 204 | } 205 | }) 206 | .collect(); 207 | 208 | Ok(QueryResponse { 209 | matches, 210 | pagination_token: Some(pagination_token.serialize()), 211 | }) 212 | } 213 | } 214 | 215 | impl QueryIndexService { 216 | pub async fn create() -> QueryIndexService { 217 | let document_store = DDBDocumentStore::create(None).await; 218 | let index_loader = LambdaIndexLoader::create(); 219 | 220 | QueryIndexService { 221 | document_store: Box::new(document_store), 222 | index_loader: Box::new(index_loader.await), 223 | query_index_paritition_client: Arc::new( 224 | LambdaQueryIndexPartitionClient::create().await, 225 | ), 226 | } 227 | } 228 | } 229 | 230 | // #[cfg(test)] 231 | // mod tests { 232 | // use super::*; 233 | // use crate::test_utils::*; 234 | 235 | // fn test_service(ctx: &TestContext) -> QueryIndexService { 236 | // QueryIndexService { 237 | // document_store: Box::new(ctx.document_store().clone()), 238 | // index_loader: Box::new(ctx.index_loader().clone()), 239 | // } 240 | // } 241 | 242 | // #[tokio::test] 243 | // async fn query_default_response() { 244 | // let ctx = setup() 245 | // .with_documents( 246 | // "test", 247 | // vec![json!({ 248 | // "__id": "foobar", 249 | // "title": "hello", 250 | // "author": "world" 251 | // })], 252 | // ) 253 | // .await; 254 | 255 | // let service = test_service(&ctx); 256 | 257 | // let request = ServiceRequest::create(QueryRequest { 258 | // query: "hello".into(), 259 | // with_partition: None, 260 | // }) 261 | // .with_path_param("index_id", "test"); 262 | 263 | // let response = service.handle_request(request).await.unwrap(); 264 | 265 | // assert_eq!( 266 | // QueryResponse { 267 | // matches: vec![SearchHit { 268 | // doc: json::json!({ 269 | // "__id": ["foobar"], 270 | // "title": ["hello"], 271 | // "author": ["world"], 272 | // }), 273 | // score: 0.28768212, 274 | // snippets: json::json!({ 275 | // "title": "hello" 276 | // }) 277 | // }] 278 | // }, 279 | // response 280 | // ); 281 | // } 282 | 283 | // #[tokio::test] 284 | // async fn query_document_with_un_indexed_fields() { 285 | // let ctx = setup() 286 | // .with_documents( 287 | // "test", 288 | // vec![json!({ 289 | // "__id": "foobar", 290 | // "title": "hello", 291 | // "meta": "world" 292 | // })], 293 | // ) 294 | // .await; 295 | 296 | // let service = test_service(&ctx); 297 | 298 | // let request = ServiceRequest::create(QueryRequest { 299 | // query: "hello".into(), 300 | // with_partition: None, 301 | // }) 302 | // .with_path_param("index_id", "test"); 303 | 304 | // let response = service.handle_request(request).await.unwrap(); 305 | 306 | // assert_eq!(1, response.matches.len()); 307 | // } 308 | 309 | // #[tokio::test] 310 | // async fn query_document_with_json_field() { 311 | // let ctx = setup() 312 | // .with_documents( 313 | // "test", 314 | // vec![json!({ 315 | // "__id": "foobar", 316 | // "title": "hello", 317 | // "props": { 318 | // "foo": "bar" 319 | // } 320 | // })], 321 | // ) 322 | // .await; 323 | 324 | // let service = test_service(&ctx); 325 | 326 | // let request = ServiceRequest::create(QueryRequest { 327 | // query: "props.foo:bar".into(), 328 | // with_partition: None, 329 | // }) 330 | // .with_path_param("index_id", "test"); 331 | 332 | // let response = service.handle_request(request).await.unwrap(); 333 | 334 | // assert_eq!(1, response.matches.len()); 335 | // } 336 | // } 337 | -------------------------------------------------------------------------------- /packages/pathery/src/service/index/stats_index.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | 3 | use async_trait::async_trait; 4 | use serde::{Deserialize, Serialize}; 5 | use serde_json as json; 6 | 7 | use crate::index::{IndexLoader, LambdaIndexLoader}; 8 | use crate::service::{ServiceHandler, ServiceRequest, ServiceResponse}; 9 | 10 | #[derive(Serialize, Deserialize)] 11 | pub struct SegmentStats { 12 | id: String, 13 | num_docs: u32, 14 | num_deleted: u32, 15 | index_size: f64, 16 | } 17 | 18 | #[derive(Serialize, Deserialize)] 19 | pub struct IndexStatsResponse { 20 | segments: Vec, 21 | } 22 | 23 | pub struct StatsIndexService { 24 | index_loader: Box, 25 | } 26 | 27 | #[async_trait] 28 | impl ServiceHandler for StatsIndexService { 29 | async fn handle_request( 30 | &self, 31 | request: ServiceRequest, 32 | ) -> ServiceResponse { 33 | let index_id = request.path_param("index_id")?; 34 | 35 | let index = self.index_loader.load_index(&index_id, None)?; 36 | 37 | let metas = index.load_metas().unwrap(); 38 | 39 | let segment_files = fs::read_dir(format!("/mnt/pathery-data/{index_id}")) 40 | .unwrap() 41 | .filter_map(|entry| entry.ok()) 42 | .collect::>(); 43 | 44 | let segments = metas 45 | .segments 46 | .iter() 47 | .map(|s| { 48 | let segment_id = s.id().uuid_string(); 49 | 50 | let index_size_bytes: u64 = segment_files 51 | .iter() 52 | .filter_map(|entry| { 53 | let filename = entry.file_name(); 54 | let filename = filename.to_str()?; 55 | 56 | filename 57 | .starts_with(&segment_id) 58 | .then(|| entry.metadata()) 59 | .and_then(Result::ok) 60 | .map(|m| m.len()) 61 | }) 62 | .sum(); 63 | 64 | let index_size_mb: f64 = index_size_bytes as f64 / 1_000_000f64; 65 | 66 | SegmentStats { 67 | id: s.id().uuid_string(), 68 | num_docs: s.num_docs(), 69 | num_deleted: s.num_deleted_docs(), 70 | index_size: index_size_mb, 71 | } 72 | }) 73 | .collect(); 74 | 75 | Ok(IndexStatsResponse { segments }) 76 | } 77 | } 78 | 79 | impl StatsIndexService { 80 | pub async fn create() -> Self { 81 | let index_loader = LambdaIndexLoader::create(); 82 | 83 | StatsIndexService { 84 | index_loader: Box::new(index_loader.await), 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /packages/pathery/src/service/mod.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::error::Error; 3 | use std::marker::PhantomData; 4 | 5 | use async_trait::async_trait; 6 | use http::Response; 7 | use lambda_http::{Body, RequestExt}; 8 | use serde::{Deserialize, Serialize}; 9 | use tracing::error; 10 | 11 | use crate::util; 12 | 13 | pub mod doc; 14 | pub mod index; 15 | 16 | #[derive(thiserror::Error, Debug)] 17 | pub enum ServiceError { 18 | #[error("{0}")] 19 | InvalidRequest(String), 20 | 21 | #[error("Internal service error")] 22 | InternalError { id: String, source: anyhow::Error }, 23 | 24 | #[error("Rate limit hit, back off and try request again.")] 25 | RateLimit, 26 | 27 | #[error("{0}")] 28 | NotFound(String), 29 | } 30 | 31 | impl ServiceError { 32 | pub fn invalid_request(message: &str) -> Self { 33 | ServiceError::InvalidRequest(message.into()) 34 | } 35 | 36 | pub fn internal_error(source: E) -> Self 37 | where E: Error + Send + Sync + 'static { 38 | let id = util::generate_id(); 39 | error!( 40 | message = "InternalServiceError", 41 | id, 42 | error = format!("{source:#?}") 43 | ); 44 | ServiceError::InternalError { 45 | id, 46 | source: anyhow::Error::new(source), 47 | } 48 | } 49 | 50 | pub fn not_found(message: &str) -> Self { 51 | ServiceError::NotFound(message.into()) 52 | } 53 | 54 | pub fn rate_limit() -> Self { 55 | ServiceError::RateLimit 56 | } 57 | 58 | pub fn status(&self) -> u16 { 59 | use ServiceError::*; 60 | match self { 61 | InvalidRequest(_) => 400, 62 | InternalError { .. } => 500, 63 | RateLimit => 429, 64 | NotFound(_) => 404, 65 | } 66 | } 67 | 68 | pub fn message(self) -> String { 69 | use ServiceError::*; 70 | match self { 71 | InternalError { id, .. } => format!("Internal server error [id = {}]", id), 72 | InvalidRequest(message) => message, 73 | RateLimit => String::from("Too many requests"), 74 | NotFound(message) => message, 75 | } 76 | } 77 | } 78 | 79 | type ServiceResponse = Result; 80 | 81 | pub struct ServiceRequest { 82 | inner: lambda_http::Request, 83 | body: PhantomData, 84 | } 85 | 86 | impl ServiceRequest 87 | where B: for<'de> Deserialize<'de> 88 | { 89 | /// Useful for testing 90 | pub fn create(body: B) -> ServiceRequest 91 | where B: Serialize { 92 | let request = http::Request::builder(); 93 | 94 | let body = lambda_http::Body::from(serde_json::to_string(&body).unwrap()); 95 | 96 | let inner = request.body(body).unwrap(); 97 | 98 | ServiceRequest { 99 | inner, 100 | body: PhantomData, 101 | } 102 | } 103 | 104 | /// Useful for testing 105 | pub fn with_path_param(mut self, name: &str, value: &str) -> Self { 106 | let updated = self 107 | .inner 108 | .with_path_parameters(HashMap::from([(String::from(name), String::from(value))])); 109 | 110 | self.inner = updated; 111 | 112 | self 113 | } 114 | 115 | pub fn body(&self) -> Result { 116 | if let Body::Text(body) = self.inner.body() { 117 | Ok(serde_json::from_str(body).map_err(|err| { 118 | ServiceError::InvalidRequest(format!("Unable to parse body: {}", err.to_string())) 119 | })?) 120 | } else { 121 | Err(ServiceError::InvalidRequest(String::from( 122 | "Expected string for body", 123 | ))) 124 | } 125 | } 126 | 127 | pub fn path_param(&self, name: &str) -> Result { 128 | let path_params = self.inner.path_parameters(); 129 | let value = path_params 130 | .first(name) 131 | .expect(&format!("missing path param: {}", name)); 132 | 133 | Ok(String::from(value)) 134 | } 135 | } 136 | 137 | fn map_error_response( 138 | error: ServiceError, 139 | ) -> Result, lambda_http::Error> { 140 | let status = error.status(); 141 | let message = error.message(); 142 | 143 | let response = Response::builder() 144 | .header("Content-Type", "application/json") 145 | .status(status); 146 | 147 | let body = serde_json::to_string(&serde_json::json!({ "message": message }))?; 148 | 149 | Ok(response.body(Body::Text(body))?) 150 | } 151 | 152 | fn map_success_response( 153 | response: R, 154 | ) -> Result, lambda_http::Error> 155 | where R: Serialize { 156 | let body = serde_json::to_string(&response)?; 157 | Ok(http::Response::builder() 158 | .status(200) 159 | .header("Content-Type", "application/json") 160 | .body(lambda_http::Body::Text(body))?) 161 | } 162 | 163 | #[async_trait] 164 | pub trait ServiceHandler: Sync 165 | where 166 | B: for<'de> Deserialize<'de> + Send, 167 | R: Serialize, 168 | { 169 | async fn handle_event( 170 | &self, 171 | event: lambda_http::Request, 172 | ) -> Result, lambda_http::Error> { 173 | let request = ServiceRequest { 174 | inner: event, 175 | body: PhantomData, 176 | }; 177 | 178 | self.handle_request(request) 179 | .await 180 | .map_or_else(map_error_response, map_success_response) 181 | } 182 | 183 | async fn handle_request(&self, request: ServiceRequest) -> ServiceResponse; 184 | } 185 | 186 | pub async fn start_service( 187 | service: &dyn ServiceHandler, 188 | ) -> Result<(), lambda_http::Error> 189 | where 190 | B: for<'de> Deserialize<'de> + Send, 191 | R: Serialize, 192 | { 193 | tracing_subscriber::fmt() 194 | .json() 195 | .with_max_level(tracing::Level::WARN) 196 | .with_target(false) 197 | .without_time() 198 | .init(); 199 | 200 | lambda_http::run(lambda_http::service_fn(|event| async { 201 | service.handle_event(event).await 202 | })) 203 | .await?; 204 | 205 | Ok(()) 206 | } 207 | -------------------------------------------------------------------------------- /packages/pathery/src/store/document.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::error::Error; 3 | use std::result::Result as StdResult; 4 | 5 | use async_trait::async_trait; 6 | use aws_sdk_dynamodb as ddb; 7 | use ddb::model::{AttributeValue, KeysAndAttributes, PutRequest, WriteRequest}; 8 | use ddb::types::SdkError; 9 | use serde::{Deserialize, Serialize}; 10 | use tantivy::schema::NamedFieldDocument; 11 | 12 | use crate::search_doc::{DDBKey, SearchDoc, SearchDocId}; 13 | use crate::service::ServiceError; 14 | use crate::util; 15 | 16 | impl From> for ServiceError 17 | where T: Error + Sync + Send + 'static 18 | { 19 | fn from(sdk_err: SdkError) -> Self { 20 | ServiceError::internal_error(sdk_err) 21 | } 22 | } 23 | 24 | impl From for ServiceError { 25 | fn from(err: serde_dynamo::Error) -> Self { 26 | ServiceError::internal_error(err) 27 | } 28 | } 29 | 30 | type Result = StdResult; 31 | 32 | #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] 33 | pub struct SearchDocRef(SearchDocId); 34 | 35 | impl From for SearchDocRef { 36 | fn from(doc: NamedFieldDocument) -> Self { 37 | let id = doc 38 | .0 39 | .get("__id") 40 | .expect("__id should be set") 41 | .first() 42 | .expect("__id should exist") 43 | .as_text() 44 | .expect("__id should be string"); 45 | 46 | SearchDocRef(SearchDocId::parse(id)) 47 | } 48 | } 49 | 50 | #[async_trait] 51 | pub trait DocumentStore: Send + Sync { 52 | /// Get documents by reference. 53 | async fn get_documents(&self, refs: Vec) -> Result>; 54 | 55 | /// Save a document such that it can be retrieved with get_documents. 56 | async fn save_documents(&self, documents: Vec) -> Result>; 57 | } 58 | 59 | pub struct DDBDocumentStore { 60 | table_name: String, 61 | client: ddb::Client, 62 | } 63 | 64 | #[async_trait] 65 | impl DocumentStore for DDBDocumentStore { 66 | async fn get_documents(&self, refs: Vec) -> Result> { 67 | let mut request = self.client.batch_get_item(); 68 | 69 | let mut keys_and_attrs = KeysAndAttributes::builder(); 70 | 71 | for doc_ref in refs { 72 | let key = DDBKey::from(doc_ref.0); 73 | keys_and_attrs = keys_and_attrs.keys(serde_dynamo::to_item(key)?); 74 | } 75 | 76 | request = request.request_items(&self.table_name, keys_and_attrs.build()); 77 | 78 | let response = request.send().await?; 79 | 80 | let documents = response 81 | .responses() 82 | .expect("responses should be present") 83 | .values() 84 | .flatten() 85 | .map(|item| serde_dynamo::from_item(item.clone())) 86 | .collect::, _>>()?; 87 | 88 | let unprocessed_ids = response 89 | .unprocessed_keys() 90 | .expect("unprocessed keys should be present") 91 | .values() 92 | .filter_map(KeysAndAttributes::keys) 93 | .flatten() 94 | .collect::>(); 95 | 96 | if unprocessed_ids.len() > 0 { 97 | return Err(ServiceError::rate_limit()); 98 | } 99 | 100 | Ok(documents) 101 | } 102 | 103 | async fn save_documents(&self, documents: Vec) -> Result> { 104 | if documents.len() > 25 { 105 | return Err(ServiceError::invalid_request( 106 | "Too many documents in request, max 25.", 107 | )); 108 | } 109 | 110 | let mut writes = vec![]; 111 | 112 | for document in &documents { 113 | let mut item: HashMap = serde_dynamo::to_item(document)?; 114 | 115 | let key: HashMap = 116 | serde_dynamo::to_item(DDBKey::from(document.id().clone()))?; 117 | 118 | item.extend(key); 119 | 120 | let put_request = PutRequest::builder().set_item(Some(item)).build(); 121 | 122 | writes.push(WriteRequest::builder().put_request(put_request).build()) 123 | } 124 | 125 | let response = self 126 | .client 127 | .batch_write_item() 128 | .request_items(&self.table_name, writes) 129 | .send() 130 | .await?; 131 | 132 | if let Some(items) = response.unprocessed_items() { 133 | let unhandled_writes = items.values().flatten().collect::>(); 134 | if unhandled_writes.len() > 0 { 135 | return Err(ServiceError::rate_limit()); 136 | } 137 | }; 138 | 139 | Ok(documents 140 | .into_iter() 141 | .map(|doc| SearchDocRef(doc.id().clone())) 142 | .collect()) 143 | } 144 | } 145 | 146 | impl DDBDocumentStore { 147 | pub async fn create(table_name: Option<&str>) -> DDBDocumentStore { 148 | let table_name = table_name 149 | .map(String::from) 150 | .unwrap_or_else(|| util::require_env("DATA_TABLE_NAME")); 151 | let sdk_config = aws_config::load_from_env().await; 152 | let client = aws_sdk_dynamodb::Client::new(&sdk_config); 153 | 154 | DDBDocumentStore { table_name, client } 155 | } 156 | } 157 | 158 | #[cfg(test)] 159 | pub mod test_util { 160 | use std::collections::HashMap; 161 | use std::sync::{Arc, Mutex}; 162 | 163 | use super::*; 164 | 165 | #[derive(Clone, Debug)] 166 | pub struct TestDocumentStore { 167 | db: Arc>>, 168 | } 169 | 170 | #[async_trait] 171 | impl DocumentStore for TestDocumentStore { 172 | async fn save_documents(&self, documents: Vec) -> Result> { 173 | let mut db = self.db.lock().unwrap(); 174 | 175 | for document in &documents { 176 | (*db).insert(document.id().clone(), document.clone()); 177 | } 178 | 179 | Ok(documents 180 | .iter() 181 | .map(|x| SearchDocRef(x.id().clone())) 182 | .collect()) 183 | } 184 | 185 | async fn get_documents(&self, refs: Vec) -> Result> { 186 | let db = self.db.lock().unwrap(); 187 | 188 | Ok(refs 189 | .iter() 190 | .map(|doc_ref| (*db).get(&doc_ref.0).unwrap().clone()) 191 | .collect()) 192 | } 193 | } 194 | 195 | impl TestDocumentStore { 196 | pub fn create() -> Self { 197 | TestDocumentStore { 198 | db: Arc::new(Mutex::new(HashMap::new())), 199 | } 200 | } 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /packages/pathery/src/store/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod document; 2 | -------------------------------------------------------------------------------- /packages/pathery/src/util.rs: -------------------------------------------------------------------------------- 1 | use std::time::SystemTime; 2 | 3 | use chrono::{DateTime, Utc}; 4 | 5 | pub fn generate_id() -> String { 6 | let id = uuid::Uuid::new_v4(); 7 | id.to_string() 8 | } 9 | 10 | pub fn timestamp() -> String { 11 | let now = SystemTime::now(); 12 | let now: DateTime = now.into(); 13 | now.to_rfc3339() 14 | } 15 | 16 | pub fn require_env(var_name: &str) -> String { 17 | std::env::var(var_name).expect(&format!("{var_name:?} should be set")) 18 | } 19 | -------------------------------------------------------------------------------- /packages/pathery/src/worker/async_delete/client.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | 3 | use async_trait::async_trait; 4 | 5 | use super::job::AsyncDeleteJob; 6 | use crate::service::ServiceError; 7 | use crate::util; 8 | 9 | #[async_trait] 10 | pub trait AsyncDeleteClient: Sync + Send + Debug { 11 | async fn submit_job(&self, job: AsyncDeleteJob) -> Result; 12 | } 13 | 14 | #[derive(Debug)] 15 | pub struct LambdaAsyncDeleteClient { 16 | queue_url: String, 17 | 18 | client: aws_sdk_sqs::Client, 19 | } 20 | 21 | #[async_trait] 22 | impl AsyncDeleteClient for LambdaAsyncDeleteClient { 23 | async fn submit_job(&self, job: AsyncDeleteJob) -> Result { 24 | let body = serde_json::to_string(&job).expect("job should serialize"); 25 | 26 | let response = self 27 | .client 28 | .send_message() 29 | .queue_url(&self.queue_url) 30 | .message_body(body) 31 | .send() 32 | .await 33 | .expect("job should queue"); 34 | 35 | Ok(response 36 | .message_id() 37 | .expect("message id should exist") 38 | .to_string()) 39 | } 40 | } 41 | 42 | impl LambdaAsyncDeleteClient { 43 | pub async fn create(queue_url: Option<&str>) -> LambdaAsyncDeleteClient { 44 | let sdk_config = aws_config::load_from_env().await; 45 | 46 | LambdaAsyncDeleteClient { 47 | queue_url: queue_url 48 | .map(String::from) 49 | .unwrap_or_else(|| util::require_env("ASYNC_DELETE_QUEUE_URL")), 50 | client: aws_sdk_sqs::Client::new(&sdk_config), 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /packages/pathery/src/worker/async_delete/job.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | #[derive(Serialize, Deserialize, Debug)] 6 | pub enum AsyncDeleteJob { 7 | FSDelete(PathBuf), 8 | } 9 | 10 | impl AsyncDeleteJob { 11 | pub fn fs_delete(path: PathBuf) -> AsyncDeleteJob { 12 | AsyncDeleteJob::FSDelete(path) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /packages/pathery/src/worker/async_delete/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod client; 2 | pub mod job; 3 | 4 | use std::fs; 5 | use std::path::PathBuf; 6 | 7 | use serde_json as json; 8 | 9 | use crate::lambda::{self, sqs}; 10 | 11 | pub fn fs_delete(path: PathBuf) { 12 | fs::remove_file(path).expect("should be able to delete file"); 13 | } 14 | 15 | pub async fn handle_event(event: sqs::SqsEvent) -> Result<(), lambda::Error> { 16 | let records = event.payload.records; 17 | 18 | let jobs = records 19 | .iter() 20 | .map(|message| message.body.as_ref().expect("Body should be present")) 21 | .map(|body| { 22 | let msg = json::from_str::(body.as_str()) 23 | .expect("Message should be deserializable"); 24 | msg 25 | }) 26 | .collect::>(); 27 | 28 | for ele in jobs { 29 | print!("{:?}", ele); 30 | match ele { 31 | job::AsyncDeleteJob::FSDelete(path) => fs_delete(path), 32 | } 33 | } 34 | 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /packages/pathery/src/worker/index_writer/client.rs: -------------------------------------------------------------------------------- 1 | use async_trait::async_trait; 2 | use thiserror::Error; 3 | 4 | use super::job::Job; 5 | use crate::service::ServiceError; 6 | use crate::util; 7 | 8 | #[derive(Debug, Error)] 9 | pub enum IndexWriterClientError {} 10 | 11 | #[async_trait] 12 | pub trait IndexWriterClient: Sync + Send { 13 | async fn submit_job(&self, job: Job) -> Result; 14 | } 15 | 16 | pub struct LambdaIndexWriterClient { 17 | queue_url: String, 18 | client: aws_sdk_sqs::Client, 19 | } 20 | 21 | #[async_trait] 22 | impl IndexWriterClient for LambdaIndexWriterClient { 23 | async fn submit_job(&self, job: Job) -> Result { 24 | let body = serde_json::to_string(&job).expect("job should serialize"); 25 | 26 | let response = self 27 | .client 28 | .send_message() 29 | .queue_url(&self.queue_url) 30 | .message_body(body) 31 | .message_group_id(job.index_id) 32 | .send() 33 | .await 34 | .expect("job should queue"); 35 | 36 | Ok(response 37 | .message_id() 38 | .expect("message id should exist") 39 | .to_string()) 40 | } 41 | } 42 | 43 | impl LambdaIndexWriterClient { 44 | pub async fn create(queue_url: Option<&str>) -> LambdaIndexWriterClient { 45 | let sdk_config = aws_config::load_from_env().await; 46 | 47 | LambdaIndexWriterClient { 48 | queue_url: queue_url 49 | .map(String::from) 50 | .unwrap_or_else(|| util::require_env("INDEX_WRITER_QUEUE_URL")), 51 | client: aws_sdk_sqs::Client::new(&sdk_config), 52 | } 53 | } 54 | } 55 | 56 | #[cfg(test)] 57 | pub mod test_utils { 58 | use super::*; 59 | use crate::index::test_util::TestIndexLoader; 60 | use crate::index::{IndexExt, IndexLoader}; 61 | use crate::store::document::test_util::TestDocumentStore; 62 | use crate::util; 63 | use crate::worker::index_writer::handle_job; 64 | 65 | #[derive(Clone)] 66 | pub struct TestIndexWriterClient { 67 | index_loader: TestIndexLoader, 68 | 69 | document_store: TestDocumentStore, 70 | } 71 | 72 | #[async_trait] 73 | impl IndexWriterClient for TestIndexWriterClient { 74 | async fn submit_job(&self, job: Job) -> Result { 75 | let index = self.index_loader.load_index(&job.index_id, None)?; 76 | 77 | let mut writer = index.default_writer(); 78 | 79 | handle_job(&mut writer, &self.document_store, job).await; 80 | 81 | writer.commit().unwrap(); 82 | 83 | Ok(util::generate_id()) 84 | } 85 | } 86 | 87 | impl TestIndexWriterClient { 88 | pub fn create(index_loader: TestIndexLoader, document_store: TestDocumentStore) -> Self { 89 | TestIndexWriterClient { 90 | index_loader, 91 | document_store, 92 | } 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /packages/pathery/src/worker/index_writer/job.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | use crate::search_doc::SearchDocId; 4 | use crate::store::document::SearchDocRef; 5 | 6 | #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] 7 | pub enum IndexWriterOp { 8 | IndexDoc { doc_ref: SearchDocRef }, 9 | 10 | DeleteDoc { doc_id: SearchDocId }, 11 | } 12 | 13 | #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] 14 | pub struct Job { 15 | pub index_id: String, 16 | pub ops: Vec, 17 | } 18 | 19 | impl Job { 20 | pub fn create(index_id: &str) -> Job { 21 | Job { 22 | index_id: index_id.into(), 23 | ops: vec![], 24 | } 25 | } 26 | 27 | pub fn index_doc(&mut self, doc_ref: SearchDocRef) { 28 | self.ops.push(IndexWriterOp::IndexDoc { doc_ref }) 29 | } 30 | 31 | pub fn delete_doc(&mut self, doc_id: SearchDocId) { 32 | self.ops.push(IndexWriterOp::DeleteDoc { doc_id }) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /packages/pathery/src/worker/index_writer/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod client; 2 | pub mod job; 3 | 4 | use std::collections::HashMap; 5 | 6 | use serde_json as json; 7 | use tantivy::{Document, IndexWriter, Term}; 8 | use tracing::info; 9 | 10 | use self::job::{IndexWriterOp, Job}; 11 | use crate::index::{IndexExt, IndexLoader}; 12 | use crate::lambda::{self, sqs}; 13 | use crate::store::document::{DocumentStore, SearchDocRef}; 14 | 15 | fn delete_doc(writer: &IndexWriter, doc_id: &str) { 16 | let index = writer.index(); 17 | let id_field = index.id_field(); 18 | 19 | writer.delete_term(Term::from_field_text(id_field, doc_id)); 20 | tracing::info!(message = "doc_deleted", doc_id); 21 | } 22 | 23 | fn index_doc(writer: &IndexWriter, doc: Document) { 24 | let index = writer.index(); 25 | let id_field = index.id_field(); 26 | let doc_id = doc 27 | .get_first(id_field) 28 | .and_then(|id| id.as_text()) 29 | .expect("__id field should be present") 30 | .to_string(); 31 | 32 | delete_doc(writer, &doc_id); 33 | writer 34 | .add_document(doc) 35 | .expect("Adding a document should not error"); 36 | tracing::info!(message = "doc_indexed", doc_id); 37 | } 38 | 39 | pub async fn handle_job(writer: &mut IndexWriter, document_store: &dyn DocumentStore, job: Job) { 40 | let schema = writer.index().schema(); 41 | 42 | let mut doc_refs: Vec = vec![]; 43 | 44 | for op in job.ops { 45 | match op { 46 | IndexWriterOp::IndexDoc { doc_ref } => doc_refs.push(doc_ref), 47 | 48 | IndexWriterOp::DeleteDoc { doc_id } => delete_doc(writer, doc_id.id()), 49 | } 50 | } 51 | 52 | let docs = document_store.get_documents(doc_refs).await.unwrap(); 53 | 54 | for doc in docs { 55 | let document = doc.document(&schema); 56 | index_doc(writer, document); 57 | } 58 | } 59 | 60 | pub async fn handle_event( 61 | document_store: &dyn DocumentStore, 62 | index_loader: &dyn IndexLoader, 63 | event: sqs::SqsEvent, 64 | ) -> Result<(), lambda::Error> { 65 | let records = event.payload.records; 66 | 67 | let jobs = records 68 | .iter() 69 | .map(|message| message.body.as_ref().expect("Body should be present")) 70 | .map(|body| { 71 | let msg = 72 | json::from_str::(body.as_str()).expect("Message should be deserializable"); 73 | msg 74 | }) 75 | .collect::>(); 76 | 77 | let mut writers: HashMap = HashMap::new(); 78 | 79 | for job in jobs { 80 | let index_id = &job.index_id; 81 | let mut writer = writers.entry(index_id.to_string()).or_insert_with(|| { 82 | index_loader 83 | .load_index(&index_id, None) 84 | .unwrap() 85 | .default_writer() 86 | }); 87 | 88 | handle_job(&mut writer, document_store, job).await; 89 | } 90 | 91 | for (index, mut writer) in writers.into_iter() { 92 | writer.commit().expect("commit should succeed"); 93 | info!(message = "index_commit", index); 94 | writer 95 | .wait_merging_threads() 96 | .expect("merge should finish without error"); 97 | } 98 | 99 | Ok(()) 100 | } 101 | 102 | #[cfg(test)] 103 | mod tests { 104 | 105 | use aws_lambda_events::sqs::{self, SqsMessage}; 106 | use lambda_http::Context; 107 | use lambda_runtime::LambdaEvent; 108 | 109 | use super::job::Job; 110 | use super::{handle_event, *}; 111 | use crate::schema::SchemaLoader; 112 | use crate::search_doc::SearchDoc; 113 | use crate::test_utils::*; 114 | 115 | #[tokio::test] 116 | async fn test_indexing() { 117 | let ctx = setup(); 118 | 119 | let schema = ctx.schema_loader().load_schema("test").unwrap(); 120 | 121 | let mut job = Job::create("test"); 122 | 123 | let document = SearchDoc::from_json( 124 | &schema, 125 | json!({ 126 | "year": 1989 127 | }), 128 | ) 129 | .unwrap(); 130 | 131 | let doc_refs = ctx 132 | .document_store() 133 | .save_documents(vec![document]) 134 | .await 135 | .unwrap(); 136 | 137 | for doc_ref in doc_refs { 138 | job.index_doc(doc_ref); 139 | } 140 | 141 | let message = SqsMessage { 142 | body: Some(json::to_string(&job).unwrap()), 143 | ..Default::default() 144 | }; 145 | 146 | let event = sqs::SqsEvent { 147 | records: vec![message], 148 | }; 149 | 150 | handle_event( 151 | ctx.document_store(), 152 | ctx.index_loader(), 153 | LambdaEvent::new(event, Context::default()), 154 | ) 155 | .await 156 | .unwrap(); 157 | 158 | assert_eq!( 159 | 1, 160 | ctx.index_loader() 161 | .load_index("test", None) 162 | .unwrap() 163 | .reader() 164 | .unwrap() 165 | .searcher() 166 | .num_docs() 167 | ); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /packages/pathery/src/worker/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod async_delete; 2 | pub mod index_writer; 3 | -------------------------------------------------------------------------------- /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - "app" 3 | - "integration-test" 4 | - "handlers/**" 5 | - "packages/**" 6 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | comment_width = 120 2 | format_strings = true 3 | group_imports = "StdExternalCrate" 4 | imports_granularity = "Module" 5 | normalize_comments = true 6 | where_single_line = true 7 | wrap_comments = true -------------------------------------------------------------------------------- /turbo.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://turbo.build/schema.json", 3 | "pipeline": { 4 | "test": { 5 | "outputs": [] 6 | }, 7 | "//#build:lambda": { 8 | "inputs": [ 9 | "Cargo.lock", 10 | "Cargo.toml", 11 | "packages/**/*.rs", 12 | "packages/**/Cargo.toml", 13 | "handlers/**/*.rs", 14 | "handlers/**/Cargo.toml", 15 | ".cargo" 16 | ], 17 | "outputs": ["target/lambda"] 18 | }, 19 | "build": { 20 | "dependsOn": ["//#build:lambda", "^build"], 21 | "outputs": ["lib"] 22 | }, 23 | "synth": { 24 | "dependsOn": ["^build"], 25 | "outputs": ["cdk.out"] 26 | }, 27 | "deploy": { 28 | "dependsOn": ["^build"], 29 | "outputs": ["cdk.out"] 30 | } 31 | } 32 | } 33 | --------------------------------------------------------------------------------