├── .changeset
    ├── README.md
    └── config.json
├── .gitignore
├── .vscode
    └── settings.json
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── app
    ├── cdk.context.json
    ├── cdk.json
    ├── package.json
    └── src
    │   ├── app.ts
    │   └── test-data-stack
    │       └── index.ts
├── cspell.json
├── data.json
├── doc
    ├── api.md
    ├── diagram.png
    └── index-config.md
├── examples
    └── getting-started
    │   ├── README.md
    │   ├── cdk.json
    │   ├── package.json
    │   ├── src
    │       └── app.ts
    │   └── tsconfig.json
├── integration-test
    ├── package.json
    └── src
    │   └── index-test-data.ts
├── package.json
├── packages
    ├── pathery-cdk
    │   ├── CHANGELOG.md
    │   ├── README.md
    │   ├── package.json
    │   ├── src
    │   │   ├── config.ts
    │   │   ├── index.ts
    │   │   ├── pathery-dashboard.ts
    │   │   ├── pathery-stack.ts
    │   │   └── rust-function.ts
    │   └── tsconfig.json
    └── pathery
    │   ├── Cargo.toml
    │   └── src
    │       ├── bin
    │           ├── async-delete-worker.rs
    │           ├── batch-index.rs
    │           ├── delete-doc.rs
    │           ├── index-writer-worker.rs
    │           ├── post-index.rs
    │           ├── query-index-partition-fn.rs
    │           ├── query-index.rs
    │           └── stats-index.rs
    │       ├── directory.rs
    │       ├── function
    │           ├── mod.rs
    │           └── query_index_partition
    │           │   ├── client.rs
    │           │   └── mod.rs
    │       ├── index.rs
    │       ├── lambda
    │           ├── mod.rs
    │           └── sqs.rs
    │       ├── lib.rs
    │       ├── pagination.rs
    │       ├── schema.rs
    │       ├── search_doc.rs
    │       ├── serialize
    │           ├── compressed_json.rs
    │           └── mod.rs
    │       ├── service
    │           ├── doc.rs
    │           ├── index
    │           │   ├── batch_index.rs
    │           │   ├── mod.rs
    │           │   ├── post_index.rs
    │           │   ├── query_index.rs
    │           │   └── stats_index.rs
    │           └── mod.rs
    │       ├── store
    │           ├── document.rs
    │           └── mod.rs
    │       ├── util.rs
    │       └── worker
    │           ├── async_delete
    │               ├── client.rs
    │               ├── job.rs
    │               └── mod.rs
    │           ├── index_writer
    │               ├── client.rs
    │               ├── job.rs
    │               └── mod.rs
    │           └── mod.rs
├── pnpm-lock.yaml
├── pnpm-workspace.yaml
├── rustfmt.toml
└── turbo.json


/.changeset/README.md:
--------------------------------------------------------------------------------
1 | # Changesets
2 | 
3 | Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
4 | with multi-package repos, or single-package repos to help you version and publish your code. You can
5 | find the full documentation for it [in our repository](https://github.com/changesets/changesets)
6 | 
7 | We have a quick list of common questions to get you started engaging with this project in
8 | [our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md)
9 | 


--------------------------------------------------------------------------------
/.changeset/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://unpkg.com/@changesets/config@2.2.0/schema.json",
 3 |   "changelog": "@changesets/cli/changelog",
 4 |   "commit": true,
 5 |   "fixed": [],
 6 |   "linked": [],
 7 |   "access": "public",
 8 |   "baseBranch": "main",
 9 |   "updateInternalDependencies": "patch",
10 |   "ignore": ["app", "integration-test"]
11 | }
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | node_modules
 2 | dev-env.json
 3 | 
 4 | .turbo
 5 | .pathery
 6 | 
 7 | cdk.out
 8 | target
 9 | cdk-outputs.json
10 | lib
11 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "editor.formatOnSave": true,
3 |   "rust-analyzer.rustfmt.extraArgs": ["+nightly"]
4 | }
5 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = [
3 |     "packages/pathery"
4 | ]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Tyler van Hensbergen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pathery :fire: Serverless Search :fire:
 2 | 
 3 | [![npm version](https://badge.fury.io/js/@pathery%2Fcdk.svg)](https://badge.fury.io/js/@pathery%2Fcdk)
 4 | 
 5 | Pathery is a **serverless search service** built on AWS using Rust, CDK and [Tantivy][tantivy]. It uses AWS managed serverless offerings – DynamoDB, EFS, Lambda, SQS, and API Gateway – to the maximum extent possible.
 6 | 
 7 | **:bell: WARNING:** This is currently a work in progress and not ready for production usage.
 8 | 
 9 | ## Features
10 | 
11 | - **🔥 Fast full-text search**. Built on Rust to limit AWS Lambda cold start overhead.
12 | - **🥰 Simple REST API**. A [simple REST API][api-docs] to make search as easy as possible.
13 | - **👍 Easy to install**. Ships as a CDK Component, making it easy to [get started][get-started].
14 | - **💵 Usage based infra**. No long running servers, only pay for what you use.
15 | - **🔼 Built for AWS**. Leans on AWS managed services to limit maintenance burden and maximize scalability.
16 |   - Document store: [DynamoDB](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html)
17 |   - Index store: [Elastic File System (EFS)](https://docs.aws.amazon.com/efs/latest/ug/whatisefs.html)
18 |   - Index writer & handler: [Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html)
19 |   - Index queue: [SQS](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/welcome.html)
20 |   - API: [API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/welcome.html)
21 | 
22 | ## Getting Started
23 | 
24 | Check out the [getting started guide][get-started] to deploy Pathery into your AWS account using CDK.
25 | 
26 | [tantivy]: https://github.com/quickwit-oss/tantivy
27 | [get-started]: ./examples/getting-started/
28 | [api-docs]: ./doc/api.md
29 | 
30 | ## Architecture
31 | 
32 | Follow along with the Dev Log:
33 | 
34 | - [Pathery Dev Log #1: Performant Serverless Queries Without a Cluster](https://tvanhens.substack.com/p/pathery-dev-log-1-performant-serverless)
35 | - [Pathery Dev Log #2: Indexing and the Document Store](https://tvanhens.substack.com/p/pathery-dev-log-2-indexing-and-the)
36 | 
37 | ![diagram](/doc/diagram.png)
38 | 


--------------------------------------------------------------------------------
/app/cdk.context.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "availability-zones:account=117773642559:region=us-east-1": [
 3 |     "us-east-1a",
 4 |     "us-east-1b",
 5 |     "us-east-1c",
 6 |     "us-east-1d",
 7 |     "us-east-1e",
 8 |     "us-east-1f"
 9 |   ]
10 | }
11 | 


--------------------------------------------------------------------------------
/app/cdk.json:
--------------------------------------------------------------------------------
1 | {
2 |   "app": "ts-node --swc src/app.ts",
3 |   "profile": "pathery-dev"
4 | }
5 | 


--------------------------------------------------------------------------------
/app/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "app",
 3 |   "private": true,
 4 |   "version": "0.0.0",
 5 |   "description": "",
 6 |   "main": "index.js",
 7 |   "scripts": {
 8 |     "build": "cdk synth",
 9 |     "deploy": "cdk deploy --all -O cdk-outputs.json --require-approval never --method direct"
10 |   },
11 |   "dependencies": {
12 |     "@pathery/cdk": "workspace:*",
13 |     "aws-sdk": "^2.1262.0",
14 |     "axios": "^1.2.0",
15 |     "esbuild": "^0.15.16"
16 |   },
17 |   "keywords": [],
18 |   "author": "",
19 |   "license": "ISC"
20 | }
21 | 


--------------------------------------------------------------------------------
/app/src/app.ts:
--------------------------------------------------------------------------------
 1 | import { App } from "aws-cdk-lib";
 2 | import { PatheryStack } from "@pathery/cdk";
 3 | import { TestDataStack } from "./test-data-stack";
 4 | 
 5 | const app = new App();
 6 | 
 7 | const pathery = new PatheryStack(app, "pathery-dev", {
 8 |   config: {
 9 |     indexes: [
10 |       {
11 |         prefix: "test-index-v1",
12 |         fields: [
13 |           {
14 |             name: "author",
15 |             flags: ["TEXT"],
16 |             kind: "text",
17 |           },
18 |           {
19 |             name: "song",
20 |             flags: ["TEXT"],
21 |             kind: "text",
22 |           },
23 |           {
24 |             name: "genre",
25 |             flags: ["STRING"],
26 |             kind: "text",
27 |           },
28 |           {
29 |             name: "releaseDate",
30 |             flags: ["INDEXED"],
31 |             kind: "i64",
32 |           },
33 |         ],
34 |       },
35 |     ],
36 |   },
37 | });
38 | 
39 | new TestDataStack(app, "pathery-test-data");
40 | 


--------------------------------------------------------------------------------
/app/src/test-data-stack/index.ts:
--------------------------------------------------------------------------------
 1 | import { CfnOutput, Stack } from "aws-cdk-lib";
 2 | import { Bucket } from "aws-cdk-lib/aws-s3";
 3 | import { Construct } from "constructs";
 4 | 
 5 | export class TestDataStack extends Stack {
 6 |   constructor(scope: Construct, id: string) {
 7 |     super(scope, id);
 8 | 
 9 |     const dataBucket = new Bucket(this, "DataBucket");
10 | 
11 |     new CfnOutput(this, "DataBucketName", {
12 |       value: dataBucket.bucketName,
13 |     });
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/cspell.json:
--------------------------------------------------------------------------------
 1 | // cSpell Settings
 2 | {
 3 |   "version": "0.2",
 4 |   "language": "en",
 5 |   "words": [
 6 |     "chrono",
 7 |     "Hensbergen",
 8 |     "Mmap",
 9 |     "Pathery",
10 |     "Pirsig",
11 |     "Runtimes",
12 |     "upsert",
13 |     "thiserror"
14 |   ],
15 |   "flagWords": [],
16 |   "ignorePaths": ["**/node_modules/**", "target/**", "lib/**"]
17 | }
18 | 


--------------------------------------------------------------------------------
/data.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "title": "Zen and the Art of Motorcycle Maintenance",
 4 |     "author": "Robert Pirsig"
 5 |   },
 6 |   {
 7 |     "title": "One Flew Over the Cuckoo's Nest",
 8 |     "author": "Ken Kesey"
 9 |   }
10 | ]
11 | 


--------------------------------------------------------------------------------
/doc/api.md:
--------------------------------------------------------------------------------
  1 | # API Docs
  2 | 
  3 | ## General
  4 | 
  5 | The base url is the url of the API gateway and is emitted on installation via CDK.
  6 | 
  7 | **Example**
  8 | 
  9 | ```
 10 | https://<api-id>.execute-api.us-east-1.amazonaws.com/prod
 11 | ```
 12 | 
 13 | ## Index Operations
 14 | 
 15 | ### Index a Document
 16 | 
 17 | `POST /index/{index_id}`
 18 | 
 19 | Indexes a document so that the document is searchable.
 20 | A document can optionally provide an `__id` field to set the document id.
 21 | If no `__id` is provided one is generated and returned.
 22 | Indexing a document with an `__id` will upsert any previously indexed data with the provided `__id`.
 23 | 
 24 | #### Parameters
 25 | 
 26 | - `__id` - (optional) the document id to use for the document
 27 | 
 28 | #### Examples
 29 | 
 30 | **Basic Indexing**
 31 | 
 32 | Request:
 33 | 
 34 | ```bash
 35 | http https://<api-id>.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1 \
 36 |      author="Robert M. Pirsig" \
 37 |      title="Zen and the Art of Motorcycle Maintenance"
 38 | ```
 39 | 
 40 | Response:
 41 | 
 42 | ```json
 43 | {
 44 |   "__id": "b7c8aee4-9656-47a3-8217-df1b71056a83",
 45 |   "updated_at": "2022-11-14T21:17:58.824791120+00:00"
 46 | }
 47 | ```
 48 | 
 49 | **Providing an `\_\_id`**
 50 | 
 51 | Request:
 52 | 
 53 | ```bash
 54 | http https://<api-id>.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1 \
 55 |      author="Robert M. Pirsig" \
 56 |      title="Zen and the Art of Motorcycle Maintenance" \
 57 |      __id=zen
 58 | ```
 59 | 
 60 | Response:
 61 | 
 62 | ```json
 63 | {
 64 |   "__id": "zen",
 65 |   "updated_at": "2022-11-14T21:17:58.824791120+00:00"
 66 | }
 67 | ```
 68 | 
 69 | ### Query a Document
 70 | 
 71 | `POST /index/{index_id}/query`
 72 | 
 73 | Query an index with a provided search string.
 74 | 
 75 | #### Parameters
 76 | 
 77 | - `query` - a query string to search against the index
 78 | 
 79 | #### Examples
 80 | 
 81 | **Simple Full Text Search**
 82 | 
 83 | Request:
 84 | 
 85 | ```bash
 86 | http https://<api-id>.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1/query \
 87 |      query="zen art"
 88 | ```
 89 | 
 90 | Response:
 91 | 
 92 | ```json
 93 | {
 94 |   "matches": [
 95 |     {
 96 |       "doc": {
 97 |         "__id": "ebf5c0a0-ca14-4471-bc21-5259d7898df3",
 98 |         "title": "Zen and the Art of Motorcycle Maintenance"
 99 |       },
100 |       "score": 0.57536423,
101 |       "snippets": {
102 |         "title": "<b>Zen</b> and the <b>Art</b> of Motorcycle Maintenance"
103 |       }
104 |     }
105 |   ]
106 | }
107 | ```
108 | 
109 | ### Delete a Document
110 | 
111 | `DELETE /index/{index_id}/doc/{doc_id}`
112 | 
113 | Delete a document from an index such that it is no longer searchable.
114 | 
115 | #### Examples
116 | 
117 | **Simple Full Text Search**
118 | 
119 | Request:
120 | 
121 | ```bash
122 | http DELETE https://<api-id>.execute-api.us-east-1.amazonaws.com/prod/index/book-index-1/doc/b7c8aee4-9656-47a3-8217-df1b71056a83
123 | ```
124 | 
125 | Response:
126 | 
127 | ```json
128 | {
129 |   "__id": "b7c8aee4-9656-47a3-8217-df1b71056a83",
130 |   "deleted_at": "2022-11-14T21:30:04.845814727+00:00"
131 | }
132 | ```
133 | 


--------------------------------------------------------------------------------
/doc/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tvanhens/pathery/f3781d37df2c71f1debfcd5a68afa5a5a899f65b/doc/diagram.png


--------------------------------------------------------------------------------
/doc/index-config.md:
--------------------------------------------------------------------------------
1 | # Coming Soon
2 | 


--------------------------------------------------------------------------------
/examples/getting-started/README.md:
--------------------------------------------------------------------------------
  1 | # Getting Started
  2 | 
  3 | This guide will walk you through:
  4 | 
  5 | 1. Project Setup
  6 | 1. Pathery Deployment
  7 | 1. Writing documents to an index
  8 | 1. Querying an index
  9 | 
 10 | ## Project Setup
 11 | 
 12 | Pathery ships as a CDK Construct and requires TypeScript and AWS CDK to be installed.
 13 | The minimum set of dependencies is shown below:
 14 | 
 15 | **package.json**
 16 | 
 17 | ```json
 18 | {
 19 |   "name": "getting-started",
 20 |   "version": "0.0.0",
 21 |   "description": "",
 22 |   "main": "index.js",
 23 |   "scripts": {
 24 |     "deploy": "cdk deploy"
 25 |   },
 26 |   "keywords": [],
 27 |   "author": "",
 28 |   "license": "ISC",
 29 |   "dependencies": {
 30 |     "@pathery/cdk": "^0.0.4",
 31 |     "@swc/core": "^1.3.14",
 32 |     "@types/node": "^18.11.9",
 33 |     "aws-cdk": "^2.50.0",
 34 |     "aws-cdk-lib": "^2.50.0",
 35 |     "constructs": "^10.1.155",
 36 |     "ts-node": "^10.9.1",
 37 |     "typescript": "^4.8.4"
 38 |   }
 39 | }
 40 | ```
 41 | 
 42 | Running `npm install` will install the required dependencies.
 43 | Next, you can configure your first index pattern in `src/app.ts`.
 44 | Index patterns define the field configuration for indexes that start with the given prefix.
 45 | 
 46 | In the example below, any index that starts with the name `book-index-v1-` will have the fields `author` and `title` indexed.
 47 | You can read more about index configuration in the [index configuration guide][index-config].
 48 | 
 49 | **src/app.ts**
 50 | 
 51 | ```typescript
 52 | import { App } from "aws-cdk-lib";
 53 | import { PatheryStack } from "@pathery/cdk";
 54 | 
 55 | const app = new App();
 56 | 
 57 | new PatheryStack(app, "pathery-dev", {
 58 |   config: {
 59 |     indexes: [
 60 |       {
 61 |         // Indexes starting with this prefix will use this config
 62 |         prefix: "book-index-v1-",
 63 |         fields: [
 64 |           {
 65 |             // Index the field title
 66 |             name: "title",
 67 |             flags: ["STORED", "TEXT"],
 68 |             kind: "text",
 69 |           },
 70 |           {
 71 |             // Index the field author
 72 |             name: "author",
 73 |             flags: ["STORED", "TEXT"],
 74 |             kind: "text",
 75 |           },
 76 |         ],
 77 |       },
 78 |     ],
 79 |   },
 80 | });
 81 | ```
 82 | 
 83 | Lastly, CDK needs to know where our CDK app is declared so we include a `cdk.json`:
 84 | 
 85 | **cdk.json**
 86 | 
 87 | ```json
 88 | {
 89 |   "app": "ts-node --swc src/app.ts"
 90 | }
 91 | ```
 92 | 
 93 | This is the minimum amount of setup required. Now we can deploy our Pathery search service.
 94 | 
 95 | ## Deployment
 96 | 
 97 | To deploy the project run `npm run deploy`.
 98 | 
 99 | If everything worked, you should see an output that looks like the one below:
100 | 
101 | ```bash
102 |   ✅  pathery-dev
103 | 
104 |  ✨  Deployment time: 55.94s
105 | 
106 |  Outputs:
107 |  arn:aws:cloudformation:us-east-1:117773642559:stack/pathery-dev/f1c49c40-60b3-11ed-b19f-0e7f8a5bfcb7
108 |  pathery-dev.ApiKeyOutput = <omitted>
109 |  pathery-dev.PatheryApiEndpointB5297505 = https://<omitted>.execute-api.us-east-1.amazonaws.com/prod/
110 |  Stack ARN:
111 | 
112 |  ✨  Total time: 58.13s
113 | ```
114 | 
115 | Note the output called `pathery-dev.PatheryApiEndpointB5297505`, this is the URL to your search API.
116 | Lets save it to your shell environment for the next step by running:
117 | 
118 | ```bash
119 | export PATHERY_ENDPOINT=<url from PatheryApiEndpoint output above>
120 | ```
121 | 
122 | This endpoint is authenticated using an API key that gets automatically generated.
123 | Copy the id on the right hand side of the output `<stack-name>.ApiKeyOutput = <omitted>` and paste it into the line below for `<api-key-id>`:
124 | 
125 | ```bash
126 | export PATHERY_KEY="$(aws apigateway get-api-key --include-value --api-key <api-key-id> --query value --output text)"
127 | ```
128 | 
129 | [index-config]: ../../doc/index-config.md
130 | 
131 | ## Indexing a Document
132 | 
133 | To index an example document run:
134 | 
135 | ```bash
136 | curl -X POST ${PATHERY_ENDPOINT}index/book-index-v1-test \
137 |      -H 'Content-Type: application/json' \
138 |      -H "x-api-key: ${PATHERY_KEY}" \
139 |      -d '{"title": "Zen and the Art of Motorcycle Maintenance", "author": "Robert Pirsig"}'
140 | ```
141 | 
142 | > **❕ Note**
143 | >
144 | > Our index is name is `book-index-v1-test`.
145 | > The prefix of `book-index-v1-` is required to match the prefix in our configuration.
146 | >
147 | > **If you try to post to an index which does not match a configuration prefix, the request will fail.**
148 | 
149 | If indexing is successful you should see:
150 | 
151 | ```json
152 | {
153 |   "__id": "7a309cda-1314-4e0a-a97d-02ce2c5e24c7",
154 |   "updated_at": "2022-11-17T17:49:28.835542383+00:00"
155 | }
156 | ```
157 | 
158 | Now we're ready to query our index.
159 | 
160 | ## Querying an Index
161 | 
162 | To query our index we can use a request like the one below:
163 | 
164 | ```bash
165 | curl -X POST ${PATHERY_ENDPOINT}index/book-index-v1-test/query \
166 |      -H 'Content-Type: application/json' \
167 |      -H "x-api-key: ${PATHERY_KEY}" \
168 |      -d '{"query": "zen art pirsig"}'
169 | ```
170 | 
171 | You should see a response like the one below, note the matching search terms are highlighted in the `snippets` of the response:
172 | 
173 | ```json
174 | {
175 |   "matches": [
176 |     {
177 |       "doc": {
178 |         "__id": "7a309cda-1314-4e0a-a97d-02ce2c5e24c7",
179 |         "author": "Robert Pirsig",
180 |         "title": "Zen and the Art of Motorcycle Maintenance"
181 |       },
182 |       "snippets": {
183 |         "title": "<b>Zen</b> and the <b>Art</b> of Motorcycle Maintenance",
184 |         "author": "Robert <b>Pirsig</b>"
185 |       },
186 |       "score": 0.86304635
187 |     }
188 |   ]
189 | }
190 | ```
191 | 


--------------------------------------------------------------------------------
/examples/getting-started/cdk.json:
--------------------------------------------------------------------------------
1 | {
2 |   "app": "ts-node --swc src/app.ts"
3 | }
4 | 


--------------------------------------------------------------------------------
/examples/getting-started/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "getting-started",
 3 |   "version": "0.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "deploy": "cdk deploy"
 8 |   },
 9 |   "keywords": [],
10 |   "author": "",
11 |   "license": "ISC",
12 |   "dependencies": {
13 |     "@pathery/cdk": "^0.0.4",
14 |     "@swc/core": "^1.3.14",
15 |     "@types/node": "^18.11.9",
16 |     "aws-cdk": "^2.50.0",
17 |     "aws-cdk-lib": "^2.50.0",
18 |     "constructs": "^10.1.155",
19 |     "ts-node": "^10.9.1",
20 |     "typescript": "^4.8.4"
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/examples/getting-started/src/app.ts:
--------------------------------------------------------------------------------
 1 | import { App } from "aws-cdk-lib";
 2 | import { PatheryStack } from "@pathery/cdk";
 3 | 
 4 | const app = new App();
 5 | 
 6 | new PatheryStack(app, "pathery-dev", {
 7 |   config: {
 8 |     indexes: [
 9 |       {
10 |         prefix: "book-index-v1-",
11 |         fields: [
12 |           {
13 |             name: "title",
14 |             flags: ["STORED", "TEXT"],
15 |             kind: "text",
16 |           },
17 |           {
18 |             name: "author",
19 |             flags: ["STORED", "TEXT"],
20 |             kind: "text",
21 |           },
22 |         ],
23 |       },
24 |     ],
25 |   },
26 | });
27 | 


--------------------------------------------------------------------------------
/examples/getting-started/tsconfig.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "compilerOptions": {
  3 |     /* Visit https://aka.ms/tsconfig to read more about this file */
  4 | 
  5 |     /* Projects */
  6 |     // "incremental": true,                              /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
  7 |     // "composite": true,                                /* Enable constraints that allow a TypeScript project to be used with project references. */
  8 |     // "tsBuildInfoFile": "./.tsbuildinfo",              /* Specify the path to .tsbuildinfo incremental compilation file. */
  9 |     // "disableSourceOfProjectReferenceRedirect": true,  /* Disable preferring source files instead of declaration files when referencing composite projects. */
 10 |     // "disableSolutionSearching": true,                 /* Opt a project out of multi-project reference checking when editing. */
 11 |     // "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
 12 | 
 13 |     /* Language and Environment */
 14 |     "target": "es2016",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
 15 |     // "lib": [],                                        /* Specify a set of bundled library declaration files that describe the target runtime environment. */
 16 |     // "jsx": "preserve",                                /* Specify what JSX code is generated. */
 17 |     // "experimentalDecorators": true,                   /* Enable experimental support for TC39 stage 2 draft decorators. */
 18 |     // "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
 19 |     // "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
 20 |     // "jsxFragmentFactory": "",                         /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
 21 |     // "jsxImportSource": "",                            /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
 22 |     // "reactNamespace": "",                             /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
 23 |     // "noLib": true,                                    /* Disable including any library files, including the default lib.d.ts. */
 24 |     // "useDefineForClassFields": true,                  /* Emit ECMAScript-standard-compliant class fields. */
 25 |     // "moduleDetection": "auto",                        /* Control what method is used to detect module-format JS files. */
 26 | 
 27 |     /* Modules */
 28 |     "module": "commonjs",                                /* Specify what module code is generated. */
 29 |     // "rootDir": "./",                                  /* Specify the root folder within your source files. */
 30 |     // "moduleResolution": "node",                       /* Specify how TypeScript looks up a file from a given module specifier. */
 31 |     // "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
 32 |     // "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
 33 |     // "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */
 34 |     // "typeRoots": [],                                  /* Specify multiple folders that act like './node_modules/@types'. */
 35 |     // "types": [],                                      /* Specify type package names to be included without being referenced in a source file. */
 36 |     // "allowUmdGlobalAccess": true,                     /* Allow accessing UMD globals from modules. */
 37 |     // "moduleSuffixes": [],                             /* List of file name suffixes to search when resolving a module. */
 38 |     // "resolveJsonModule": true,                        /* Enable importing .json files. */
 39 |     // "noResolve": true,                                /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
 40 | 
 41 |     /* JavaScript Support */
 42 |     // "allowJs": true,                                  /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
 43 |     // "checkJs": true,                                  /* Enable error reporting in type-checked JavaScript files. */
 44 |     // "maxNodeModuleJsDepth": 1,                        /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
 45 | 
 46 |     /* Emit */
 47 |     // "declaration": true,                              /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
 48 |     // "declarationMap": true,                           /* Create sourcemaps for d.ts files. */
 49 |     // "emitDeclarationOnly": true,                      /* Only output d.ts files and not JavaScript files. */
 50 |     // "sourceMap": true,                                /* Create source map files for emitted JavaScript files. */
 51 |     // "outFile": "./",                                  /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
 52 |     // "outDir": "./",                                   /* Specify an output folder for all emitted files. */
 53 |     // "removeComments": true,                           /* Disable emitting comments. */
 54 |     // "noEmit": true,                                   /* Disable emitting files from a compilation. */
 55 |     // "importHelpers": true,                            /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
 56 |     // "importsNotUsedAsValues": "remove",               /* Specify emit/checking behavior for imports that are only used for types. */
 57 |     // "downlevelIteration": true,                       /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
 58 |     // "sourceRoot": "",                                 /* Specify the root path for debuggers to find the reference source code. */
 59 |     // "mapRoot": "",                                    /* Specify the location where debugger should locate map files instead of generated locations. */
 60 |     // "inlineSourceMap": true,                          /* Include sourcemap files inside the emitted JavaScript. */
 61 |     // "inlineSources": true,                            /* Include source code in the sourcemaps inside the emitted JavaScript. */
 62 |     // "emitBOM": true,                                  /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
 63 |     // "newLine": "crlf",                                /* Set the newline character for emitting files. */
 64 |     // "stripInternal": true,                            /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
 65 |     // "noEmitHelpers": true,                            /* Disable generating custom helper functions like '__extends' in compiled output. */
 66 |     // "noEmitOnError": true,                            /* Disable emitting files if any type checking errors are reported. */
 67 |     // "preserveConstEnums": true,                       /* Disable erasing 'const enum' declarations in generated code. */
 68 |     // "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
 69 |     // "preserveValueImports": true,                     /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
 70 | 
 71 |     /* Interop Constraints */
 72 |     // "isolatedModules": true,                          /* Ensure that each file can be safely transpiled without relying on other imports. */
 73 |     // "allowSyntheticDefaultImports": true,             /* Allow 'import x from y' when a module doesn't have a default export. */
 74 |     "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
 75 |     // "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
 76 |     "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */
 77 | 
 78 |     /* Type Checking */
 79 |     "strict": true,                                      /* Enable all strict type-checking options. */
 80 |     // "noImplicitAny": true,                            /* Enable error reporting for expressions and declarations with an implied 'any' type. */
 81 |     // "strictNullChecks": true,                         /* When type checking, take into account 'null' and 'undefined'. */
 82 |     // "strictFunctionTypes": true,                      /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
 83 |     // "strictBindCallApply": true,                      /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
 84 |     // "strictPropertyInitialization": true,             /* Check for class properties that are declared but not set in the constructor. */
 85 |     // "noImplicitThis": true,                           /* Enable error reporting when 'this' is given the type 'any'. */
 86 |     // "useUnknownInCatchVariables": true,               /* Default catch clause variables as 'unknown' instead of 'any'. */
 87 |     // "alwaysStrict": true,                             /* Ensure 'use strict' is always emitted. */
 88 |     // "noUnusedLocals": true,                           /* Enable error reporting when local variables aren't read. */
 89 |     // "noUnusedParameters": true,                       /* Raise an error when a function parameter isn't read. */
 90 |     // "exactOptionalPropertyTypes": true,               /* Interpret optional property types as written, rather than adding 'undefined'. */
 91 |     // "noImplicitReturns": true,                        /* Enable error reporting for codepaths that do not explicitly return in a function. */
 92 |     // "noFallthroughCasesInSwitch": true,               /* Enable error reporting for fallthrough cases in switch statements. */
 93 |     // "noUncheckedIndexedAccess": true,                 /* Add 'undefined' to a type when accessed using an index. */
 94 |     // "noImplicitOverride": true,                       /* Ensure overriding members in derived classes are marked with an override modifier. */
 95 |     // "noPropertyAccessFromIndexSignature": true,       /* Enforces using indexed accessors for keys declared using an indexed type. */
 96 |     // "allowUnusedLabels": true,                        /* Disable error reporting for unused labels. */
 97 |     // "allowUnreachableCode": true,                     /* Disable error reporting for unreachable code. */
 98 | 
 99 |     /* Completeness */
100 |     // "skipDefaultLibCheck": true,                      /* Skip type checking .d.ts files that are included with TypeScript. */
101 |     "skipLibCheck": true                                 /* Skip type checking all .d.ts files. */
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/integration-test/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "integration-test",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "description": "",
 6 |   "main": "index.js",
 7 |   "scripts": {
 8 |     "test": "echo \"Error: no test specified\" && exit 1"
 9 |   },
10 |   "keywords": [],
11 |   "author": "",
12 |   "license": "ISC",
13 |   "dependencies": {
14 |     "@faker-js/faker": "^7.6.0",
15 |     "aws-sdk": "^2.1262.0",
16 |     "axios": "^1.2.0"
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/integration-test/src/index-test-data.ts:
--------------------------------------------------------------------------------
  1 | import * as AWS from "aws-sdk";
  2 | import http, { AxiosError } from "axios";
  3 | import { faker } from "@faker-js/faker";
  4 | 
  5 | const maxBatch = 20_000;
  6 | const batchSize = 25;
  7 | const patheryEndpoint =
  8 |   "https://nlztni8cx5.execute-api.us-east-1.amazonaws.com/prod/";
  9 | const index_id = "test-index-v1-3";
 10 | const apiKeyId = "7xyag5xp0d";
 11 | 
 12 | const api = new AWS.APIGateway();
 13 | 
 14 | const s3 = new AWS.S3();
 15 | 
 16 | export async function getApiKey() {
 17 |   const response = await api
 18 |     .getApiKey({
 19 |       apiKey: apiKeyId,
 20 |       includeValue: true,
 21 |     })
 22 |     .promise();
 23 | 
 24 |   const value = response.value;
 25 | 
 26 |   if (!value) {
 27 |     throw new Error("Could not get API key value");
 28 |   }
 29 | 
 30 |   return value;
 31 | }
 32 | 
 33 | async function uploadBatch(apiKey: string, batch: any[]) {
 34 |   const batchUrl = `${patheryEndpoint}index/${index_id}/batch`;
 35 | 
 36 |   try {
 37 |     await http.post(batchUrl, batch, {
 38 |       headers: {
 39 |         "Content-Type": "application/json",
 40 |         "X-Api-Key": apiKey,
 41 |       },
 42 |     });
 43 |     return { status: "OK" as const };
 44 |   } catch (err) {
 45 |     if (err instanceof AxiosError) {
 46 |       if (!err.response) {
 47 |         console.error(err);
 48 |         process.exit(1);
 49 |       }
 50 | 
 51 |       const message: string = err.response.data.message;
 52 |       const code = err.response.status;
 53 | 
 54 |       if (code !== 500) {
 55 |         console.error(err);
 56 |         process.exit(1);
 57 |       }
 58 | 
 59 |       console.log(`[${code}] ${message}`);
 60 | 
 61 |       return { status: "Error" as const, code, message };
 62 |     }
 63 |   }
 64 | }
 65 | 
 66 | export async function* batchGenerator() {
 67 |   let batchNum = 1;
 68 | 
 69 |   let batch: unknown[] = [];
 70 | 
 71 |   while (true) {
 72 |     if (batchNum > maxBatch) {
 73 |       return batch;
 74 |     }
 75 |     const next = {
 76 |       author: faker.name.fullName(),
 77 |       song: faker.music.songName(),
 78 |       genre: faker.music.genre(),
 79 |       releaseDate: faker.date.past().getTime(),
 80 |     };
 81 | 
 82 |     batch.push(next);
 83 | 
 84 |     if (batch.length >= batchSize) {
 85 |       console.log(`Uploading batch #${batchNum++}`);
 86 | 
 87 |       yield batch;
 88 | 
 89 |       batch = [];
 90 |     }
 91 |   }
 92 | }
 93 | 
 94 | async function startUploader(
 95 |   apiKey: string,
 96 |   batches: AsyncGenerator<unknown[], unknown[], unknown>
 97 | ) {
 98 |   for await (const batch of batches) {
 99 |     let attempts = 0;
100 |     while (true) {
101 |       if (attempts >= 3) {
102 |         process.exit(1);
103 |       }
104 | 
105 |       attempts++;
106 | 
107 |       const result = await uploadBatch(apiKey, batch);
108 | 
109 |       if (result?.status === "OK") {
110 |         break;
111 |       }
112 | 
113 |       console.log("Backing off...");
114 | 
115 |       await new Promise((resolve) => {
116 |         setTimeout(resolve, 2000);
117 |       });
118 |     }
119 |   }
120 | }
121 | 
122 | export async function doIndex(numUploader: number) {
123 |   const apiKey = await getApiKey();
124 | 
125 |   const batches = batchGenerator();
126 | 
127 |   const uploaderList: Promise<any>[] = [];
128 | 
129 |   for (let i = 0; i < numUploader; i++) {
130 |     uploaderList.push(startUploader(apiKey, batches));
131 |   }
132 | 
133 |   await Promise.all(uploaderList);
134 | 
135 |   console.log("Done");
136 | }
137 | 
138 | doIndex(10);
139 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pathery",
 3 |   "private": true,
 4 |   "version": "1.0.0",
 5 |   "description": "",
 6 |   "main": "index.js",
 7 |   "scripts": {
 8 |     "sso:dev": "aws sso login --profile=pathery-dev",
 9 |     "build:lambda": "cargo lambda build --release --arm64",
10 |     "build": "turbo run build",
11 |     "check:spell": "cspell '**/*.{md,ts,rs}'",
12 |     "check": "npm run check:spell",
13 |     "version": "changeset version",
14 |     "publish": "pnpm publish -r --access public",
15 |     "release": "npm run build && pnpm run publish",
16 |     "deploy:example": "turbo run deploy"
17 |   },
18 |   "keywords": [],
19 |   "author": "",
20 |   "license": "ISC",
21 |   "dependencies": {
22 |     "@changesets/cli": "^2.25.2",
23 |     "@swc/core": "^1.3.14",
24 |     "@types/node": "^18.11.9",
25 |     "aws-cdk": "^2.50.0",
26 |     "aws-cdk-lib": "^2.50.0",
27 |     "constructs": "^10.1.155",
28 |     "cspell": "^6.14.3",
29 |     "depcheck": "^1.4.3",
30 |     "ts-node": "^10.9.1",
31 |     "turbo": "^1.6.3",
32 |     "typescript": "^4.8.4"
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # @pathery/cdk
  2 | 
  3 | ## 0.2.5
  4 | 
  5 | ### Patch Changes
  6 | 
  7 | - a97b5b2: feat: enable multithreading for query partition lambda
  8 | 
  9 | ## 0.2.4
 10 | 
 11 | ### Patch Changes
 12 | 
 13 | - 371ab81: fix: run requests in parallel
 14 | 
 15 | ## 0.2.3
 16 | 
 17 | ### Patch Changes
 18 | 
 19 | - 3b2b99b: fix: stats endpoint missing env var
 20 | 
 21 | ## 0.2.2
 22 | 
 23 | ### Patch Changes
 24 | 
 25 | - 6e4ad31: Feature: automatically fan out queries as index grows
 26 | - 3142bbf: feat: add pagination via pagination token
 27 | 
 28 | ## 0.2.1
 29 | 
 30 | ### Patch Changes
 31 | 
 32 | - f4f3868: fix: run deletes async on delay queue
 33 | 
 34 | ## 0.2.0
 35 | 
 36 | ### Minor Changes
 37 | 
 38 | - b19295c: Compressed stored document representation.
 39 | 
 40 | ## 0.1.1
 41 | 
 42 | ### Patch Changes
 43 | 
 44 | - 61cd70b: Fix: documents were not serializing to writer queue correctly.
 45 | - 22598b6: Feature: Allow query handler memory size to be specified via CDK construct.
 46 | - ea2676c: Add json field type to schema config.
 47 | - 534908c: Fix: 404 error for missing index config
 48 | - 576d352: Feature: Partition queries using the optional with_partition body param.
 49 | - 534908c: Improvement: Use DynamoDB for original document storage.
 50 | - 534908c: Fix: allow empty body for delete doc request
 51 | - 653cd03: Feature: Add date field type.
 52 | - 61cd70b: Feature: add i64 as index field type
 53 | 
 54 | ## 0.1.0
 55 | 
 56 | ### Minor Changes
 57 | 
 58 | - 9ee82b6: Add API key authorization and generate default key.
 59 | 
 60 | ### Patch Changes
 61 | 
 62 | - 83cb85c: Allow IndexWriter config to be specified.
 63 | 
 64 | ## 0.0.9
 65 | 
 66 | ### Patch Changes
 67 | 
 68 | - 03f647a: Add batch index endpoint
 69 | 
 70 | ## 0.0.8
 71 | 
 72 | ### Patch Changes
 73 | 
 74 | - 38a8116: Fix: incorrect dashboard naming
 75 | 
 76 | ## 0.0.7
 77 | 
 78 | ### Patch Changes
 79 | 
 80 | - 903af06: Add basic dashboard with errors and writer stats
 81 | 
 82 | ## 0.0.6
 83 | 
 84 | ### Patch Changes
 85 | 
 86 | - bf29fa3: Fixes https://github.com/tvanhens/pathery/issues/1
 87 | 
 88 | ## 0.0.5
 89 | 
 90 | ### Patch Changes
 91 | 
 92 | - 255c378: Add the STRING flag for text fields to enable exact-only matching.
 93 | 
 94 | ## 0.0.4
 95 | 
 96 | ### Patch Changes
 97 | 
 98 | - 8220a12: Improve package docs, keywords and description.
 99 | 
100 | ## 0.0.3
101 | 
102 | ### Patch Changes
103 | 
104 | - c1e6d24: Add readme to package.
105 | 
106 | ## 0.0.2
107 | 
108 | ### Patch Changes
109 | 
110 | - 1e8060d: Move configuration into construct props.
111 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/README.md:
--------------------------------------------------------------------------------
 1 | # Pathery Serverless Search CDK Construct
 2 | 
 3 | [![npm version](https://badge.fury.io/js/@pathery%2Fcdk.svg)](https://badge.fury.io/js/@pathery%2Fcdk)
 4 | 
 5 | Pathery Search is a serverless search solution built on AWS.
 6 | 
 7 | For more information, visit the [project page][project-page].
 8 | 
 9 | Pathery CDK is an [AWS CDK][aws-cdk] construct that packages the infrastructure required to deploy Pathery Search.
10 | 
11 | [aws-cdk]: https://github.com/aws/aws-cdk
12 | [project-page]: https://github.com/tvanhens/pathery
13 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@pathery/cdk",
 3 |   "publishConfig": {
 4 |     "access": "public"
 5 |   },
 6 |   "version": "0.2.5",
 7 |   "license": "MIT",
 8 |   "description": "AWS CDK Construct for Pathery Serverless Search.",
 9 |   "keywords": [
10 |     "aws",
11 |     "cdk",
12 |     "serverless",
13 |     "search",
14 |     "full-text"
15 |   ],
16 |   "author": "Tyler van Hensbergen",
17 |   "main": "lib/index.js",
18 |   "types": "lib/index.d.ts",
19 |   "files": [
20 |     "target",
21 |     "lib"
22 |   ],
23 |   "scripts": {
24 |     "build:pack-deps": "mkdir -p target && cp -r ../../target/lambda/* target",
25 |     "build": "npm run build:pack-deps && tsc"
26 |   },
27 |   "peerDependencies": {
28 |     "aws-cdk-lib": "^2.50.0",
29 |     "constructs": "^10.1.155"
30 |   },
31 |   "directories": {
32 |     "lib": "lib"
33 |   },
34 |   "devDependencies": {},
35 |   "repository": {
36 |     "type": "git",
37 |     "url": "git+https://github.com/tvanhens/pathery.git"
38 |   },
39 |   "bugs": {
40 |     "url": "https://github.com/tvanhens/pathery/issues"
41 |   },
42 |   "homepage": "https://github.com/tvanhens/pathery#readme"
43 | }
44 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/src/config.ts:
--------------------------------------------------------------------------------
  1 | export interface FieldConfig<K, Flags> {
  2 |   /**
  3 |    * The name of the field to index.
  4 |    *
  5 |    * This must match the object key name of objects being indexed.
  6 |    */
  7 |   name: string;
  8 | 
  9 |   /**
 10 |    * The kind of field.
 11 |    *
 12 |    * Kind descriptions:
 13 |    *
 14 |    * `text` - Indexes field values as `string`.
 15 |    *
 16 |    * `date` - Indexes field values as ints but serialized as ISO 80601 strings in transit.
 17 |    */
 18 |   kind: K;
 19 | 
 20 |   /**
 21 |    * Flags to add additional indexing capabilities.
 22 |    *
 23 |    * Flag descriptions:
 24 |    *
 25 |    *
 26 |    * `TEXT`    - (only for `text`) Marks this field for full-text indexing.
 27 |    *
 28 |    * `STRING`  - (only for `text`) Marks this field for exact-string indexing.
 29 |    *
 30 |    * `INDEXED` - (only for `date`) Marks this field for ordered search indexing.
 31 |    */
 32 |   flags: Flags[];
 33 | }
 34 | 
 35 | export type TextFieldConfig = FieldConfig<"text", "STRING" | "TEXT" | "FAST">;
 36 | 
 37 | export type DateFieldConfig = FieldConfig<"date", "INDEXED" | "FAST">;
 38 | 
 39 | export type IntegerFieldConfig = FieldConfig<"i64", "INDEXED" | "FAST">;
 40 | 
 41 | export type JsonFieldConfig = FieldConfig<"json", "TEXT">;
 42 | 
 43 | export type IndexFieldConfig =
 44 |   | TextFieldConfig
 45 |   | DateFieldConfig
 46 |   | IntegerFieldConfig
 47 |   | JsonFieldConfig;
 48 | 
 49 | export interface IndexConfig {
 50 |   /**
 51 |    * Prefix matcher for index name.
 52 |    *
 53 |    * Indexes that start with `prefix` will use the fields schema and configuration specified in this object.
 54 |    *
 55 |    * For example:
 56 |    *
 57 |    * ```ts
 58 |    * { prefix: `book-index-`, ... }
 59 |    * ```
 60 |    *
 61 |    * will cause indexes named `book-index-1` and `book-index-foo` to match.
 62 |    */
 63 |   prefix: string;
 64 | 
 65 |   /**
 66 |    * List of field configurations for the index.
 67 |    *
 68 |    * Documents must have fields that match the fields specified in this configuration in order to be indexed.
 69 |    * Fields which are not included in the list of fields will be ignored.
 70 |    *
 71 |    * @example
 72 |    * String text field config:
 73 |    *
 74 |    * ```ts
 75 |    * {
 76 |    *   name: "isbn",
 77 |    *   kind: "text",
 78 |    *   // Note "STRING" here which indexes the field as one string (e.g. no splitting).
 79 |    *   flags: ["STRING"]
 80 |    * }
 81 |    * ```
 82 |    *
 83 |    * @example
 84 |    * Full-text text field config:
 85 |    *
 86 |    * ```ts
 87 |    * {
 88 |    *   name: "description",
 89 |    *   kind: "text",
 90 |    *   // Note "TEXT" flag which indexes the field as a full-text field splitting on characters such as spaces.
 91 |    *   flags: ["TEXT"]
 92 |    * }
 93 |    * ```
 94 |    */
 95 |   fields: IndexFieldConfig[];
 96 | }
 97 | 
 98 | export interface PatheryConfig {
 99 |   /**
100 |    * List of index configurations.
101 |    */
102 |   indexes: IndexConfig[];
103 | }
104 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/src/index.ts:
--------------------------------------------------------------------------------
1 | import { PatheryStack } from "./pathery-stack";
2 | 
3 | export { PatheryStack };
4 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/src/pathery-dashboard.ts:
--------------------------------------------------------------------------------
  1 | import { Construct } from "constructs";
  2 | import {
  3 |   Column,
  4 |   Dashboard,
  5 |   GraphWidget,
  6 |   LogQueryWidget,
  7 |   MathExpression,
  8 |   Row,
  9 |   Shading,
 10 |   TextWidget,
 11 | } from "aws-cdk-lib/aws-cloudwatch";
 12 | import { RustFunction } from "./rust-function";
 13 | import { Duration, Stack } from "aws-cdk-lib";
 14 | 
 15 | export interface PatheryDashboardProps {
 16 |   indexWriterWorker: RustFunction;
 17 | }
 18 | 
 19 | export class PatheryDashboard extends Construct {
 20 |   constructor(scope: Construct, id: string, props: PatheryDashboardProps) {
 21 |     super(scope, id);
 22 | 
 23 |     let stack = Stack.of(this);
 24 | 
 25 |     const dashboard = new Dashboard(this, "Resource", {
 26 |       dashboardName: `Pathery-${stack.stackName}`,
 27 |     });
 28 | 
 29 |     let functions = stack.node
 30 |       .findAll()
 31 |       .filter((c): c is RustFunction => c instanceof RustFunction);
 32 | 
 33 |     let successRate = new MathExpression({
 34 |       expression: "100 - ((errors / invocations) * 100)",
 35 |       period: Duration.minutes(1),
 36 |       usingMetrics: {
 37 |         errors: props.indexWriterWorker.metricErrors({
 38 |           statistic: "sum",
 39 |         }),
 40 |         invocations: props.indexWriterWorker.metricInvocations({
 41 |           statistic: "sum",
 42 |         }),
 43 |       },
 44 |       color: "#72bf6a",
 45 |       label: "Success Rate",
 46 |     });
 47 | 
 48 |     dashboard.addWidgets(
 49 |       new LogQueryWidget({
 50 |         title: "Errors",
 51 |         logGroupNames: functions.map((f) => f.logGroup.logGroupName),
 52 |         queryLines: [
 53 |           "fields @timestamp, @log, fields.message",
 54 |           "filter level = 'ERROR'",
 55 |         ],
 56 |         width: 24,
 57 |       }),
 58 |       new Column(
 59 |         new TextWidget({
 60 |           markdown: "# IndexWriterWorker",
 61 |           width: 24,
 62 |           height: 1,
 63 |         }),
 64 |         new Row(
 65 |           new GraphWidget({
 66 |             liveData: true,
 67 |             title: "IndexWriterWorker Execution",
 68 |             width: 12,
 69 |             left: [
 70 |               props.indexWriterWorker.metricDuration({
 71 |                 period: Duration.minutes(1),
 72 |                 statistic: "max",
 73 |               }),
 74 |             ],
 75 |             leftYAxis: {
 76 |               min: 0,
 77 |               label: "Latency (ms)",
 78 |               showUnits: false,
 79 |             },
 80 |             right: [successRate],
 81 |             rightYAxis: {
 82 |               min: 0,
 83 |               max: 100,
 84 |               label: "Success Rate (%)",
 85 |               showUnits: false,
 86 |             },
 87 |             leftAnnotations: [
 88 |               {
 89 |                 value:
 90 |                   (props.indexWriterWorker.timeout?.toMilliseconds() ?? 3000) *
 91 |                   0.75,
 92 |                 fill: Shading.ABOVE,
 93 |                 color: "#e6b400",
 94 |                 label: "Timeout Warning",
 95 |               },
 96 |               {
 97 |                 value:
 98 |                   props.indexWriterWorker.timeout?.toMilliseconds() ?? 3000,
 99 |                 fill: Shading.ABOVE,
100 |                 color: "#f44336",
101 |                 label: "Timeout",
102 |               },
103 |             ],
104 |           })
105 |         )
106 |       )
107 |     );
108 |   }
109 | }
110 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/src/pathery-stack.ts:
--------------------------------------------------------------------------------
  1 | import {
  2 |   Stack,
  3 |   aws_lambda,
  4 |   CfnOutput,
  5 |   Duration,
  6 |   StackProps,
  7 | } from "aws-cdk-lib";
  8 | import {
  9 |   ApiKey,
 10 |   EndpointType,
 11 |   LambdaIntegration,
 12 |   RestApi,
 13 | } from "aws-cdk-lib/aws-apigateway";
 14 | import {
 15 |   GatewayVpcEndpointAwsService,
 16 |   InterfaceVpcEndpointAwsService,
 17 |   SubnetType,
 18 |   Vpc,
 19 | } from "aws-cdk-lib/aws-ec2";
 20 | import { FileSystem } from "aws-cdk-lib/aws-efs";
 21 | import { Function, LayerVersion } from "aws-cdk-lib/aws-lambda";
 22 | import { Architecture, Code, Runtime } from "aws-cdk-lib/aws-lambda";
 23 | import { SqsEventSource } from "aws-cdk-lib/aws-lambda-event-sources";
 24 | import { IQueue, Queue } from "aws-cdk-lib/aws-sqs";
 25 | import { Construct } from "constructs";
 26 | import { PatheryConfig } from "./config";
 27 | import * as fs from "fs";
 28 | import { RustFunction } from "./rust-function";
 29 | import { PatheryDashboard } from "./pathery-dashboard";
 30 | import {
 31 |   AttributeType,
 32 |   BillingMode,
 33 |   ITable,
 34 |   Table,
 35 | } from "aws-cdk-lib/aws-dynamodb";
 36 | 
 37 | export interface PatheryStackProps extends StackProps {
 38 |   config: PatheryConfig;
 39 | 
 40 |   /**
 41 |    * IndexWriter configuration overrides.
 42 |    */
 43 |   indexWriter?: {
 44 |     /**
 45 |      * IndexWriter Lambda memorySize.
 46 |      *
 47 |      * @default 2048
 48 |      */
 49 |     memorySize?: number;
 50 | 
 51 |     /**
 52 |      * IndexWriter Lambda timeout duration.
 53 |      *
 54 |      * @default Duration.minutes(1)
 55 |      */
 56 |     timeout?: Duration;
 57 |   };
 58 | 
 59 |   /**
 60 |    * QueryHandler configuration overrides.
 61 |    */
 62 |   queryHandler?: {
 63 |     /**
 64 |      * IndexWriter Lambda memorySize.
 65 |      *
 66 |      * @default 3008
 67 |      */
 68 |     memorySize?: number;
 69 |   };
 70 | }
 71 | 
 72 | export class PatheryStack extends Stack {
 73 |   readonly apiKey: ApiKey;
 74 | 
 75 |   readonly apiGateway: RestApi;
 76 | 
 77 |   private readonly table: ITable;
 78 | 
 79 |   private indexWriterQueue: IQueue;
 80 | 
 81 |   private deleteQueue: IQueue;
 82 | 
 83 |   constructor(scope: Construct, id: string, props: PatheryStackProps) {
 84 |     super(scope, id, props);
 85 | 
 86 |     this.table = new Table(this, "DataTable", {
 87 |       billingMode: BillingMode.PAY_PER_REQUEST,
 88 |       partitionKey: {
 89 |         name: "pk",
 90 |         type: AttributeType.STRING,
 91 |       },
 92 |       sortKey: {
 93 |         name: "sk",
 94 |         type: AttributeType.STRING,
 95 |       },
 96 |       timeToLiveAttribute: "__ttl",
 97 |     });
 98 | 
 99 |     this.deleteQueue = new Queue(this, "DeleteQueue", {
100 |       deliveryDelay: Duration.minutes(15),
101 |       visibilityTimeout: Duration.minutes(2),
102 |     });
103 | 
104 |     this.indexWriterQueue = new Queue(this, "IndexWriterQueue", {
105 |       fifo: true,
106 |       contentBasedDeduplication: true,
107 |     });
108 | 
109 |     const vpc = new Vpc(this, "Vpc", {
110 |       subnetConfiguration: [
111 |         {
112 |           cidrMask: 28,
113 |           name: "isolated",
114 |           subnetType: SubnetType.PRIVATE_ISOLATED,
115 |         },
116 |       ],
117 |     });
118 |     vpc.addGatewayEndpoint("S3Endpoint", {
119 |       service: GatewayVpcEndpointAwsService.S3,
120 |     });
121 |     vpc.addGatewayEndpoint("DynamoEndpoint", {
122 |       service: GatewayVpcEndpointAwsService.DYNAMODB,
123 |     });
124 |     const sqsEndpoint = vpc.addInterfaceEndpoint("SqsGateway", {
125 |       service: InterfaceVpcEndpointAwsService.SQS,
126 |     });
127 |     sqsEndpoint.connections.allowDefaultPortFromAnyIpv4();
128 |     const lambdaEndpoint = vpc.addInterfaceEndpoint("LambdaEndpoint", {
129 |       service: InterfaceVpcEndpointAwsService.LAMBDA,
130 |     });
131 |     lambdaEndpoint.connections.allowDefaultPortFromAnyIpv4();
132 | 
133 |     const efs = new FileSystem(this, "Filesystem", {
134 |       vpc,
135 |     });
136 | 
137 |     let accessPoint = efs.addAccessPoint("ReadWrite", {
138 |       createAcl: {
139 |         ownerGid: "1001",
140 |         ownerUid: "1001",
141 |         permissions: "750",
142 |       },
143 |       posixUser: {
144 |         uid: "1001",
145 |         gid: "1001",
146 |       },
147 |       path: "/pathery-data",
148 |     });
149 | 
150 |     fs.mkdirSync(".pathery/layer/pathery", { recursive: true });
151 |     fs.writeFileSync(
152 |       ".pathery/layer/pathery/config.json",
153 |       JSON.stringify(props.config)
154 |     );
155 |     let configLayer = new LayerVersion(this, "config-layer", {
156 |       code: Code.fromAsset(".pathery/layer"),
157 |       compatibleArchitectures: [Architecture.ARM_64],
158 |       compatibleRuntimes: [Runtime.PROVIDED_AL2],
159 |     });
160 | 
161 |     const postIndex = new RustFunction(this, "post-index");
162 |     postIndex.addLayers(configLayer);
163 |     this.indexWriterProducer(postIndex);
164 | 
165 |     const batchIndex = new RustFunction(this, "batch-index");
166 |     batchIndex.addLayers(configLayer);
167 |     this.indexWriterProducer(batchIndex);
168 | 
169 |     const queryIndexPartition = new RustFunction(
170 |       this,
171 |       "query-index-partition-fn",
172 |       {
173 |         memorySize: props.queryHandler?.memorySize ?? 3008,
174 |         timeout: Duration.seconds(5),
175 |         vpc,
176 |         vpcSubnets: {
177 |           subnets: vpc.isolatedSubnets,
178 |         },
179 |         filesystem: aws_lambda.FileSystem.fromEfsAccessPoint(
180 |           accessPoint,
181 |           "/mnt/pathery-data"
182 |         ),
183 |       }
184 |     );
185 |     queryIndexPartition.addLayers(configLayer);
186 |     this.table.grantReadData(queryIndexPartition);
187 |     queryIndexPartition.addEnvironment("DATA_TABLE_NAME", this.table.tableName);
188 |     queryIndexPartition.addEnvironment(
189 |       "ASYNC_DELETE_QUEUE_URL",
190 |       this.deleteQueue.queueUrl
191 |     );
192 | 
193 |     const queryIndex = new RustFunction(this, "query-index", {
194 |       memorySize: props.queryHandler?.memorySize ?? 3008,
195 |       timeout: Duration.seconds(5),
196 |       vpc,
197 |       vpcSubnets: {
198 |         subnets: vpc.isolatedSubnets,
199 |       },
200 |       filesystem: aws_lambda.FileSystem.fromEfsAccessPoint(
201 |         accessPoint,
202 |         "/mnt/pathery-data"
203 |       ),
204 |     });
205 |     queryIndex.addLayers(configLayer);
206 |     this.table.grantReadData(queryIndex);
207 |     queryIndex.addEnvironment("DATA_TABLE_NAME", this.table.tableName);
208 |     queryIndex.addEnvironment(
209 |       "ASYNC_DELETE_QUEUE_URL",
210 |       this.deleteQueue.queueUrl
211 |     );
212 |     queryIndexPartition.grantInvoke(queryIndex);
213 |     queryIndex.addEnvironment(
214 |       "QUERY_INDEX_PARTITION_NAME",
215 |       queryIndexPartition.functionName
216 |     );
217 | 
218 |     const statsIndex = new RustFunction(this, "stats-index", {
219 |       vpc,
220 |       vpcSubnets: {
221 |         subnets: vpc.isolatedSubnets,
222 |       },
223 |       filesystem: aws_lambda.FileSystem.fromEfsAccessPoint(
224 |         accessPoint,
225 |         "/mnt/pathery-data"
226 |       ),
227 |     });
228 |     statsIndex.addLayers(configLayer);
229 |     // FIXME: This doesn't actually get used but is required to be
230 |     //        set because of some tangled internal dependencies.
231 |     statsIndex.addEnvironment(
232 |       "ASYNC_DELETE_QUEUE_URL",
233 |       this.deleteQueue.queueUrl
234 |     );
235 | 
236 |     const deleteDoc = new RustFunction(this, "delete-doc");
237 |     deleteDoc.addLayers(configLayer);
238 |     this.indexWriterProducer(deleteDoc);
239 | 
240 |     const api = new RestApi(this, "PatheryApi", {
241 |       restApiName: id,
242 |       endpointConfiguration: {
243 |         types: [EndpointType.REGIONAL],
244 |       },
245 |       defaultMethodOptions: {
246 |         apiKeyRequired: true,
247 |       },
248 |     });
249 | 
250 |     this.apiGateway = api;
251 | 
252 |     const apiKey = new ApiKey(this, "DefaultApiKey", {});
253 | 
254 |     const plan = api.addUsagePlan("DefaultPlan", {
255 |       apiStages: [
256 |         {
257 |           api,
258 |           stage: api.deploymentStage,
259 |         },
260 |       ],
261 |     });
262 | 
263 |     plan.addApiKey(apiKey);
264 | 
265 |     this.apiKey = apiKey;
266 | 
267 |     const indexRoute = api.root.addResource("index");
268 | 
269 |     const indexSingleRoute = indexRoute.addResource("{index_id}");
270 | 
271 |     indexSingleRoute.addMethod("POST", new LambdaIntegration(postIndex));
272 | 
273 |     const queryActionRoute = indexSingleRoute.addResource("query");
274 | 
275 |     queryActionRoute.addMethod("POST", new LambdaIntegration(queryIndex));
276 | 
277 |     const statsActionRoute = indexSingleRoute.addResource("stats");
278 | 
279 |     statsActionRoute.addMethod("GET", new LambdaIntegration(statsIndex));
280 | 
281 |     const batchIndexRoute = indexSingleRoute.addResource("batch");
282 | 
283 |     batchIndexRoute.addMethod("POST", new LambdaIntegration(batchIndex));
284 | 
285 |     const documentRoute = indexSingleRoute.addResource("doc");
286 | 
287 |     const documentSingleRoute = documentRoute.addResource("{doc_id}");
288 | 
289 |     documentSingleRoute.addMethod("DELETE", new LambdaIntegration(deleteDoc));
290 | 
291 |     const indexWriterWorker = new RustFunction(this, "index-writer-worker", {
292 |       memorySize: props.indexWriter?.memorySize ?? 2048,
293 |       timeout: props.indexWriter?.timeout ?? Duration.minutes(1),
294 |       vpc,
295 |       vpcSubnets: {
296 |         subnets: vpc.isolatedSubnets,
297 |       },
298 |       filesystem: aws_lambda.FileSystem.fromEfsAccessPoint(
299 |         accessPoint,
300 |         "/mnt/pathery-data"
301 |       ),
302 |     });
303 |     indexWriterWorker.addLayers(configLayer);
304 |     indexWriterWorker.addEventSource(
305 |       new SqsEventSource(this.indexWriterQueue, {
306 |         batchSize: 10,
307 |       })
308 |     );
309 |     this.table.grantReadWriteData(indexWriterWorker);
310 |     indexWriterWorker.addEnvironment("DATA_TABLE_NAME", this.table.tableName);
311 |     this.deleteQueue.grantSendMessages(indexWriterWorker);
312 |     indexWriterWorker.addEnvironment(
313 |       "ASYNC_DELETE_QUEUE_URL",
314 |       this.deleteQueue.queueUrl
315 |     );
316 | 
317 |     const asyncDeleteWorker = new RustFunction(this, "async-delete-worker", {
318 |       memorySize: 2048,
319 |       timeout: props.indexWriter?.timeout ?? Duration.minutes(1),
320 |       vpc,
321 |       vpcSubnets: {
322 |         subnets: vpc.isolatedSubnets,
323 |       },
324 |       filesystem: aws_lambda.FileSystem.fromEfsAccessPoint(
325 |         accessPoint,
326 |         "/mnt/pathery-data"
327 |       ),
328 |     });
329 |     asyncDeleteWorker.addLayers(configLayer);
330 |     asyncDeleteWorker.addEventSource(
331 |       new SqsEventSource(this.deleteQueue, {
332 |         batchSize: 10,
333 |       })
334 |     );
335 | 
336 |     new PatheryDashboard(this, "Dashboard", {
337 |       indexWriterWorker,
338 |     });
339 | 
340 |     new CfnOutput(this, "ApiKeyOutput", {
341 |       value: apiKey.keyId,
342 |     });
343 |   }
344 | 
345 |   private indexWriterProducer(lambda: Function) {
346 |     this.table.grantWriteData(lambda);
347 |     lambda.addEnvironment("DATA_TABLE_NAME", this.table.tableName);
348 | 
349 |     this.indexWriterQueue.grantSendMessages(lambda);
350 |     lambda.addEnvironment(
351 |       "INDEX_WRITER_QUEUE_URL",
352 |       this.indexWriterQueue.queueUrl
353 |     );
354 |   }
355 | }
356 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/src/rust-function.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 |   Architecture,
 3 |   Code,
 4 |   Function,
 5 |   FunctionProps,
 6 |   Runtime,
 7 | } from "aws-cdk-lib/aws-lambda";
 8 | import { RetentionDays } from "aws-cdk-lib/aws-logs";
 9 | import { Construct } from "constructs";
10 | import * as path from "path";
11 | 
12 | export class RustFunction extends Function {
13 |   constructor(scope: Construct, id: string, props?: Partial<FunctionProps>) {
14 |     const lambdaAssetPath = path.join(__dirname, "..", "target", id);
15 |     super(scope, id, {
16 |       ...props,
17 |       code: Code.fromAsset(lambdaAssetPath),
18 |       handler: "default",
19 |       runtime: Runtime.PROVIDED_AL2,
20 |       architecture: Architecture.ARM_64,
21 |       logRetention: RetentionDays.THREE_DAYS,
22 |     });
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/packages/pathery-cdk/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es2016",
 4 |     "module": "commonjs",
 5 |     "esModuleInterop": true,
 6 |     "forceConsistentCasingInFileNames": true,
 7 |     "strict": true,
 8 |     "skipLibCheck": true,
 9 |     "outDir": "lib",
10 |     "declaration": true,
11 |     "baseUrl": "./src",
12 |     "declarationMap": true
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/packages/pathery/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | edition = "2021"
 3 | name = "pathery"
 4 | version = "0.1.0"
 5 | 
 6 | [dependencies]
 7 | anyhow = "1.0.66"
 8 | async-trait = "0.1.58"
 9 | aws-config = "0.51.0"
10 | aws-sdk-dynamodb = "0.21.0"
11 | aws-sdk-lambda = "0.21.0"
12 | aws-sdk-sqs = "0.21.0"
13 | aws-smithy-types = "0.51.0"
14 | aws_lambda_events = "0.7.2"
15 | base64 = "0.21.0"
16 | chrono = "0.4.23"
17 | http = "0.2.8"
18 | lambda_http = {version = "0.7", default-features = false, features = ["apigw_rest"]}
19 | lambda_runtime = "0.7"
20 | serde = {version = "1.0.147", features = ["derive"]}
21 | serde_dynamo = {version = "4", features = ["aws-sdk-dynamodb+0_21"]}
22 | serde_json = "1.0.87"
23 | tantivy = {version = "0.18.1"}
24 | tantivy-common = "0.3.0"
25 | thiserror = "1.0.37"
26 | tokio = {version = "1", features = ["full"]}
27 | tracing = {version = "0.1", features = ["log"]}
28 | tracing-subscriber = {version = "0.3", default-features = false, features = ["fmt", "json", "std"]}
29 | uuid = "1.2.1"
30 | zstd = "0.12.3"
31 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/async-delete-worker.rs:
--------------------------------------------------------------------------------
 1 | use pathery::lambda;
 2 | use pathery::lambda::lambda_runtime::{run, service_fn};
 3 | use pathery::lambda::sqs;
 4 | use pathery::worker::async_delete::handle_event;
 5 | 
 6 | #[tokio::main]
 7 | async fn main() -> Result<(), sqs::Error> {
 8 |     lambda::init_tracing();
 9 | 
10 |     run(service_fn(|event| handle_event(event))).await
11 | }
12 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/batch-index.rs:
--------------------------------------------------------------------------------
 1 | use pathery::service::index::BatchIndexService;
 2 | use pathery::service::start_service;
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<(), lambda_http::Error> {
 6 |     let service = BatchIndexService::create().await;
 7 | 
 8 |     start_service(&service).await
 9 | }
10 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/delete-doc.rs:
--------------------------------------------------------------------------------
 1 | use pathery::service::doc::DeleteDocService;
 2 | use pathery::service::start_service;
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<(), lambda_http::Error> {
 6 |     let service = DeleteDocService::create().await;
 7 | 
 8 |     start_service(&service).await
 9 | }
10 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/index-writer-worker.rs:
--------------------------------------------------------------------------------
 1 | use pathery::index::LambdaIndexLoader;
 2 | use pathery::lambda;
 3 | use pathery::lambda::lambda_runtime::{run, service_fn};
 4 | use pathery::lambda::sqs;
 5 | use pathery::store::document::DDBDocumentStore;
 6 | use pathery::worker::index_writer::handle_event;
 7 | 
 8 | #[tokio::main]
 9 | async fn main() -> Result<(), sqs::Error> {
10 |     lambda::init_tracing();
11 | 
12 |     let document_store = DDBDocumentStore::create(None).await;
13 |     let index_loader = LambdaIndexLoader::create().await;
14 | 
15 |     run(service_fn(|event| {
16 |         handle_event(&document_store, &index_loader, event)
17 |     }))
18 |     .await
19 | }
20 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/post-index.rs:
--------------------------------------------------------------------------------
 1 | use pathery::service::index::PostIndexService;
 2 | use pathery::service::start_service;
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<(), lambda_http::Error> {
 6 |     let service = PostIndexService::create().await;
 7 | 
 8 |     start_service(&service).await
 9 | }
10 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/query-index-partition-fn.rs:
--------------------------------------------------------------------------------
 1 | use pathery::function::query_index_partition::handle_event;
 2 | use pathery::index::LambdaIndexLoader;
 3 | use pathery::lambda;
 4 | use pathery::lambda::lambda_runtime::{run, service_fn};
 5 | use pathery::lambda::sqs;
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> Result<(), sqs::Error> {
 9 |     lambda::init_tracing();
10 | 
11 |     let index_loader = LambdaIndexLoader::create().await;
12 | 
13 |     run(service_fn(|event| handle_event(&index_loader, event))).await
14 | }
15 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/query-index.rs:
--------------------------------------------------------------------------------
 1 | use pathery::service::index::QueryIndexService;
 2 | use pathery::service::start_service;
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<(), lambda_http::Error> {
 6 |     let service = QueryIndexService::create().await;
 7 | 
 8 |     start_service(&service).await
 9 | }
10 | 


--------------------------------------------------------------------------------
/packages/pathery/src/bin/stats-index.rs:
--------------------------------------------------------------------------------
 1 | use pathery::service::index::StatsIndexService;
 2 | use pathery::service::start_service;
 3 | 
 4 | #[tokio::main]
 5 | async fn main() -> Result<(), lambda_http::Error> {
 6 |     let service = StatsIndexService::create().await;
 7 | 
 8 |     start_service(&service).await
 9 | }
10 | 


--------------------------------------------------------------------------------
/packages/pathery/src/directory.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::path::{Path, PathBuf};
  3 | use std::sync::Arc;
  4 | 
  5 | use tantivy::directory::error::OpenDirectoryError;
  6 | use tantivy::directory::{DirectoryLock, MmapDirectory};
  7 | use tantivy::Directory;
  8 | use tokio::runtime::Handle;
  9 | 
 10 | use crate::pagination::SegmentMeta;
 11 | use crate::worker::async_delete::client::AsyncDeleteClient;
 12 | use crate::worker::async_delete::job::AsyncDeleteJob;
 13 | 
 14 | struct NoopLockGuard;
 15 | 
 16 | /// Directory that wraps MmapDirectory without using a lockfile.
 17 | ///
 18 | /// Using a FIFO SQS queue for orchestrating indexing removes the need for a lockfile.
 19 | #[derive(Clone, Debug)]
 20 | pub struct PatheryDirectory {
 21 |     directory_path: PathBuf,
 22 | 
 23 |     segments: Option<Vec<SegmentMeta>>,
 24 | 
 25 |     inner: MmapDirectory,
 26 | 
 27 |     async_delete_client: Arc<dyn AsyncDeleteClient>,
 28 | 
 29 |     handle: Handle,
 30 | }
 31 | 
 32 | impl PatheryDirectory {
 33 |     pub fn open<P>(
 34 |         directory_path: P,
 35 |         async_delete_client: &Arc<dyn AsyncDeleteClient>,
 36 |         segments: Option<Vec<SegmentMeta>>,
 37 |     ) -> Result<PatheryDirectory, OpenDirectoryError>
 38 |     where
 39 |         P: AsRef<Path>,
 40 |     {
 41 |         Ok(PatheryDirectory {
 42 |             directory_path: directory_path.as_ref().to_owned(),
 43 |             segments,
 44 |             inner: MmapDirectory::open(directory_path)?,
 45 |             async_delete_client: Arc::clone(async_delete_client),
 46 |             handle: Handle::try_current().unwrap(),
 47 |         })
 48 |     }
 49 | }
 50 | 
 51 | impl Directory for PatheryDirectory {
 52 |     fn get_file_handle(
 53 |         &self,
 54 |         path: &std::path::Path,
 55 |     ) -> Result<Box<dyn tantivy::directory::FileHandle>, tantivy::directory::error::OpenReadError>
 56 |     {
 57 |         self.inner.get_file_handle(path)
 58 |     }
 59 | 
 60 |     fn delete(&self, path: &std::path::Path) -> Result<(), tantivy::directory::error::DeleteError> {
 61 |         let path = self.directory_path.join(path.to_path_buf());
 62 |         let job = AsyncDeleteJob::fs_delete(path);
 63 |         self.handle
 64 |             .block_on(self.async_delete_client.submit_job(job))
 65 |             .expect("Message should queue successfully");
 66 |         Ok(())
 67 |     }
 68 | 
 69 |     fn exists(
 70 |         &self,
 71 |         path: &std::path::Path,
 72 |     ) -> Result<bool, tantivy::directory::error::OpenReadError> {
 73 |         self.inner.exists(path)
 74 |     }
 75 | 
 76 |     fn open_write(
 77 |         &self,
 78 |         path: &std::path::Path,
 79 |     ) -> Result<tantivy::directory::WritePtr, tantivy::directory::error::OpenWriteError> {
 80 |         self.inner.open_write(path)
 81 |     }
 82 | 
 83 |     fn atomic_read(
 84 |         &self,
 85 |         path: &std::path::Path,
 86 |     ) -> Result<Vec<u8>, tantivy::directory::error::OpenReadError> {
 87 |         let result = self.inner.atomic_read(path)?;
 88 | 
 89 |         // check that we are returning meta.json
 90 |         if path == Path::new("meta.json") {
 91 |             if let Some(segments) = &self.segments {
 92 |                 let mut meta: HashMap<String, serde_json::Value> =
 93 |                     serde_json::from_slice(&result[..]).expect("meta.json should be parsable");
 94 | 
 95 |                 // let segments = meta
 96 |                 //     .get("segments")
 97 |                 //     .and_then(|s| s.as_array())
 98 |                 //     .expect("segments should be set");
 99 | 
100 |                 // let filtered_segments: Vec<_> = segments
101 |                 //     .iter()
102 |                 //     .enumerate()
103 |                 //     .filter(|(idx, _)| (idx + self.partition_n) % self.total_partitions == 0)
104 |                 //     .map(|(_, v)| v.to_owned())
105 |                 //     .collect();
106 |                 let segments = serde_json::to_value(segments).unwrap();
107 | 
108 |                 meta.insert(String::from("segments"), segments);
109 | 
110 |                 return Ok(serde_json::to_vec(&meta).expect("meta.json should serialize"));
111 |             }
112 |         }
113 | 
114 |         Ok(result)
115 |     }
116 | 
117 |     fn atomic_write(&self, path: &std::path::Path, data: &[u8]) -> std::io::Result<()> {
118 |         self.inner.atomic_write(path, data)
119 |     }
120 | 
121 |     fn sync_directory(&self) -> std::io::Result<()> {
122 |         self.inner.sync_directory()
123 |     }
124 | 
125 |     fn watch(
126 |         &self,
127 |         watch_callback: tantivy::directory::WatchCallback,
128 |     ) -> tantivy::Result<tantivy::directory::WatchHandle> {
129 |         self.inner.watch(watch_callback)
130 |     }
131 | 
132 |     fn acquire_lock(
133 |         &self,
134 |         _lock: &tantivy::directory::Lock,
135 |     ) -> Result<tantivy::directory::DirectoryLock, tantivy::directory::error::LockError> {
136 |         Ok(DirectoryLock::from(Box::new(NoopLockGuard)))
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/packages/pathery/src/function/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod query_index_partition;
2 | 


--------------------------------------------------------------------------------
/packages/pathery/src/function/query_index_partition/client.rs:
--------------------------------------------------------------------------------
 1 | use aws_smithy_types::Blob;
 2 | 
 3 | use super::{PartitionQueryResponse, QueryRequest};
 4 | use crate::pagination::SegmentMeta;
 5 | use crate::util;
 6 | 
 7 | pub struct LambdaQueryIndexPartitionClient {
 8 |     function_name: String,
 9 | 
10 |     client: aws_sdk_lambda::Client,
11 | }
12 | 
13 | impl LambdaQueryIndexPartitionClient {
14 |     pub async fn create() -> LambdaQueryIndexPartitionClient {
15 |         let sdk_config = aws_config::load_from_env().await;
16 |         let function_name = util::require_env("QUERY_INDEX_PARTITION_NAME");
17 | 
18 |         LambdaQueryIndexPartitionClient {
19 |             function_name,
20 |             client: aws_sdk_lambda::Client::new(&sdk_config),
21 |         }
22 |     }
23 | 
24 |     pub async fn query_partition(
25 |         &self,
26 |         index_id: String,
27 |         query: String,
28 |         offset: usize,
29 |         partition_n: usize,
30 |         segments: Vec<SegmentMeta>,
31 |     ) -> PartitionQueryResponse {
32 |         // TODO: Error handling and retries
33 |         let request = self.client.invoke();
34 |         let request = request.function_name(&self.function_name);
35 |         let input = QueryRequest {
36 |             index_id,
37 |             query,
38 |             offset,
39 |             partition_n,
40 |             segments,
41 |         };
42 |         let input = serde_json::to_vec(&input).expect("should serialize");
43 |         let input = Blob::new(input);
44 |         let request = request.payload(input);
45 |         let response = tokio::spawn(request.send());
46 |         let response = response.await.unwrap().expect("should succeed");
47 | 
48 |         let payload = response.payload().expect("payload should exist");
49 |         let payload = payload.to_owned().into_inner();
50 |         let payload: PartitionQueryResponse =
51 |             serde_json::from_slice(&payload).expect("payload should parse");
52 |         payload
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/packages/pathery/src/function/query_index_partition/mod.rs:
--------------------------------------------------------------------------------
  1 | pub mod client;
  2 | 
  3 | use lambda_runtime::{Error, LambdaEvent};
  4 | use serde::{Deserialize, Serialize};
  5 | use tantivy::collector::TopDocs;
  6 | use tantivy::query::QueryParser;
  7 | use tantivy::schema::{Field, FieldType};
  8 | use tantivy::{DocAddress, Score};
  9 | 
 10 | use crate::index::IndexLoader;
 11 | use crate::pagination::SegmentMeta;
 12 | use crate::service::ServiceError;
 13 | use crate::store::document::SearchDocRef;
 14 | 
 15 | #[derive(Serialize, Deserialize, Debug)]
 16 | pub struct QueryRequest {
 17 |     pub index_id: String,
 18 |     pub query: String,
 19 |     pub offset: usize,
 20 |     pub partition_n: usize,
 21 |     pub segments: Vec<SegmentMeta>,
 22 | }
 23 | 
 24 | #[derive(Serialize, Deserialize, Debug, PartialEq)]
 25 | pub struct PartitionSearchHit {
 26 |     pub doc_ref: SearchDocRef,
 27 |     pub score: f32,
 28 |     pub partition_n: usize,
 29 | }
 30 | 
 31 | #[derive(Serialize, Deserialize, Debug, PartialEq)]
 32 | pub struct PartitionQueryResponse {
 33 |     pub matches: Vec<PartitionSearchHit>,
 34 | }
 35 | 
 36 | pub async fn handle_event(
 37 |     index_loader: &dyn IndexLoader,
 38 |     event: LambdaEvent<QueryRequest>,
 39 | ) -> Result<PartitionQueryResponse, Error> {
 40 |     let body = event.payload;
 41 |     let index_id = body.index_id;
 42 | 
 43 |     let mut index = index_loader.load_index(&index_id, Some(body.segments))?;
 44 | 
 45 |     index.set_default_multithread_executor().unwrap();
 46 | 
 47 |     let reader = index.reader().expect("Reader should load");
 48 | 
 49 |     let searcher = reader.searcher();
 50 | 
 51 |     let schema = index.schema();
 52 | 
 53 |     let query_parser = QueryParser::for_index(
 54 |         &index,
 55 |         schema
 56 |             .fields()
 57 |             .filter_map(|(field, entry)| {
 58 |                 if !entry.is_indexed() {
 59 |                     return None;
 60 |                 }
 61 |                 match entry.field_type() {
 62 |                     FieldType::Str(_) => Some(field),
 63 |                     _ => None,
 64 |                 }
 65 |             })
 66 |             .collect::<Vec<Field>>(),
 67 |     );
 68 | 
 69 |     let query = query_parser
 70 |         .parse_query(&body.query)
 71 |         .map_err(|err| ServiceError::invalid_request(&err.to_string()))?;
 72 | 
 73 |     let collector = TopDocs::with_limit(10).and_offset(body.offset);
 74 | 
 75 |     let top_docs: Vec<(Score, DocAddress)> = searcher
 76 |         .search(&query, &collector)
 77 |         .expect("search should succeed");
 78 | 
 79 |     let matches: Vec<_> = top_docs
 80 |         .into_iter()
 81 |         .map(|(score, address)| {
 82 |             let document = searcher.doc(address).expect("doc should exist");
 83 | 
 84 |             let named_doc = schema.to_named_doc(&document);
 85 | 
 86 |             let stored_ref = SearchDocRef::from(named_doc);
 87 | 
 88 |             PartitionSearchHit {
 89 |                 doc_ref: stored_ref,
 90 |                 score,
 91 |                 partition_n: body.partition_n,
 92 |             }
 93 |         })
 94 |         .collect();
 95 | 
 96 |     if matches.len() == 0 {
 97 |         return Ok(PartitionQueryResponse { matches: vec![] });
 98 |     }
 99 | 
100 |     Ok(PartitionQueryResponse { matches })
101 | }
102 | 
103 | // #[cfg(test)]
104 | // mod tests {
105 | //     use super::*;
106 | //     use crate::test_utils::*;
107 | 
108 | //     fn test_service(ctx: &TestContext) -> QueryIndexService {
109 | //         QueryIndexService {
110 | //             document_store: Box::new(ctx.document_store().clone()),
111 | //             index_loader: Box::new(ctx.index_loader().clone()),
112 | //         }
113 | //     }
114 | 
115 | //     #[tokio::test]
116 | //     async fn query_default_response() {
117 | //         let ctx = setup()
118 | //             .with_documents(
119 | //                 "test",
120 | //                 vec![json!({
121 | //                     "__id": "foobar",
122 | //                     "title": "hello",
123 | //                     "author": "world"
124 | //                 })],
125 | //             )
126 | //             .await;
127 | 
128 | //         let service = test_service(&ctx);
129 | 
130 | //         let request = ServiceRequest::create(QueryRequest {
131 | //             query: "hello".into(),
132 | //             with_partition: None,
133 | //         })
134 | //         .with_path_param("index_id", "test");
135 | 
136 | //         let response = service.handle_request(request).await.unwrap();
137 | 
138 | //         assert_eq!(
139 | //             QueryResponse {
140 | //                 matches: vec![SearchHit {
141 | //                     doc: json::json!({
142 | //                         "__id": ["foobar"],
143 | //                         "title": ["hello"],
144 | //                         "author": ["world"],
145 | //                     }),
146 | //                     score: 0.28768212,
147 | //                     snippets: json::json!({
148 | //                         "title": "<b>hello</b>"
149 | //                     })
150 | //                 }]
151 | //             },
152 | //             response
153 | //         );
154 | //     }
155 | 
156 | //     #[tokio::test]
157 | //     async fn query_document_with_un_indexed_fields() {
158 | //         let ctx = setup()
159 | //             .with_documents(
160 | //                 "test",
161 | //                 vec![json!({
162 | //                     "__id": "foobar",
163 | //                     "title": "hello",
164 | //                     "meta": "world"
165 | //                 })],
166 | //             )
167 | //             .await;
168 | 
169 | //         let service = test_service(&ctx);
170 | 
171 | //         let request = ServiceRequest::create(QueryRequest {
172 | //             query: "hello".into(),
173 | //             with_partition: None,
174 | //         })
175 | //         .with_path_param("index_id", "test");
176 | 
177 | //         let response = service.handle_request(request).await.unwrap();
178 | 
179 | //         assert_eq!(1, response.matches.len());
180 | //     }
181 | 
182 | //     #[tokio::test]
183 | //     async fn query_document_with_json_field() {
184 | //         let ctx = setup()
185 | //             .with_documents(
186 | //                 "test",
187 | //                 vec![json!({
188 | //                     "__id": "foobar",
189 | //                     "title": "hello",
190 | //                     "props": {
191 | //                         "foo": "bar"
192 | //                     }
193 | //                 })],
194 | //             )
195 | //             .await;
196 | 
197 | //         let service = test_service(&ctx);
198 | 
199 | //         let request = ServiceRequest::create(QueryRequest {
200 | //             query: "props.foo:bar".into(),
201 | //             with_partition: None,
202 | //         })
203 | //         .with_path_param("index_id", "test");
204 | 
205 | //         let response = service.handle_request(request).await.unwrap();
206 | 
207 | //         assert_eq!(1, response.matches.len());
208 | //     }
209 | // }
210 | 


--------------------------------------------------------------------------------
/packages/pathery/src/index.rs:
--------------------------------------------------------------------------------
  1 | use std::fs;
  2 | use std::path::Path;
  3 | use std::sync::Arc;
  4 | 
  5 | use tantivy::merge_policy::DefaultMergePolicy;
  6 | use tantivy::schema::Field;
  7 | use tantivy::{Index, IndexWriter};
  8 | 
  9 | use crate::directory::PatheryDirectory;
 10 | use crate::pagination::SegmentMeta;
 11 | use crate::schema::{SchemaLoader, SchemaProvider};
 12 | use crate::service::ServiceError;
 13 | use crate::worker::async_delete::client::{AsyncDeleteClient, LambdaAsyncDeleteClient};
 14 | 
 15 | pub trait IndexLoader: Send + Sync {
 16 |     fn load_index(
 17 |         &self,
 18 |         index_id: &str,
 19 |         segments: Option<Vec<SegmentMeta>>,
 20 |     ) -> Result<Index, ServiceError>;
 21 | }
 22 | 
 23 | pub struct LambdaIndexLoader {
 24 |     schema_loader: SchemaProvider,
 25 | 
 26 |     async_delete_client: Arc<dyn AsyncDeleteClient>,
 27 | }
 28 | 
 29 | impl LambdaIndexLoader {
 30 |     pub async fn create() -> Self {
 31 |         let async_delete_client = LambdaAsyncDeleteClient::create(None).await;
 32 |         let async_delete_client = Arc::new(async_delete_client);
 33 | 
 34 |         Self {
 35 |             schema_loader: SchemaProvider::lambda(),
 36 |             async_delete_client,
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | impl IndexLoader for LambdaIndexLoader {
 42 |     fn load_index(
 43 |         &self,
 44 |         index_id: &str,
 45 |         segments: Option<Vec<SegmentMeta>>,
 46 |     ) -> Result<Index, ServiceError> {
 47 |         let directory_path = format!("/mnt/pathery-data/{index_id}");
 48 | 
 49 |         let mut index = if let Ok(existing_dir) =
 50 |             PatheryDirectory::open(&directory_path, &self.async_delete_client, segments)
 51 |         {
 52 |             Index::open(existing_dir).expect("Index should be openable")
 53 |         } else {
 54 |             fs::create_dir(&directory_path).expect("Directory should be creatable");
 55 |             let schema = self.schema_loader.load_schema(index_id)?;
 56 |             Index::create_in_dir(Path::new(&directory_path), schema)
 57 |                 .expect("Index should be creatable")
 58 |         };
 59 | 
 60 |         index
 61 |             .set_default_multithread_executor()
 62 |             .expect("default multithread executor should succeed");
 63 | 
 64 |         Ok(index)
 65 |     }
 66 | }
 67 | 
 68 | pub trait IndexExt {
 69 |     fn default_writer(&self) -> IndexWriter;
 70 | 
 71 |     fn id_field(&self) -> Field;
 72 | }
 73 | 
 74 | impl IndexExt for Index {
 75 |     fn default_writer(&self) -> IndexWriter {
 76 |         let writer = self
 77 |             .writer(100_000_000)
 78 |             .expect("Writer should be available");
 79 | 
 80 |         let mut merge_policy = DefaultMergePolicy::default();
 81 |         merge_policy.set_max_docs_before_merge(10_000);
 82 | 
 83 |         writer.set_merge_policy(Box::new(merge_policy));
 84 | 
 85 |         writer
 86 |     }
 87 | 
 88 |     fn id_field(&self) -> Field {
 89 |         self.schema()
 90 |             .get_field("__id")
 91 |             .expect("__id field should exist")
 92 |     }
 93 | }
 94 | 
 95 | #[cfg(test)]
 96 | pub mod test_util {
 97 |     use std::collections::HashMap;
 98 |     use std::sync::{Arc, Mutex};
 99 | 
100 |     use super::*;
101 | 
102 |     #[derive(Debug)]
103 |     pub struct TestIndexLoader {
104 |         schema_loader: SchemaProvider,
105 | 
106 |         table: Arc<Mutex<HashMap<String, Index>>>,
107 |     }
108 | 
109 |     impl Clone for TestIndexLoader {
110 |         fn clone(&self) -> Self {
111 |             Self {
112 |                 schema_loader: self.schema_loader.clone(),
113 |                 table: self.table.clone(),
114 |             }
115 |         }
116 |     }
117 | 
118 |     impl IndexLoader for TestIndexLoader {
119 |         fn load_index(
120 |             &self,
121 |             index_id: &str,
122 |             _segments: Option<Vec<SegmentMeta>>,
123 |         ) -> Result<Index, ServiceError> {
124 |             let mut table = self.table.lock().unwrap();
125 | 
126 |             let entry = (*table).entry(index_id.into());
127 | 
128 |             let schema = self.schema_loader.load_schema(index_id)?;
129 | 
130 |             let index = entry.or_insert_with(|| Index::create_in_ram(schema));
131 | 
132 |             Ok(index.clone())
133 |         }
134 |     }
135 | 
136 |     impl TestIndexLoader {
137 |         pub fn create(schema_loader: SchemaProvider) -> Self {
138 |             TestIndexLoader {
139 |                 schema_loader,
140 |                 table: Arc::new(Mutex::new(HashMap::new())),
141 |             }
142 |         }
143 |     }
144 | }
145 | 


--------------------------------------------------------------------------------
/packages/pathery/src/lambda/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod sqs;
 2 | 
 3 | pub use lambda_runtime::Error;
 4 | pub use {lambda_runtime, tracing};
 5 | 
 6 | pub fn init_tracing() {
 7 |     tracing_subscriber::fmt()
 8 |         .json()
 9 |         .with_target(false)
10 |         .without_time()
11 |         .init();
12 | }
13 | 


--------------------------------------------------------------------------------
/packages/pathery/src/lambda/sqs.rs:
--------------------------------------------------------------------------------
1 | use aws_lambda_events::event::sqs;
2 | pub use lambda_runtime::Error;
3 | use lambda_runtime::LambdaEvent;
4 | 
5 | pub type SqsEvent = LambdaEvent<sqs::SqsEvent>;
6 | 


--------------------------------------------------------------------------------
/packages/pathery/src/lib.rs:
--------------------------------------------------------------------------------
  1 | pub mod directory;
  2 | pub mod function;
  3 | pub mod index;
  4 | pub mod lambda;
  5 | pub mod pagination;
  6 | pub mod schema;
  7 | pub mod search_doc;
  8 | pub mod serialize;
  9 | pub mod service;
 10 | pub mod store;
 11 | pub mod util;
 12 | pub mod worker;
 13 | 
 14 | pub(crate) use serde_json as json;
 15 | 
 16 | #[cfg(test)]
 17 | pub mod test_utils {
 18 |     pub use serde_json as json;
 19 |     pub use serde_json::json;
 20 | 
 21 |     use crate::index::test_util::TestIndexLoader;
 22 |     use crate::schema::{SchemaLoader, SchemaProvider};
 23 |     use crate::search_doc::SearchDoc;
 24 |     use crate::store::document::test_util::TestDocumentStore;
 25 |     use crate::store::document::DocumentStore;
 26 |     use crate::worker::index_writer::client::test_utils::TestIndexWriterClient;
 27 |     use crate::worker::index_writer::client::IndexWriterClient;
 28 |     use crate::worker::index_writer::job::Job;
 29 | 
 30 |     pub struct TestContext {
 31 |         schema_loader: SchemaProvider,
 32 | 
 33 |         document_store: TestDocumentStore,
 34 | 
 35 |         writer_client: TestIndexWriterClient,
 36 | 
 37 |         index_loader: TestIndexLoader,
 38 |     }
 39 | 
 40 |     impl TestContext {
 41 |         pub async fn with_documents(self, index_id: &str, docs: Vec<json::Value>) -> TestContext {
 42 |             let schema = self.schema_loader.load_schema(index_id).unwrap();
 43 |             let documents: Vec<_> = docs
 44 |                 .into_iter()
 45 |                 .map(|value| SearchDoc::from_json(&schema, value).unwrap())
 46 |                 .collect();
 47 |             let doc_refs = self.document_store.save_documents(documents).await.unwrap();
 48 |             let mut job = Job::create(index_id);
 49 |             for doc_ref in doc_refs {
 50 |                 job.index_doc(doc_ref);
 51 |             }
 52 |             self.writer_client().submit_job(job).await.unwrap();
 53 |             self
 54 |         }
 55 | 
 56 |         pub fn schema_loader(&self) -> &SchemaProvider {
 57 |             &self.schema_loader
 58 |         }
 59 | 
 60 |         pub fn document_store(&self) -> &TestDocumentStore {
 61 |             &self.document_store
 62 |         }
 63 | 
 64 |         pub fn writer_client(&self) -> &TestIndexWriterClient {
 65 |             &self.writer_client
 66 |         }
 67 | 
 68 |         pub fn index_loader(&self) -> &TestIndexLoader {
 69 |             &self.index_loader
 70 |         }
 71 |     }
 72 | 
 73 |     pub fn setup() -> TestContext {
 74 |         let config = json!({
 75 |             "indexes": [
 76 |                 {
 77 |                     "prefix": "test",
 78 |                     "fields": [
 79 |                         {
 80 |                             "name": "title",
 81 |                             "kind": "text",
 82 |                             "flags": ["TEXT"]
 83 |                         },
 84 |                         {
 85 |                             "name": "author",
 86 |                             "kind": "text",
 87 |                             "flags": ["TEXT"]
 88 |                         },
 89 |                         {
 90 |                             "name": "isbn",
 91 |                             "kind": "text",
 92 |                             "flags": ["STRING"]
 93 |                         },
 94 |                         {
 95 |                             "name": "date_added",
 96 |                             "kind": "date",
 97 |                             "flags": ["INDEXED", "FAST"]
 98 |                         },
 99 |                         {
100 |                             "name": "meta",
101 |                             "kind": "text",
102 |                             "flags": []
103 |                         },
104 |                         {
105 |                             "name": "year",
106 |                             "kind": "i64",
107 |                             "flags": ["INDEXED"]
108 |                         },
109 |                         {
110 |                             "name": "props",
111 |                             "kind": "json",
112 |                             "flags": ["TEXT"]
113 |                         }
114 |                     ]
115 |                 }
116 |             ]
117 |         });
118 | 
119 |         let schema_loader = SchemaProvider::from_json(config);
120 | 
121 |         let index_loader = TestIndexLoader::create(schema_loader.clone());
122 | 
123 |         let document_store = TestDocumentStore::create();
124 | 
125 |         TestContext {
126 |             schema_loader,
127 |             writer_client: TestIndexWriterClient::create(
128 |                 index_loader.clone(),
129 |                 document_store.clone(),
130 |             ),
131 |             document_store,
132 |             index_loader,
133 |         }
134 |     }
135 | }
136 | 


--------------------------------------------------------------------------------
/packages/pathery/src/pagination.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | 
  3 | use base64::Engine;
  4 | use serde::{Deserialize, Serialize};
  5 | use serde_json::Value;
  6 | 
  7 | #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)]
  8 | pub struct SegmentMeta {
  9 |     segment_id: String,
 10 | 
 11 |     #[serde(flatten)]
 12 |     extra: HashMap<String, Value>,
 13 | }
 14 | 
 15 | #[derive(Serialize, Deserialize, PartialEq, Debug, Clone)]
 16 | pub struct PaginationToken {
 17 |     query: String,
 18 |     segments: Vec<SegmentMeta>,
 19 |     partition_state: Vec<usize>,
 20 | }
 21 | 
 22 | impl PaginationToken {
 23 |     pub fn new<T>(query: T, total_partitions: usize) -> PaginationToken
 24 |     where T: Into<String> {
 25 |         let mut partition_state: Vec<usize> = vec![];
 26 |         partition_state.resize(total_partitions, 0);
 27 |         PaginationToken {
 28 |             query: query.into(),
 29 |             segments: vec![],
 30 |             partition_state,
 31 |         }
 32 |     }
 33 | 
 34 |     pub fn import_segments_json(&mut self, segments_json: Value) {
 35 |         let segments: Vec<SegmentMeta> = serde_json::from_value(segments_json).unwrap();
 36 |         self.segments = segments;
 37 |     }
 38 | 
 39 |     pub fn segments_for_partition(&self, n: usize) -> Vec<SegmentMeta> {
 40 |         self.segments
 41 |             .iter()
 42 |             .enumerate()
 43 |             .filter(|(idx, _)| (idx + n) % self.partition_state.len() == 0)
 44 |             .map(|(_, x)| x.clone())
 45 |             .collect()
 46 |     }
 47 | 
 48 |     pub fn inc_offset(&mut self, partition_n: usize) {
 49 |         let value = self.partition_state.get_mut(partition_n).unwrap();
 50 |         *value = *value + 1;
 51 |     }
 52 | 
 53 |     pub fn get_offset(&self, partition_n: usize) -> usize {
 54 |         *self.partition_state.get(partition_n).unwrap()
 55 |     }
 56 | 
 57 |     pub fn get_query(&self) -> String {
 58 |         self.query.to_string()
 59 |     }
 60 | 
 61 |     pub fn serialize(&self) -> String {
 62 |         let json = serde_json::to_vec(self).expect("should serialize to json");
 63 |         let compressed = zstd::encode_all(json.as_slice(), 20).expect("should encode");
 64 |         base64::engine::general_purpose::STANDARD.encode(compressed)
 65 |     }
 66 | 
 67 |     pub fn parse<T>(from: T) -> PaginationToken
 68 |     where T: Into<String> {
 69 |         let decoded = base64::engine::general_purpose::STANDARD
 70 |             .decode(from.into())
 71 |             .unwrap();
 72 |         let decompressed = zstd::decode_all(decoded.as_slice()).unwrap();
 73 |         serde_json::from_slice(&decompressed).unwrap()
 74 |     }
 75 | }
 76 | 
 77 | #[cfg(test)]
 78 | mod tests {
 79 |     use serde_json::json;
 80 | 
 81 |     use super::PaginationToken;
 82 | 
 83 |     #[test]
 84 |     fn test_round_trip() {
 85 |         let mut token = PaginationToken::new("foobar", 2);
 86 |         token.import_segments_json(json!([
 87 |             {
 88 |                 "segment_id": "abc123",
 89 |                 "foo": "bar"
 90 |             }
 91 |         ]));
 92 | 
 93 |         let token_str = token.serialize();
 94 |         let parsed = PaginationToken::parse(token_str);
 95 | 
 96 |         println!("{:?}", parsed);
 97 | 
 98 |         assert_eq!(token, parsed);
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/packages/pathery/src/schema.rs:
--------------------------------------------------------------------------------
  1 | use std::fs;
  2 | 
  3 | use serde::{Deserialize, Serialize};
  4 | use serde_json as json;
  5 | use tantivy::schema::{self, DocParsingError, Field, NumericOptions, Schema, TextOptions};
  6 | use thiserror::Error;
  7 | 
  8 | use crate::service::ServiceError;
  9 | 
 10 | #[derive(Serialize, Deserialize, Debug, Clone)]
 11 | pub enum TextFieldOption {
 12 |     TEXT,
 13 |     STRING,
 14 |     FAST,
 15 | }
 16 | 
 17 | #[derive(Serialize, Deserialize, Debug, Clone)]
 18 | pub enum NumericFieldOption {
 19 |     INDEXED,
 20 |     FAST,
 21 | }
 22 | 
 23 | #[derive(Serialize, Deserialize, Debug, Clone)]
 24 | pub enum JsonFieldOption {
 25 |     TEXT,
 26 | }
 27 | 
 28 | #[derive(Serialize, Deserialize, Debug, Clone)]
 29 | #[serde(tag = "kind")]
 30 | pub enum FieldConfig {
 31 |     #[serde(rename = "text")]
 32 |     TextFieldConfig {
 33 |         name: String,
 34 |         flags: Vec<TextFieldOption>,
 35 |     },
 36 |     #[serde(rename = "date")]
 37 |     DateFieldConfig {
 38 |         name: String,
 39 |         flags: Vec<NumericFieldOption>,
 40 |     },
 41 |     #[serde(rename = "i64")]
 42 |     IntegerFieldConfig {
 43 |         name: String,
 44 |         flags: Vec<NumericFieldOption>,
 45 |     },
 46 |     #[serde(rename = "json")]
 47 |     JsonFieldConfig {
 48 |         name: String,
 49 |         flags: Vec<JsonFieldOption>,
 50 |     },
 51 | }
 52 | 
 53 | #[derive(Serialize, Deserialize, Debug, Clone)]
 54 | pub struct IndexConfig {
 55 |     prefix: String,
 56 |     fields: Vec<FieldConfig>,
 57 | }
 58 | 
 59 | #[derive(Serialize, Deserialize, Debug, Clone)]
 60 | pub struct PatheryConfig {
 61 |     indexes: Vec<IndexConfig>,
 62 | }
 63 | 
 64 | pub trait SchemaLoader: Send + Sync {
 65 |     fn load_schema(&self, index_id: &str) -> Result<Schema, ServiceError>;
 66 | }
 67 | 
 68 | #[derive(Error, Debug)]
 69 | pub enum IndexDocError {
 70 |     #[error("Expected JSON object")]
 71 |     NotJsonObject,
 72 |     #[error("Request JSON object is empty")]
 73 |     EmptyDoc,
 74 |     #[error("Error parsing JSON object document")]
 75 |     DocParsingError(DocParsingError),
 76 | }
 77 | 
 78 | fn numeric_field_options(flags: &Vec<NumericFieldOption>) -> NumericOptions {
 79 |     flags
 80 |         .iter()
 81 |         .fold(NumericOptions::default(), |acc, opt| match opt {
 82 |             NumericFieldOption::INDEXED => acc | schema::INDEXED,
 83 |             NumericFieldOption::FAST => acc | schema::FAST,
 84 |         })
 85 | }
 86 | 
 87 | pub trait SchemaExt {
 88 |     fn id_field(&self) -> Field;
 89 | }
 90 | 
 91 | impl SchemaExt for Schema {
 92 |     fn id_field(&self) -> Field {
 93 |         self.get_field("__id")
 94 |             .expect("__id field should be present")
 95 |     }
 96 | }
 97 | 
 98 | #[derive(Clone, Debug)]
 99 | pub struct SchemaProvider {
100 |     config: PatheryConfig,
101 | }
102 | 
103 | impl SchemaProvider {
104 |     pub fn lambda() -> Self {
105 |         let config_path = "/opt/pathery/config.json";
106 |         let content = fs::read_to_string(config_path).expect("config should exist");
107 |         let config: PatheryConfig = json::from_str(&content).expect("config should parse");
108 | 
109 |         SchemaProvider { config }
110 |     }
111 | 
112 |     pub fn from_json(config: json::Value) -> Self {
113 |         let config = json::from_value(config).expect("config should parse");
114 |         Self { config }
115 |     }
116 | }
117 | 
118 | impl SchemaLoader for SchemaProvider {
119 |     fn load_schema(&self, index_id: &str) -> Result<Schema, ServiceError> {
120 |         let config = self
121 |             .config
122 |             .indexes
123 |             .iter()
124 |             .find(|config| index_id.starts_with(&config.prefix))
125 |             .ok_or_else(|| {
126 |                 ServiceError::not_found(&format!("Schema for index [{}] not found", index_id))
127 |             })?;
128 | 
129 |         let mut schema = Schema::builder();
130 | 
131 |         for field in &config.fields {
132 |             match &field {
133 |                 FieldConfig::TextFieldConfig { name, flags } => {
134 |                     let field_opts =
135 |                         flags
136 |                             .iter()
137 |                             .fold(TextOptions::default(), |acc, opt| match opt {
138 |                                 TextFieldOption::TEXT => acc | schema::TEXT,
139 |                                 TextFieldOption::STRING => acc | schema::STRING,
140 |                                 TextFieldOption::FAST => acc | schema::FAST,
141 |                             });
142 |                     schema.add_text_field(name, field_opts);
143 |                 }
144 |                 FieldConfig::DateFieldConfig { name, flags } => {
145 |                     schema.add_date_field(name, numeric_field_options(flags));
146 |                 }
147 |                 FieldConfig::IntegerFieldConfig { name, flags } => {
148 |                     schema.add_i64_field(name, numeric_field_options(flags));
149 |                 }
150 |                 FieldConfig::JsonFieldConfig { name, flags } => {
151 |                     let field_opts =
152 |                         flags
153 |                             .iter()
154 |                             .fold(TextOptions::default(), |acc, opt| match opt {
155 |                                 JsonFieldOption::TEXT => acc | schema::TEXT,
156 |                             });
157 |                     schema.add_json_field(name, field_opts);
158 |                 }
159 |             }
160 |         }
161 | 
162 |         // Add system schema fields
163 | 
164 |         // __id is the document id used for uniqueness
165 |         schema.add_text_field("__id", schema::STRING | schema::STORED);
166 | 
167 |         Ok(schema.build())
168 |     }
169 | }
170 | 
171 | #[cfg(test)]
172 | mod tests {
173 |     use serde_json::json;
174 | 
175 |     use super::*;
176 | 
177 |     #[test]
178 |     fn parse_test_config() {
179 |         let config = json!({
180 |                 "indexes": [{
181 |                     "prefix": "book-index-v1-",
182 |                     "fields": [
183 |                         {
184 |                             "name": "title",
185 |                             "flags": ["TEXT"],
186 |                             "kind": "text",
187 |                         },
188 |                         {
189 |                             "name": "author",
190 |                             "flags": ["STRING"],
191 |                             "kind": "text",
192 |                         },
193 |                         {
194 |                             "name": "date_added",
195 |                             "flags": ["INDEXED", "FAST"],
196 |                             "kind": "date",
197 |                         },
198 |                         {
199 |                             "name": "year",
200 |                             "flags": ["INDEXED", "FAST"],
201 |                             "kind": "i64",
202 |                         },
203 |                         {
204 |                             "name": "meta",
205 |                             "flags": ["TEXT"],
206 |                             "kind": "json"
207 |                         }
208 |                     ],
209 |             }]
210 |         });
211 | 
212 |         serde_json::from_value::<PatheryConfig>(config).expect("should not throw");
213 |     }
214 | 
215 |     #[test]
216 |     fn serialize_schema() {
217 |         let mut schema = Schema::builder();
218 | 
219 |         schema.add_text_field("title", schema::STORED | schema::TEXT);
220 |         schema.add_text_field("author", schema::STORED | schema::STRING);
221 |         schema.add_date_field(
222 |             "created_date",
223 |             schema::STORED | schema::INDEXED | schema::FAST,
224 |         );
225 | 
226 |         let schema = schema.build();
227 | 
228 |         println!("{}", json::to_string_pretty(&schema).expect("ok"));
229 |     }
230 | }
231 | 


--------------------------------------------------------------------------------
/packages/pathery/src/search_doc.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Deserialize, Serialize};
  2 | use serde_json::{json, Map, Value};
  3 | use tantivy::schema::{DocParsingError, Schema};
  4 | use tantivy::Document;
  5 | use thiserror::Error;
  6 | 
  7 | use crate::serialize::compressed_json;
  8 | use crate::util;
  9 | 
 10 | #[derive(Debug, Error, PartialEq, Eq)]
 11 | pub enum SearchDocError {
 12 |     #[error("json value is not an object")]
 13 |     NotAnObject,
 14 | 
 15 |     #[error("invalid type for __id, expected string")]
 16 |     InvalidIdType,
 17 | 
 18 |     #[error("{0}")]
 19 |     SchemaValidationError(String),
 20 | 
 21 |     #[error("cannot index empty document")]
 22 |     EmptyDocument,
 23 | }
 24 | 
 25 | impl From<DocParsingError> for SearchDocError {
 26 |     fn from(err: DocParsingError) -> Self {
 27 |         SearchDocError::SchemaValidationError(err.to_string())
 28 |     }
 29 | }
 30 | 
 31 | #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 32 | pub struct DDBKey {
 33 |     pub pk: String,
 34 |     pub sk: String,
 35 | }
 36 | 
 37 | impl From<SearchDocId> for DDBKey {
 38 |     fn from(id: SearchDocId) -> Self {
 39 |         DDBKey {
 40 |             pk: format!("document|{}", id.0),
 41 |             sk: format!("document|{}", id.0),
 42 |         }
 43 |     }
 44 | }
 45 | 
 46 | #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
 47 | #[serde(transparent)]
 48 | pub struct SearchDocId(String);
 49 | 
 50 | impl From<DDBKey> for SearchDocId {
 51 |     fn from(key: DDBKey) -> Self {
 52 |         let doc_id = key
 53 |             .pk
 54 |             .split("|")
 55 |             .nth(1)
 56 |             .expect("key should be formatted correctly");
 57 | 
 58 |         Self(doc_id.into())
 59 |     }
 60 | }
 61 | 
 62 | impl SearchDocId {
 63 |     pub fn parse(id: &str) -> SearchDocId {
 64 |         SearchDocId(id.into())
 65 |     }
 66 | 
 67 |     pub fn id(&self) -> &str {
 68 |         &self.0
 69 |     }
 70 | }
 71 | 
 72 | #[derive(Debug, Serialize, Deserialize, Clone)]
 73 | pub struct SearchDoc {
 74 |     id: SearchDocId,
 75 |     #[serde(with = "compressed_json")]
 76 |     content: Map<String, Value>,
 77 | }
 78 | 
 79 | impl SearchDoc {
 80 |     /// Converts a JSON value into a SearchDoc if the document is valid according to the schema.
 81 |     /// Also generate an `__id` if no `__id` is present.
 82 |     pub fn from_json(schema: &Schema, json_value: Value) -> Result<SearchDoc, SearchDocError> {
 83 |         let mut json_object = match json_value {
 84 |             Value::Object(obj) => obj,
 85 |             _ => return Err(SearchDocError::NotAnObject),
 86 |         };
 87 | 
 88 |         let id = json_object
 89 |             .entry("__id")
 90 |             .or_insert_with(|| json!(util::generate_id()))
 91 |             .as_str()
 92 |             .ok_or_else(|| SearchDocError::InvalidIdType)?
 93 |             .to_string();
 94 | 
 95 |         // Validate the document against the provided schema.
 96 |         let document = schema.json_object_to_doc(json_object.clone())?;
 97 | 
 98 |         if document.field_values().len() <= 1 {
 99 |             return Err(SearchDocError::EmptyDocument);
100 |         }
101 | 
102 |         Ok(SearchDoc {
103 |             id: SearchDocId(id),
104 |             content: json_object,
105 |         })
106 |     }
107 | 
108 |     pub fn id(&self) -> &SearchDocId {
109 |         &self.id
110 |     }
111 | 
112 |     pub fn document(&self, schema: &Schema) -> Document {
113 |         schema
114 |             .json_object_to_doc(self.content.clone())
115 |             .expect("should succeed since from_json validates")
116 |     }
117 | }
118 | 
119 | #[cfg(test)]
120 | mod tests {
121 |     use tantivy::schema;
122 | 
123 |     use super::*;
124 | 
125 |     fn setup() -> Schema {
126 |         let mut schema = Schema::builder();
127 |         schema.add_text_field("__id", schema::STRING);
128 |         schema.add_text_field("name", schema::STRING);
129 |         schema.build()
130 |     }
131 | 
132 |     #[test]
133 |     fn from_json_generates_id() {
134 |         let schema = setup();
135 |         let value = json!({
136 |             "name": "world"
137 |         });
138 | 
139 |         let search_doc = SearchDoc::from_json(&schema, value).unwrap();
140 | 
141 |         assert!(search_doc.id.0.len() > 0);
142 |     }
143 | 
144 |     #[test]
145 |     fn from_json_uses_id_when_exists() {
146 |         let schema = setup();
147 |         let id = util::generate_id();
148 |         let value = json!({ "__id": id, "name": "world" });
149 | 
150 |         let search_doc = SearchDoc::from_json(&schema, value).unwrap();
151 | 
152 |         assert_eq!(id, search_doc.id.0);
153 |     }
154 | 
155 |     #[test]
156 |     fn from_json_returns_validation_error_when_schema_does_not_match() {
157 |         let schema = setup();
158 |         let value = json!({ "name": 1234 });
159 | 
160 |         let search_doc = SearchDoc::from_json(&schema, value).unwrap_err();
161 | 
162 |         assert_eq!(
163 |             SearchDocError::SchemaValidationError(
164 |                 "The field '\"name\"' could not be parsed: TypeError { expected: \"a string\", \
165 |                  json: Number(1234) }"
166 |                     .into()
167 |             ),
168 |             search_doc,
169 |         );
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/packages/pathery/src/serialize/compressed_json.rs:
--------------------------------------------------------------------------------
 1 | use serde::de::Visitor;
 2 | use serde::{Deserializer, Serializer};
 3 | use serde_json::{Map, Value};
 4 | 
 5 | pub fn serialize<S>(input: &Map<String, Value>, serializer: S) -> Result<S::Ok, S::Error>
 6 | where S: Serializer {
 7 |     let json_bytes = serde_json::to_vec(input).unwrap();
 8 |     let encoded_bytes = zstd::encode_all(json_bytes.as_slice(), 0).unwrap();
 9 |     serializer.serialize_bytes(&encoded_bytes)
10 | }
11 | 
12 | struct CompressedJsonVisitor;
13 | 
14 | impl<'de> Visitor<'de> for CompressedJsonVisitor {
15 |     type Value = Map<String, Value>;
16 | 
17 |     fn expecting(&self, _formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
18 |         todo!()
19 |     }
20 | 
21 |     fn visit_bytes<E>(self, v: &[u8]) -> Result<Self::Value, E>
22 |     where E: serde::de::Error {
23 |         let decoded_bytes = zstd::decode_all(v).unwrap();
24 |         let deserialized = serde_json::from_slice(&decoded_bytes).unwrap();
25 |         Ok(deserialized)
26 |     }
27 | }
28 | 
29 | pub fn deserialize<'de, D>(deserializer: D) -> Result<Map<String, Value>, D::Error>
30 | where D: Deserializer<'de> {
31 |     deserializer.deserialize_bytes(CompressedJsonVisitor)
32 | }
33 | 
34 | #[cfg(test)]
35 | mod tests {
36 |     use std::collections::HashMap;
37 | 
38 |     use serde::{Deserialize, Serialize};
39 |     use serde_dynamo::{self, AttributeValue};
40 |     use serde_json::{json, Map, Value};
41 | 
42 |     #[derive(Serialize, Deserialize, Debug, PartialEq, Clone)]
43 |     struct MyType {
44 |         #[serde(with = "super")]
45 |         inner: Map<String, Value>,
46 |     }
47 | 
48 |     #[test]
49 |     fn test_round_trip() {
50 |         let init = MyType {
51 |             inner: json!({
52 |                 "hello": "world"
53 |             })
54 |             .as_object()
55 |             .unwrap()
56 |             .to_owned(),
57 |         };
58 | 
59 |         let serialized: HashMap<String, AttributeValue> =
60 |             serde_dynamo::to_item(init.clone()).unwrap();
61 |         let deserialized: MyType = serde_dynamo::from_item(serialized).unwrap();
62 | 
63 |         assert_eq!(init, deserialized);
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/packages/pathery/src/serialize/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod compressed_json;
2 | 


--------------------------------------------------------------------------------
/packages/pathery/src/service/doc.rs:
--------------------------------------------------------------------------------
 1 | use async_trait::async_trait;
 2 | use serde::{Deserialize, Serialize};
 3 | use serde_json as json;
 4 | 
 5 | use super::{ServiceHandler, ServiceRequest, ServiceResponse};
 6 | use crate::search_doc::SearchDocId;
 7 | use crate::worker::index_writer::client::{IndexWriterClient, LambdaIndexWriterClient};
 8 | use crate::worker::index_writer::job::Job;
 9 | 
10 | #[derive(Serialize, Deserialize, Debug)]
11 | pub struct PathParams {
12 |     index_id: String,
13 |     doc_id: String,
14 | }
15 | 
16 | #[derive(Serialize)]
17 | pub struct DeleteDocResponse {
18 |     pub job_id: String,
19 | }
20 | 
21 | pub struct DeleteDocService {
22 |     client: Box<dyn IndexWriterClient>,
23 | }
24 | 
25 | #[async_trait]
26 | impl ServiceHandler<json::Value, DeleteDocResponse> for DeleteDocService {
27 |     async fn handle_request(
28 |         &self,
29 |         request: ServiceRequest<json::Value>,
30 |     ) -> ServiceResponse<DeleteDocResponse> {
31 |         let index_id = request.path_param("index_id")?;
32 |         let doc_id = request.path_param("doc_id")?;
33 | 
34 |         let mut job = Job::create(&index_id);
35 | 
36 |         job.delete_doc(SearchDocId::parse(&doc_id));
37 | 
38 |         let job_id = self.client.submit_job(job).await?;
39 | 
40 |         Ok(DeleteDocResponse { job_id })
41 |     }
42 | }
43 | 
44 | impl DeleteDocService {
45 |     pub async fn create() -> Self {
46 |         let client = LambdaIndexWriterClient::create(None).await;
47 | 
48 |         DeleteDocService {
49 |             client: Box::new(client),
50 |         }
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/packages/pathery/src/service/index/batch_index.rs:
--------------------------------------------------------------------------------
 1 | use async_trait::async_trait;
 2 | use serde::Serialize;
 3 | 
 4 | use crate::json;
 5 | use crate::schema::{SchemaLoader, SchemaProvider};
 6 | use crate::search_doc::SearchDoc;
 7 | use crate::service::{ServiceError, ServiceHandler, ServiceRequest, ServiceResponse};
 8 | use crate::store::document::{DDBDocumentStore, DocumentStore};
 9 | use crate::worker::index_writer::client::{IndexWriterClient, LambdaIndexWriterClient};
10 | use crate::worker::index_writer::job::Job;
11 | 
12 | #[derive(Serialize)]
13 | pub struct BatchIndexResponse {
14 |     pub job_id: String,
15 | }
16 | 
17 | pub struct BatchIndexService {
18 |     schema_loader: Box<dyn SchemaLoader>,
19 | 
20 |     document_store: Box<dyn DocumentStore>,
21 | 
22 |     index_writer: Box<dyn IndexWriterClient>,
23 | }
24 | 
25 | #[async_trait]
26 | impl ServiceHandler<Vec<json::Value>, BatchIndexResponse> for BatchIndexService {
27 |     async fn handle_request(
28 |         &self,
29 |         request: ServiceRequest<Vec<json::Value>>,
30 |     ) -> ServiceResponse<BatchIndexResponse> {
31 |         let body = request.body()?;
32 | 
33 |         let index_id = request.path_param("index_id")?;
34 | 
35 |         let schema = self.schema_loader.load_schema(&index_id)?;
36 | 
37 |         let mut job = Job::create(&index_id);
38 | 
39 |         let documents = body
40 |             .into_iter()
41 |             .map(|value| SearchDoc::from_json(&schema, value))
42 |             .collect::<Vec<_>>();
43 | 
44 |         let error = documents
45 |             .iter()
46 |             .enumerate()
47 |             .filter_map(|(idx, result)| result.as_ref().err().map(|err| (idx, err)))
48 |             .collect::<Vec<_>>();
49 | 
50 |         if let Some((idx, error)) = error.first() {
51 |             return Err(ServiceError::invalid_request(&format!(
52 |                 "Error parsing document (path: [{}]): {}",
53 |                 idx,
54 |                 error.to_string()
55 |             )));
56 |         }
57 | 
58 |         let documents = documents
59 |             .into_iter()
60 |             .filter_map(Result::ok)
61 |             .collect::<Vec<_>>();
62 | 
63 |         let doc_refs = self.document_store.save_documents(documents).await?;
64 | 
65 |         for doc_ref in doc_refs {
66 |             job.index_doc(doc_ref)
67 |         }
68 | 
69 |         let job_id = self.index_writer.submit_job(job).await?;
70 | 
71 |         Ok(BatchIndexResponse { job_id })
72 |     }
73 | }
74 | 
75 | impl BatchIndexService {
76 |     pub async fn create() -> Self {
77 |         let document_store = DDBDocumentStore::create(None).await;
78 |         let writer_client = LambdaIndexWriterClient::create(None).await;
79 |         let schema_loader = SchemaProvider::lambda();
80 | 
81 |         BatchIndexService {
82 |             document_store: Box::new(document_store),
83 |             index_writer: Box::new(writer_client),
84 |             schema_loader: Box::new(schema_loader),
85 |         }
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/packages/pathery/src/service/index/mod.rs:
--------------------------------------------------------------------------------
 1 | mod batch_index;
 2 | mod post_index;
 3 | mod query_index;
 4 | mod stats_index;
 5 | 
 6 | pub use batch_index::BatchIndexService;
 7 | pub use post_index::PostIndexService;
 8 | pub use query_index::QueryIndexService;
 9 | pub use stats_index::StatsIndexService;
10 | 


--------------------------------------------------------------------------------
/packages/pathery/src/service/index/post_index.rs:
--------------------------------------------------------------------------------
  1 | use async_trait::async_trait;
  2 | use serde::Serialize;
  3 | 
  4 | use crate::schema::{SchemaLoader, SchemaProvider};
  5 | use crate::search_doc::SearchDoc;
  6 | use crate::service::{ServiceError, ServiceHandler, ServiceRequest, ServiceResponse};
  7 | use crate::store::document::{DDBDocumentStore, DocumentStore};
  8 | use crate::worker::index_writer::client::{IndexWriterClient, LambdaIndexWriterClient};
  9 | use crate::worker::index_writer::job::Job;
 10 | use crate::{json, util};
 11 | 
 12 | #[derive(Serialize, Debug)]
 13 | pub struct PostIndexResponse {
 14 |     pub job_id: String,
 15 |     pub updated_at: String,
 16 | }
 17 | 
 18 | pub struct PostIndexService {
 19 |     schema_loader: Box<dyn SchemaLoader>,
 20 | 
 21 |     document_store: Box<dyn DocumentStore>,
 22 | 
 23 |     writer_client: Box<dyn IndexWriterClient>,
 24 | }
 25 | 
 26 | #[async_trait]
 27 | impl ServiceHandler<json::Value, PostIndexResponse> for PostIndexService {
 28 |     async fn handle_request(
 29 |         &self,
 30 |         request: ServiceRequest<json::Value>,
 31 |     ) -> ServiceResponse<PostIndexResponse> {
 32 |         let body = request.body()?;
 33 | 
 34 |         let index_id = request.path_param("index_id")?;
 35 | 
 36 |         let schema = self.schema_loader.load_schema(&index_id)?;
 37 | 
 38 |         let document = SearchDoc::from_json(&schema, body)
 39 |             .map_err(|err| ServiceError::invalid_request(&err.to_string()))?;
 40 | 
 41 |         let doc_refs = self.document_store.save_documents(vec![document]).await?;
 42 | 
 43 |         let mut job = Job::create(&index_id);
 44 | 
 45 |         for doc_ref in doc_refs {
 46 |             job.index_doc(doc_ref);
 47 |         }
 48 | 
 49 |         let job_id = self.writer_client.submit_job(job).await?;
 50 | 
 51 |         Ok(PostIndexResponse {
 52 |             job_id,
 53 |             updated_at: util::timestamp(),
 54 |         })
 55 |     }
 56 | }
 57 | 
 58 | impl PostIndexService {
 59 |     pub async fn create() -> Self {
 60 |         let document_store = DDBDocumentStore::create(None).await;
 61 |         let writer_client = LambdaIndexWriterClient::create(None).await;
 62 |         let schema_loader = SchemaProvider::lambda();
 63 | 
 64 |         PostIndexService {
 65 |             document_store: Box::new(document_store),
 66 |             writer_client: Box::new(writer_client),
 67 |             schema_loader: Box::new(schema_loader),
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | #[cfg(test)]
 73 | mod tests {
 74 |     use super::*;
 75 |     use crate::test_utils::*;
 76 | 
 77 |     pub fn test_service() -> PostIndexService {
 78 |         let ctx = setup();
 79 | 
 80 |         let schema_loader = Box::new(ctx.schema_loader().clone());
 81 |         let document_store = Box::new(ctx.document_store().clone());
 82 |         let writer_client = Box::new(ctx.writer_client().clone());
 83 | 
 84 |         PostIndexService {
 85 |             schema_loader,
 86 |             document_store,
 87 |             writer_client,
 88 |         }
 89 |     }
 90 | 
 91 |     #[tokio::test]
 92 |     async fn post_index_doc_with_no_id() {
 93 |         let service = test_service();
 94 | 
 95 |         let doc = json::json!({
 96 |             "title": "Zen and the Art of Motorcycle Maintenance",
 97 |             "author": "Robert Pirsig",
 98 |             "date_added": "2022-11-23T18:24:40Z",
 99 |             "isbn": "0060589469"
100 |         });
101 | 
102 |         let request = ServiceRequest::create(doc).with_path_param("index_id", "test");
103 | 
104 |         service.handle_request(request).await.unwrap();
105 |     }
106 | 
107 |     #[tokio::test]
108 |     async fn post_index_non_object() {
109 |         let service = test_service();
110 | 
111 |         let doc = json::json!([]);
112 | 
113 |         let request = ServiceRequest::create(doc).with_path_param("index_id", "test");
114 | 
115 |         let response = service.handle_request(request).await.unwrap_err();
116 | 
117 |         assert_eq!(400, response.status());
118 |         assert_eq!("json value is not an object", response.message());
119 |     }
120 | 
121 |     #[tokio::test]
122 |     async fn post_index_value_that_does_not_match_schema() {
123 |         let service = test_service();
124 | 
125 |         let doc = json::json!({"title": 1});
126 | 
127 |         let request = ServiceRequest::create(doc).with_path_param("index_id", "test");
128 | 
129 |         let response = service.handle_request(request).await.unwrap_err();
130 | 
131 |         assert_eq!(400, response.status());
132 |         assert_eq!(
133 |             "The field '\"title\"' could not be parsed: TypeError { expected: \"a string\", json: \
134 |              Number(1) }",
135 |             response.message()
136 |         );
137 |     }
138 | 
139 |     #[tokio::test]
140 |     async fn post_index_field_that_does_not_exist() {
141 |         let service = test_service();
142 | 
143 |         let doc = json::json!({
144 |             "foobar": "baz",
145 |         });
146 | 
147 |         let request = ServiceRequest::create(doc).with_path_param("index_id", "test");
148 | 
149 |         let response = service.handle_request(request).await.unwrap_err();
150 | 
151 |         // Empty because the non-existent field does not explicitly trigger a failure - it just
152 |         // doesn't get indexed.
153 |         assert_eq!(400, response.status());
154 |         assert_eq!("cannot index empty document", response.message());
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/packages/pathery/src/service/index/query_index.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::sync::Arc;
  3 | 
  4 | use async_trait::async_trait;
  5 | use serde::{Deserialize, Serialize};
  6 | use tantivy::query::QueryParser;
  7 | use tantivy::schema::{Field, FieldType};
  8 | use tantivy::{Index, SnippetGenerator, TantivyError};
  9 | use tracing::info;
 10 | 
 11 | use crate::function::query_index_partition::client::LambdaQueryIndexPartitionClient;
 12 | use crate::function::query_index_partition::PartitionSearchHit;
 13 | use crate::index::{IndexExt, IndexLoader, LambdaIndexLoader};
 14 | use crate::json;
 15 | use crate::pagination::PaginationToken;
 16 | use crate::service::{ServiceError, ServiceHandler, ServiceRequest, ServiceResponse};
 17 | use crate::store::document::{DDBDocumentStore, DocumentStore};
 18 | 
 19 | #[derive(Serialize, Deserialize, Debug)]
 20 | pub struct QueryRequest {
 21 |     pub query: String,
 22 |     pub pagination_token: Option<String>,
 23 | }
 24 | 
 25 | #[derive(Serialize, Deserialize, Debug, PartialEq)]
 26 | pub struct SearchHit {
 27 |     pub doc: json::Value,
 28 |     pub snippets: json::Value,
 29 |     pub score: f32,
 30 | }
 31 | 
 32 | #[derive(Serialize, Deserialize, Debug, PartialEq)]
 33 | pub struct QueryResponse {
 34 |     pub matches: Vec<SearchHit>,
 35 |     pub pagination_token: Option<String>,
 36 | }
 37 | 
 38 | pub struct QueryIndexService {
 39 |     index_loader: Box<dyn IndexLoader>,
 40 | 
 41 |     document_store: Box<dyn DocumentStore>,
 42 | 
 43 |     query_index_paritition_client: Arc<LambdaQueryIndexPartitionClient>,
 44 | }
 45 | 
 46 | #[async_trait]
 47 | impl ServiceHandler<QueryRequest, QueryResponse> for QueryIndexService {
 48 |     async fn handle_request(
 49 |         &self,
 50 |         request: ServiceRequest<QueryRequest>,
 51 |     ) -> ServiceResponse<QueryResponse> {
 52 |         let body = request.body()?;
 53 | 
 54 |         let index_id = request.path_param("index_id")?;
 55 | 
 56 |         let index = self.index_loader.load_index(&index_id, None)?;
 57 | 
 58 |         let metas = index.load_metas().unwrap();
 59 |         let num_docs: u32 = metas.segments.iter().map(|seg| seg.num_docs()).sum();
 60 |         info!("Doc count: {}", num_docs);
 61 | 
 62 |         let total_partitions = (num_docs / 60_000) + 1;
 63 |         info!("Total partitions: {}", total_partitions);
 64 | 
 65 |         let mut pagination_token = match body.pagination_token {
 66 |             Some(token) => PaginationToken::parse(token),
 67 |             None => {
 68 |                 let mut pagination_token =
 69 |                     PaginationToken::new(&body.query, total_partitions as usize);
 70 |                 let metas = index.load_metas().unwrap();
 71 |                 let segments = metas.segments;
 72 |                 let segments_json = serde_json::to_value(segments).unwrap();
 73 |                 pagination_token.import_segments_json(segments_json);
 74 |                 pagination_token
 75 |             }
 76 |         };
 77 | 
 78 |         let requests: Vec<_> = (0..total_partitions)
 79 |             .map(|partition_n| {
 80 |                 let query_client = Arc::clone(&self.query_index_paritition_client);
 81 |                 let index_id = index_id.clone();
 82 |                 let ro_token = pagination_token.clone();
 83 | 
 84 |                 tokio::spawn(async move {
 85 |                     query_client
 86 |                         .query_partition(
 87 |                             index_id.clone(),
 88 |                             ro_token.get_query(),
 89 |                             ro_token.get_offset(partition_n as usize),
 90 |                             partition_n as usize,
 91 |                             ro_token.segments_for_partition(partition_n as usize),
 92 |                         )
 93 |                         .await
 94 |                 })
 95 |             })
 96 |             .collect();
 97 | 
 98 |         let mut matches: Vec<PartitionSearchHit> = Vec::new();
 99 | 
100 |         for request in requests {
101 |             let mut response = request.await.unwrap();
102 |             let response = response.matches.as_mut();
103 |             matches.append(response);
104 |         }
105 | 
106 |         matches.sort_by(|a, b| b.score.total_cmp(&a.score));
107 |         matches.truncate(10);
108 | 
109 |         for match_one in &matches {
110 |             pagination_token.inc_offset(match_one.partition_n)
111 |         }
112 | 
113 |         println!("{}", serde_json::to_string(&pagination_token).unwrap());
114 | 
115 |         if matches.len() == 0 {
116 |             return Ok(QueryResponse {
117 |                 matches: vec![],
118 |                 pagination_token: None,
119 |             });
120 |         }
121 | 
122 |         let retrieved_matches = self
123 |             .document_store
124 |             .get_documents(
125 |                 matches
126 |                     .iter()
127 |                     .map(|one_match| one_match.doc_ref.clone())
128 |                     .collect(),
129 |             )
130 |             .await
131 |             .unwrap();
132 | 
133 |         let snippet_index = Index::create_in_ram(index.schema());
134 |         let mut snippet_writer = snippet_index.default_writer();
135 |         let snippet_reader = snippet_index.reader().unwrap();
136 |         let snippet_schema = snippet_index.schema();
137 | 
138 |         let query_parser = QueryParser::for_index(
139 |             &snippet_index,
140 |             snippet_schema
141 |                 .fields()
142 |                 .filter_map(|(field, entry)| {
143 |                     if !entry.is_indexed() {
144 |                         return None;
145 |                     }
146 |                     match entry.field_type() {
147 |                         FieldType::Str(_) => Some(field),
148 |                         _ => None,
149 |                     }
150 |                 })
151 |                 .collect::<Vec<Field>>(),
152 |         );
153 | 
154 |         let query = query_parser
155 |             .parse_query(&body.query)
156 |             .map_err(|err| ServiceError::invalid_request(&err.to_string()))?;
157 | 
158 |         let matches = retrieved_matches
159 |             .iter()
160 |             .zip(matches)
161 |             .map(|(search_doc, one_match)| {
162 |                 let document = search_doc.document(&snippet_schema);
163 |                 let named_doc = snippet_schema.to_named_doc(&document);
164 |                 snippet_writer.add_document(document.clone()).unwrap();
165 |                 snippet_writer.commit().unwrap();
166 |                 snippet_reader.reload().unwrap();
167 |                 let snippet_searcher = snippet_reader.searcher();
168 | 
169 |                 let snippets: HashMap<String, String> = document
170 |                     .field_values()
171 |                     .iter()
172 |                     .filter_map(|field_value| {
173 |                         // Only text fields are supported for snippets
174 |                         let text = field_value.value().as_text()?;
175 | 
176 |                         let generator = match SnippetGenerator::create(
177 |                             &snippet_searcher,
178 |                             &query,
179 |                             field_value.field(),
180 |                         ) {
181 |                             Ok(generator) => Some(generator),
182 |                             // InvalidArgument is returned when field is not indexed
183 |                             Err(TantivyError::InvalidArgument(_)) => None,
184 |                             Err(err) => panic!("{}", err.to_string()),
185 |                         }?;
186 | 
187 |                         let snippet = generator.snippet(text).to_html();
188 | 
189 |                         if snippet.is_empty() {
190 |                             None
191 |                         } else {
192 |                             Some((
193 |                                 snippet_schema.get_field_name(field_value.field()).into(),
194 |                                 snippet,
195 |                             ))
196 |                         }
197 |                     })
198 |                     .collect();
199 | 
200 |                 SearchHit {
201 |                     score: one_match.score,
202 |                     doc: json::to_value(named_doc).expect("named doc should serialize"),
203 |                     snippets: json::to_value(snippets).expect("snippets should serialize"),
204 |                 }
205 |             })
206 |             .collect();
207 | 
208 |         Ok(QueryResponse {
209 |             matches,
210 |             pagination_token: Some(pagination_token.serialize()),
211 |         })
212 |     }
213 | }
214 | 
215 | impl QueryIndexService {
216 |     pub async fn create() -> QueryIndexService {
217 |         let document_store = DDBDocumentStore::create(None).await;
218 |         let index_loader = LambdaIndexLoader::create();
219 | 
220 |         QueryIndexService {
221 |             document_store: Box::new(document_store),
222 |             index_loader: Box::new(index_loader.await),
223 |             query_index_paritition_client: Arc::new(
224 |                 LambdaQueryIndexPartitionClient::create().await,
225 |             ),
226 |         }
227 |     }
228 | }
229 | 
230 | // #[cfg(test)]
231 | // mod tests {
232 | //     use super::*;
233 | //     use crate::test_utils::*;
234 | 
235 | //     fn test_service(ctx: &TestContext) -> QueryIndexService {
236 | //         QueryIndexService {
237 | //             document_store: Box::new(ctx.document_store().clone()),
238 | //             index_loader: Box::new(ctx.index_loader().clone()),
239 | //         }
240 | //     }
241 | 
242 | //     #[tokio::test]
243 | //     async fn query_default_response() {
244 | //         let ctx = setup()
245 | //             .with_documents(
246 | //                 "test",
247 | //                 vec![json!({
248 | //                     "__id": "foobar",
249 | //                     "title": "hello",
250 | //                     "author": "world"
251 | //                 })],
252 | //             )
253 | //             .await;
254 | 
255 | //         let service = test_service(&ctx);
256 | 
257 | //         let request = ServiceRequest::create(QueryRequest {
258 | //             query: "hello".into(),
259 | //             with_partition: None,
260 | //         })
261 | //         .with_path_param("index_id", "test");
262 | 
263 | //         let response = service.handle_request(request).await.unwrap();
264 | 
265 | //         assert_eq!(
266 | //             QueryResponse {
267 | //                 matches: vec![SearchHit {
268 | //                     doc: json::json!({
269 | //                         "__id": ["foobar"],
270 | //                         "title": ["hello"],
271 | //                         "author": ["world"],
272 | //                     }),
273 | //                     score: 0.28768212,
274 | //                     snippets: json::json!({
275 | //                         "title": "<b>hello</b>"
276 | //                     })
277 | //                 }]
278 | //             },
279 | //             response
280 | //         );
281 | //     }
282 | 
283 | //     #[tokio::test]
284 | //     async fn query_document_with_un_indexed_fields() {
285 | //         let ctx = setup()
286 | //             .with_documents(
287 | //                 "test",
288 | //                 vec![json!({
289 | //                     "__id": "foobar",
290 | //                     "title": "hello",
291 | //                     "meta": "world"
292 | //                 })],
293 | //             )
294 | //             .await;
295 | 
296 | //         let service = test_service(&ctx);
297 | 
298 | //         let request = ServiceRequest::create(QueryRequest {
299 | //             query: "hello".into(),
300 | //             with_partition: None,
301 | //         })
302 | //         .with_path_param("index_id", "test");
303 | 
304 | //         let response = service.handle_request(request).await.unwrap();
305 | 
306 | //         assert_eq!(1, response.matches.len());
307 | //     }
308 | 
309 | //     #[tokio::test]
310 | //     async fn query_document_with_json_field() {
311 | //         let ctx = setup()
312 | //             .with_documents(
313 | //                 "test",
314 | //                 vec![json!({
315 | //                     "__id": "foobar",
316 | //                     "title": "hello",
317 | //                     "props": {
318 | //                         "foo": "bar"
319 | //                     }
320 | //                 })],
321 | //             )
322 | //             .await;
323 | 
324 | //         let service = test_service(&ctx);
325 | 
326 | //         let request = ServiceRequest::create(QueryRequest {
327 | //             query: "props.foo:bar".into(),
328 | //             with_partition: None,
329 | //         })
330 | //         .with_path_param("index_id", "test");
331 | 
332 | //         let response = service.handle_request(request).await.unwrap();
333 | 
334 | //         assert_eq!(1, response.matches.len());
335 | //     }
336 | // }
337 | 


--------------------------------------------------------------------------------
/packages/pathery/src/service/index/stats_index.rs:
--------------------------------------------------------------------------------
 1 | use std::fs;
 2 | 
 3 | use async_trait::async_trait;
 4 | use serde::{Deserialize, Serialize};
 5 | use serde_json as json;
 6 | 
 7 | use crate::index::{IndexLoader, LambdaIndexLoader};
 8 | use crate::service::{ServiceHandler, ServiceRequest, ServiceResponse};
 9 | 
10 | #[derive(Serialize, Deserialize)]
11 | pub struct SegmentStats {
12 |     id: String,
13 |     num_docs: u32,
14 |     num_deleted: u32,
15 |     index_size: f64,
16 | }
17 | 
18 | #[derive(Serialize, Deserialize)]
19 | pub struct IndexStatsResponse {
20 |     segments: Vec<SegmentStats>,
21 | }
22 | 
23 | pub struct StatsIndexService {
24 |     index_loader: Box<dyn IndexLoader>,
25 | }
26 | 
27 | #[async_trait]
28 | impl ServiceHandler<json::Value, IndexStatsResponse> for StatsIndexService {
29 |     async fn handle_request(
30 |         &self,
31 |         request: ServiceRequest<json::Value>,
32 |     ) -> ServiceResponse<IndexStatsResponse> {
33 |         let index_id = request.path_param("index_id")?;
34 | 
35 |         let index = self.index_loader.load_index(&index_id, None)?;
36 | 
37 |         let metas = index.load_metas().unwrap();
38 | 
39 |         let segment_files = fs::read_dir(format!("/mnt/pathery-data/{index_id}"))
40 |             .unwrap()
41 |             .filter_map(|entry| entry.ok())
42 |             .collect::<Vec<_>>();
43 | 
44 |         let segments = metas
45 |             .segments
46 |             .iter()
47 |             .map(|s| {
48 |                 let segment_id = s.id().uuid_string();
49 | 
50 |                 let index_size_bytes: u64 = segment_files
51 |                     .iter()
52 |                     .filter_map(|entry| {
53 |                         let filename = entry.file_name();
54 |                         let filename = filename.to_str()?;
55 | 
56 |                         filename
57 |                             .starts_with(&segment_id)
58 |                             .then(|| entry.metadata())
59 |                             .and_then(Result::ok)
60 |                             .map(|m| m.len())
61 |                     })
62 |                     .sum();
63 | 
64 |                 let index_size_mb: f64 = index_size_bytes as f64 / 1_000_000f64;
65 | 
66 |                 SegmentStats {
67 |                     id: s.id().uuid_string(),
68 |                     num_docs: s.num_docs(),
69 |                     num_deleted: s.num_deleted_docs(),
70 |                     index_size: index_size_mb,
71 |                 }
72 |             })
73 |             .collect();
74 | 
75 |         Ok(IndexStatsResponse { segments })
76 |     }
77 | }
78 | 
79 | impl StatsIndexService {
80 |     pub async fn create() -> Self {
81 |         let index_loader = LambdaIndexLoader::create();
82 | 
83 |         StatsIndexService {
84 |             index_loader: Box::new(index_loader.await),
85 |         }
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/packages/pathery/src/service/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::error::Error;
  3 | use std::marker::PhantomData;
  4 | 
  5 | use async_trait::async_trait;
  6 | use http::Response;
  7 | use lambda_http::{Body, RequestExt};
  8 | use serde::{Deserialize, Serialize};
  9 | use tracing::error;
 10 | 
 11 | use crate::util;
 12 | 
 13 | pub mod doc;
 14 | pub mod index;
 15 | 
 16 | #[derive(thiserror::Error, Debug)]
 17 | pub enum ServiceError {
 18 |     #[error("{0}")]
 19 |     InvalidRequest(String),
 20 | 
 21 |     #[error("Internal service error")]
 22 |     InternalError { id: String, source: anyhow::Error },
 23 | 
 24 |     #[error("Rate limit hit, back off and try request again.")]
 25 |     RateLimit,
 26 | 
 27 |     #[error("{0}")]
 28 |     NotFound(String),
 29 | }
 30 | 
 31 | impl ServiceError {
 32 |     pub fn invalid_request(message: &str) -> Self {
 33 |         ServiceError::InvalidRequest(message.into())
 34 |     }
 35 | 
 36 |     pub fn internal_error<E>(source: E) -> Self
 37 |     where E: Error + Send + Sync + 'static {
 38 |         let id = util::generate_id();
 39 |         error!(
 40 |             message = "InternalServiceError",
 41 |             id,
 42 |             error = format!("{source:#?}")
 43 |         );
 44 |         ServiceError::InternalError {
 45 |             id,
 46 |             source: anyhow::Error::new(source),
 47 |         }
 48 |     }
 49 | 
 50 |     pub fn not_found(message: &str) -> Self {
 51 |         ServiceError::NotFound(message.into())
 52 |     }
 53 | 
 54 |     pub fn rate_limit() -> Self {
 55 |         ServiceError::RateLimit
 56 |     }
 57 | 
 58 |     pub fn status(&self) -> u16 {
 59 |         use ServiceError::*;
 60 |         match self {
 61 |             InvalidRequest(_) => 400,
 62 |             InternalError { .. } => 500,
 63 |             RateLimit => 429,
 64 |             NotFound(_) => 404,
 65 |         }
 66 |     }
 67 | 
 68 |     pub fn message(self) -> String {
 69 |         use ServiceError::*;
 70 |         match self {
 71 |             InternalError { id, .. } => format!("Internal server error [id = {}]", id),
 72 |             InvalidRequest(message) => message,
 73 |             RateLimit => String::from("Too many requests"),
 74 |             NotFound(message) => message,
 75 |         }
 76 |     }
 77 | }
 78 | 
 79 | type ServiceResponse<R> = Result<R, ServiceError>;
 80 | 
 81 | pub struct ServiceRequest<B> {
 82 |     inner: lambda_http::Request,
 83 |     body: PhantomData<B>,
 84 | }
 85 | 
 86 | impl<B> ServiceRequest<B>
 87 | where B: for<'de> Deserialize<'de>
 88 | {
 89 |     /// Useful for testing
 90 |     pub fn create(body: B) -> ServiceRequest<B>
 91 |     where B: Serialize {
 92 |         let request = http::Request::builder();
 93 | 
 94 |         let body = lambda_http::Body::from(serde_json::to_string(&body).unwrap());
 95 | 
 96 |         let inner = request.body(body).unwrap();
 97 | 
 98 |         ServiceRequest {
 99 |             inner,
100 |             body: PhantomData,
101 |         }
102 |     }
103 | 
104 |     /// Useful for testing
105 |     pub fn with_path_param(mut self, name: &str, value: &str) -> Self {
106 |         let updated = self
107 |             .inner
108 |             .with_path_parameters(HashMap::from([(String::from(name), String::from(value))]));
109 | 
110 |         self.inner = updated;
111 | 
112 |         self
113 |     }
114 | 
115 |     pub fn body(&self) -> Result<B, ServiceError> {
116 |         if let Body::Text(body) = self.inner.body() {
117 |             Ok(serde_json::from_str(body).map_err(|err| {
118 |                 ServiceError::InvalidRequest(format!("Unable to parse body: {}", err.to_string()))
119 |             })?)
120 |         } else {
121 |             Err(ServiceError::InvalidRequest(String::from(
122 |                 "Expected string for body",
123 |             )))
124 |         }
125 |     }
126 | 
127 |     pub fn path_param(&self, name: &str) -> Result<String, ServiceError> {
128 |         let path_params = self.inner.path_parameters();
129 |         let value = path_params
130 |             .first(name)
131 |             .expect(&format!("missing path param: {}", name));
132 | 
133 |         Ok(String::from(value))
134 |     }
135 | }
136 | 
137 | fn map_error_response(
138 |     error: ServiceError,
139 | ) -> Result<lambda_http::Response<lambda_http::Body>, lambda_http::Error> {
140 |     let status = error.status();
141 |     let message = error.message();
142 | 
143 |     let response = Response::builder()
144 |         .header("Content-Type", "application/json")
145 |         .status(status);
146 | 
147 |     let body = serde_json::to_string(&serde_json::json!({ "message": message }))?;
148 | 
149 |     Ok(response.body(Body::Text(body))?)
150 | }
151 | 
152 | fn map_success_response<R>(
153 |     response: R,
154 | ) -> Result<lambda_http::Response<lambda_http::Body>, lambda_http::Error>
155 | where R: Serialize {
156 |     let body = serde_json::to_string(&response)?;
157 |     Ok(http::Response::builder()
158 |         .status(200)
159 |         .header("Content-Type", "application/json")
160 |         .body(lambda_http::Body::Text(body))?)
161 | }
162 | 
163 | #[async_trait]
164 | pub trait ServiceHandler<B, R>: Sync
165 | where
166 |     B: for<'de> Deserialize<'de> + Send,
167 |     R: Serialize,
168 | {
169 |     async fn handle_event(
170 |         &self,
171 |         event: lambda_http::Request,
172 |     ) -> Result<lambda_http::Response<lambda_http::Body>, lambda_http::Error> {
173 |         let request = ServiceRequest {
174 |             inner: event,
175 |             body: PhantomData,
176 |         };
177 | 
178 |         self.handle_request(request)
179 |             .await
180 |             .map_or_else(map_error_response, map_success_response)
181 |     }
182 | 
183 |     async fn handle_request(&self, request: ServiceRequest<B>) -> ServiceResponse<R>;
184 | }
185 | 
186 | pub async fn start_service<B, R>(
187 |     service: &dyn ServiceHandler<B, R>,
188 | ) -> Result<(), lambda_http::Error>
189 | where
190 |     B: for<'de> Deserialize<'de> + Send,
191 |     R: Serialize,
192 | {
193 |     tracing_subscriber::fmt()
194 |         .json()
195 |         .with_max_level(tracing::Level::WARN)
196 |         .with_target(false)
197 |         .without_time()
198 |         .init();
199 | 
200 |     lambda_http::run(lambda_http::service_fn(|event| async {
201 |         service.handle_event(event).await
202 |     }))
203 |     .await?;
204 | 
205 |     Ok(())
206 | }
207 | 


--------------------------------------------------------------------------------
/packages/pathery/src/store/document.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::error::Error;
  3 | use std::result::Result as StdResult;
  4 | 
  5 | use async_trait::async_trait;
  6 | use aws_sdk_dynamodb as ddb;
  7 | use ddb::model::{AttributeValue, KeysAndAttributes, PutRequest, WriteRequest};
  8 | use ddb::types::SdkError;
  9 | use serde::{Deserialize, Serialize};
 10 | use tantivy::schema::NamedFieldDocument;
 11 | 
 12 | use crate::search_doc::{DDBKey, SearchDoc, SearchDocId};
 13 | use crate::service::ServiceError;
 14 | use crate::util;
 15 | 
 16 | impl<T> From<SdkError<T>> for ServiceError
 17 | where T: Error + Sync + Send + 'static
 18 | {
 19 |     fn from(sdk_err: SdkError<T>) -> Self {
 20 |         ServiceError::internal_error(sdk_err)
 21 |     }
 22 | }
 23 | 
 24 | impl From<serde_dynamo::Error> for ServiceError {
 25 |     fn from(err: serde_dynamo::Error) -> Self {
 26 |         ServiceError::internal_error(err)
 27 |     }
 28 | }
 29 | 
 30 | type Result<T> = StdResult<T, ServiceError>;
 31 | 
 32 | #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
 33 | pub struct SearchDocRef(SearchDocId);
 34 | 
 35 | impl From<NamedFieldDocument> for SearchDocRef {
 36 |     fn from(doc: NamedFieldDocument) -> Self {
 37 |         let id = doc
 38 |             .0
 39 |             .get("__id")
 40 |             .expect("__id should be set")
 41 |             .first()
 42 |             .expect("__id should exist")
 43 |             .as_text()
 44 |             .expect("__id should be string");
 45 | 
 46 |         SearchDocRef(SearchDocId::parse(id))
 47 |     }
 48 | }
 49 | 
 50 | #[async_trait]
 51 | pub trait DocumentStore: Send + Sync {
 52 |     /// Get documents by reference.
 53 |     async fn get_documents(&self, refs: Vec<SearchDocRef>) -> Result<Vec<SearchDoc>>;
 54 | 
 55 |     /// Save a document such that it can be retrieved with get_documents.
 56 |     async fn save_documents(&self, documents: Vec<SearchDoc>) -> Result<Vec<SearchDocRef>>;
 57 | }
 58 | 
 59 | pub struct DDBDocumentStore {
 60 |     table_name: String,
 61 |     client: ddb::Client,
 62 | }
 63 | 
 64 | #[async_trait]
 65 | impl DocumentStore for DDBDocumentStore {
 66 |     async fn get_documents(&self, refs: Vec<SearchDocRef>) -> Result<Vec<SearchDoc>> {
 67 |         let mut request = self.client.batch_get_item();
 68 | 
 69 |         let mut keys_and_attrs = KeysAndAttributes::builder();
 70 | 
 71 |         for doc_ref in refs {
 72 |             let key = DDBKey::from(doc_ref.0);
 73 |             keys_and_attrs = keys_and_attrs.keys(serde_dynamo::to_item(key)?);
 74 |         }
 75 | 
 76 |         request = request.request_items(&self.table_name, keys_and_attrs.build());
 77 | 
 78 |         let response = request.send().await?;
 79 | 
 80 |         let documents = response
 81 |             .responses()
 82 |             .expect("responses should be present")
 83 |             .values()
 84 |             .flatten()
 85 |             .map(|item| serde_dynamo::from_item(item.clone()))
 86 |             .collect::<StdResult<Vec<SearchDoc>, _>>()?;
 87 | 
 88 |         let unprocessed_ids = response
 89 |             .unprocessed_keys()
 90 |             .expect("unprocessed keys should be present")
 91 |             .values()
 92 |             .filter_map(KeysAndAttributes::keys)
 93 |             .flatten()
 94 |             .collect::<Vec<_>>();
 95 | 
 96 |         if unprocessed_ids.len() > 0 {
 97 |             return Err(ServiceError::rate_limit());
 98 |         }
 99 | 
100 |         Ok(documents)
101 |     }
102 | 
103 |     async fn save_documents(&self, documents: Vec<SearchDoc>) -> Result<Vec<SearchDocRef>> {
104 |         if documents.len() > 25 {
105 |             return Err(ServiceError::invalid_request(
106 |                 "Too many documents in request, max 25.",
107 |             ));
108 |         }
109 | 
110 |         let mut writes = vec![];
111 | 
112 |         for document in &documents {
113 |             let mut item: HashMap<String, AttributeValue> = serde_dynamo::to_item(document)?;
114 | 
115 |             let key: HashMap<String, AttributeValue> =
116 |                 serde_dynamo::to_item(DDBKey::from(document.id().clone()))?;
117 | 
118 |             item.extend(key);
119 | 
120 |             let put_request = PutRequest::builder().set_item(Some(item)).build();
121 | 
122 |             writes.push(WriteRequest::builder().put_request(put_request).build())
123 |         }
124 | 
125 |         let response = self
126 |             .client
127 |             .batch_write_item()
128 |             .request_items(&self.table_name, writes)
129 |             .send()
130 |             .await?;
131 | 
132 |         if let Some(items) = response.unprocessed_items() {
133 |             let unhandled_writes = items.values().flatten().collect::<Vec<_>>();
134 |             if unhandled_writes.len() > 0 {
135 |                 return Err(ServiceError::rate_limit());
136 |             }
137 |         };
138 | 
139 |         Ok(documents
140 |             .into_iter()
141 |             .map(|doc| SearchDocRef(doc.id().clone()))
142 |             .collect())
143 |     }
144 | }
145 | 
146 | impl DDBDocumentStore {
147 |     pub async fn create(table_name: Option<&str>) -> DDBDocumentStore {
148 |         let table_name = table_name
149 |             .map(String::from)
150 |             .unwrap_or_else(|| util::require_env("DATA_TABLE_NAME"));
151 |         let sdk_config = aws_config::load_from_env().await;
152 |         let client = aws_sdk_dynamodb::Client::new(&sdk_config);
153 | 
154 |         DDBDocumentStore { table_name, client }
155 |     }
156 | }
157 | 
158 | #[cfg(test)]
159 | pub mod test_util {
160 |     use std::collections::HashMap;
161 |     use std::sync::{Arc, Mutex};
162 | 
163 |     use super::*;
164 | 
165 |     #[derive(Clone, Debug)]
166 |     pub struct TestDocumentStore {
167 |         db: Arc<Mutex<HashMap<SearchDocId, SearchDoc>>>,
168 |     }
169 | 
170 |     #[async_trait]
171 |     impl DocumentStore for TestDocumentStore {
172 |         async fn save_documents(&self, documents: Vec<SearchDoc>) -> Result<Vec<SearchDocRef>> {
173 |             let mut db = self.db.lock().unwrap();
174 | 
175 |             for document in &documents {
176 |                 (*db).insert(document.id().clone(), document.clone());
177 |             }
178 | 
179 |             Ok(documents
180 |                 .iter()
181 |                 .map(|x| SearchDocRef(x.id().clone()))
182 |                 .collect())
183 |         }
184 | 
185 |         async fn get_documents(&self, refs: Vec<SearchDocRef>) -> Result<Vec<SearchDoc>> {
186 |             let db = self.db.lock().unwrap();
187 | 
188 |             Ok(refs
189 |                 .iter()
190 |                 .map(|doc_ref| (*db).get(&doc_ref.0).unwrap().clone())
191 |                 .collect())
192 |         }
193 |     }
194 | 
195 |     impl TestDocumentStore {
196 |         pub fn create() -> Self {
197 |             TestDocumentStore {
198 |                 db: Arc::new(Mutex::new(HashMap::new())),
199 |             }
200 |         }
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/packages/pathery/src/store/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod document;
2 | 


--------------------------------------------------------------------------------
/packages/pathery/src/util.rs:
--------------------------------------------------------------------------------
 1 | use std::time::SystemTime;
 2 | 
 3 | use chrono::{DateTime, Utc};
 4 | 
 5 | pub fn generate_id() -> String {
 6 |     let id = uuid::Uuid::new_v4();
 7 |     id.to_string()
 8 | }
 9 | 
10 | pub fn timestamp() -> String {
11 |     let now = SystemTime::now();
12 |     let now: DateTime<Utc> = now.into();
13 |     now.to_rfc3339()
14 | }
15 | 
16 | pub fn require_env(var_name: &str) -> String {
17 |     std::env::var(var_name).expect(&format!("{var_name:?} should be set"))
18 | }
19 | 


--------------------------------------------------------------------------------
/packages/pathery/src/worker/async_delete/client.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt::Debug;
 2 | 
 3 | use async_trait::async_trait;
 4 | 
 5 | use super::job::AsyncDeleteJob;
 6 | use crate::service::ServiceError;
 7 | use crate::util;
 8 | 
 9 | #[async_trait]
10 | pub trait AsyncDeleteClient: Sync + Send + Debug {
11 |     async fn submit_job(&self, job: AsyncDeleteJob) -> Result<String, ServiceError>;
12 | }
13 | 
14 | #[derive(Debug)]
15 | pub struct LambdaAsyncDeleteClient {
16 |     queue_url: String,
17 | 
18 |     client: aws_sdk_sqs::Client,
19 | }
20 | 
21 | #[async_trait]
22 | impl AsyncDeleteClient for LambdaAsyncDeleteClient {
23 |     async fn submit_job(&self, job: AsyncDeleteJob) -> Result<String, ServiceError> {
24 |         let body = serde_json::to_string(&job).expect("job should serialize");
25 | 
26 |         let response = self
27 |             .client
28 |             .send_message()
29 |             .queue_url(&self.queue_url)
30 |             .message_body(body)
31 |             .send()
32 |             .await
33 |             .expect("job should queue");
34 | 
35 |         Ok(response
36 |             .message_id()
37 |             .expect("message id should exist")
38 |             .to_string())
39 |     }
40 | }
41 | 
42 | impl LambdaAsyncDeleteClient {
43 |     pub async fn create(queue_url: Option<&str>) -> LambdaAsyncDeleteClient {
44 |         let sdk_config = aws_config::load_from_env().await;
45 | 
46 |         LambdaAsyncDeleteClient {
47 |             queue_url: queue_url
48 |                 .map(String::from)
49 |                 .unwrap_or_else(|| util::require_env("ASYNC_DELETE_QUEUE_URL")),
50 |             client: aws_sdk_sqs::Client::new(&sdk_config),
51 |         }
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/packages/pathery/src/worker/async_delete/job.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | #[derive(Serialize, Deserialize, Debug)]
 6 | pub enum AsyncDeleteJob {
 7 |     FSDelete(PathBuf),
 8 | }
 9 | 
10 | impl AsyncDeleteJob {
11 |     pub fn fs_delete(path: PathBuf) -> AsyncDeleteJob {
12 |         AsyncDeleteJob::FSDelete(path)
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/packages/pathery/src/worker/async_delete/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod client;
 2 | pub mod job;
 3 | 
 4 | use std::fs;
 5 | use std::path::PathBuf;
 6 | 
 7 | use serde_json as json;
 8 | 
 9 | use crate::lambda::{self, sqs};
10 | 
11 | pub fn fs_delete(path: PathBuf) {
12 |     fs::remove_file(path).expect("should be able to delete file");
13 | }
14 | 
15 | pub async fn handle_event(event: sqs::SqsEvent) -> Result<(), lambda::Error> {
16 |     let records = event.payload.records;
17 | 
18 |     let jobs = records
19 |         .iter()
20 |         .map(|message| message.body.as_ref().expect("Body should be present"))
21 |         .map(|body| {
22 |             let msg = json::from_str::<job::AsyncDeleteJob>(body.as_str())
23 |                 .expect("Message should be deserializable");
24 |             msg
25 |         })
26 |         .collect::<Vec<_>>();
27 | 
28 |     for ele in jobs {
29 |         print!("{:?}", ele);
30 |         match ele {
31 |             job::AsyncDeleteJob::FSDelete(path) => fs_delete(path),
32 |         }
33 |     }
34 | 
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/packages/pathery/src/worker/index_writer/client.rs:
--------------------------------------------------------------------------------
 1 | use async_trait::async_trait;
 2 | use thiserror::Error;
 3 | 
 4 | use super::job::Job;
 5 | use crate::service::ServiceError;
 6 | use crate::util;
 7 | 
 8 | #[derive(Debug, Error)]
 9 | pub enum IndexWriterClientError {}
10 | 
11 | #[async_trait]
12 | pub trait IndexWriterClient: Sync + Send {
13 |     async fn submit_job(&self, job: Job) -> Result<String, ServiceError>;
14 | }
15 | 
16 | pub struct LambdaIndexWriterClient {
17 |     queue_url: String,
18 |     client: aws_sdk_sqs::Client,
19 | }
20 | 
21 | #[async_trait]
22 | impl IndexWriterClient for LambdaIndexWriterClient {
23 |     async fn submit_job(&self, job: Job) -> Result<String, ServiceError> {
24 |         let body = serde_json::to_string(&job).expect("job should serialize");
25 | 
26 |         let response = self
27 |             .client
28 |             .send_message()
29 |             .queue_url(&self.queue_url)
30 |             .message_body(body)
31 |             .message_group_id(job.index_id)
32 |             .send()
33 |             .await
34 |             .expect("job should queue");
35 | 
36 |         Ok(response
37 |             .message_id()
38 |             .expect("message id should exist")
39 |             .to_string())
40 |     }
41 | }
42 | 
43 | impl LambdaIndexWriterClient {
44 |     pub async fn create(queue_url: Option<&str>) -> LambdaIndexWriterClient {
45 |         let sdk_config = aws_config::load_from_env().await;
46 | 
47 |         LambdaIndexWriterClient {
48 |             queue_url: queue_url
49 |                 .map(String::from)
50 |                 .unwrap_or_else(|| util::require_env("INDEX_WRITER_QUEUE_URL")),
51 |             client: aws_sdk_sqs::Client::new(&sdk_config),
52 |         }
53 |     }
54 | }
55 | 
56 | #[cfg(test)]
57 | pub mod test_utils {
58 |     use super::*;
59 |     use crate::index::test_util::TestIndexLoader;
60 |     use crate::index::{IndexExt, IndexLoader};
61 |     use crate::store::document::test_util::TestDocumentStore;
62 |     use crate::util;
63 |     use crate::worker::index_writer::handle_job;
64 | 
65 |     #[derive(Clone)]
66 |     pub struct TestIndexWriterClient {
67 |         index_loader: TestIndexLoader,
68 | 
69 |         document_store: TestDocumentStore,
70 |     }
71 | 
72 |     #[async_trait]
73 |     impl IndexWriterClient for TestIndexWriterClient {
74 |         async fn submit_job(&self, job: Job) -> Result<String, ServiceError> {
75 |             let index = self.index_loader.load_index(&job.index_id, None)?;
76 | 
77 |             let mut writer = index.default_writer();
78 | 
79 |             handle_job(&mut writer, &self.document_store, job).await;
80 | 
81 |             writer.commit().unwrap();
82 | 
83 |             Ok(util::generate_id())
84 |         }
85 |     }
86 | 
87 |     impl TestIndexWriterClient {
88 |         pub fn create(index_loader: TestIndexLoader, document_store: TestDocumentStore) -> Self {
89 |             TestIndexWriterClient {
90 |                 index_loader,
91 |                 document_store,
92 |             }
93 |         }
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/packages/pathery/src/worker/index_writer/job.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | use crate::search_doc::SearchDocId;
 4 | use crate::store::document::SearchDocRef;
 5 | 
 6 | #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
 7 | pub enum IndexWriterOp {
 8 |     IndexDoc { doc_ref: SearchDocRef },
 9 | 
10 |     DeleteDoc { doc_id: SearchDocId },
11 | }
12 | 
13 | #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
14 | pub struct Job {
15 |     pub index_id: String,
16 |     pub ops: Vec<IndexWriterOp>,
17 | }
18 | 
19 | impl Job {
20 |     pub fn create(index_id: &str) -> Job {
21 |         Job {
22 |             index_id: index_id.into(),
23 |             ops: vec![],
24 |         }
25 |     }
26 | 
27 |     pub fn index_doc(&mut self, doc_ref: SearchDocRef) {
28 |         self.ops.push(IndexWriterOp::IndexDoc { doc_ref })
29 |     }
30 | 
31 |     pub fn delete_doc(&mut self, doc_id: SearchDocId) {
32 |         self.ops.push(IndexWriterOp::DeleteDoc { doc_id })
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/packages/pathery/src/worker/index_writer/mod.rs:
--------------------------------------------------------------------------------
  1 | pub mod client;
  2 | pub mod job;
  3 | 
  4 | use std::collections::HashMap;
  5 | 
  6 | use serde_json as json;
  7 | use tantivy::{Document, IndexWriter, Term};
  8 | use tracing::info;
  9 | 
 10 | use self::job::{IndexWriterOp, Job};
 11 | use crate::index::{IndexExt, IndexLoader};
 12 | use crate::lambda::{self, sqs};
 13 | use crate::store::document::{DocumentStore, SearchDocRef};
 14 | 
 15 | fn delete_doc(writer: &IndexWriter, doc_id: &str) {
 16 |     let index = writer.index();
 17 |     let id_field = index.id_field();
 18 | 
 19 |     writer.delete_term(Term::from_field_text(id_field, doc_id));
 20 |     tracing::info!(message = "doc_deleted", doc_id);
 21 | }
 22 | 
 23 | fn index_doc(writer: &IndexWriter, doc: Document) {
 24 |     let index = writer.index();
 25 |     let id_field = index.id_field();
 26 |     let doc_id = doc
 27 |         .get_first(id_field)
 28 |         .and_then(|id| id.as_text())
 29 |         .expect("__id field should be present")
 30 |         .to_string();
 31 | 
 32 |     delete_doc(writer, &doc_id);
 33 |     writer
 34 |         .add_document(doc)
 35 |         .expect("Adding a document should not error");
 36 |     tracing::info!(message = "doc_indexed", doc_id);
 37 | }
 38 | 
 39 | pub async fn handle_job(writer: &mut IndexWriter, document_store: &dyn DocumentStore, job: Job) {
 40 |     let schema = writer.index().schema();
 41 | 
 42 |     let mut doc_refs: Vec<SearchDocRef> = vec![];
 43 | 
 44 |     for op in job.ops {
 45 |         match op {
 46 |             IndexWriterOp::IndexDoc { doc_ref } => doc_refs.push(doc_ref),
 47 | 
 48 |             IndexWriterOp::DeleteDoc { doc_id } => delete_doc(writer, doc_id.id()),
 49 |         }
 50 |     }
 51 | 
 52 |     let docs = document_store.get_documents(doc_refs).await.unwrap();
 53 | 
 54 |     for doc in docs {
 55 |         let document = doc.document(&schema);
 56 |         index_doc(writer, document);
 57 |     }
 58 | }
 59 | 
 60 | pub async fn handle_event(
 61 |     document_store: &dyn DocumentStore,
 62 |     index_loader: &dyn IndexLoader,
 63 |     event: sqs::SqsEvent,
 64 | ) -> Result<(), lambda::Error> {
 65 |     let records = event.payload.records;
 66 | 
 67 |     let jobs = records
 68 |         .iter()
 69 |         .map(|message| message.body.as_ref().expect("Body should be present"))
 70 |         .map(|body| {
 71 |             let msg =
 72 |                 json::from_str::<Job>(body.as_str()).expect("Message should be deserializable");
 73 |             msg
 74 |         })
 75 |         .collect::<Vec<_>>();
 76 | 
 77 |     let mut writers: HashMap<String, IndexWriter> = HashMap::new();
 78 | 
 79 |     for job in jobs {
 80 |         let index_id = &job.index_id;
 81 |         let mut writer = writers.entry(index_id.to_string()).or_insert_with(|| {
 82 |             index_loader
 83 |                 .load_index(&index_id, None)
 84 |                 .unwrap()
 85 |                 .default_writer()
 86 |         });
 87 | 
 88 |         handle_job(&mut writer, document_store, job).await;
 89 |     }
 90 | 
 91 |     for (index, mut writer) in writers.into_iter() {
 92 |         writer.commit().expect("commit should succeed");
 93 |         info!(message = "index_commit", index);
 94 |         writer
 95 |             .wait_merging_threads()
 96 |             .expect("merge should finish without error");
 97 |     }
 98 | 
 99 |     Ok(())
100 | }
101 | 
102 | #[cfg(test)]
103 | mod tests {
104 | 
105 |     use aws_lambda_events::sqs::{self, SqsMessage};
106 |     use lambda_http::Context;
107 |     use lambda_runtime::LambdaEvent;
108 | 
109 |     use super::job::Job;
110 |     use super::{handle_event, *};
111 |     use crate::schema::SchemaLoader;
112 |     use crate::search_doc::SearchDoc;
113 |     use crate::test_utils::*;
114 | 
115 |     #[tokio::test]
116 |     async fn test_indexing() {
117 |         let ctx = setup();
118 | 
119 |         let schema = ctx.schema_loader().load_schema("test").unwrap();
120 | 
121 |         let mut job = Job::create("test");
122 | 
123 |         let document = SearchDoc::from_json(
124 |             &schema,
125 |             json!({
126 |                 "year": 1989
127 |             }),
128 |         )
129 |         .unwrap();
130 | 
131 |         let doc_refs = ctx
132 |             .document_store()
133 |             .save_documents(vec![document])
134 |             .await
135 |             .unwrap();
136 | 
137 |         for doc_ref in doc_refs {
138 |             job.index_doc(doc_ref);
139 |         }
140 | 
141 |         let message = SqsMessage {
142 |             body: Some(json::to_string(&job).unwrap()),
143 |             ..Default::default()
144 |         };
145 | 
146 |         let event = sqs::SqsEvent {
147 |             records: vec![message],
148 |         };
149 | 
150 |         handle_event(
151 |             ctx.document_store(),
152 |             ctx.index_loader(),
153 |             LambdaEvent::new(event, Context::default()),
154 |         )
155 |         .await
156 |         .unwrap();
157 | 
158 |         assert_eq!(
159 |             1,
160 |             ctx.index_loader()
161 |                 .load_index("test", None)
162 |                 .unwrap()
163 |                 .reader()
164 |                 .unwrap()
165 |                 .searcher()
166 |                 .num_docs()
167 |         );
168 |     }
169 | }
170 | 


--------------------------------------------------------------------------------
/packages/pathery/src/worker/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod async_delete;
2 | pub mod index_writer;
3 | 


--------------------------------------------------------------------------------
/pnpm-workspace.yaml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - "app"
3 |   - "integration-test"
4 |   - "handlers/**"
5 |   - "packages/**"
6 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | comment_width = 120
2 | format_strings = true
3 | group_imports = "StdExternalCrate"
4 | imports_granularity = "Module"
5 | normalize_comments = true
6 | where_single_line = true
7 | wrap_comments = true


--------------------------------------------------------------------------------
/turbo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://turbo.build/schema.json",
 3 |   "pipeline": {
 4 |     "test": {
 5 |       "outputs": []
 6 |     },
 7 |     "//#build:lambda": {
 8 |       "inputs": [
 9 |         "Cargo.lock",
10 |         "Cargo.toml",
11 |         "packages/**/*.rs",
12 |         "packages/**/Cargo.toml",
13 |         "handlers/**/*.rs",
14 |         "handlers/**/Cargo.toml",
15 |         ".cargo"
16 |       ],
17 |       "outputs": ["target/lambda"]
18 |     },
19 |     "build": {
20 |       "dependsOn": ["//#build:lambda", "^build"],
21 |       "outputs": ["lib"]
22 |     },
23 |     "synth": {
24 |       "dependsOn": ["^build"],
25 |       "outputs": ["cdk.out"]
26 |     },
27 |     "deploy": {
28 |       "dependsOn": ["^build"],
29 |       "outputs": ["cdk.out"]
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------