├── .dockerignore ├── .eslintignore ├── .eslintrc ├── .gitignore ├── .prettierrc ├── README.md ├── docs ├── labels.png ├── set_webhook.png └── setting_webhook.png ├── infra └── cdk │ ├── .npmignore │ ├── README.md │ ├── bin │ └── cdk.ts │ ├── cdk.json │ ├── jest.config.js │ ├── lib │ └── omnivore-tagging-stack.ts │ ├── package-lock.json │ ├── package.json │ ├── test │ └── cdk.test.ts │ └── tsconfig.json ├── package-lock.json ├── package.json ├── src ├── app.ts ├── clients │ ├── ai │ │ ├── bedrock.ts │ │ ├── client.ts │ │ ├── openAi.ts │ │ └── prompt.ts │ └── omnivore │ │ └── omnivore.ts ├── env.ts ├── lambda.ts ├── lib │ ├── article.ts │ ├── embedding.ts │ ├── labels.ts │ ├── service │ │ └── page.ts │ ├── store │ │ ├── Store.ts │ │ ├── labelDynamoDBStore.ts │ │ └── labelLocal.ts │ └── util │ │ ├── cache.ts │ │ ├── logger.ts │ │ └── math.ts ├── resources │ └── config.ts ├── routes │ └── webhook.ts └── types │ ├── AiClient.ts │ ├── Bedrock.ts │ ├── Embedded.ts │ ├── Feeds.ts │ ├── OmnivoreArticle.ts │ ├── OmnivoreSchema.ts │ ├── Webhook.ts │ └── globals.d.ts └── tsconfig.json /.dockerignore: -------------------------------------------------------------------------------- 1 | cdk* -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | src/lib/util/logger.ts -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "eslint:recommended", 4 | "plugin:@typescript-eslint/recommended", 5 | "plugin:@typescript-eslint/recommended-requiring-type-checking", 6 | "plugin:prettier/recommended" 7 | ], 8 | "parser": "@typescript-eslint/parser", 9 | "parserOptions": { 10 | "ecmaVersion": "latest", 11 | "project": "tsconfig.json" 12 | }, 13 | "plugins": ["@typescript-eslint"], 14 | "rules": { 15 | "semi": [2, "never"], 16 | "@typescript-eslint/require-await": "off", 17 | "@typescript-eslint/no-unused-vars": [ 18 | "warn", 19 | { 20 | "argsIgnorePattern": "^_", 21 | "ignoreRestSiblings": true, 22 | "varsIgnorePattern": "^_", 23 | "caughtErrorsIgnorePattern": "^_" 24 | } 25 | ], 26 | "@typescript-eslint/no-misused-promises": ["error", { 27 | "checksVoidReturn": false 28 | }], 29 | "@typescript-eslint/no-floating-promises": "off", 30 | "@typescript-eslint/no-unsafe-call": "warn", 31 | "@typescript-eslint/no-unsafe-assignment": "warn", 32 | "@typescript-eslint/no-unsafe-member-access": "warn", 33 | "@typescript-eslint/no-unsafe-return": "warn" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ IDEA 2 | .idea/ 3 | 4 | # Compiled files 5 | node_modules/ 6 | dist/ 7 | out/ 8 | 9 | # Editor-specific files 10 | *.iml 11 | *.ipr 12 | *.iws 13 | .idea/ 14 | 15 | # Logs and databases 16 | *.log 17 | *.sqlite 18 | *.sqlite3 19 | 20 | # OS-specific files 21 | .DS_Store 22 | Thumbs.db 23 | 24 | cdk.out -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": false, 3 | "singleQuote": true 4 | } 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Omnivore Automatic Labelling 2 | 3 | This project aims to implement automatic labeling using Omnivores built in Webhook Functionality and Open AI's Embedding API. 4 | 5 | This repository provides a way to deploy this via AWS using API Gateway, and Lambda. I also have included a Fastify Docker Image that can be deployed to any webserver. 6 | 7 | ## Improving labeling Accuracy: 8 | 9 | Labeling accuracy can be improved by providing a small description in your Label in omnivore. This provides more context to the embedding model. 10 | 11 | ![label](./docs/labels.png) 12 | 13 | Note that the embedding uses the Articles Description to try to automatically label. 14 | 15 | ## Labeling Filtering 16 | 17 | There is an included filter list in the `src/resources/config.ts` 18 | 19 | This currently filters the RSS Tag, and Newsletter tag. These are system defaults and rarely used. 20 | 21 | You can also add your own tags into here to avoid 22 | 23 | ## Labeling Strategies 24 | 25 | Provided are 3 different labeling Strategies. These can be changed in `src/resources/config.ts` 26 | 27 | ### PERCENTAGES 28 | 29 | This is the default method. 30 | 31 | It uses a set of percentages to add labels. 32 | 33 | ``` 34 | { 35 | "percentage": 0.9, 36 | "maxLabels": 5 37 | }, 38 | ``` 39 | 40 | The above would allow you to have 5 labels added if the similarity percentages between that label, and the article is 90% 41 | 42 | 43 | ### HIGHEST_SIMILARITY 44 | 45 | This adds the `maxLabels` number of Labels to each article. 46 | 47 | It always adds the highest % similarity. 48 | 49 | ### PER_LABEL_PERCENTAGE 50 | 51 | This allows you to set individual percentages per tag. If the tag is not in the list, or is below the threshold it will not be added. 52 | 53 | 54 | ## Technology 55 | 56 | * OpenAI Embedding Models 57 | * Omnivore API 58 | * AWS Api Gateway 59 | * AWS Lambda 60 | * DynamoDB (Storage to avoid constantly recalculating the embedding) 61 | 62 | ## How to Deploy 63 | 64 | The AWS CDK is used to simplify deployment. This requires you to have an AWS Account. 65 | 66 | ``npx cdk deploy -c open_api_key= -c omnivore_auth=`` 67 | 68 | This will provide an API Gateway, the endpoint will be the API Gateway Endpoint /page 69 | 70 | This can then be added in the Omnivore Webhook 71 | 72 | ![Webhook](./docs/setting_webhook.png) 73 | 74 | ![Webhook](./docs/set_webhook.png) 75 | 76 | 77 | ## TODO: 78 | 79 | I need to add updated labels. At the moment in the deployed version the only way to update the labels is to delete them from DynamoDB. In the hosted version it would be to restart the server. It does not currently add new labels, but we could add this later. 80 | 81 | -------------------------------------------------------------------------------- /docs/labels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Podginator/omnivore-automatic-labelling/06109be4ed9a91439bceec4ccd7be6a9854d0131/docs/labels.png -------------------------------------------------------------------------------- /docs/set_webhook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Podginator/omnivore-automatic-labelling/06109be4ed9a91439bceec4ccd7be6a9854d0131/docs/set_webhook.png -------------------------------------------------------------------------------- /docs/setting_webhook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Podginator/omnivore-automatic-labelling/06109be4ed9a91439bceec4ccd7be6a9854d0131/docs/setting_webhook.png -------------------------------------------------------------------------------- /infra/cdk/.npmignore: -------------------------------------------------------------------------------- 1 | *.ts 2 | !*.d.ts 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | -------------------------------------------------------------------------------- /infra/cdk/README.md: -------------------------------------------------------------------------------- 1 | # Welcome to your CDK TypeScript project 2 | 3 | This is a blank project for CDK development with TypeScript. 4 | 5 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 6 | 7 | ## Useful commands 8 | 9 | * `npm run build` compile typescript to js 10 | * `npm run watch` watch for changes and compile 11 | * `npm run test` perform the jest unit tests 12 | * `npx cdk deploy` deploy this stack to your default AWS account/region 13 | * `npx cdk diff` compare deployed stack with current state 14 | * `npx cdk synth` emits the synthesized CloudFormation template 15 | -------------------------------------------------------------------------------- /infra/cdk/bin/cdk.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import 'source-map-support/register'; 3 | import * as cdk from 'aws-cdk-lib'; 4 | import { OmnivoreTaggingStack } from '../lib/omnivore-tagging-stack'; 5 | 6 | const app = new cdk.App(); 7 | new OmnivoreTaggingStack(app, 'CdkStack', { 8 | /* If you don't specify 'env', this stack will be environment-agnostic. 9 | * Account/Region-dependent features and context lookups will not work, 10 | * but a single synthesized template can be deployed anywhere. */ 11 | 12 | /* Uncomment the next line to specialize this stack for the AWS Account 13 | * and Region that are implied by the current CLI configuration. */ 14 | // env: { account: process.env.CDK_DEFAULT_ACCOUNT, region: process.env.CDK_DEFAULT_REGION }, 15 | 16 | /* Uncomment the next line if you know exactly what Account and Region you 17 | * want to deploy the stack to. */ 18 | // env: { account: '123456789012', region: 'us-east-1' }, 19 | 20 | /* For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html */ 21 | }); -------------------------------------------------------------------------------- /infra/cdk/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "npx ts-node --prefer-ts-exts bin/cdk.ts", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "**/*.d.ts", 11 | "**/*.js", 12 | "tsconfig.json", 13 | "package*.json", 14 | "yarn.lock", 15 | "node_modules", 16 | "test" 17 | ] 18 | }, 19 | "context": { 20 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 21 | "@aws-cdk/core:checkSecretUsage": true, 22 | 23 | "@aws-cdk/core:target-partitions": [ 24 | "aws", 25 | "aws-cn" 26 | ], 27 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 28 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 29 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 30 | "@aws-cdk/aws-iam:minimizePolicies": true, 31 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 32 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 33 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 34 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 35 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 36 | "@aws-cdk/core:enablePartitionLiterals": true, 37 | "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, 38 | "@aws-cdk/aws-iam:standardizedServicePrincipals": true, 39 | "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, 40 | "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, 41 | "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, 42 | "@aws-cdk/aws-route53-patters:useCertificate": true, 43 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false, 44 | "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, 45 | "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, 46 | "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, 47 | "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, 48 | "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, 49 | "@aws-cdk/aws-redshift:columnId": true, 50 | "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, 51 | "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, 52 | "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, 53 | "@aws-cdk/aws-kms:aliasNameRef": true, 54 | "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, 55 | "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, 56 | "@aws-cdk/aws-efs:denyAnonymousAccess": true, 57 | "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, 58 | "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, 59 | "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, 60 | "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, 61 | "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, 62 | "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, 63 | "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /infra/cdk/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | testEnvironment: 'node', 3 | roots: ['/test'], 4 | testMatch: ['**/*.test.ts'], 5 | transform: { 6 | '^.+\\.tsx?$': 'ts-jest' 7 | } 8 | }; 9 | -------------------------------------------------------------------------------- /infra/cdk/lib/omnivore-tagging-stack.ts: -------------------------------------------------------------------------------- 1 | import * as dotenv from 'dotenv' 2 | import * as cdk from 'aws-cdk-lib'; 3 | import * as path from "path"; 4 | import { Construct } from 'constructs'; 5 | import * as Lambda from 'aws-cdk-lib/aws-lambda-nodejs'; 6 | import * as apigw from 'aws-cdk-lib/aws-apigateway' 7 | import * as dynamodb from 'aws-cdk-lib/aws-dynamodb' 8 | import {Runtime} from "aws-cdk-lib/aws-lambda"; 9 | import {RemovalPolicy} from "aws-cdk-lib"; 10 | 11 | dotenv.config(); 12 | 13 | export class OmnivoreTaggingStack extends cdk.Stack { 14 | constructor(scope: Construct, id: string, props?: cdk.StackProps) { 15 | super(scope, id, props); 16 | 17 | const tableName = "labelEmbeddings"; 18 | 19 | const table = new dynamodb.Table(this, `${tableName}Table`, { 20 | tableName: tableName, 21 | partitionKey: { 22 | name: "id", 23 | type: dynamodb.AttributeType.STRING, 24 | }, 25 | removalPolicy: RemovalPolicy.DESTROY, 26 | billingMode: dynamodb.BillingMode.PAY_PER_REQUEST, 27 | }); 28 | 29 | const lambdaFn = new Lambda.NodejsFunction(this, "IncomingWebhook", { 30 | entry: path.join(__dirname, "../../../src/lambda.ts"), 31 | depsLockFilePath: path.join(__dirname, "../../../package-lock.json"), 32 | handler: "handler", 33 | runtime: Runtime.NODEJS_20_X, 34 | memorySize: 1024, 35 | environment: { 36 | OPENAI_API_KEY: this.node.getContext("open_api_key"), 37 | OMNIVORE_API_KEY: this.node.getContext("omnivore_auth"), 38 | DYNAMODB_TABLE_NAME: table.tableName 39 | }, 40 | }); 41 | 42 | table.grantReadWriteData(lambdaFn); 43 | 44 | const gateway = new apigw.RestApi(this, 'OmnivoreWebhookRestAPI', { 45 | endpointTypes: [apigw.EndpointType.REGIONAL], 46 | deployOptions: { 47 | metricsEnabled: true, 48 | loggingLevel: apigw.MethodLoggingLevel.INFO, 49 | dataTraceEnabled: true, 50 | stageName: 'prod', 51 | methodOptions: { 52 | '/*/*': { 53 | throttlingRateLimit: 100, 54 | throttlingBurstLimit: 200, 55 | }, 56 | }, 57 | }, 58 | }); 59 | 60 | const basePath = gateway.root.addResource('page'); 61 | 62 | let lambdaIntegration = new apigw.LambdaIntegration(lambdaFn, { 63 | proxy: false, 64 | requestParameters: {'integration.request.header.X-Amz-Invocation-Type': "'Event'"}, 65 | integrationResponses: [ 66 | { 67 | statusCode: '200', 68 | }, 69 | ], 70 | }); 71 | 72 | basePath.addMethod('POST', lambdaIntegration, { 73 | methodResponses: [ 74 | { 75 | statusCode: '200', 76 | }, 77 | ], 78 | }); 79 | 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /infra/cdk/package-lock.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Podginator/omnivore-automatic-labelling/06109be4ed9a91439bceec4ccd7be6a9854d0131/infra/cdk/package-lock.json -------------------------------------------------------------------------------- /infra/cdk/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cdk", 3 | "version": "0.1.0", 4 | "bin": { 5 | "cdk": "bin/cdk.js" 6 | }, 7 | "scripts": { 8 | "build": "tsc", 9 | "watch": "tsc -w", 10 | "test": "jest", 11 | "cdk": "cdk" 12 | }, 13 | "devDependencies": { 14 | "@types/jest": "^29.5.11", 15 | "@types/node": "20.10.8", 16 | "aws-cdk": "2.121.1", 17 | "jest": "^29.7.0", 18 | "ts-jest": "^29.1.1", 19 | "ts-node": "^10.9.2", 20 | "typescript": "~5.3.3" 21 | }, 22 | "dependencies": { 23 | "aws-cdk-lib": "2.121.1", 24 | "aws-lambda-fastify": "^2.2.0", 25 | "constructs": "^10.0.0", 26 | "logform": "^2.6.0", 27 | "source-map-support": "^0.5.21" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /infra/cdk/test/cdk.test.ts: -------------------------------------------------------------------------------- 1 | // import * as cdk from 'aws-cdk-lib'; 2 | // import { Template } from 'aws-cdk-lib/assertions'; 3 | // import * as Cdk from '../lib/cdk-stack'; 4 | 5 | // example test. To run these tests, uncomment this file along with the 6 | // example resource in lib/omnivore-tagging-stack.ts 7 | test('SQS Queue Created', () => { 8 | // const app = new cdk.App(); 9 | // // WHEN 10 | // const stack = new Cdk.OmnivoreTaggingStack(app, 'MyTestStack'); 11 | // // THEN 12 | // const template = Template.fromStack(stack); 13 | 14 | // template.hasResourceProperties('AWS::SQS::Queue', { 15 | // VisibilityTimeout: 300 16 | // }); 17 | }); 18 | -------------------------------------------------------------------------------- /infra/cdk/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "commonjs", 5 | "lib": [ 6 | "es2020", 7 | "dom" 8 | ], 9 | "declaration": true, 10 | "strict": true, 11 | "noImplicitAny": true, 12 | "strictNullChecks": true, 13 | "noImplicitThis": true, 14 | "alwaysStrict": true, 15 | "noUnusedLocals": false, 16 | "noUnusedParameters": false, 17 | "noImplicitReturns": true, 18 | "noFallthroughCasesInSwitch": false, 19 | "inlineSourceMap": true, 20 | "inlineSources": true, 21 | "experimentalDecorators": true, 22 | "strictPropertyInitialization": false, 23 | "typeRoots": [ 24 | "./node_modules/@types" 25 | ] 26 | }, 27 | "exclude": [ 28 | "node_modules", 29 | "cdk.out" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "omnivore-automatic-tagging", 3 | "version": "1.0.0", 4 | "description": "An automatic tagging solution for Omnivore using Omnivore Webhooks", 5 | "main": "index.js", 6 | "scripts": { 7 | "build": "tsc", 8 | "dev": "ts-node-dev --files src/app.ts", 9 | "lint": "eslint src --ext ts,js,tsx,jsx", 10 | "lint:fix": "eslint src --fix --ext ts,js,tsx,jsx" 11 | }, 12 | "author": "Thomas Rogers", 13 | "license": "ISC", 14 | "dependencies": { 15 | "@aws-sdk/client-dynamodb": "^3.490.0", 16 | "@aws-sdk/lib-dynamodb": "^3.490.0", 17 | "aws-lambda-fastify": "^2.2.0", 18 | "aws4-axios": "^3.3.0", 19 | "axios": "^1.6.5", 20 | "dotenv": "^16.3.1", 21 | "fastify": "^4.25.2", 22 | "logform": "^2.6.0", 23 | "openai": "^4.24.7", 24 | "typescript": "^5.3.3", 25 | "winston": "^3.11.0" 26 | }, 27 | "devDependencies": { 28 | "@typescript-eslint/eslint-plugin": "^6.12.0", 29 | "@typescript-eslint/parser": "^6.12.0", 30 | "aws-cdk": "^2.121.1", 31 | "aws-lambda": "^1.0.7", 32 | "eslint": "^8.54.0", 33 | "eslint-config-prettier": "^9.0.0", 34 | "eslint-plugin-prettier": "^5.0.1", 35 | "prettier": "^3.2.2" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/app.ts: -------------------------------------------------------------------------------- 1 | import fastify, { FastifyError, FastifyRequest, FastifyReply } from 'fastify' 2 | import * as webhook from './routes/webhook' 3 | 4 | export const app = fastify() 5 | 6 | webhook.registerRoutes(app) 7 | 8 | // Error Handling - Uses http-errors 9 | app.setErrorHandler((err: FastifyError, _req, reply) => { 10 | if (!err) { 11 | return reply.status(500).send() 12 | } 13 | 14 | if (err.statusCode) { 15 | return reply.status(err.statusCode).send(err) 16 | } 17 | 18 | return reply.status(500).send(err) 19 | }) 20 | 21 | if (require.main === module) { 22 | app.listen(3000, (err) => { 23 | console.log('Listening on Port 3000') 24 | }) 25 | } 26 | -------------------------------------------------------------------------------- /src/clients/ai/bedrock.ts: -------------------------------------------------------------------------------- 1 | import axios, { AxiosInstance } from 'axios' 2 | import { 3 | BedrockClientParams, 4 | BedrockClientResponse, 5 | BedrockInvokeParams, 6 | } from '../../types/Bedrock' 7 | import { aws4Interceptor } from 'aws4-axios' 8 | import { AiClient, Embedding } from '../../types/AiClient' 9 | import { SUMMARISE_PROMPT } from './prompt' 10 | 11 | export class BedrockClient implements AiClient { 12 | client: AxiosInstance 13 | tokenLimit = 100_000 // (Perhaps. Not even sure of the validity of this.) 14 | embeddingLimit = 8000 15 | constructor( 16 | params: BedrockClientParams = { 17 | region: 'us-west-2', 18 | endpoint: 'https://bedrock-runtime.us-west-2.amazonaws.com', 19 | }, 20 | ) { 21 | this.client = axios.create({ 22 | baseURL: params.endpoint, 23 | }) 24 | const interceptor = aws4Interceptor({ 25 | options: { 26 | region: params.region, 27 | service: 'bedrock', 28 | }, 29 | }) 30 | 31 | this.client.interceptors.request.use(interceptor) 32 | this.client.defaults.headers.common['Accept'] = '*/*' 33 | this.client.defaults.headers.common['Content-Type'] = 'application/json' 34 | } 35 | 36 | _extractHttpBody( 37 | invokeParams: BedrockInvokeParams, 38 | ): Partial { 39 | const { model: _, prompt, ...httpCommands } = invokeParams 40 | return { ...httpCommands, prompt: this._wrapPrompt(prompt) } 41 | } 42 | 43 | _wrapPrompt(prompt: string): string { 44 | return `\nHuman: ${prompt}\nAssistant:` 45 | } 46 | 47 | async getEmbeddings(text: string): Promise { 48 | const { data } = await this.client.post( 49 | `/model/cohere.embed-english-v3/invoke`, 50 | { texts: [text], input_type: 'clustering' }, 51 | ) 52 | return data.embeddings![0] 53 | } 54 | async summarizeText(text: string): Promise { 55 | const summariseParams = { 56 | model: 'anthropic.claude-v2', 57 | max_tokens_to_sample: 8192, 58 | temperature: 1, 59 | top_k: 250, 60 | top_p: 0.999, 61 | stop_sequences: ['\\n\\Human:'], 62 | anthropic_version: 'bedrock-2023-05-31', 63 | prompt: SUMMARISE_PROMPT(text), 64 | } 65 | 66 | const { data } = await this.client.post( 67 | `/model/${summariseParams.model}/invoke`, 68 | this._extractHttpBody(summariseParams), 69 | ) 70 | return data.completion 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/clients/ai/client.ts: -------------------------------------------------------------------------------- 1 | import { AiClient } from '../../types/AiClient' 2 | import { OpenAiClient } from './openAi' 3 | 4 | export const client: AiClient = new OpenAiClient() 5 | -------------------------------------------------------------------------------- /src/clients/ai/openAi.ts: -------------------------------------------------------------------------------- 1 | import { AiClient, Embedding } from '../../types/AiClient' 2 | import { OpenAI } from 'openai' 3 | import { SUMMARISE_PROMPT } from './prompt' 4 | 5 | export type OpenAiParams = { 6 | apiKey: string // defaults to process.env["OPEN_AI_KEY"] 7 | } 8 | 9 | export class OpenAiClient implements AiClient { 10 | client: OpenAI 11 | tokenLimit = 4096 12 | embeddingLimit = 8191 13 | 14 | constructor( 15 | openAiParams: OpenAiParams = { apiKey: process.env.OPEN_AI_KEY! }, 16 | ) { 17 | this.client = new OpenAI(openAiParams) 18 | } 19 | 20 | async getEmbeddings(input: string): Promise { 21 | const embedding = await this.client.embeddings.create({ 22 | input, 23 | model: 'text-embedding-ada-002', 24 | }) 25 | 26 | return embedding.data[0].embedding 27 | } 28 | 29 | async summarizeText(text: string): Promise { 30 | const prompt = `${SUMMARISE_PROMPT(text)}` 31 | const completion = await this.client.chat.completions.create({ 32 | messages: [{ role: 'user', content: prompt }], 33 | model: 'gpt-3.5-turbo', 34 | stream: false, 35 | }) 36 | 37 | return completion.choices[0]?.message?.content ?? '' 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/clients/ai/prompt.ts: -------------------------------------------------------------------------------- 1 | export const SUMMARISE_PROMPT = (articleContent: string) => 2 | `Please create a summary of the article below. Please Do not exceed 25 words. Please do not add any of your own prose.\n${articleContent}\n' Here is a 25 word summary of the article:\n` 3 | -------------------------------------------------------------------------------- /src/clients/omnivore/omnivore.ts: -------------------------------------------------------------------------------- 1 | import axios, { type AxiosResponse } from 'axios' 2 | import { 3 | type Article, 4 | type SearchItemEdge, 5 | type ArticleSuccess, 6 | Label, 7 | LabelsSuccess, 8 | } from '../../types/OmnivoreSchema' 9 | 10 | const API_URL = 11 | process.env.OMNIVORE_API_URL ?? 'https://api-prod.omnivore.app/api' 12 | 13 | export class OmnivoreClient { 14 | username: string 15 | token: string 16 | 17 | private constructor(username: string, token: string) { 18 | this.username = username 19 | this.token = token 20 | } 21 | 22 | static async createOmnivoreClient(token: string): Promise { 23 | return new OmnivoreClient(await this.getUsername(token), token) 24 | } 25 | 26 | private static async getUsername(token: string): Promise { 27 | const data = JSON.stringify({ 28 | query: `query GetUsername { 29 | me { 30 | profile { 31 | username 32 | } 33 | } 34 | } 35 | `, 36 | }) 37 | 38 | const response = await axios 39 | .post(`${API_URL}/graphql`, data, { 40 | headers: { 41 | Cookie: `auth=${token};`, 42 | 'Content-Type': 'application/json', 43 | }, 44 | }) 45 | .catch((error) => { 46 | console.error(error) 47 | throw error 48 | }) 49 | 50 | return response.data.data.me.profile.username as string 51 | } 52 | 53 | async fetchPages(): Promise { 54 | const data = { 55 | query: `query Search($after: String, $first: Int, $query: String) { 56 | search(first: $first, after: $after, query: $query) { 57 | ... on SearchSuccess { 58 | edges { 59 | cursor 60 | node { 61 | id 62 | title 63 | slug 64 | url 65 | pageType 66 | contentReader 67 | createdAt 68 | isArchived 69 | author 70 | image 71 | description 72 | publishedAt 73 | ownedByViewer 74 | originalArticleUrl 75 | uploadFileId 76 | labels { 77 | id 78 | name 79 | color 80 | } 81 | pageId 82 | shortId 83 | quote 84 | annotation 85 | state 86 | siteName 87 | subscription 88 | readAt 89 | savedAt 90 | wordsCount 91 | } 92 | } 93 | pageInfo { 94 | hasNextPage 95 | hasPreviousPage 96 | startCursor 97 | endCursor 98 | totalCount 99 | } 100 | } 101 | ... on SearchError { 102 | errorCodes 103 | } 104 | } 105 | }`, 106 | variables: { query: 'in:inbox', after: '0', first: 1000 }, 107 | } 108 | 109 | const response = await axios 110 | .post(`${API_URL}/graphql`, data, { 111 | headers: { 112 | Cookie: `auth=${process.env.OMNIVORE_AUTH_TOKEN!};`, 113 | 'Content-Type': 'application/json', 114 | }, 115 | }) 116 | .catch((error) => { 117 | console.error(error) 118 | throw error 119 | }) 120 | 121 | return response.data.data.search.edges as SearchItemEdge[] 122 | } 123 | 124 | async fetchPage(slug: string): Promise
{ 125 | const data = JSON.stringify({ 126 | variables: { 127 | username: this.username, 128 | slug, 129 | }, 130 | query: `query GetArticle( 131 | $username: String! 132 | $slug: String! 133 | ) { 134 | article(username: $username, slug: $slug) { 135 | ... on ArticleSuccess { 136 | article { 137 | id, 138 | title, 139 | url, 140 | author, 141 | savedAt, 142 | description, 143 | image 144 | content 145 | } 146 | } 147 | ... on ArticleError { 148 | errorCodes 149 | } 150 | } 151 | } 152 | `, 153 | }) 154 | 155 | const response: AxiosResponse<{ data: { article: ArticleSuccess } }> = 156 | await axios.post(`${API_URL}/graphql`, data, { 157 | headers: { 158 | Cookie: `auth=${this.token};`, 159 | 'Content-Type': 'application/json', 160 | }, 161 | }) 162 | 163 | return response.data.data.article.article 164 | } 165 | 166 | async getUsersTags(): Promise { 167 | const data = JSON.stringify({ 168 | query: `query GetLabels{ 169 | labels { 170 | ... on LabelsSuccess { 171 | labels { 172 | id, 173 | name, 174 | color, 175 | description, 176 | createdAt, 177 | position, 178 | internal 179 | } 180 | } 181 | ... on LabelsError { 182 | errorCodes 183 | } 184 | } 185 | } 186 | `, 187 | }) 188 | 189 | const response: AxiosResponse<{ data: { labels: LabelsSuccess } }> = 190 | await axios.post(`${API_URL}/graphql`, data, { 191 | headers: { 192 | Cookie: `auth=${this.token};`, 193 | 'Content-Type': 'application/json', 194 | }, 195 | }) 196 | 197 | return response.data.data.labels.labels 198 | } 199 | 200 | async setLabels(pageId: string, labels: Label[]): Promise { 201 | const mutation = `mutation SetLabels($input: SetLabelsInput!) { 202 | setLabels(input: $input) { 203 | ... on SetLabelsSuccess { 204 | labels { 205 | ...LabelFields 206 | } 207 | } 208 | ... on SetLabelsError { 209 | errorCodes 210 | } 211 | } 212 | } 213 | 214 | fragment LabelFields on Label { 215 | id 216 | name 217 | color 218 | description 219 | createdAt 220 | }` 221 | 222 | const labelIds = labels.map((it) => it.id) 223 | 224 | await axios.post( 225 | `${API_URL}/graphql`, 226 | { query: mutation, variables: { input: { pageId, labelIds } } }, 227 | { 228 | headers: { 229 | Cookie: `auth=${this.token};`, 230 | 'Content-Type': 'application/json', 231 | }, 232 | }, 233 | ) 234 | } 235 | 236 | async archiveLink(id: string): Promise { 237 | const mutation = `mutation ArchivePage($id: ID!) { 238 | setLinkArchived (input: {linkId: $id, archived: true}) { 239 | ... on ArchiveLinkSuccess { 240 | linkId 241 | message 242 | } 243 | ... on ArchiveLinkError { 244 | message 245 | errorCodes 246 | } 247 | } 248 | }` 249 | 250 | return await axios 251 | .post( 252 | `${API_URL}/graphql`, 253 | { query: mutation, variables: { id } }, 254 | { 255 | headers: { 256 | Cookie: `auth=${this.token};`, 257 | 'Content-Type': 'application/json', 258 | }, 259 | }, 260 | ) 261 | .then((_) => true) 262 | } 263 | } 264 | 265 | export const omnivoreClient: Promise = (async () => { 266 | return await OmnivoreClient.createOmnivoreClient( 267 | process.env.OMNIVORE_API_KEY!, 268 | ) 269 | })() 270 | -------------------------------------------------------------------------------- /src/env.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Podginator/omnivore-automatic-labelling/06109be4ed9a91439bceec4ccd7be6a9854d0131/src/env.ts -------------------------------------------------------------------------------- /src/lambda.ts: -------------------------------------------------------------------------------- 1 | import { Context } from 'aws-cdk/lib/settings' 2 | import { PageWebhookInput } from './types/Webhook' 3 | import { retrieveSimilarLabelsForPage } from './lib/service/page' 4 | import { logger } from './lib/util/logger' 5 | 6 | export const handler = async ( 7 | event: PageWebhookInput, 8 | _: Context, 9 | ): Promise => { 10 | logger.log(`Retrieved Page: ${event.page.title}, Trying to get Labels...`) 11 | logger.log(JSON.stringify(event)) 12 | if (event.page.state != 'SUCCEEDED' && event.action != 'updated') { 13 | logger.error('Not able to parse an incomplete article, exiting early...') 14 | return 15 | } 16 | await retrieveSimilarLabelsForPage(event.page) 17 | } 18 | -------------------------------------------------------------------------------- /src/lib/article.ts: -------------------------------------------------------------------------------- 1 | import { OmnivorePage } from '../types/Webhook' 2 | import { Embedded } from '../types/Embedded' 3 | import { client as aiClient } from '../clients/ai/client' 4 | 5 | export const getArticleEmbedding = async ( 6 | article: OmnivorePage, 7 | ): Promise> => { 8 | const articleTitleAndDescription = `${article.title}: ${article.description}` 9 | const embedding = await aiClient.getEmbeddings(articleTitleAndDescription) 10 | 11 | return { 12 | base: article, 13 | embedding: embedding, 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/lib/embedding.ts: -------------------------------------------------------------------------------- 1 | import { cosineSimilarity, normalizeValue } from './util/math' 2 | import { Embedded } from '../types/Embedded' 3 | import { Label } from '../types/OmnivoreSchema' 4 | import { OmnivorePage } from '../types/Webhook' 5 | import { SELECTION_CONFIG } from '../resources/config' 6 | import { logger } from './util/logger' 7 | 8 | 9 | const getPercentageSimilarity = ( 10 | labelProbability: { label: Label; similarity: number }[], 11 | ): Label[] => { 12 | const percentages = SELECTION_CONFIG.percentages! 13 | 14 | for (const config of percentages) { 15 | logger.log(config.percentage) 16 | 17 | const labels = labelProbability.filter( 18 | (it) => it.similarity > config.percentage, 19 | ) 20 | 21 | logger.log(labels) 22 | if (labels.length > 0) { 23 | return labels.slice(0, config.maxLabels).map((it) => it.label) 24 | } 25 | } 26 | 27 | return [] 28 | } 29 | 30 | const getHighestProbability = ( 31 | labelProbability: { label: Label; similarity: number }[], 32 | ): Label[] => 33 | labelProbability 34 | .slice(0, SELECTION_CONFIG.maxLabels ?? 1) 35 | .map((it) => it.label) 36 | 37 | const getPerTag = ( 38 | labelProbability: { label: Label; similarity: number }[], 39 | ): Label[] => { 40 | const labelMap = SELECTION_CONFIG.labels! 41 | 42 | return labelProbability 43 | .filter( 44 | (it) => 45 | labelMap[it.label.name] != undefined && 46 | labelMap[it.label.name] < it.similarity, 47 | ) 48 | .map((it) => it.label) 49 | } 50 | 51 | const filterIfNotInSet = (() => { 52 | const selectionSet = new Set(SELECTION_CONFIG.filters) 53 | return (label: { label: Label }) => { 54 | return !selectionSet.has(label.label.name) 55 | } 56 | })() 57 | 58 | const LABEL_SELECTION_MAP = { 59 | 'PERCENTAGES': getPercentageSimilarity, 60 | 'HIGHEST_SIMILARITY': getHighestProbability, 61 | 'PER_LABEL_PERCENTAGE': getPerTag, 62 | } 63 | 64 | export const getSimilarLabels = ( 65 | article: Embedded, 66 | labelEmbeddings: Embedded