├── .nvmrc ├── .cursorignore ├── services └── meridian-ml-service │ ├── src │ └── meridian_ml_service │ │ ├── __init__.py │ │ ├── schemas.py │ │ ├── config.py │ │ ├── main.py │ │ ├── dependencies.py │ │ └── embeddings.py │ ├── .env.example │ ├── .dockerignore │ ├── fly.toml │ ├── README.md │ ├── pyproject.toml │ └── Dockerfile ├── apps ├── frontend │ ├── src │ │ ├── public │ │ │ ├── robots.txt │ │ │ ├── favicon.ico │ │ │ ├── favicon-16x16.png │ │ │ ├── favicon-32x32.png │ │ │ ├── apple-touch-icon.png │ │ │ ├── android-chrome-192x192.png │ │ │ ├── android-chrome-512x512.png │ │ │ └── site.webmanifest │ │ ├── server │ │ │ ├── tsconfig.json │ │ │ ├── api │ │ │ │ ├── briefs │ │ │ │ │ ├── latest.get.ts │ │ │ │ │ ├── index.get.ts │ │ │ │ │ └── [slug] │ │ │ │ │ │ └── index.get.ts │ │ │ │ ├── reports.get.ts │ │ │ │ ├── admin │ │ │ │ │ ├── login.post.ts │ │ │ │ │ └── sources │ │ │ │ │ │ ├── [id] │ │ │ │ │ │ ├── init-dos.post.ts │ │ │ │ │ │ ├── index.delete.ts │ │ │ │ │ │ └── details.get.ts │ │ │ │ │ │ ├── index.post.ts │ │ │ │ │ │ └── index.get.ts │ │ │ │ └── subscribe.post.ts │ │ │ └── lib │ │ │ │ └── utils.ts │ │ ├── app.vue │ │ ├── shared │ │ │ └── types.ts │ │ ├── composables │ │ │ ├── useSEO.ts │ │ │ ├── useReadingProgess.ts │ │ │ ├── useStickyElement.ts │ │ │ └── useTableOfContents.ts │ │ ├── pages │ │ │ ├── briefs │ │ │ │ ├── latest.vue │ │ │ │ └── index.vue │ │ │ ├── admin │ │ │ │ └── login.vue │ │ │ └── index.vue │ │ ├── layouts │ │ │ ├── admin.vue │ │ │ └── default.vue │ │ ├── plugins │ │ │ └── markdown.ts │ │ └── components │ │ │ └── SubscriptionForm.vue │ ├── tsconfig.json │ ├── eslint.config.mjs │ ├── .env.example │ ├── .gitignore │ ├── tailwind.config.ts │ ├── package.json │ ├── nuxt.config.ts │ └── README.md ├── backend │ ├── tsconfig.build.json │ ├── test │ │ ├── tsconfig.json │ │ ├── parseRss.spec.ts │ │ ├── utils.spec.ts │ │ ├── parseArticle.spec.ts │ │ ├── rateLimiter.spec.ts │ │ └── fixtures │ │ │ └── ft_com.xml │ ├── biome.json │ ├── src │ │ ├── lib │ │ │ ├── tryCatchAsync.ts │ │ │ ├── embeddings.ts │ │ │ ├── utils.ts │ │ │ ├── logger.ts │ │ │ ├── parsers.ts │ │ │ ├── rateLimiter.ts │ │ │ └── articleFetchers.ts │ │ ├── app.ts │ │ ├── prompts │ │ │ └── articleRepresentation.prompt.ts │ │ ├── routers │ │ │ ├── reports.router.ts │ │ │ ├── events.router.ts │ │ │ ├── sources.router.ts │ │ │ ├── openGraph.router.ts │ │ │ └── durableObjects.router.ts │ │ └── index.ts │ ├── package.json │ ├── tsconfig.json │ └── wrangler.jsonc └── briefs │ └── src │ ├── events.py │ └── llm.py ├── packages └── database │ ├── migrations │ ├── 0000_blushing_boomerang.sql │ ├── 0003_regular_swordsman.sql │ ├── 0002_calm_sebastian_shaw.sql │ ├── meta │ │ ├── 0000_snapshot.json │ │ └── _journal.json │ └── 0001_premium_wolfpack.sql │ ├── .env.example │ ├── README.MD │ ├── Makefile │ ├── drizzle.config.ts │ ├── src │ ├── index.ts │ ├── database.ts │ ├── seed.ts │ ├── validators │ │ ├── dataSourceConfig.ts │ │ └── analysisPayload.ts │ └── schema.ts │ ├── tsconfig.json │ └── package.json ├── screenshot.png ├── .vscode ├── extensions.json └── settings.json ├── pnpm-workspace.yaml ├── .prettierrc ├── turbo.json ├── package.json ├── LICENSE ├── .gitignore ├── .github └── workflows │ └── deploy-services.yaml └── README.md /.nvmrc: -------------------------------------------------------------------------------- 1 | 22.14.0 -------------------------------------------------------------------------------- /.cursorignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .venv -------------------------------------------------------------------------------- /services/meridian-ml-service/src/meridian_ml_service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/frontend/src/public/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | Disallow: /api -------------------------------------------------------------------------------- /packages/database/migrations/0000_blushing_boomerang.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION vector; -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iliane5/meridian/HEAD/screenshot.png -------------------------------------------------------------------------------- /apps/frontend/src/server/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../../.nuxt/tsconfig.server.json" 3 | } 4 | -------------------------------------------------------------------------------- /packages/database/.env.example: -------------------------------------------------------------------------------- 1 | DATABASE_URL="postgresql://postgres:mysecretpassword@localhost:5432/postgres" -------------------------------------------------------------------------------- /packages/database/migrations/0003_regular_swordsman.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE "ingested_items" ADD COLUMN "embedding_text" text; -------------------------------------------------------------------------------- /apps/frontend/src/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iliane5/meridian/HEAD/apps/frontend/src/public/favicon.ico -------------------------------------------------------------------------------- /apps/frontend/src/public/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iliane5/meridian/HEAD/apps/frontend/src/public/favicon-16x16.png -------------------------------------------------------------------------------- /apps/frontend/src/public/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iliane5/meridian/HEAD/apps/frontend/src/public/favicon-32x32.png -------------------------------------------------------------------------------- /apps/frontend/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | // https://nuxt.com/docs/guide/concepts/typescript 3 | "extends": "./.nuxt/tsconfig.json" 4 | } 5 | -------------------------------------------------------------------------------- /packages/database/README.MD: -------------------------------------------------------------------------------- 1 | docker run --name my-postgres -e POSTGRES_PASSWORD=mysecretpassword -p 5432:5432 pgvector/pgvector:pg16 2 | -------------------------------------------------------------------------------- /apps/frontend/src/public/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iliane5/meridian/HEAD/apps/frontend/src/public/apple-touch-icon.png -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["astro-build.astro-vscode", "unifiedjs.vscode-mdx"], 3 | "unwantedRecommendations": [], 4 | } 5 | -------------------------------------------------------------------------------- /apps/frontend/src/public/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iliane5/meridian/HEAD/apps/frontend/src/public/android-chrome-192x192.png -------------------------------------------------------------------------------- /apps/frontend/src/public/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iliane5/meridian/HEAD/apps/frontend/src/public/android-chrome-512x512.png -------------------------------------------------------------------------------- /packages/database/Makefile: -------------------------------------------------------------------------------- 1 | db: 2 | @docker run -d --name my-postgres -e POSTGRES_PASSWORD=mysecretpassword -p 5432:5432 pgvector/pgvector:pg16 && pnpm migrate && pnpm studio -------------------------------------------------------------------------------- /apps/backend/tsconfig.build.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "compilerOptions": { 4 | "outDir": "./dist", 5 | "declaration": true, 6 | "declarationMap": true 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - apps/* 3 | - packages/* 4 | 5 | ignoredBuiltDependencies: 6 | - unrs-resolver 7 | - workerd 8 | 9 | onlyBuiltDependencies: 10 | - '@biomejs/biome' 11 | -------------------------------------------------------------------------------- /apps/frontend/src/app.vue: -------------------------------------------------------------------------------- 1 | 9 | -------------------------------------------------------------------------------- /apps/backend/test/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.json", 3 | "compilerOptions": { 4 | "types": ["@cloudflare/vitest-pool-workers"] 5 | }, 6 | "include": ["./**/*.ts", "../worker-configuration.d.ts"], 7 | "exclude": [] 8 | } 9 | -------------------------------------------------------------------------------- /apps/frontend/eslint.config.mjs: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | import withNuxt from './.nuxt/eslint.config.mjs'; 3 | 4 | export default withNuxt( 5 | // Your custom configs here, 6 | { 7 | rules: { 8 | 'vue/html-self-closing': 'off', 9 | }, 10 | } 11 | ); 12 | -------------------------------------------------------------------------------- /services/meridian-ml-service/.env.example: -------------------------------------------------------------------------------- 1 | # Model to use for embeddings (example) 2 | EMBEDDING_MODEL_NAME=intfloat/multilingual-e5-small 3 | 4 | # Port for local development (if not using Cloud Run's $PORT) 5 | # PORT=8080 6 | 7 | # API Token for auth 8 | # API_TOKEN=hunter2 -------------------------------------------------------------------------------- /apps/frontend/.env.example: -------------------------------------------------------------------------------- 1 | NUXT_DATABASE_URL="postgresql://postgres:mysecretpassword@localhost:5432/postgres" 2 | NUXT_SESSION_PASSWORD=password-with-at-least-32-characters 3 | NUXT_PUBLIC_WORKER_API="http://localhost:8787" 4 | NUXT_ADMIN_USERNAME="admin" 5 | NUXT_ADMIN_PASSWORD="hunter2" 6 | NUXT_WORKER_API_TOKEN="hunter2" -------------------------------------------------------------------------------- /apps/frontend/.gitignore: -------------------------------------------------------------------------------- 1 | # Nuxt dev/build outputs 2 | .output 3 | .data 4 | .nuxt 5 | .nitro 6 | .cache 7 | dist 8 | 9 | # Node dependencies 10 | node_modules 11 | 12 | # Logs 13 | logs 14 | *.log 15 | 16 | # Misc 17 | .DS_Store 18 | .fleet 19 | .idea 20 | 21 | # Local env files 22 | .env 23 | .env.* 24 | !.env.example 25 | -------------------------------------------------------------------------------- /packages/database/drizzle.config.ts: -------------------------------------------------------------------------------- 1 | import 'dotenv/config'; 2 | import { defineConfig } from 'drizzle-kit'; 3 | 4 | export default defineConfig({ 5 | out: './migrations', 6 | schema: './src/schema.ts', 7 | dialect: 'postgresql', 8 | dbCredentials: { 9 | url: process.env.DATABASE_URL || '', 10 | }, 11 | }); 12 | -------------------------------------------------------------------------------- /apps/frontend/tailwind.config.ts: -------------------------------------------------------------------------------- 1 | import typography from '@tailwindcss/typography'; 2 | import type { Config } from 'tailwindcss'; 3 | 4 | export default { 5 | content: ['./src/**/*.{js,ts,jsx,tsx,vue}'], 6 | darkMode: 'class', 7 | theme: { 8 | extend: {}, 9 | }, 10 | plugins: [typography], 11 | } satisfies Config; 12 | -------------------------------------------------------------------------------- /packages/database/migrations/0002_calm_sebastian_shaw.sql: -------------------------------------------------------------------------------- 1 | ALTER TYPE "public"."ingested_item_status" ADD VALUE 'FAILED_RENDER' BEFORE 'FAILED_FETCH';--> statement-breakpoint 2 | ALTER TYPE "public"."ingested_item_status" ADD VALUE 'FAILED_EMBEDDING' BEFORE 'SKIPPED_PDF';--> statement-breakpoint 3 | ALTER TYPE "public"."ingested_item_status" ADD VALUE 'FAILED_R2_UPLOAD' BEFORE 'SKIPPED_PDF'; -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "singleQuote": true, 4 | "tabWidth": 2, 5 | "printWidth": 120, 6 | "trailingComma": "es5", 7 | "bracketSpacing": true, 8 | "arrowParens": "avoid", 9 | "endOfLine": "lf", 10 | "overrides": [ 11 | { 12 | "files": "*.astro", 13 | "options": { 14 | "parser": "astro" 15 | } 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /packages/database/src/index.ts: -------------------------------------------------------------------------------- 1 | export * from './schema'; 2 | export { and, inArray, desc, eq, gte, isNull, sql, lte, isNotNull, not, cosineDistance, gt } from 'drizzle-orm'; 3 | export * from './database'; 4 | export { RssSourceConfigV1, DataSourceConfigWrapper } from './validators/dataSourceConfig'; 5 | export { AnalysisPayloadBaseV1, AnalysisPayloadWrapper } from './validators/analysisPayload'; 6 | -------------------------------------------------------------------------------- /apps/frontend/src/shared/types.ts: -------------------------------------------------------------------------------- 1 | export interface Brief { 2 | slug: string; 3 | date: { 4 | month: string; 5 | day: number; 6 | year: number; 7 | }; 8 | id: number; 9 | title: string; 10 | content: string; 11 | createdAt: Date; 12 | totalArticles: number; 13 | totalSources: number; 14 | usedArticles: number; 15 | usedSources: number; 16 | model_author: string | null; 17 | } 18 | -------------------------------------------------------------------------------- /apps/frontend/src/public/site.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Meridian", 3 | "short_name": "Meridian", 4 | "icons": [ 5 | { "src": "/android-chrome-192x192.png", "sizes": "192x192", "type": "image/png" }, 6 | { "src": "/android-chrome-512x512.png", "sizes": "512x512", "type": "image/png" } 7 | ], 8 | "theme_color": "#ffffff", 9 | "background_color": "#ffffff", 10 | "display": "standalone" 11 | } 12 | -------------------------------------------------------------------------------- /packages/database/src/database.ts: -------------------------------------------------------------------------------- 1 | import { drizzle } from 'drizzle-orm/postgres-js'; 2 | import postgres from 'postgres'; 3 | 4 | import * as schema from './schema'; 5 | 6 | export const client = (url: string, options?: postgres.Options<{}> | undefined) => postgres(url, options); 7 | 8 | export const getDb = (url: string, options?: postgres.Options<{}> | undefined) => 9 | drizzle(client(url, options), { schema }); 10 | -------------------------------------------------------------------------------- /packages/database/migrations/meta/0000_snapshot.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "558fb55b-0e9b-4a90-b83b-7add24c77c0b", 3 | "prevId": "00000000-0000-0000-0000-000000000000", 4 | "version": "7", 5 | "dialect": "postgresql", 6 | "tables": {}, 7 | "enums": {}, 8 | "schemas": {}, 9 | "views": {}, 10 | "sequences": {}, 11 | "roles": {}, 12 | "policies": {}, 13 | "_meta": { 14 | "columns": {}, 15 | "schemas": {}, 16 | "tables": {} 17 | } 18 | } -------------------------------------------------------------------------------- /services/meridian-ml-service/src/meridian_ml_service/schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | 3 | 4 | class EmbeddingRequest(BaseModel): 5 | texts: list[str] = Field(..., min_length=1, description="List of texts to embed") 6 | 7 | 8 | class EmbeddingResponse(BaseModel): 9 | embeddings: list[list[float]] = Field( 10 | ..., description="List of computed embeddings" 11 | ) 12 | model_name: str = Field(..., description="Name of the model used") 13 | -------------------------------------------------------------------------------- /turbo.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://turbo.build/schema.json", 3 | "ui": "stream", 4 | "tasks": { 5 | "build": { 6 | "dependsOn": ["^build"], 7 | "inputs": ["$TURBO_DEFAULT$", ".env*"], 8 | "outputs": ["dist/**"] 9 | }, 10 | "lint": { 11 | "dependsOn": ["^lint"] 12 | }, 13 | "typecheck": { 14 | "dependsOn": ["^typecheck"] 15 | }, 16 | "dev": { 17 | "cache": false, 18 | "persistent": true 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /apps/frontend/src/composables/useSEO.ts: -------------------------------------------------------------------------------- 1 | export function useSEO(opts: { title: string; description: string; ogImage: string; ogUrl: string }) { 2 | return useSeoMeta({ 3 | title: opts.title, 4 | description: opts.description, 5 | ogTitle: opts.title, 6 | ogDescription: opts.description, 7 | twitterTitle: opts.title, 8 | twitterDescription: opts.description, 9 | ogImage: opts.ogImage, 10 | twitterImage: opts.ogImage, 11 | twitterCard: 'summary_large_image', 12 | ogLocale: 'en_US', 13 | ogUrl: opts.ogUrl, 14 | }); 15 | } 16 | -------------------------------------------------------------------------------- /services/meridian-ml-service/.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | .dockerignore 3 | .git 4 | .gitignore 5 | __pycache__/ 6 | *.pyc 7 | *.pyo 8 | *.pyd 9 | .env 10 | .venv/ 11 | venv/ 12 | env/ 13 | *.env.* 14 | !.env.example 15 | # Additional exclusions for ML projects 16 | .pytest_cache/ 17 | .mypy_cache/ 18 | .ruff_cache/ 19 | **/.cache/ 20 | **/__pycache__/ 21 | **/*.egg-info/ 22 | dist/ 23 | build/ 24 | .coverage 25 | htmlcov/ 26 | .ipynb_checkpoints/ 27 | **/*.ipynb 28 | .DS_Store 29 | # Documentation 30 | docs/ 31 | README.md 32 | # Tests 33 | tests/ 34 | # Dev tooling 35 | .github/ -------------------------------------------------------------------------------- /packages/database/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://json.schemastore.org/tsconfig", 3 | "display": "Node 20", 4 | "_version": "20.1.0", 5 | 6 | "compilerOptions": { 7 | "lib": ["es2023"], 8 | "module": "nodenext", 9 | "target": "es2022", 10 | 11 | "strict": true, 12 | "esModuleInterop": true, 13 | "skipLibCheck": true, 14 | "moduleResolution": "node16", 15 | 16 | "baseUrl": ".", 17 | "paths": { 18 | "@/*": ["./src/*"] 19 | } 20 | }, 21 | "include": ["**/*.ts"], 22 | "exclude": ["dist", "build", "node_modules"] 23 | } 24 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/briefs/latest.get.ts: -------------------------------------------------------------------------------- 1 | import { $reports, desc } from '@meridian/database'; 2 | import { ensureDate, generateReportSlug, getDB } from '~/server/lib/utils'; 3 | 4 | export default defineEventHandler(async event => { 5 | const latestReport = await getDB(event).query.$reports.findFirst({ 6 | orderBy: desc($reports.createdAt), 7 | columns: { id: true, createdAt: true, title: true }, 8 | }); 9 | if (latestReport === undefined) { 10 | throw createError({ statusCode: 404, statusMessage: 'No reports found' }); 11 | } 12 | 13 | return generateReportSlug(ensureDate(latestReport.createdAt)); 14 | }); 15 | -------------------------------------------------------------------------------- /apps/backend/biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schemas/1.9.4/schema.json", 3 | "vcs": { 4 | "enabled": false, 5 | "clientKind": "git", 6 | "useIgnoreFile": false 7 | }, 8 | "files": { 9 | "ignoreUnknown": false, 10 | "ignore": ["worker-configuration.d.ts"] 11 | }, 12 | "formatter": { 13 | "enabled": false 14 | }, 15 | "organizeImports": { 16 | "enabled": true 17 | }, 18 | "linter": { 19 | "enabled": true, 20 | "rules": { 21 | "recommended": true 22 | } 23 | }, 24 | "javascript": { 25 | "formatter": { 26 | "quoteStyle": "double" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /apps/backend/src/lib/tryCatchAsync.ts: -------------------------------------------------------------------------------- 1 | import { type Result, err, ok } from 'neverthrow'; 2 | 3 | /** 4 | * Wraps an existing Promise, converting resolution to Ok and rejection/throw to Err. 5 | * The error type is 'unknown' because anything can be thrown. 6 | * 7 | * @param promise The promise to wrap. 8 | * @returns A Promise resolving to a Result. 9 | */ 10 | export async function tryCatchAsync(promise: Promise): Promise> { 11 | try { 12 | const value = await promise; 13 | return ok(value); 14 | } catch (error) { 15 | // Catches synchronous throws during promise creation *and* promise rejections. 16 | return err(error); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/briefs/index.get.ts: -------------------------------------------------------------------------------- 1 | import { $reports, desc } from '@meridian/database'; 2 | import { ensureDate, formatReportDate, generateReportSlug, getDB } from '~/server/lib/utils'; 3 | 4 | export default defineEventHandler(async event => { 5 | const reports = await getDB(event).query.$reports.findMany({ 6 | orderBy: desc($reports.createdAt), 7 | columns: { id: true, createdAt: true, title: true }, 8 | }); 9 | 10 | // Process reports to add date and slug 11 | return reports.map(report => { 12 | const createdAt = ensureDate(report.createdAt); 13 | return { 14 | ...report, 15 | date: formatReportDate(createdAt), 16 | slug: generateReportSlug(createdAt), 17 | }; 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "meridian", 3 | "private": true, 4 | "license": "MIT", 5 | "author": { 6 | "name": "Iliane Amadou", 7 | "email": "mail@iliane.xyz", 8 | "url": "https://iliane.xyz" 9 | }, 10 | "scripts": { 11 | "build": "turbo run build", 12 | "dev": "turbo run dev", 13 | "lint": "turbo run lint", 14 | "format": "prettier --write \"**/*.{ts,tsx,md,vue}\"", 15 | "typecheck": "turbo run typecheck" 16 | }, 17 | "devDependencies": { 18 | "prettier": "^3.5.3", 19 | "turbo": "^2.4.4", 20 | "typescript": "5.8.2", 21 | "@biomejs/biome": "^1.9.4", 22 | "eslint": "^9.27.0" 23 | }, 24 | "packageManager": "pnpm@10.9.0", 25 | "engines": { 26 | "node": ">=22" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /packages/database/src/seed.ts: -------------------------------------------------------------------------------- 1 | import 'dotenv/config'; 2 | 3 | import { $sources } from './schema'; 4 | import { getDb } from './database'; 5 | 6 | async function main() { 7 | await getDb(process.env.DATABASE_URL!) 8 | .insert($sources) 9 | .values({ 10 | id: 1, 11 | name: 'Hacker news', 12 | url: 'https://news.ycombinator.com/rss', 13 | scrape_frequency: 1, 14 | category: 'news', 15 | paywall: false, 16 | lastChecked: new Date(), 17 | }) 18 | .onConflictDoNothing(); 19 | } 20 | 21 | main() 22 | .then(() => { 23 | console.log('✅ Seeded database'); 24 | process.exit(0); 25 | }) 26 | .catch(err => { 27 | console.error('Error seeding database', err); 28 | process.exit(1); 29 | }); 30 | -------------------------------------------------------------------------------- /apps/frontend/src/pages/briefs/latest.vue: -------------------------------------------------------------------------------- 1 | 19 | 20 | 25 | -------------------------------------------------------------------------------- /packages/database/src/validators/dataSourceConfig.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | // RSS Source Configuration Schema v1.0 4 | export const RssSourceConfigV1 = z.object({ 5 | url: z.string().url(), 6 | rss_paywall: z.boolean().optional().default(false), 7 | config_schema_version: z.literal('1.0'), 8 | }); 9 | 10 | // Base Data Source Configuration Wrapper 11 | // Discriminated union that can wrap different source configs 12 | export const DataSourceConfigWrapper = z.discriminatedUnion('source_type', [ 13 | z.object({ 14 | source_type: z.literal('RSS'), 15 | config: RssSourceConfigV1, 16 | }), 17 | ]); 18 | 19 | // Type exports for TypeScript usage 20 | export type RssSourceConfigV1Type = z.infer; 21 | export type DataSourceConfigWrapperType = z.infer; 22 | -------------------------------------------------------------------------------- /packages/database/migrations/meta/_journal.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "7", 3 | "dialect": "postgresql", 4 | "entries": [ 5 | { 6 | "idx": 0, 7 | "version": "7", 8 | "when": 1745430466343, 9 | "tag": "0000_blushing_boomerang", 10 | "breakpoints": true 11 | }, 12 | { 13 | "idx": 1, 14 | "version": "7", 15 | "when": 1748143359245, 16 | "tag": "0001_premium_wolfpack", 17 | "breakpoints": true 18 | }, 19 | { 20 | "idx": 2, 21 | "version": "7", 22 | "when": 1748146595683, 23 | "tag": "0002_calm_sebastian_shaw", 24 | "breakpoints": true 25 | }, 26 | { 27 | "idx": 3, 28 | "version": "7", 29 | "when": 1748576379169, 30 | "tag": "0003_regular_swordsman", 31 | "breakpoints": true 32 | } 33 | ] 34 | } -------------------------------------------------------------------------------- /apps/frontend/src/server/api/reports.get.ts: -------------------------------------------------------------------------------- 1 | import { ensureDate, formatReportDate, generateReportSlug, getDB } from '~/server/lib/utils'; 2 | 3 | export default defineEventHandler(async event => { 4 | const reports = await getDB(event).query.$reports.findMany(); 5 | 6 | // Process reports to add date and slug 7 | const processedReports = reports 8 | .map(report => { 9 | const createdAt = ensureDate(report.createdAt); 10 | return { 11 | ...report, 12 | date: formatReportDate(createdAt), 13 | slug: generateReportSlug(createdAt), 14 | }; 15 | }) 16 | .sort((a, b) => { 17 | const dateA = a.createdAt ? new Date(a.createdAt).getTime() : 0; 18 | const dateB = b.createdAt ? new Date(b.createdAt).getTime() : 0; 19 | return dateB - dateA; 20 | }); 21 | 22 | return processedReports; 23 | }); 24 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "wrangler.json": "jsonc" 4 | }, 5 | "typescript.tsdk": "node_modules/typescript/lib", 6 | 7 | "editor.defaultFormatter": "esbenp.prettier-vscode", 8 | 9 | "eslint.useFlatConfig": true, 10 | "eslint.validate": ["vue", "typescript", "javascript"], 11 | "eslint.workingDirectories": ["./apps/frontend"], 12 | 13 | // hide git ignored files 14 | "files.exclude": { 15 | "**/*.turbo": true, 16 | "**/.turbo": true, 17 | "**/.venv": true, 18 | "**/node_modules": true, 19 | "**/.nuxt": true, 20 | "**/.output": true, 21 | "**/dist": true, 22 | "**/.wrangler": true, 23 | "**/.mypy_cache": true, 24 | "**/.ruff_cache": true, 25 | "**/*.egg-info": true, 26 | "**/__pycache__": true 27 | }, 28 | "[python]": { 29 | "editor.defaultFormatter": "charliermarsh.ruff" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /packages/database/src/validators/analysisPayload.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | import type { DataSourceConfigWrapper, RssSourceConfigV1 } from './dataSourceConfig'; 3 | 4 | // RSS Source Configuration Schema v1.0 5 | export const AnalysisPayloadBaseV1 = z.object({ 6 | schema_version: z.literal('1.0'), 7 | analysis_type: z.string(), 8 | data: z.record(z.unknown()), 9 | }); 10 | 11 | // Base Analysis Payload Wrapper 12 | // Discriminated union that can wrap different analysis payloads 13 | export const AnalysisPayloadWrapper = z.discriminatedUnion('analysis_type', [ 14 | z.object({ 15 | analysis_type: z.literal('RSS'), 16 | data: AnalysisPayloadBaseV1, 17 | }), 18 | ]); 19 | 20 | // Type exports for TypeScript usage 21 | export type RssSourceConfigV1Type = z.infer; 22 | export type DataSourceConfigWrapperType = z.infer; 23 | -------------------------------------------------------------------------------- /apps/backend/src/app.ts: -------------------------------------------------------------------------------- 1 | import { Hono } from 'hono'; 2 | import { trimTrailingSlash } from 'hono/trailing-slash'; 3 | import type { Env } from './index'; 4 | import durableObjectsRouter from './routers/durableObjects.router'; 5 | import eventsRouter from './routers/events.router'; 6 | import openGraph from './routers/openGraph.router'; 7 | import reportsRouter from './routers/reports.router'; 8 | import sourcesRouter from './routers/sources.router'; 9 | 10 | export type HonoEnv = { Bindings: Env }; 11 | 12 | const app = new Hono() 13 | .use(trimTrailingSlash()) 14 | .get('/favicon.ico', async c => c.notFound()) // disable favicon 15 | .route('/reports', reportsRouter) 16 | .route('/sources', sourcesRouter) 17 | .route('/openGraph', openGraph) 18 | .route('/events', eventsRouter) 19 | .route('/do', durableObjectsRouter) 20 | .get('/ping', async c => c.json({ pong: true })); 21 | 22 | export default app; 23 | -------------------------------------------------------------------------------- /apps/frontend/src/layouts/admin.vue: -------------------------------------------------------------------------------- 1 | 16 | 17 | 33 | -------------------------------------------------------------------------------- /packages/database/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@meridian/database", 3 | "version": "0.0.0", 4 | "private": true, 5 | "license": "MIT", 6 | "author": { 7 | "name": "Iliane Amadou", 8 | "email": "mail@iliane.xyz", 9 | "url": "https://iliane.xyz" 10 | }, 11 | "publishConfig": { 12 | "access": "public" 13 | }, 14 | "exports": { 15 | ".": "./src/index.ts" 16 | }, 17 | "scripts": { 18 | "migrate": "drizzle-kit migrate", 19 | "generate": "drizzle-kit generate", 20 | "studio": "drizzle-kit studio", 21 | "typecheck": "tsc --noEmit", 22 | "seed": "tsx src/seed.ts" 23 | }, 24 | "dependencies": { 25 | "drizzle-orm": "^0.42.0", 26 | "postgres": "^3.4.5", 27 | "tsx": "^4.19.3", 28 | "zod": "^3.22.4" 29 | }, 30 | "devDependencies": { 31 | "@types/node": "^22.13.14", 32 | "dotenv": "^16.4.7", 33 | "drizzle-kit": "^0.31.0", 34 | "typescript": "^5.8.2" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /apps/frontend/src/pages/briefs/index.vue: -------------------------------------------------------------------------------- 1 | 15 | 16 | 26 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/admin/login.post.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | const loginSchema = z.object({ username: z.string(), password: z.string() }); 4 | 5 | export default eventHandler(async event => { 6 | const config = useRuntimeConfig(event); 7 | 8 | const bodyResult = loginSchema.safeParse(await readBody(event)); 9 | if (bodyResult.success === false) { 10 | throw createError({ statusCode: 400, message: 'Invalid request body' }); 11 | } 12 | 13 | const { username, password } = bodyResult.data; 14 | if (username !== config.admin.username || password !== config.admin.password) { 15 | throw createError({ statusCode: 401, message: 'Wrong password' }); 16 | } 17 | 18 | try { 19 | await setUserSession(event, { user: { login: 'admin' }, loggedInAt: Date.now() }); 20 | } catch (error) { 21 | console.error('Failed to set user session', error); 22 | throw createError({ statusCode: 500, message: 'Failed to set user session' }); 23 | } 24 | 25 | return setResponseStatus(event, 201); 26 | }); 27 | -------------------------------------------------------------------------------- /services/meridian-ml-service/fly.toml: -------------------------------------------------------------------------------- 1 | # fly.toml app configuration file generated for meridian-ml-service on 2025-04-25T16:34:14+02:00 2 | # 3 | # See https://fly.io/docs/reference/configuration/ for information about how to use this file. 4 | # 5 | 6 | app = 'meridian-ml-service' 7 | primary_region = 'cdg' 8 | 9 | [build] 10 | 11 | [http_service] 12 | internal_port = 8080 13 | force_https = true 14 | auto_stop_machines = 'stop' 15 | auto_start_machines = true 16 | min_machines_running = 0 17 | processes = ['app'] 18 | 19 | [[vm]] 20 | memory = '1gb' 21 | cpu_kind = 'shared' 22 | cpus = 1 23 | 24 | # Add a health check pointing to your root or ping endpoint 25 | [[services.http_checks]] 26 | interval = "10s" 27 | timeout = "2s" 28 | grace_period = "5s" # Give it time to start, especially with model download 29 | method = "GET" 30 | path = "/ping" # or "/" 31 | protocol = "http" 32 | port = 8080 33 | 34 | [[services.ports]] 35 | handlers = ["http"] 36 | port = 8080 37 | force_https = true # optional -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Iliane Amadou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /apps/frontend/src/server/lib/utils.ts: -------------------------------------------------------------------------------- 1 | import { getDb } from '@meridian/database'; 2 | import type { H3Event } from 'h3'; 3 | 4 | export const MONTH_NAMES = [ 5 | 'January', 6 | 'February', 7 | 'March', 8 | 'April', 9 | 'May', 10 | 'June', 11 | 'July', 12 | 'August', 13 | 'September', 14 | 'October', 15 | 'November', 16 | 'December', 17 | ]; 18 | 19 | export interface FormattedDate { 20 | month: string; 21 | day: number; 22 | year: number; 23 | } 24 | 25 | export function formatReportDate(date: Date): FormattedDate { 26 | return { 27 | month: MONTH_NAMES[date.getUTCMonth()], 28 | day: date.getUTCDate(), 29 | year: date.getUTCFullYear(), 30 | }; 31 | } 32 | 33 | export function generateReportSlug(date: Date): string { 34 | const { month, day, year } = formatReportDate(date); 35 | return `${month.toLowerCase()}-${day}-${year}`; 36 | } 37 | 38 | export function ensureDate(dateInput: Date | string | null | undefined): Date { 39 | return dateInput ? new Date(dateInput) : new Date(); 40 | } 41 | 42 | export function getDB(event: H3Event) { 43 | return getDb(useRuntimeConfig(event).database.url); 44 | } 45 | -------------------------------------------------------------------------------- /apps/frontend/src/plugins/markdown.ts: -------------------------------------------------------------------------------- 1 | import MarkdownIt from 'markdown-it'; 2 | import mdColorDefault from 'markdown-it-color'; 3 | // @ts-expect-error - no types for this package 4 | import mdTaskListsDefault from 'markdown-it-deflist'; 5 | 6 | import { defineNuxtPlugin } from '#app'; 7 | 8 | // Helper to get the actual function, handling CJS/ESM differences 9 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 10 | const unwrapDefault = (mod: any) => mod.default || mod; 11 | 12 | const markdownItColor = unwrapDefault(mdColorDefault); 13 | const markdownItTaskLists = unwrapDefault(mdTaskListsDefault); 14 | 15 | export default defineNuxtPlugin({ 16 | name: 'markdown-it', 17 | setup() { 18 | const md = new MarkdownIt({ 19 | linkify: true, 20 | breaks: true, 21 | typographer: true, 22 | html: true, // Be careful with this if markdown comes from users! 23 | }) 24 | .use(markdownItTaskLists) 25 | .use(markdownItColor, { defaultClassName: 'text-primary' }); 26 | 27 | return { 28 | provide: { 29 | md: md, // Provide the configured instance 30 | }, 31 | }; 32 | }, 33 | }); 34 | -------------------------------------------------------------------------------- /services/meridian-ml-service/README.md: -------------------------------------------------------------------------------- 1 | **3. Development Workflow & VS Code:** 2 | 3 | - **Setup:** 4 | 1. Install `uv`: Follow instructions at [https://github.com/astral-sh/uv](https://github.com/astral-sh/uv) 5 | 2. Create a virtual environment: `uv venv` (creates `.venv`) 6 | 3. Activate it: `source .venv/bin/activate` 7 | 4. Install dependencies: `uv pip install -e .[dev]` (Installs package in editable mode + dev deps) 8 | 5. Copy `.env.example` to `.env` if needed for local settings. 9 | - **Running Locally:** 10 | `uvicorn meridian_ml_service.main:app --reload --host 0.0.0.0 --port 8080` 11 | - **Linting/Formatting:** 12 | `uv run ruff check . --fix` 13 | `uv run ruff format .` 14 | - **Type Checking:** 15 | `uv run mypy src/` 16 | - **VS Code:** 17 | 1. Install the official **Python** extension (Microsoft). 18 | 2. Install the **Ruff** extension (Astral Software). Configure it to use `ruff format` on save if desired. 19 | 3. Install the **Mypy Type Checker** extension (Microsoft). 20 | 4. Ensure VS Code detects and uses the `.venv` virtual environment. Your editor should now show linting/formatting/type errors inline. 21 | -------------------------------------------------------------------------------- /apps/backend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@meridian/backend", 3 | "version": "0.0.0", 4 | "private": true, 5 | "license": "MIT", 6 | "author": { 7 | "name": "Iliane Amadou", 8 | "email": "mail@iliane.xyz", 9 | "url": "https://iliane.xyz" 10 | }, 11 | "scripts": { 12 | "dev": "wrangler dev", 13 | "test": "vitest run", 14 | "cf-typegen": "wrangler types", 15 | "lint": "biome check .", 16 | "lint:fix": "biome check --write .", 17 | "typecheck": "tsc --noEmit" 18 | }, 19 | "devDependencies": { 20 | "@biomejs/biome": "1.9.4", 21 | "@cloudflare/vitest-pool-workers": "^0.8.19", 22 | "@types/node": "^22.14.1", 23 | "typescript": "^5.8.2", 24 | "vitest": "^3.1.2", 25 | "wrangler": "^4.14.0" 26 | }, 27 | "dependencies": { 28 | "@ai-sdk/google": "^1.2.13", 29 | "@cloudflare/puppeteer": "^1.0.2", 30 | "@hono/zod-validator": "^0.4.3", 31 | "@meridian/database": "workspace:*", 32 | "@mozilla/readability": "^0.6.0", 33 | "ai": "^4.3.9", 34 | "fast-xml-parser": "^5.2.1", 35 | "hono": "^4.7.7", 36 | "linkedom": "^0.18.9", 37 | "neverthrow": "^8.2.0", 38 | "workers-og": "^0.0.25", 39 | "zod": "^3.24.3" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /services/meridian-ml-service/src/meridian_ml_service/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import lru_cache 3 | from typing import Optional 4 | 5 | from dotenv import load_dotenv 6 | from pydantic import BaseModel, Field 7 | 8 | # Load environment variables from .env file 9 | load_dotenv() 10 | 11 | 12 | # Using a simple class for now, can switch to pydantic-settings later if needed 13 | class Settings(BaseModel): 14 | embedding_model_name: str = "intfloat/multilingual-e5-small" # Default 15 | api_token: Optional[str] = Field( 16 | default=None, description="Optional API token for authentication" 17 | ) 18 | 19 | 20 | @lru_cache # Cache the settings object 21 | def get_settings() -> Settings: 22 | """Loads settings, prioritizing environment variables.""" 23 | model_name_from_env = os.getenv("EMBEDDING_MODEL_NAME") 24 | api_token_from_env = os.getenv("API_TOKEN") 25 | return Settings( 26 | embedding_model_name=( 27 | model_name_from_env 28 | if model_name_from_env 29 | else "intfloat/multilingual-e5-small" 30 | ), 31 | api_token=api_token_from_env, 32 | ) 33 | 34 | 35 | settings = get_settings() # Load settings once on module import 36 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/admin/sources/[id]/init-dos.post.ts: -------------------------------------------------------------------------------- 1 | import { getDB } from '~/server/lib/utils'; 2 | import { $data_sources, eq } from '@meridian/database'; 3 | 4 | export default defineEventHandler(async event => { 5 | await requireUserSession(event); // require auth 6 | 7 | const sourceId = Number(getRouterParam(event, 'id')); 8 | if (Number.isNaN(sourceId)) { 9 | throw createError({ statusCode: 400, statusMessage: 'Invalid source ID' }); 10 | } 11 | 12 | const db = getDB(event); 13 | const config = useRuntimeConfig(); 14 | 15 | const source = await db.query.$data_sources.findFirst({ where: eq($data_sources.id, sourceId) }); 16 | if (source === undefined) { 17 | throw createError({ statusCode: 404, statusMessage: 'Source not found' }); 18 | } 19 | 20 | try { 21 | await fetch(`${config.public.WORKER_API}/do/admin/source/${sourceId}/init`, { 22 | method: 'POST', 23 | headers: { 24 | Authorization: `Bearer ${config.worker.api_token}`, 25 | }, 26 | }); 27 | } catch (error) { 28 | console.error('Failed to initialize DO', error); 29 | throw createError({ statusCode: 500, statusMessage: 'Failed to initialize DO' }); 30 | } 31 | 32 | return { success: true }; 33 | }); 34 | -------------------------------------------------------------------------------- /apps/backend/src/lib/embeddings.ts: -------------------------------------------------------------------------------- 1 | import { err, ok } from 'neverthrow'; 2 | import { z } from 'zod'; 3 | import type { Env } from '../index'; 4 | import { tryCatchAsync } from './tryCatchAsync'; 5 | 6 | const embeddingsResponseSchema = z.object({ 7 | embeddings: z.array(z.array(z.number())), 8 | }); 9 | 10 | export async function createEmbeddings(env: Env, texts: string[]) { 11 | const response = await tryCatchAsync( 12 | fetch(`${env.MERIDIAN_ML_SERVICE_URL}/embeddings`, { 13 | method: 'POST', 14 | body: JSON.stringify({ texts }), 15 | headers: { 16 | Authorization: `Bearer ${env.MERIDIAN_ML_SERVICE_API_KEY}`, 17 | 'Content-Type': 'application/json', 18 | }, 19 | }) 20 | ); 21 | if (response.isErr()) { 22 | return err(response.error); 23 | } 24 | if (!response.value.ok) { 25 | return err(new Error(`Failed to fetch embeddings: ${response.value.statusText}`)); 26 | } 27 | 28 | const jsonResult = await tryCatchAsync(response.value.json()); 29 | if (jsonResult.isErr()) { 30 | return err(jsonResult.error); 31 | } 32 | 33 | const parsedResponse = embeddingsResponseSchema.safeParse(jsonResult.value); 34 | if (parsedResponse.success === false) { 35 | return err(new Error(`Invalid response ${JSON.stringify(parsedResponse.error)}`)); 36 | } 37 | 38 | return ok(parsedResponse.data.embeddings); 39 | } 40 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/admin/sources/[id]/index.delete.ts: -------------------------------------------------------------------------------- 1 | import { $data_sources, eq } from '@meridian/database'; 2 | import { getDB } from '~/server/lib/utils'; 3 | 4 | export default defineEventHandler(async event => { 5 | await requireUserSession(event); // require auth 6 | 7 | const sourceId = Number(getRouterParam(event, 'id')); 8 | if (Number.isNaN(sourceId)) { 9 | throw createError({ statusCode: 400, statusMessage: 'Invalid source ID' }); 10 | } 11 | 12 | const db = getDB(event); 13 | const source = await db.query.$data_sources.findFirst({ where: eq($data_sources.id, sourceId) }); 14 | if (source === undefined) { 15 | throw createError({ statusCode: 404, statusMessage: 'Source not found' }); 16 | } 17 | 18 | const config = useRuntimeConfig(); 19 | 20 | try { 21 | const response = await fetch(`${config.public.WORKER_API}/do/admin/source/${sourceId}`, { 22 | method: 'DELETE', 23 | headers: { 24 | Authorization: `Bearer ${config.worker.api_token}`, 25 | }, 26 | }); 27 | if (!response.ok) { 28 | throw new Error(`Failed to delete source: ${response.statusText}`); 29 | } 30 | } catch (error) { 31 | console.error('Failed to delete source:', error); 32 | throw createError({ statusCode: 500, statusMessage: 'Failed to delete source' }); 33 | } 34 | 35 | return { success: true }; 36 | }); 37 | -------------------------------------------------------------------------------- /apps/frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@meridian/frontend", 3 | "private": true, 4 | "type": "module", 5 | "version": "0.0.0", 6 | "license": "MIT", 7 | "author": { 8 | "name": "Iliane Amadou", 9 | "email": "mail@iliane.xyz", 10 | "url": "https://iliane.xyz" 11 | }, 12 | "scripts": { 13 | "dev": "nuxt dev", 14 | "build": "nuxt build", 15 | "lint": "eslint .", 16 | "lint:fix": "eslint . --fix", 17 | "preview": "nuxt preview", 18 | "postinstall": "nuxt prepare", 19 | "typecheck": "nuxt typecheck" 20 | }, 21 | "devDependencies": { 22 | "@headlessui/vue": "^1.7.23", 23 | "@heroicons/vue": "^2.2.0", 24 | "@mailerlite/mailerlite-nodejs": "^1.4.0", 25 | "@meridian/database": "workspace:*", 26 | "@nuxt/eslint": "1.4.1", 27 | "@nuxtjs/color-mode": "3.5.2", 28 | "@radix-ui/colors": "^3.0.0", 29 | "@tailwindcss/typography": "^0.5.16", 30 | "@tailwindcss/vite": "^4.1.4", 31 | "@types/markdown-it": "^14.1.2", 32 | "@unhead/vue": "^2.0.8", 33 | "markdown-it": "^14.1.0", 34 | "markdown-it-color": "^2.1.1", 35 | "markdown-it-deflist": "^3.0.0", 36 | "nuxt": "^3.16.2", 37 | "nuxt-auth-utils": "0.5.20", 38 | "tailwindcss": "^4.1.4", 39 | "vue": "^3.5.13", 40 | "vue-router": "^4.5.0", 41 | "vue-tsc": "^2.2.10", 42 | "wrangler": "^4.13.0", 43 | "zod": "^3.24.3" 44 | }, 45 | "dependencies": { 46 | "date-fns": "^4.1.0" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /apps/frontend/src/pages/admin/login.vue: -------------------------------------------------------------------------------- 1 | 31 | 32 | 42 | -------------------------------------------------------------------------------- /apps/backend/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig.json to read more about this file */ 4 | 5 | /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ 6 | "target": "es2022", 7 | /* Specify a set of bundled library declaration files that describe the target runtime environment. */ 8 | "lib": ["es2022"], 9 | /* Specify what JSX code is generated. */ 10 | "jsx": "react-jsx", 11 | 12 | /* Specify what module code is generated. */ 13 | "module": "es2022", 14 | /* Specify how TypeScript looks up a file from a given module specifier. */ 15 | "moduleResolution": "bundler", 16 | /* Enable importing .json files */ 17 | "resolveJsonModule": true, 18 | 19 | /* Allow JavaScript files to be a part of your program. Use the `checkJS` option to get errors from these files. */ 20 | "allowJs": true, 21 | /* Enable error reporting in type-checked JavaScript files. */ 22 | "checkJs": false, 23 | 24 | /* Disable emitting files from a compilation. */ 25 | "noEmit": true, 26 | 27 | /* Ensure that each file can be safely transpiled without relying on other imports. */ 28 | "isolatedModules": true, 29 | /* Allow 'import x from y' when a module doesn't have a default export. */ 30 | "allowSyntheticDefaultImports": true, 31 | /* Ensure that casing is correct in imports. */ 32 | "forceConsistentCasingInFileNames": true, 33 | 34 | /* Enable all strict type-checking options. */ 35 | "strict": true, 36 | 37 | /* Skip type checking all .d.ts files. */ 38 | "skipLibCheck": true 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /apps/backend/src/lib/utils.ts: -------------------------------------------------------------------------------- 1 | import { getDb as getDbFromDatabase } from '@meridian/database'; 2 | import type { Context } from 'hono'; 3 | import type { HonoEnv } from '../app'; 4 | 5 | export function getDb(hyperdrive: Hyperdrive) { 6 | return getDbFromDatabase(hyperdrive.connectionString, { 7 | // Workers limit the number of concurrent external connections, so be sure to limit 8 | // the size of the local connection pool that postgres.js may establish. 9 | max: 5, 10 | // If you are not using array types in your Postgres schema, 11 | // disabling this will save you an extra round-trip every time you connect. 12 | fetch_types: false, 13 | }); 14 | } 15 | 16 | export function hasValidAuthToken(c: Context) { 17 | const auth = c.req.header('Authorization'); 18 | if (auth === undefined || auth !== `Bearer ${c.env.API_TOKEN}`) { 19 | return false; 20 | } 21 | return true; 22 | } 23 | 24 | export const userAgents = [ 25 | // ios (golden standard for publishers) 26 | 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1', // iphone safari (best overall) 27 | 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/123.0.6312.87 Mobile/15E148 Safari/604.1', // iphone chrome 28 | 29 | // android (good alternatives) 30 | 'Mozilla/5.0 (Linux; Android 14; SM-S908B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36', // samsung flagship 31 | 'Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Mobile Safari/537.36', // pixel 32 | ]; 33 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/admin/sources/index.post.ts: -------------------------------------------------------------------------------- 1 | import { getDB } from '~/server/lib/utils'; 2 | import { z } from 'zod'; 3 | import type { DataSourceConfigWrapper } from '@meridian/database'; 4 | import { $data_sources } from '@meridian/database'; 5 | 6 | const schema = z.object({ 7 | url: z.string().url(), 8 | }); 9 | 10 | export default defineEventHandler(async event => { 11 | await requireUserSession(event); // require auth 12 | 13 | const bodyResult = schema.safeParse(await readBody(event)); 14 | if (bodyResult.success === false) { 15 | throw createError({ statusCode: 400, statusMessage: 'Invalid request body' }); 16 | } 17 | 18 | try { 19 | await getDB(event) 20 | .insert($data_sources) 21 | .values({ 22 | name: 'Unknown', 23 | source_type: 'RSS', 24 | config: { 25 | source_type: 'RSS', 26 | config: { 27 | config_schema_version: '1.0', 28 | rss_paywall: false, 29 | url: bodyResult.data.url, 30 | }, 31 | } satisfies z.infer, 32 | }); 33 | } catch (error) { 34 | console.error('Failed to add source', error); 35 | throw createError({ statusCode: 500, statusMessage: 'Failed to add source' }); 36 | } 37 | 38 | const config = useRuntimeConfig(); 39 | 40 | try { 41 | await fetch(`${config.public.WORKER_API}/do/admin/initialize-dos`, { 42 | method: 'POST', 43 | headers: { 44 | Authorization: `Bearer ${config.worker.api_token}`, 45 | }, 46 | }); 47 | } catch (error) { 48 | console.error('Failed to initialize DOs', error); 49 | throw createError({ statusCode: 500, statusMessage: 'Failed to initialize DOs' }); 50 | } 51 | 52 | return { 53 | success: true, 54 | }; 55 | }); 56 | -------------------------------------------------------------------------------- /services/meridian-ml-service/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "meridian_ml_service" 3 | version = "0.1.0" 4 | description = "Python service for ML tasks (embeddings, clustering) for Meridian." 5 | authors = [{ name = "Iliane Amadou", email = "mail@iliane.xyz" }] 6 | requires-python = ">=3.11" # Stable, well-supported, performant 7 | dependencies = [ 8 | "fastapi>=0.115.12", # Last version with Pydantic v2 support 9 | "uvicorn[standard]>=0.34.2", # Includes performance extras 10 | "pydantic>=2.11.3", 11 | "numpy>=2.2.0", 12 | "torch>=2.6.0", # CPU version will be installed via extra-index-url 13 | "transformers>=4.51.3", 14 | "sentence-transformers>=4.1.0", # Often simplifies embedding tasks 15 | "python-dotenv>=1.1.0", 16 | # Add later when needed: 17 | # "umap-learn>=0.5.5", 18 | # "hdbscan>=0.8.33", 19 | ] 20 | 21 | [project.optional-dependencies] 22 | dev = [ 23 | "ruff>=0.4.4", # Fast linter/formatter 24 | "mypy>=1.10.0", # Static type checker 25 | ] 26 | 27 | # Configuration for Ruff (Linter/Formatter) 28 | [tool.ruff] 29 | line-length = 88 30 | target-version = "py311" 31 | 32 | [tool.ruff.lint] 33 | # See https://docs.astral.sh/ruff/rules/ for rule codes 34 | select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "T20", "SIM", "PTH"] 35 | ignore = ["E501"] # Ignore line length rule (handled by formatter) 36 | 37 | [tool.ruff.format] 38 | quote-style = "double" 39 | 40 | # Configuration for Mypy (Type Checker) 41 | [tool.mypy] 42 | python_version = "3.11" 43 | warn_return_any = true 44 | warn_unused_configs = true 45 | ignore_missing_imports = true # Be pragmatic initially 46 | # Add stricter checks as needed 47 | 48 | # Build system config (standard for setuptools/uv) 49 | [build-system] 50 | requires = ["setuptools>=61.0"] 51 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /apps/frontend/src/server/api/subscribe.post.ts: -------------------------------------------------------------------------------- 1 | import MailerLite from '@mailerlite/mailerlite-nodejs'; 2 | import { $newsletter } from '@meridian/database'; 3 | import { z } from 'zod'; 4 | import { getDB } from '../lib/utils'; 5 | 6 | export default defineEventHandler(async event => { 7 | const config = useRuntimeConfig(event); 8 | 9 | // Parse the request body to get the email 10 | const body = await readBody(event); 11 | const bodyContent = z.object({ email: z.string().email() }).safeParse(body); 12 | if (bodyContent.success === false) { 13 | throw createError({ statusCode: 400, statusMessage: 'Invalid email format' }); 14 | } 15 | 16 | try { 17 | // Insert email into the newsletter table 18 | await Promise.all([ 19 | getDB(event).insert($newsletter).values({ email: bodyContent.data.email }).onConflictDoNothing(), 20 | (async () => { 21 | if (config.mailerlite.api_key === undefined || config.mailerlite.group_id === undefined) { 22 | console.warn('MailerLite is not configured'); 23 | return; // nothing if mailerlite is not configured 24 | } 25 | const mailerlite = new MailerLite({ api_key: config.mailerlite.api_key }); 26 | try { 27 | await mailerlite.subscribers.createOrUpdate({ 28 | email: bodyContent.data.email, 29 | groups: [config.mailerlite.group_id], 30 | }); 31 | } catch (error) { 32 | console.error('MailerLite error:', error); 33 | throw createError({ statusCode: 500, statusMessage: 'MailerLite error' }); 34 | } 35 | })(), 36 | ]); 37 | 38 | return { success: true, message: 'Successfully subscribed' }; 39 | } catch (error) { 40 | console.error('Database error:', error); 41 | throw createError({ statusCode: 500, statusMessage: 'Database error' }); 42 | } 43 | }); 44 | -------------------------------------------------------------------------------- /apps/frontend/src/pages/index.vue: -------------------------------------------------------------------------------- 1 | 44 | 45 | 55 | -------------------------------------------------------------------------------- /apps/frontend/src/composables/useReadingProgess.ts: -------------------------------------------------------------------------------- 1 | function throttle unknown>(func: T, wait: number) { 2 | let timeout: ReturnType | null = null; 3 | let lastArgs: Parameters | null = null; 4 | 5 | const throttled = (...args: Parameters) => { 6 | lastArgs = args; 7 | 8 | if (!timeout) { 9 | func(...args); 10 | timeout = setTimeout(() => { 11 | if (lastArgs) func(...lastArgs); 12 | timeout = null; 13 | lastArgs = null; 14 | }, wait); 15 | } 16 | }; 17 | 18 | throttled.cancel = () => { 19 | if (timeout) { 20 | clearTimeout(timeout); 21 | timeout = null; 22 | lastArgs = null; 23 | } 24 | }; 25 | 26 | return throttled; 27 | } 28 | 29 | export function useReadingProgress() { 30 | const readingProgress = ref(0); 31 | const showBackToTop = ref(false); 32 | let scrollListener: () => void; 33 | 34 | const calculateProgress = () => { 35 | const scrollTop = document.documentElement.scrollTop; 36 | const scrollHeight = document.documentElement.scrollHeight - document.documentElement.clientHeight; 37 | readingProgress.value = scrollHeight > 0 ? (scrollTop / scrollHeight) * 100 : 0; 38 | showBackToTop.value = scrollTop > 500; // Show back to top button after scrolling down 500px 39 | }; 40 | 41 | const throttledCalculateProgress = throttle(calculateProgress, 25); 42 | 43 | const scrollToTop = () => { 44 | window.scrollTo({ top: 0, behavior: 'smooth' }); 45 | }; 46 | 47 | onMounted(() => { 48 | scrollListener = throttledCalculateProgress; 49 | window.addEventListener('scroll', scrollListener); 50 | calculateProgress(); // Initial calculation 51 | }); 52 | 53 | onUnmounted(() => { 54 | window.removeEventListener('scroll', scrollListener); 55 | throttledCalculateProgress.cancel(); 56 | }); 57 | 58 | return { 59 | readingProgress, 60 | showBackToTop, 61 | scrollToTop, 62 | }; 63 | } 64 | -------------------------------------------------------------------------------- /services/meridian-ml-service/src/meridian_ml_service/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from fastapi import Depends, FastAPI, HTTPException 3 | 4 | from .config import settings 5 | from .dependencies import ( 6 | ModelDep, 7 | verify_token, 8 | get_embedding_model, 9 | ) # Import auth dependency 10 | from .embeddings import compute_embeddings 11 | from .schemas import EmbeddingRequest, EmbeddingResponse 12 | 13 | app = FastAPI( 14 | title="Meridian ML Service", 15 | description="Handles ML tasks like embeddings and clustering.", 16 | version="0.1.0", 17 | ) 18 | 19 | 20 | # Simple root endpoint for health check 21 | @app.get("/") 22 | async def read_root(): 23 | return {"status": "ok", "service": "Meridian ML Service"} 24 | 25 | 26 | @app.get("/ping") 27 | async def ping(): 28 | return {"pong": True} 29 | 30 | 31 | @app.post("/embeddings", response_model=EmbeddingResponse) 32 | async def api_compute_embeddings( 33 | request: EmbeddingRequest, 34 | model_components: ModelDep, # ModelDep already includes Depends 35 | _: None = Depends(verify_token), 36 | ): 37 | """ 38 | Computes embeddings for the provided list of texts. 39 | """ 40 | print(f"Received request to embed {len(request.texts)} texts.") 41 | try: 42 | embeddings_np: np.ndarray = compute_embeddings( 43 | texts=request.texts, 44 | model_components=model_components, 45 | ) 46 | 47 | embeddings_list: list[list[float]] = embeddings_np.tolist() 48 | 49 | return EmbeddingResponse( 50 | embeddings=embeddings_list, model_name=settings.embedding_model_name 51 | ) 52 | except Exception as e: 53 | print(f"ERROR during embedding computation: {e}") 54 | # Consider more specific error handling based on exception types 55 | raise HTTPException( 56 | status_code=500, 57 | detail=f"Internal server error during embedding computation: {str(e)}", 58 | ) from e 59 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/briefs/[slug]/index.get.ts: -------------------------------------------------------------------------------- 1 | import { $reports, and, gte, lte } from '@meridian/database'; 2 | import { ensureDate, formatReportDate, getDB } from '~/server/lib/utils'; 3 | 4 | interface Brief { 5 | id: number; 6 | createdAt: Date; 7 | title: string; 8 | content: string; 9 | model_author: string | null; 10 | totalArticles: number; 11 | totalSources: number; 12 | usedSources: number; 13 | usedArticles: number; 14 | slug: string; 15 | date: { 16 | month: string; 17 | day: number; 18 | year: number; 19 | }; 20 | } 21 | 22 | export default defineEventHandler(async event => { 23 | const slug = getRouterParam(event, 'slug'); 24 | if (slug === undefined) { 25 | throw createError({ statusCode: 400, statusMessage: 'Slug is required' }); 26 | } 27 | 28 | // decode slug & get date 29 | const date = new Date(slug); 30 | if (Number.isNaN(date.getTime())) { 31 | throw createError({ statusCode: 400, statusMessage: 'Invalid slug' }); 32 | } 33 | 34 | // set start/end of the day for date range query 35 | const startOfDay = new Date(date.getFullYear(), date.getMonth(), date.getDate()); 36 | const endOfDay = new Date(date.getFullYear(), date.getMonth(), date.getDate() + 1); 37 | 38 | // get report created on this day 39 | const report = await getDB(event).query.$reports.findFirst({ 40 | where: and(gte($reports.createdAt, startOfDay), lte($reports.createdAt, endOfDay)), 41 | columns: { 42 | id: true, 43 | createdAt: true, 44 | title: true, 45 | content: true, 46 | model_author: true, 47 | totalArticles: true, 48 | totalSources: true, 49 | usedSources: true, 50 | usedArticles: true, 51 | }, 52 | }); 53 | if (report === undefined) { 54 | throw createError({ statusCode: 404, statusMessage: 'Report not found' }); 55 | } 56 | 57 | return { 58 | ...report, 59 | slug, 60 | date: formatReportDate(ensureDate(report.createdAt)), 61 | } satisfies Brief; 62 | }); 63 | -------------------------------------------------------------------------------- /apps/briefs/src/events.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | from datetime import date 4 | from pydantic import BaseModel, field_validator 5 | from typing import Optional 6 | import pandas as pd 7 | from datetime import datetime 8 | from typing import Optional 9 | from pydantic import BaseModel, field_validator 10 | from dotenv import load_dotenv 11 | 12 | load_dotenv() 13 | 14 | 15 | class Source(BaseModel): 16 | id: int 17 | name: str 18 | 19 | 20 | class Event(BaseModel): 21 | id: int 22 | sourceId: int 23 | url: str 24 | title: str 25 | publishDate: datetime # changed from date to datetime 26 | contentFileKey: str 27 | primary_location: str 28 | completeness: str 29 | content_quality: str 30 | event_summary_points: list[str] 31 | thematic_keywords: list[str] 32 | topic_tags: list[str] 33 | key_entities: list[str] 34 | content_focus: list[str] 35 | embedding: list[float] 36 | createdAt: datetime 37 | 38 | @field_validator("publishDate", mode="before") 39 | @classmethod 40 | def parse_date(cls, value): 41 | if value is None: 42 | return None 43 | 44 | # Handle ISO format with timezone info 45 | try: 46 | return datetime.fromisoformat(value) 47 | except ValueError: 48 | # For older Python versions or non-standard formats 49 | # you might need dateutil 50 | from dateutil import parser 51 | 52 | return parser.parse(value) 53 | 54 | 55 | def get_events(date: str = None): 56 | url = f"http://localhost:8787/events" 57 | 58 | if date: 59 | url += f"?date={date}" 60 | 61 | response = requests.get( 62 | url, 63 | headers={"Authorization": f"Bearer {os.environ.get('MERIDIAN_SECRET_KEY')}"}, 64 | ) 65 | data = response.json() 66 | 67 | sources = [Source(**source) for source in data["sources"]] 68 | events = [Event(**event) for event in data["events"]] 69 | 70 | return sources, events 71 | -------------------------------------------------------------------------------- /apps/backend/src/prompts/articleRepresentation.prompt.ts: -------------------------------------------------------------------------------- 1 | export function getArticleRepresentationPrompt(title: string, url: string, text: string) { 2 | return ` 3 | Transform article into standardized format. No repeated info across fields. 4 | 5 | Fields: 6 | Topic: technology/politics/business/health/agriculture/sports/international 7 | Subtopic: specific area (ai-research, elections, trade-policy) 8 | Geography: global/us/china/europe/[city]/[region] 9 | Scope: policy/technical/market/social-impact/breaking-news/analysis 10 | Urgency: breaking/developing/routine/historical 11 | Source: mainstream/trade/academic/government/blog 12 | Entities: [max 5 key people/orgs/products/places] 13 | Tags: [max 5 additional specifics not covered above] 14 | 15 | Examples: 16 | 17 | INPUT: """ 18 | Nvidia CEO Jensen Huang Warns Companies to Adopt AI Now 19 | 20 | Nvidia CEO delivered stark warning to business leaders yesterday, stating companies must integrate AI immediately or face obsolescence. Speaking to Fortune 500 executives, emphasized current AI revolution represents 'once-in-a-lifetime transformation'. Stock surged 180% this year as AI chip demand accelerates. 21 | """ 22 | 23 | OUTPUT: 24 | Topic: technology 25 | Subtopic: business-strategy 26 | Geography: us 27 | Scope: market 28 | Urgency: routine 29 | Source: mainstream 30 | Entities: [Jensen Huang, Nvidia, Fortune 500] 31 | Tags: [stock-surge, 180-percent, chip-demand] 32 | 33 | INPUT: """ 34 | Breaking: Emergency Wheat Export Ban by Inner Mongolia Agricultural Ministry 35 | 36 | Ministry announced immediate wheat export suspension today, citing food security concerns amid drought. Affects 2.3 million tons scheduled for neighboring provinces. Farmers concerned about revenue losses, traders predict price volatility. 37 | """ 38 | 39 | OUTPUT: 40 | Topic: agriculture 41 | Subtopic: trade-policy 42 | Geography: inner-mongolia 43 | Scope: breaking-news 44 | Urgency: breaking 45 | Source: mainstream 46 | Entities: [Inner Mongolia Agricultural Ministry] 47 | Tags: [export-ban, drought, 2.3-million-tons, price-volatility] 48 | 49 | INPUT: """" 50 | # (${title})[${url}] 51 | 52 | ${text.slice(0, 1500)}... 53 | """ 54 | 55 | OUTPUT: 56 | `.trim(); 57 | } 58 | -------------------------------------------------------------------------------- /apps/frontend/nuxt.config.ts: -------------------------------------------------------------------------------- 1 | import tailwindcss from '@tailwindcss/vite'; 2 | 3 | // https://nuxt.com/docs/api/configuration/nuxt-config 4 | export default defineNuxtConfig({ 5 | app: { 6 | head: { 7 | htmlAttrs: { lang: 'en' }, 8 | link: [{ rel: 'icon', type: 'image/png', href: '/favicon.ico' }], 9 | }, 10 | }, 11 | 12 | colorMode: { classSuffix: '', preference: 'system', fallback: 'system' }, 13 | compatibilityDate: '2025-03-01', 14 | css: ['~/assets/css/main.css'], 15 | 16 | devtools: { enabled: true }, 17 | devServer: { host: '0.0.0.0' }, 18 | 19 | modules: ['@nuxtjs/color-mode', 'nuxt-auth-utils', '@nuxt/eslint'], 20 | 21 | nitro: { prerender: { autoSubfolderIndex: false }, cloudflare: { nodeCompat: true, deployConfig: true } }, 22 | 23 | routeRules: { 24 | // Cache the list of briefs for 1 hour on CDN, 15 mins in browser 25 | // Allow serving stale data for up to a day while revalidating 26 | '/api/briefs': { 27 | cache: { 28 | maxAge: 60 * 15, // 15 minutes browser cache 29 | staleMaxAge: 60 * 60 * 24, // 1 day stale-while-revalidate on CDN 30 | }, 31 | }, 32 | // Cache individual briefs for longer (assuming they don't change once published) 33 | // Cache for 1 day on CDN, 1 hour in browser 34 | '/api/briefs/**': { 35 | // Matches /api/briefs/some-slug, /api/briefs/another-slug etc. 36 | cache: { 37 | maxAge: 60 * 60, // 1 hour browser cache 38 | staleMaxAge: 60 * 60 * 24 * 7, // 1 week stale-while-revalidate on CDN 39 | }, 40 | }, 41 | }, 42 | 43 | // In production, these are set via the environment variables 44 | // NUXT_+{key} 45 | runtimeConfig: { 46 | database: { url: '' }, // NUXT_DATABASE_URL 47 | mailerlite: { api_key: '', group_id: '' }, // NUXT_MAILERLITE_API_KEY, NUXT_MAILERLITE_GROUP_ID 48 | admin: { username: 'admin', password: 'hunter2' }, // NUXT_ADMIN_USERNAME, NUXT_ADMIN_PASSWORD 49 | worker: { api_token: 'hunter2' }, // NUXT_WORKER_API_TOKEN 50 | 51 | // IMPORTANT: all "public" config is exposed to the client 52 | public: { WORKER_API: 'http://localhost:8787' }, // NUXT_PUBLIC_WORKER_API 53 | }, 54 | 55 | srcDir: 'src', 56 | 57 | vite: { plugins: [tailwindcss()] }, 58 | }); 59 | -------------------------------------------------------------------------------- /apps/backend/src/lib/logger.ts: -------------------------------------------------------------------------------- 1 | // Define the basic structure for your logs 2 | interface LogEntry { 3 | level: 'debug' | 'info' | 'warn' | 'error'; 4 | message: string; 5 | timestamp: string; 6 | context?: Record; 7 | error?: { 8 | message: string; 9 | stack?: string; 10 | cause?: unknown; 11 | }; 12 | } 13 | 14 | // Basic logger class 15 | export class Logger { 16 | private baseContext: Record; 17 | 18 | constructor(baseContext: Record = {}) { 19 | // Clone the context to prevent mutation issues if the source object changes 20 | this.baseContext = { ...baseContext }; 21 | } 22 | 23 | // Method to create a "child" logger with additional context 24 | child(additionalContext: Record): Logger { 25 | return new Logger({ ...this.baseContext, ...additionalContext }); 26 | } 27 | 28 | // Central logging function 29 | private log(level: LogEntry['level'], message: string, context?: Record, error?: Error) { 30 | const entry: LogEntry = { 31 | level, 32 | message, 33 | timestamp: new Date().toISOString(), 34 | // Merge base context, method-specific context 35 | context: { ...this.baseContext, ...context }, 36 | }; 37 | 38 | if (error) { 39 | entry.error = { 40 | message: error.message, 41 | stack: error.stack, 42 | // Include cause if available 43 | ...(error.cause ? { cause: error.cause } : {}), 44 | }; 45 | } 46 | 47 | // The core idea: output structured JSON via console.log 48 | // Logpush / Tail Workers will pick this up. 49 | console.log(JSON.stringify(entry)); 50 | } 51 | 52 | // Convenience methods for different levels 53 | debug(message: string, context?: Record) { 54 | this.log('debug', message, context); 55 | } 56 | 57 | info(message: string, context?: Record) { 58 | this.log('info', message, context); 59 | } 60 | 61 | warn(message: string, context?: Record, error?: Error) { 62 | this.log('warn', message, context, error); 63 | } 64 | 65 | error(message: string, context?: Record, error?: Error) { 66 | this.log('error', message, context, error); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /services/meridian-ml-service/src/meridian_ml_service/dependencies.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Union 2 | import asyncio 3 | from functools import lru_cache 4 | 5 | from fastapi import Depends, HTTPException, Security 6 | from fastapi.security import APIKeyHeader 7 | from starlette.status import HTTP_403_FORBIDDEN 8 | 9 | from .config import settings 10 | from .embeddings import ModelComponents, load_embedding_model 11 | 12 | # Global lock for model loading 13 | _model_lock = asyncio.Lock() 14 | _model_instance: Union[ModelComponents, None] = None 15 | 16 | 17 | async def get_embedding_model() -> ModelComponents: 18 | """FastAPI dependency to get the loaded embedding model components in a thread-safe way.""" 19 | global _model_instance 20 | 21 | if _model_instance is not None: 22 | return _model_instance 23 | 24 | async with _model_lock: 25 | # double-check pattern to avoid race conditions 26 | if _model_instance is not None: 27 | return _model_instance 28 | 29 | try: 30 | _model_instance = load_embedding_model() 31 | return _model_instance 32 | except Exception as e: 33 | # Consider how to handle model loading failure more gracefully in API 34 | # Maybe return HTTP 503 Service Unavailable? 35 | print(f"FATAL: Could not provide embedding model: {e}") 36 | raise # Let FastAPI handle internal server error for now 37 | 38 | 39 | ModelDep = Annotated[ModelComponents, Depends(get_embedding_model)] 40 | 41 | api_key_header = APIKeyHeader(name="Authorization", auto_error=False) 42 | 43 | 44 | async def verify_token(api_key: Union[str, None] = Security(api_key_header)) -> None: 45 | if settings.api_token is None: 46 | return # auth is disabled if no token is configured 47 | 48 | if api_key is None: 49 | raise HTTPException( 50 | status_code=HTTP_403_FORBIDDEN, detail="Invalid or missing API token" 51 | ) 52 | 53 | # Extract token from Bearer format 54 | token = api_key 55 | if api_key.startswith("Bearer "): 56 | token = api_key[7:] # Remove "Bearer " prefix 57 | 58 | if token != settings.api_token: 59 | raise HTTPException( 60 | status_code=HTTP_403_FORBIDDEN, detail="Invalid or missing API token" 61 | ) 62 | -------------------------------------------------------------------------------- /apps/frontend/src/composables/useStickyElement.ts: -------------------------------------------------------------------------------- 1 | export function useStickyElement( 2 | targetRef: Ref, 3 | options: IntersectionObserverInit = { threshold: 0 } 4 | ) { 5 | const isSticky = ref(false); 6 | let observer: IntersectionObserver | null = null; 7 | 8 | const setupObserver = () => { 9 | if (observer) observer.disconnect(); // Clean up previous observer 10 | 11 | if (targetRef.value) { 12 | // Use a placeholder element *before* the target to detect when the target *would* leave the screen top 13 | // Or observe the target itself and check entry.boundingClientRect.top <= options.rootMargin top value (if set) 14 | // Simpler approach: Observe the target and set sticky when it's *not* intersecting the *top* of the viewport. 15 | // Let's observe a sentinel element placed *above* the header for simplicity if possible, 16 | // otherwise observe the header itself and use rootMargin. 17 | 18 | // Assuming we observe the element *itself* and want it sticky when it scrolls *off* the top. 19 | // We need a negative top margin equal to the element's height or just 1px if we only care when it *starts* scrolling off. 20 | // Let's stick to the original logic: observe the element, become sticky when *not* intersecting. 21 | // This requires the element to *start* within the viewport. 22 | 23 | observer = new IntersectionObserver( 24 | ([entry]) => { 25 | // Becomes sticky when the *observed element* is no longer intersecting the viewport (at the top) 26 | // This interpretation might depend on where the observed element is relative to the sticky element itself. 27 | // If targetRef *is* the element becoming sticky, this works. 28 | isSticky.value = !entry.isIntersecting; 29 | }, 30 | options // Use provided options (e.g., { threshold: 0 }) 31 | ); 32 | observer.observe(targetRef.value); 33 | } 34 | }; 35 | 36 | onMounted(() => { 37 | // Need to wait for the element to be mounted and potentially rendered 38 | nextTick(setupObserver); 39 | }); 40 | 41 | onUnmounted(() => { 42 | if (observer) { 43 | observer.disconnect(); 44 | } 45 | }); 46 | 47 | // Re-setup if the target element changes (e.g., v-if) 48 | watch(targetRef, () => { 49 | nextTick(setupObserver); 50 | }); 51 | 52 | return { 53 | isSticky, 54 | }; 55 | } 56 | -------------------------------------------------------------------------------- /apps/backend/src/routers/reports.router.ts: -------------------------------------------------------------------------------- 1 | import { zValidator } from '@hono/zod-validator'; 2 | import { $reports, desc } from '@meridian/database'; 3 | import { Hono } from 'hono'; 4 | import { z } from 'zod'; 5 | import type { HonoEnv } from '../app'; 6 | import { tryCatchAsync } from '../lib/tryCatchAsync'; 7 | import { getDb, hasValidAuthToken } from '../lib/utils'; 8 | 9 | const route = new Hono() 10 | .get('/last-report', async c => { 11 | // check auth token 12 | const hasValidToken = hasValidAuthToken(c); 13 | if (!hasValidToken) { 14 | return c.json({ error: 'Unauthorized' }, 401); 15 | } 16 | 17 | const reportResult = await tryCatchAsync( 18 | getDb(c.env.HYPERDRIVE).query.$reports.findFirst({ 19 | orderBy: desc($reports.createdAt), 20 | }) 21 | ); 22 | if (reportResult.isErr()) { 23 | return c.json({ error: 'Failed to fetch last report' }, 500); 24 | } 25 | 26 | const report = reportResult.value; 27 | if (report === undefined) { 28 | return c.json({ error: 'No report found' }, 404); 29 | } 30 | 31 | return c.json(report); 32 | }) 33 | .post( 34 | '/report', 35 | zValidator( 36 | 'json', 37 | z.object({ 38 | title: z.string(), 39 | content: z.string(), 40 | totalArticles: z.number(), 41 | totalSources: z.number(), 42 | usedArticles: z.number(), 43 | usedSources: z.number(), 44 | tldr: z.string(), 45 | createdAt: z.coerce.date(), 46 | model_author: z.string(), 47 | clustering_params: z.object({ 48 | umap: z.object({ 49 | n_neighbors: z.number(), 50 | }), 51 | hdbscan: z.object({ 52 | min_cluster_size: z.number(), 53 | min_samples: z.number(), 54 | epsilon: z.number(), 55 | }), 56 | }), 57 | }) 58 | ), 59 | async c => { 60 | if (!hasValidAuthToken(c)) { 61 | return c.json({ error: 'Unauthorized' }, 401); 62 | } 63 | 64 | const db = getDb(c.env.HYPERDRIVE); 65 | const body = c.req.valid('json'); 66 | 67 | const reportResult = await tryCatchAsync(db.insert($reports).values(body)); 68 | if (reportResult.isErr()) { 69 | return c.json({ error: 'Failed to insert report' }, 500); 70 | } 71 | 72 | return c.json({ success: true }); 73 | } 74 | ); 75 | 76 | export default route; 77 | -------------------------------------------------------------------------------- /apps/frontend/src/layouts/default.vue: -------------------------------------------------------------------------------- 1 | 4 | 5 | 64 | -------------------------------------------------------------------------------- /apps/backend/src/routers/events.router.ts: -------------------------------------------------------------------------------- 1 | import { $data_sources, $ingested_items, and, eq, gte, isNotNull, lte, not } from '@meridian/database'; 2 | import { Hono } from 'hono'; 3 | import type { HonoEnv } from '../app'; 4 | import { getDb, hasValidAuthToken } from '../lib/utils'; 5 | 6 | const route = new Hono().get('/', async c => { 7 | // require bearer auth token 8 | const hasValidToken = hasValidAuthToken(c); 9 | if (!hasValidToken) { 10 | return c.json({ error: 'Unauthorized' }, 401); 11 | } 12 | 13 | // Check if a date query parameter was provided in yyyy-mm-dd format 14 | const dateParam = c.req.query('date'); 15 | 16 | let endDate: Date; 17 | if (dateParam) { 18 | // Parse the date parameter explicitly with UTC 19 | // Append T07:00:00Z to ensure it's 7am UTC 20 | endDate = new Date(`${dateParam}T07:00:00Z`); 21 | // Check if date is valid 22 | if (Number.isNaN(endDate.getTime())) { 23 | return c.json({ error: 'Invalid date format. Please use yyyy-mm-dd' }, 400); 24 | } 25 | } else { 26 | // Use current date if no date parameter was provided 27 | endDate = new Date(); 28 | // Set to 7am UTC today 29 | endDate.setUTCHours(7, 0, 0, 0); 30 | } 31 | 32 | // Create a 30-hour window ending at 7am UTC on the specified date 33 | const startDate = new Date(endDate.getTime() - 30 * 60 * 60 * 1000); 34 | 35 | const db = getDb(c.env.HYPERDRIVE); 36 | const [allSources, events] = await Promise.all([ 37 | db.select({ id: $data_sources.id, name: $data_sources.name }).from($data_sources), 38 | db 39 | .select({ 40 | id: $ingested_items.id, 41 | sourceId: $ingested_items.data_source_id, 42 | url: $ingested_items.url_to_original, 43 | title: $ingested_items.display_title, 44 | publishDate: $ingested_items.published_at, 45 | contentFileKey: $ingested_items.raw_data_r2_key, 46 | embedding: $ingested_items.embedding, 47 | createdAt: $ingested_items.ingested_at, 48 | }) 49 | .from($ingested_items) 50 | .where( 51 | and( 52 | isNotNull($ingested_items.embedding), 53 | gte($ingested_items.published_at, startDate), 54 | lte($ingested_items.published_at, endDate), 55 | isNotNull($ingested_items.processed_at) 56 | ) 57 | ), 58 | ]); 59 | 60 | return c.json({ 61 | sources: allSources, 62 | events, 63 | dateRange: { 64 | startDate: startDate.toISOString(), 65 | endDate: endDate.toISOString(), 66 | }, 67 | }); 68 | }); 69 | 70 | export default route; 71 | -------------------------------------------------------------------------------- /apps/frontend/README.md: -------------------------------------------------------------------------------- 1 | # Meridian Frontend 2 | 3 | This is the Nuxt 3 frontend application for the [Meridian project](https://github.com/iliane5/meridian) (your personal AI intelligence agency). It provides the web interface for viewing generated intelligence briefs and managing sources (admin). 4 | 5 | Built with: 6 | 7 | - [Nuxt 3](https://nuxt.com/) (Vue 3) 8 | - [Tailwind CSS](https://tailwindcss.com/) (with Radix UI colors) 9 | - [TypeScript](https://www.typescriptlang.org/) 10 | 11 | ## Key Features 12 | 13 | - Displays daily intelligence briefs with rich formatting (`/briefs/[slug]`). 14 | - Interactive Table of Contents for easy navigation within briefs. 15 | - Subscription form for updates (`/`). 16 | - Consumes the Meridian API (via Nitro server routes in `/server/api` and potentially external workers). 17 | 18 | ## Setup 19 | 20 | Make sure you have [Node.js](https://nodejs.org/) (v22+ recommended) and [pnpm](https://pnpm.io/) installed. 21 | 22 | From the _root_ of the Meridian monorepo: 23 | 24 | ```bash 25 | # Install all workspace dependencies 26 | pnpm install 27 | ``` 28 | 29 | Or, if you're only working within this app (less common in a monorepo): 30 | 31 | ```bash 32 | cd apps/frontend 33 | pnpm install 34 | ``` 35 | 36 | You'll also need to ensure the necessary environment variables are configured (likely in a `.env` file in the root or this directory, depending on your setup) – particularly for the database connection (`DATABASE_URL`) and any external API endpoints (`WORKER_API`). See the [main project README](https://github.com/iliane5/meridian#setup) for full setup details. 37 | 38 | ## Development Server 39 | 40 | Start the Nuxt development server (usually on `http://localhost:3000`): 41 | 42 | ```bash 43 | # From the root directory 44 | pnpm --filter @meridian/frontend dev 45 | 46 | # Or from the apps/frontend directory 47 | pnpm dev 48 | ``` 49 | 50 | ## Production Build 51 | 52 | Build the application for production: 53 | 54 | ```bash 55 | # From the root directory 56 | pnpm --filter @meridian/frontend build 57 | 58 | # Or from the apps/frontend directory 59 | pnpm build 60 | ``` 61 | 62 | Locally preview the production build: 63 | 64 | ```bash 65 | # From the root directory 66 | pnpm --filter @meridian/frontend preview 67 | 68 | # Or from the apps/frontend directory 69 | pnpm preview 70 | ``` 71 | 72 | ## Deployment 73 | 74 | This application is typically deployed using [Cloudflare Pages](https://pages.cloudflare.com/). 75 | 76 | Check out the [Nuxt deployment documentation](https://nuxt.com/docs/getting-started/deployment) for general deployment information. 77 | -------------------------------------------------------------------------------- /apps/backend/src/routers/sources.router.ts: -------------------------------------------------------------------------------- 1 | import { zValidator } from '@hono/zod-validator'; 2 | import { $data_sources, eq } from '@meridian/database'; 3 | import { Hono } from 'hono'; 4 | import { z } from 'zod'; 5 | import type { HonoEnv } from '../app'; 6 | import { Logger } from '../lib/logger'; 7 | import { tryCatchAsync } from '../lib/tryCatchAsync'; 8 | import { getDb, hasValidAuthToken } from '../lib/utils'; 9 | 10 | const logger = new Logger({ router: 'sources' }); 11 | 12 | const route = new Hono().delete( 13 | '/:id', 14 | zValidator( 15 | 'param', 16 | z.object({ 17 | id: z.coerce.number(), 18 | }) 19 | ), 20 | async c => { 21 | if (!hasValidAuthToken(c)) { 22 | return c.json({ error: 'Unauthorized' }, 401); 23 | } 24 | 25 | const routeLogger = logger.child({ 26 | operation: 'delete-source', 27 | source_id: c.req.valid('param').id, 28 | }); 29 | routeLogger.info('Attempting to delete source'); 30 | 31 | const db = getDb(c.env.HYPERDRIVE); 32 | 33 | const sourceResult = await tryCatchAsync( 34 | db.query.$data_sources.findFirst({ 35 | where: eq($data_sources.id, c.req.valid('param').id), 36 | }) 37 | ); 38 | if (sourceResult.isErr()) { 39 | const error = sourceResult.error instanceof Error ? sourceResult.error : new Error(String(sourceResult.error)); 40 | routeLogger.error('Failed to fetch source', undefined, error); 41 | return c.json({ error: 'Failed to fetch source' }, 500); 42 | } 43 | 44 | const source = sourceResult.value; 45 | if (source === undefined) { 46 | routeLogger.warn('Source not found'); 47 | return c.json({ error: "Source doesn't exist" }, 404); 48 | } 49 | 50 | routeLogger.debug('Source found, proceeding with deletion', { source_url: source.config.config.url }); 51 | const doId = c.env.DATA_SOURCE_INGESTOR.idFromName(source.config.config.url); // Use URL for ID stability 52 | const stub = c.env.DATA_SOURCE_INGESTOR.get(doId); 53 | 54 | const deleteResult = await tryCatchAsync( 55 | Promise.all([db.delete($data_sources).where(eq($data_sources.id, c.req.valid('param').id)), stub.destroy()]) 56 | ); 57 | if (deleteResult.isErr()) { 58 | const error = deleteResult.error instanceof Error ? deleteResult.error : new Error(String(deleteResult.error)); 59 | routeLogger.error('Failed to delete source', undefined, error); 60 | return c.json({ error: 'Failed to delete source' }, 500); 61 | } 62 | 63 | routeLogger.info('Source deleted successfully'); 64 | return c.json({ success: true }); 65 | } 66 | ); 67 | 68 | export default route; 69 | -------------------------------------------------------------------------------- /apps/backend/test/parseRss.spec.ts: -------------------------------------------------------------------------------- 1 | import { readFileSync } from 'node:fs'; 2 | import path from 'node:path'; 3 | import { describe, expect, it } from 'vitest'; 4 | import { parseRSSFeed } from '../src/lib/parsers'; 5 | 6 | describe('parseRssFeed', () => { 7 | // helper to load fixtures 8 | const loadFixture = (filename: string) => readFileSync(path.join(__dirname, 'fixtures', filename), 'utf-8'); 9 | 10 | it('handles independant.co.uk feed', async () => { 11 | const xml = loadFixture('independant_co_uk.xml'); 12 | const result = await parseRSSFeed(xml); 13 | if (result.isErr()) throw result.error; 14 | 15 | expect(result.value).toHaveLength(100); 16 | 17 | expect(result.value[0].title).toBe( 18 | 'Trump makes good on promise as thousands of JFK assassination files are released: Live updates' 19 | ); 20 | expect(result.value[0].link).toBe( 21 | 'https://www.independent.co.uk/news/world/americas/us-politics/jfk-files-released-assassination-trump-b2717229.html' 22 | ); 23 | expect(result.value[0].pubDate).toStrictEqual(new Date('Tue, 18 Mar 2025 23:24:58 GMT')); 24 | }); 25 | 26 | it('handles cn.nytimes.com feed', async () => { 27 | const xml = loadFixture('cn_nytimes_com.xml'); 28 | const result = await parseRSSFeed(xml); 29 | if (result.isErr()) throw result.error; 30 | 31 | expect(result.value).toHaveLength(20); 32 | 33 | expect(result.value[0].title).toBe('前高管揭Facebook内幕:配合北京开发审查工具'); 34 | expect(result.value[0].link).toBe('https://cn.nytimes.com/culture/20250318/careless-people-sarah-wynn-williams/'); 35 | expect(result.value[0].pubDate).toStrictEqual(new Date('Tue, 18 Mar 2025 04:59:35 +0800')); 36 | }); 37 | 38 | it('handles ft.com feed', async () => { 39 | const xml = loadFixture('ft_com.xml'); 40 | const result = await parseRSSFeed(xml); 41 | 42 | if (result.isErr()) throw result.error; 43 | 44 | expect(result.value).toHaveLength(25); 45 | 46 | expect(result.value[0].title).toBe('‘If Trump defies a Supreme Court order, will it matter to markets?’'); 47 | expect(result.value[0].link).toBe('https://www.ft.com/content/2e579290-fc0c-4b88-8703-f0bae45266d9'); 48 | expect(result.value[0].pubDate).toStrictEqual(new Date('Tue, 18 Mar 2025 23:34:47 GMT')); 49 | }); 50 | 51 | it('handles theverge.com feed', async () => { 52 | const xml = loadFixture('theverge_com.xml'); 53 | const result = await parseRSSFeed(xml); 54 | if (result.isErr()) throw result.error; 55 | 56 | expect(result.value).toHaveLength(10); 57 | 58 | expect(result.value[0].title).toBe('The Boeing Starliner astronauts returned to Earth today'); 59 | expect(result.value[0].link).toBe( 60 | 'https://www.theverge.com/news/628311/nasa-crew-10-mission-starliner-astronauts-return-spacex' 61 | ); 62 | expect(result.value[0].pubDate).toStrictEqual(new Date('2025-03-18T18:04:44-04:00')); 63 | }); 64 | }); 65 | -------------------------------------------------------------------------------- /apps/frontend/src/components/SubscriptionForm.vue: -------------------------------------------------------------------------------- 1 | 52 | 53 | 87 | -------------------------------------------------------------------------------- /packages/database/migrations/0001_premium_wolfpack.sql: -------------------------------------------------------------------------------- 1 | CREATE TYPE "public"."ingested_item_status" AS ENUM('NEW', 'PENDING_PROCESSING', 'PROCESSED', 'FAILED_FETCH', 'FAILED_PROCESSING', 'SKIPPED_PDF', 'SKIPPED_TOO_OLD');--> statement-breakpoint 2 | CREATE TYPE "public"."source_type" AS ENUM('RSS');--> statement-breakpoint 3 | CREATE TABLE IF NOT EXISTS "data_sources" ( 4 | "id" serial PRIMARY KEY NOT NULL, 5 | "name" text NOT NULL, 6 | "source_type" "source_type" NOT NULL, 7 | "config" jsonb NOT NULL, 8 | "config_version_hash" text, 9 | "publisher_id" integer, 10 | "scrape_frequency_minutes" integer DEFAULT 240 NOT NULL, 11 | "last_checked" timestamp, 12 | "do_initialized_at" timestamp, 13 | "created_at" timestamp DEFAULT now() NOT NULL, 14 | "updated_at" timestamp DEFAULT now() NOT NULL 15 | ); 16 | --> statement-breakpoint 17 | CREATE TABLE IF NOT EXISTS "ingested_items" ( 18 | "id" bigserial PRIMARY KEY NOT NULL, 19 | "item_id_from_source" text NOT NULL, 20 | "raw_data_r2_key" text NOT NULL, 21 | "display_title" text, 22 | "url_to_original" text NOT NULL, 23 | "published_at" timestamp, 24 | "status" "ingested_item_status" DEFAULT 'NEW', 25 | "content_body_r2_key" text, 26 | "content_body_text" text, 27 | "word_count" integer, 28 | "analysis_payload" jsonb, 29 | "source_specific_metadata" jsonb, 30 | "used_browser" boolean, 31 | "embedding" vector(384), 32 | "fail_reason" text, 33 | "data_source_id" integer NOT NULL, 34 | "processed_at" timestamp, 35 | "ingested_at" timestamp DEFAULT CURRENT_TIMESTAMP, 36 | CONSTRAINT "ingested_items_url_to_original_unique" UNIQUE("url_to_original"), 37 | CONSTRAINT "uniqueSourceItem" UNIQUE("data_source_id","item_id_from_source") 38 | ); 39 | --> statement-breakpoint 40 | CREATE TABLE IF NOT EXISTS "newsletter" ( 41 | "id" serial PRIMARY KEY NOT NULL, 42 | "email" text NOT NULL, 43 | "created_at" timestamp DEFAULT CURRENT_TIMESTAMP, 44 | CONSTRAINT "newsletter_email_unique" UNIQUE("email") 45 | ); 46 | --> statement-breakpoint 47 | CREATE TABLE IF NOT EXISTS "publishers" ( 48 | "id" serial PRIMARY KEY NOT NULL, 49 | "name" text NOT NULL, 50 | "base_url" text, 51 | "created_at" timestamp DEFAULT now() NOT NULL 52 | ); 53 | --> statement-breakpoint 54 | CREATE TABLE IF NOT EXISTS "reports" ( 55 | "id" serial PRIMARY KEY NOT NULL, 56 | "title" text NOT NULL, 57 | "content" text NOT NULL, 58 | "total_articles" integer NOT NULL, 59 | "total_sources" integer NOT NULL, 60 | "used_articles" integer NOT NULL, 61 | "used_sources" integer NOT NULL, 62 | "tldr" text, 63 | "clustering_params" jsonb, 64 | "model_author" text, 65 | "created_at" timestamp DEFAULT CURRENT_TIMESTAMP NOT NULL 66 | ); 67 | --> statement-breakpoint 68 | ALTER TABLE IF EXISTS "data_sources" ADD CONSTRAINT "data_sources_publisher_id_publishers_id_fk" FOREIGN KEY ("publisher_id") REFERENCES "public"."publishers"("id") ON DELETE no action ON UPDATE no action;--> statement-breakpoint 69 | ALTER TABLE IF EXISTS "ingested_items" ADD CONSTRAINT "ingested_items_data_source_id_data_sources_id_fk" FOREIGN KEY ("data_source_id") REFERENCES "public"."data_sources"("id") ON DELETE no action ON UPDATE no action;--> statement-breakpoint 70 | CREATE INDEX IF NOT EXISTS "embeddingIndex" ON "ingested_items" USING hnsw ("embedding" vector_cosine_ops); -------------------------------------------------------------------------------- /apps/frontend/src/server/api/admin/sources/[id]/details.get.ts: -------------------------------------------------------------------------------- 1 | import { $ingested_items, $data_sources, eq, and, desc, ingestedItemStatusEnum } from '@meridian/database'; 2 | import { getDB } from '~/server/lib/utils'; 3 | 4 | // to access the enums 5 | type ArticleStatus = (typeof ingestedItemStatusEnum.enumValues)[number]; 6 | 7 | export default defineEventHandler(async event => { 8 | await requireUserSession(event); // require auth 9 | 10 | const sourceId = Number(getRouterParam(event, 'id')); 11 | if (Number.isNaN(sourceId)) { 12 | throw createError({ statusCode: 400, statusMessage: 'Invalid source ID' }); 13 | } 14 | 15 | // get source details 16 | const db = getDB(event); 17 | const source = await db.query.$data_sources.findFirst({ where: eq($data_sources.id, sourceId) }); 18 | if (source === undefined) { 19 | throw createError({ statusCode: 404, statusMessage: 'Source not found' }); 20 | } 21 | 22 | // get query params for filtering and sorting 23 | const query = getQuery(event); 24 | const page = Number(query.page) || 1; 25 | const pageSize = 50; 26 | const status = query.status as string; 27 | const sortBy = (query.sortBy as string) || 'createdAt'; 28 | const sortOrder = query.sortOrder === 'asc' ? 'asc' : 'desc'; 29 | 30 | // build where clause 31 | const conditions = [eq($ingested_items.data_source_id, sourceId)]; 32 | 33 | // only add conditions if they're valid enum values 34 | if (ingestedItemStatusEnum.enumValues.includes(status as ArticleStatus)) { 35 | conditions.push(eq($ingested_items.status, status as ArticleStatus)); 36 | } 37 | 38 | const whereClause = and(...conditions); 39 | 40 | // determine sort field 41 | const sortField = 42 | sortBy === 'publishedAt' 43 | ? $ingested_items.published_at 44 | : sortBy === 'processedAt' 45 | ? $ingested_items.processed_at 46 | : $ingested_items.ingested_at; 47 | 48 | // get articles with filters and sorting 49 | const articles = await db.query.$ingested_items.findMany({ 50 | where: whereClause, 51 | orderBy: sortOrder === 'asc' ? sortField : desc(sortField), 52 | limit: pageSize, 53 | offset: (page - 1) * pageSize, 54 | }); 55 | 56 | // get total count with filters 57 | const totalCount = await db.query.$ingested_items.findMany({ 58 | where: whereClause, 59 | columns: { id: true }, 60 | }); 61 | 62 | return { 63 | id: source.id, 64 | name: source.name, 65 | url: source.config.config.url, 66 | initialized: source.do_initialized_at !== null, 67 | frequency: 68 | source.scrape_frequency_minutes <= 60 69 | ? 'Hourly' 70 | : source.scrape_frequency_minutes <= 120 71 | ? '4 Hours' 72 | : source.scrape_frequency_minutes <= 180 73 | ? '6 Hours' 74 | : 'Daily', 75 | lastFetched: source.lastChecked?.toISOString(), 76 | articles: articles.map(article => ({ 77 | id: article.id, 78 | title: article.display_title ?? 'Unknown', 79 | url: article.url_to_original ?? 'Unknown', 80 | publishedAt: article.published_at?.toISOString(), 81 | status: article.status, 82 | failReason: article.fail_reason, 83 | processedAt: article.processed_at?.toISOString(), 84 | createdAt: article.ingested_at?.toISOString(), 85 | hasEmbedding: article.embedding !== null, 86 | })), 87 | pagination: { 88 | currentPage: page, 89 | totalPages: Math.ceil(totalCount.length / pageSize), 90 | totalItems: totalCount.length, 91 | }, 92 | }; 93 | }); 94 | -------------------------------------------------------------------------------- /services/meridian-ml-service/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | 3 | # --- Builder Stage --- 4 | FROM python:3.11-slim AS builder 5 | 6 | # Install uv (your project uses it, it's fast) 7 | RUN pip install uv 8 | 9 | WORKDIR /app 10 | 11 | # Copy only dependency definitions first for better layer caching 12 | COPY pyproject.toml ./ 13 | 14 | # Install dependencies efficiently in one step and clean up 15 | RUN uv pip install --system --no-cache --index-strategy unsafe-best-match --extra-index-url https://download.pytorch.org/whl/cpu --requirement pyproject.toml && \ 16 | rm -rf /root/.cache /tmp/* /var/tmp/* 17 | 18 | # Pre-download the model and save it to a known location 19 | RUN mkdir -p /app/models && \ 20 | python3 -c "from transformers import AutoTokenizer, AutoModel; \ 21 | model_name = 'intfloat/multilingual-e5-small'; \ 22 | tokenizer = AutoTokenizer.from_pretrained(model_name); \ 23 | model = AutoModel.from_pretrained(model_name); \ 24 | tokenizer.save_pretrained('/app/models'); \ 25 | model.save_pretrained('/app/models')" 26 | 27 | # --- Runtime Stage --- 28 | FROM python:3.11-slim 29 | 30 | # Install only runtime essentials and clean up in one layer 31 | RUN apt-get update && \ 32 | apt-get install -y --no-install-recommends \ 33 | ca-certificates && \ 34 | apt-get clean && \ 35 | rm -rf /var/lib/apt/lists/* 36 | 37 | WORKDIR /app 38 | 39 | # Create a non-root user for security 40 | RUN useradd --create-home --shell /bin/bash appuser 41 | USER appuser 42 | WORKDIR /home/appuser/app 43 | 44 | # Copy installed dependencies from the builder stage's system python env 45 | COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages 46 | COPY --from=builder /usr/local/bin /usr/local/bin 47 | 48 | # Copy the pre-downloaded model 49 | COPY --from=builder /app/models /home/appuser/app/models 50 | 51 | # Copy your application code 52 | # Important: Ensure the path matches your project structure relative to the Dockerfile 53 | # Assuming Dockerfile is in 'meridian-ml-service' directory 54 | COPY --chown=appuser:appuser ./src ./src 55 | # Good practice, though likely not needed at runtime here 56 | COPY --chown=appuser:appuser pyproject.toml ./ 57 | 58 | # Environment variables 59 | ENV PYTHONUNBUFFERED=1 \ 60 | # Add src directory to Python path so modules can be found 61 | PYTHONPATH=/home/appuser/app:${PYTHONPATH} \ 62 | # Fly.io sets PORT automatically, uvicorn will pick it up via $PORT in CMD 63 | # Default model from your config. Can be overridden via fly secrets. 64 | EMBEDDING_MODEL_NAME="/home/appuser/app/models" \ 65 | # API_TOKEN should be provided via secrets at runtime, not in the Dockerfile 66 | # Set Hugging Face cache directory to somewhere writeable by appuser 67 | HF_HOME=/home/appuser/.cache/huggingface \ 68 | TRANSFORMERS_CACHE=/home/appuser/.cache/huggingface/transformers \ 69 | HF_HUB_CACHE=/home/appuser/.cache/huggingface/hub 70 | 71 | # Ensure the cache directory exists and is owned by the app user 72 | # This RUN command executes as root before switching back to appuser implicitly for CMD 73 | USER root 74 | RUN mkdir -p /home/appuser/.cache/huggingface && \ 75 | chown -R appuser:appuser /home/appuser/.cache 76 | USER appuser 77 | 78 | # Expose the default port. Fly will map this. 79 | EXPOSE 8080 80 | 81 | # Run the application using uvicorn 82 | # Update the import path to match your module structure 83 | # Use $PORT which fly provides. 84 | CMD ["uvicorn", "src.meridian_ml_service.main:app", "--host", "0.0.0.0", "--port", "8080"] -------------------------------------------------------------------------------- /apps/backend/wrangler.jsonc: -------------------------------------------------------------------------------- 1 | /** 2 | * For more details on how to configure Wrangler, refer to: 3 | * https://developers.cloudflare.com/workers/wrangler/configuration/ 4 | */ 5 | { 6 | "$schema": "node_modules/wrangler/config-schema.json", 7 | "name": "meridian-backend", 8 | "main": "src/index.ts", 9 | "compatibility_date": "2025-04-30", 10 | "compatibility_flags": ["nodejs_compat"], 11 | "migrations": [ 12 | { 13 | "new_sqlite_classes": ["DataSourceIngestorDO"], 14 | "tag": "v1", 15 | }, 16 | ], 17 | "durable_objects": { 18 | "bindings": [ 19 | { 20 | "class_name": "DataSourceIngestorDO", 21 | "name": "DATA_SOURCE_INGESTOR", 22 | }, 23 | ], 24 | }, 25 | "observability": { 26 | "enabled": true, 27 | }, 28 | "hyperdrive": [ 29 | { 30 | "binding": "HYPERDRIVE", 31 | "id": "b748bf8359b74c519d64501151cecd80", 32 | "localConnectionString": "postgresql://postgres:mysecretpassword@localhost:5432/postgres", 33 | }, 34 | ], 35 | /** 36 | * Smart Placement 37 | * Docs: https://developers.cloudflare.com/workers/configuration/smart-placement/#smart-placement 38 | */ 39 | "placement": { "mode": "smart" }, 40 | /** 41 | * Bindings 42 | * Bindings allow your Worker to interact with resources on the Cloudflare Developer Platform, including 43 | * databases, object storage, AI inference, real-time communication and more. 44 | * https://developers.cloudflare.com/workers/runtime-apis/bindings/ 45 | */ 46 | "queues": { 47 | "producers": [ 48 | { 49 | "queue": "meridian-article-processing-queue-prod", 50 | "binding": "ARTICLE_PROCESSING_QUEUE", 51 | }, 52 | ], 53 | "consumers": [ 54 | { 55 | "queue": "meridian-article-processing-queue-prod", 56 | "max_batch_size": 100, 57 | "max_batch_timeout": 30, 58 | "max_retries": 5, 59 | "dead_letter_queue": "meridian-article-processing-dlq", 60 | // "retry_delay": 60 61 | }, 62 | ], 63 | }, 64 | "r2_buckets": [ 65 | { 66 | "binding": "ARTICLES_BUCKET", 67 | "bucket_name": "meridian-articles-prod", 68 | "preview_bucket_name": "meridian-articles-dev", 69 | "jurisdiction": "eu", 70 | }, 71 | ], 72 | "workflows": [ 73 | { 74 | "name": "meridian_process_ingested_item", 75 | "binding": "PROCESS_INGESTED_ITEM", 76 | "class_name": "ProcessIngestedItemWorkflow", 77 | }, 78 | ], 79 | // !!! NOTE !!! : tail workers make workers with durable objects CRASH for now - 30/04/2025 80 | // "tail_consumers": [ 81 | // { 82 | // "service": "meridian-backend", 83 | // }, 84 | // ], 85 | /** 86 | * Environment Variables 87 | * https://developers.cloudflare.com/workers/wrangler/configuration/#environment-variables 88 | */ 89 | // "vars": { "MY_VARIABLE": "production_value" }, 90 | /** 91 | * Note: Use secrets to store sensitive data. 92 | * https://developers.cloudflare.com/workers/configuration/secrets/ 93 | */ 94 | 95 | /** 96 | * Static Assets 97 | * https://developers.cloudflare.com/workers/static-assets/binding/ 98 | */ 99 | // "assets": { "directory": "./public/", "binding": "ASSETS" }, 100 | 101 | /** 102 | * Service Bindings (communicate between multiple Workers) 103 | * https://developers.cloudflare.com/workers/wrangler/configuration/#service-bindings 104 | */ 105 | // "services": [{ "binding": "MY_SERVICE", "service": "my-service" }] 106 | } 107 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore 2 | 3 | # Logs 4 | 5 | logs 6 | _.log 7 | npm-debug.log_ 8 | yarn-debug.log* 9 | yarn-error.log* 10 | lerna-debug.log* 11 | .pnpm-debug.log* 12 | 13 | # Caches 14 | 15 | .cache 16 | 17 | # Diagnostic reports (https://nodejs.org/api/report.html) 18 | 19 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 20 | 21 | # Runtime data 22 | 23 | pids 24 | _.pid 25 | _.seed 26 | *.pid.lock 27 | 28 | # Directory for instrumented libs generated by jscoverage/JSCover 29 | 30 | lib-cov 31 | 32 | # Coverage directory used by tools like istanbul 33 | 34 | coverage 35 | *.lcov 36 | 37 | # nyc test coverage 38 | 39 | .nyc_output 40 | 41 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 42 | 43 | .grunt 44 | 45 | # Bower dependency directory (https://bower.io/) 46 | 47 | bower_components 48 | 49 | # node-waf configuration 50 | 51 | .lock-wscript 52 | 53 | # Compiled binary addons (https://nodejs.org/api/addons.html) 54 | 55 | build/Release 56 | 57 | # Dependency directories 58 | 59 | node_modules/ 60 | jspm_packages/ 61 | 62 | # Snowpack dependency directory (https://snowpack.dev/) 63 | 64 | web_modules/ 65 | 66 | # TypeScript cache 67 | 68 | *.tsbuildinfo 69 | 70 | # Optional npm cache directory 71 | 72 | .npm 73 | 74 | # Optional eslint cache 75 | 76 | .eslintcache 77 | 78 | # Optional stylelint cache 79 | 80 | .stylelintcache 81 | 82 | # Microbundle cache 83 | 84 | .rpt2_cache/ 85 | .rts2_cache_cjs/ 86 | .rts2_cache_es/ 87 | .rts2_cache_umd/ 88 | 89 | # Optional REPL history 90 | 91 | .node_repl_history 92 | 93 | # Output of 'npm pack' 94 | 95 | *.tgz 96 | 97 | # Yarn Integrity file 98 | 99 | .yarn-integrity 100 | 101 | # dotenv environment variable files 102 | 103 | .env 104 | .env.development.local 105 | .env.test.local 106 | .env.production.local 107 | .env.local 108 | 109 | # parcel-bundler cache (https://parceljs.org/) 110 | 111 | .parcel-cache 112 | 113 | # Next.js build output 114 | 115 | .next 116 | out 117 | 118 | # Nuxt.js build / generate output 119 | 120 | .nuxt 121 | dist 122 | 123 | # Gatsby files 124 | 125 | # Comment in the public line in if your project uses Gatsby and not Next.js 126 | 127 | # https://nextjs.org/blog/next-9-1#public-directory-support 128 | 129 | # public 130 | 131 | # vuepress build output 132 | 133 | .vuepress/dist 134 | 135 | # vuepress v2.x temp and cache directory 136 | 137 | .temp 138 | 139 | # Docusaurus cache and generated files 140 | 141 | .docusaurus 142 | 143 | # Serverless directories 144 | 145 | .serverless/ 146 | 147 | # FuseBox cache 148 | 149 | .fusebox/ 150 | 151 | # DynamoDB Local files 152 | 153 | .dynamodb/ 154 | 155 | # TernJS port file 156 | 157 | .tern-port 158 | 159 | # Stores VSCode versions used for testing VSCode extensions 160 | 161 | .vscode-test 162 | 163 | # yarn v2 164 | 165 | .yarn/cache 166 | .yarn/unplugged 167 | .yarn/build-state.yml 168 | .yarn/install-state.gz 169 | .pnp.* 170 | 171 | # IntelliJ based IDEs 172 | .idea 173 | 174 | # Finder (MacOS) folder config 175 | .DS_Store 176 | 177 | .venv 178 | 179 | .wrangler 180 | 181 | .turbo 182 | 183 | notebooks 184 | 185 | browser-worker 186 | 187 | .dev.vars 188 | 189 | apps/scrapers/feeds.json 190 | 191 | __pycache__ 192 | 193 | apps/briefs 194 | 195 | # python stuff 196 | .mypy_cache 197 | .ruff_cache 198 | *.egg-info 199 | 200 | 201 | # TEMP 202 | feeds.json 203 | CONTRIBUTING.md 204 | MAIN_LIST.JSON 205 | NOTES.MD 206 | SETUP.MD 207 | TODO.MD 208 | reportV5-confidential.ipynb 209 | -------------------------------------------------------------------------------- /apps/backend/src/index.ts: -------------------------------------------------------------------------------- 1 | import app from './app'; 2 | import { DataSourceIngestorDO } from './durable_objects/dataSourceIngestorDO'; 3 | import { Logger } from './lib/logger'; 4 | import { type ProcessArticlesParams, startProcessArticleWorkflow } from './workflows/processIngestedItem.workflow'; 5 | 6 | export type Env = { 7 | // Bindings 8 | ARTICLES_BUCKET: R2Bucket; 9 | ARTICLE_PROCESSING_QUEUE: Queue; 10 | DATA_SOURCE_INGESTOR: DurableObjectNamespace; 11 | PROCESS_INGESTED_ITEM: Workflow; 12 | HYPERDRIVE: Hyperdrive; 13 | 14 | // Secrets 15 | API_TOKEN: string; 16 | 17 | AXIOM_DATASET: string | undefined; // optional, use if you want to send logs to axiom 18 | AXIOM_TOKEN: string | undefined; // optional, use if you want to send logs to axiom 19 | 20 | CLOUDFLARE_API_TOKEN: string; 21 | CLOUDFLARE_ACCOUNT_ID: string; 22 | 23 | DATABASE_URL: string; 24 | 25 | GEMINI_API_KEY: string; 26 | GEMINI_BASE_URL: string; 27 | 28 | MERIDIAN_ML_SERVICE_URL: string; 29 | MERIDIAN_ML_SERVICE_API_KEY: string; 30 | }; 31 | 32 | // Create a base logger for the queue handler 33 | const queueLogger = new Logger({ service: 'article-queue-handler' }); 34 | 35 | export default { 36 | fetch: app.fetch, 37 | async queue(batch: MessageBatch, env: Env): Promise { 38 | const batchLogger = queueLogger.child({ batch_size: batch.messages.length }); 39 | batchLogger.info('Received batch of articles to process'); 40 | 41 | const articlesToProcess: number[] = []; 42 | for (const message of batch.messages) { 43 | const { ingested_item_ids } = message.body as ProcessArticlesParams; 44 | batchLogger.debug('Processing message', { message_id: message.id, article_count: ingested_item_ids.length }); 45 | 46 | for (const id of ingested_item_ids) { 47 | articlesToProcess.push(id); 48 | } 49 | } 50 | 51 | batchLogger.info('Articles extracted from batch', { total_articles: articlesToProcess.length }); 52 | 53 | if (articlesToProcess.length === 0) { 54 | batchLogger.info('Queue batch was empty, nothing to process'); 55 | batch.ackAll(); // Acknowledge the empty batch 56 | return; 57 | } 58 | 59 | // Process articles in chunks of 96 60 | const CHUNK_SIZE = 96; 61 | const articleChunks = []; 62 | for (let i = 0; i < articlesToProcess.length; i += CHUNK_SIZE) { 63 | articleChunks.push(articlesToProcess.slice(i, i + CHUNK_SIZE)); 64 | } 65 | 66 | batchLogger.info('Split articles into chunks', { chunk_count: articleChunks.length }); 67 | 68 | // Process each chunk sequentially 69 | for (const chunk of articleChunks) { 70 | const workflowResult = await startProcessArticleWorkflow(env, { ingested_item_ids: chunk }); 71 | if (workflowResult.isErr()) { 72 | batchLogger.error( 73 | 'Failed to trigger ProcessArticles Workflow', 74 | { error_message: workflowResult.error.message, chunk_size: chunk.length }, 75 | workflowResult.error 76 | ); 77 | // Retry the entire batch if Workflow creation failed 78 | batch.retryAll({ delaySeconds: 30 }); // Retry after 30 seconds 79 | return; 80 | } 81 | 82 | batchLogger.info('Successfully triggered ProcessArticles Workflow for chunk', { 83 | workflow_id: workflowResult.value.id, 84 | chunk_size: chunk.length, 85 | }); 86 | } 87 | 88 | batch.ackAll(); // Acknowledge the entire batch after all chunks are processed 89 | }, 90 | } satisfies ExportedHandler; 91 | 92 | export { DataSourceIngestorDO }; 93 | export { ProcessIngestedItemWorkflow } from './workflows/processIngestedItem.workflow'; 94 | -------------------------------------------------------------------------------- /apps/backend/test/utils.spec.ts: -------------------------------------------------------------------------------- 1 | import type { Context } from 'hono'; 2 | import { beforeEach, describe, expect, it, vi } from 'vitest'; 3 | import type { HonoEnv } from '../src/app'; 4 | import { hasValidAuthToken } from '../src/lib/utils'; 5 | 6 | describe('hasValidAuthToken', () => { 7 | // Mock Context object 8 | let mockContext: Context; 9 | const validToken = 'valid-token-12345'; 10 | 11 | beforeEach(() => { 12 | // Reset mocks 13 | vi.resetAllMocks(); 14 | 15 | // Create a mock context with request headers and environment 16 | mockContext = { 17 | req: { 18 | header: vi.fn(), 19 | }, 20 | env: { 21 | API_TOKEN: validToken, 22 | }, 23 | } as unknown as Context; 24 | }); 25 | 26 | it('should return true when Authorization header has the correct Bearer token', () => { 27 | // Setup header mock to return the valid token 28 | mockContext.req.header = vi.fn().mockImplementation((name: string) => { 29 | if (name === 'Authorization') return `Bearer ${validToken}`; 30 | return undefined; 31 | }); 32 | 33 | // Call the function 34 | const result = hasValidAuthToken(mockContext); 35 | 36 | // Assert 37 | expect(result).toBe(true); 38 | expect(mockContext.req.header).toHaveBeenCalledWith('Authorization'); 39 | }); 40 | 41 | it('should return false when Authorization header is missing', () => { 42 | // Setup header mock to return undefined 43 | mockContext.req.header = vi.fn().mockImplementation((name: string) => { 44 | return undefined; 45 | }); 46 | 47 | // Call the function 48 | const result = hasValidAuthToken(mockContext); 49 | 50 | // Assert 51 | expect(result).toBe(false); 52 | expect(mockContext.req.header).toHaveBeenCalledWith('Authorization'); 53 | }); 54 | 55 | it('should return false when Authorization header has incorrect token value', () => { 56 | // Setup header mock to return an invalid token 57 | mockContext.req.header = vi.fn().mockImplementation((name: string) => { 58 | if (name === 'Authorization') return 'Bearer wrong-token'; 59 | return undefined; 60 | }); 61 | 62 | // Call the function 63 | const result = hasValidAuthToken(mockContext); 64 | 65 | // Assert 66 | expect(result).toBe(false); 67 | expect(mockContext.req.header).toHaveBeenCalledWith('Authorization'); 68 | }); 69 | 70 | it('should return false when Authorization header uses a scheme other than Bearer', () => { 71 | // Setup header mock to return a non-Bearer token 72 | mockContext.req.header = vi.fn().mockImplementation((name: string) => { 73 | if (name === 'Authorization') return `Basic ${validToken}`; 74 | return undefined; 75 | }); 76 | 77 | // Call the function 78 | const result = hasValidAuthToken(mockContext); 79 | 80 | // Assert 81 | expect(result).toBe(false); 82 | expect(mockContext.req.header).toHaveBeenCalledWith('Authorization'); 83 | }); 84 | 85 | it('should return false when API_TOKEN environment variable is not set or empty', () => { 86 | // Mock the environment with an empty API_TOKEN 87 | mockContext.env.API_TOKEN = ''; 88 | 89 | // Setup header mock to return a valid token format 90 | mockContext.req.header = vi.fn().mockImplementation((name: string) => { 91 | if (name === 'Authorization') return `Bearer ${validToken}`; 92 | return undefined; 93 | }); 94 | 95 | // Call the function 96 | const result = hasValidAuthToken(mockContext); 97 | 98 | // Assert 99 | expect(result).toBe(false); 100 | expect(mockContext.req.header).toHaveBeenCalledWith('Authorization'); 101 | }); 102 | }); 103 | -------------------------------------------------------------------------------- /.github/workflows/deploy-services.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy services 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | timeout-minutes: 10 10 | steps: 11 | - uses: actions/checkout@v4 12 | - uses: pnpm/action-setup@v4 13 | with: 14 | version: 10.9.0 15 | 16 | - uses: actions/setup-node@v4 17 | with: 18 | node-version: '22.14.0' 19 | cache: 'pnpm' 20 | 21 | - name: Install dependencies 22 | run: pnpm install 23 | 24 | - name: Check Git status before generating migrations 25 | run: git status --porcelain 26 | id: pre_migration_status 27 | working-directory: 'packages/database' 28 | 29 | - name: Generate migrations 30 | run: pnpm generate 31 | working-directory: 'packages/database' 32 | 33 | - name: Check if new migrations were created 34 | id: check_migrations 35 | run: | 36 | git status --porcelain 37 | if [[ $(git status --porcelain | grep -E "^\?\?" | wc -l) -gt 0 ]]; then 38 | echo "New migration files were created during CI. Please run 'pnpm generate' locally and commit the changes." 39 | echo "new_files=true" >> $GITHUB_OUTPUT 40 | exit 1 41 | fi 42 | if [[ $(git status --porcelain | grep -E "^M" | wc -l) -gt 0 ]]; then 43 | echo "Existing migration files were modified during CI. Please run 'pnpm generate' locally and commit the changes." 44 | echo "modified_files=true" >> $GITHUB_OUTPUT 45 | exit 1 46 | fi 47 | echo "No new or modified migration files detected." 48 | working-directory: 'packages/database' 49 | 50 | - name: Run database migrations 51 | run: pnpm migrate 52 | working-directory: 'packages/database' 53 | env: 54 | DATABASE_URL: ${{ secrets.DATABASE_URL }} 55 | 56 | - name: Build & Deploy Worker 57 | uses: cloudflare/wrangler-action@v3 58 | with: 59 | apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} 60 | accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} 61 | packageManager: pnpm 62 | workingDirectory: 'apps/backend' 63 | environment: production 64 | secrets: | 65 | API_TOKEN 66 | AXIOM_DATASET 67 | AXIOM_TOKEN 68 | CLOUDFLARE_API_TOKEN 69 | CLOUDFLARE_ACCOUNT_ID 70 | DATABASE_URL 71 | GEMINI_BASE_URL 72 | GEMINI_API_KEY 73 | MERIDIAN_ML_SERVICE_URL 74 | MERIDIAN_ML_SERVICE_API_KEY 75 | env: 76 | API_TOKEN: ${{ secrets.API_TOKEN }} 77 | AXIOM_DATASET: ${{ secrets.AXIOM_DATASET }} 78 | AXIOM_TOKEN: ${{ secrets.AXIOM_TOKEN }} 79 | DATABASE_URL: ${{ secrets.DATABASE_URL }} 80 | CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} 81 | CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} 82 | GEMINI_BASE_URL: ${{ secrets.GEMINI_BASE_URL }} 83 | GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} 84 | MERIDIAN_ML_SERVICE_URL: ${{ secrets.MERIDIAN_ML_SERVICE_URL }} 85 | MERIDIAN_ML_SERVICE_API_KEY: ${{ secrets.MERIDIAN_ML_SERVICE_API_KEY }} 86 | 87 | # - name: Build Nuxt Application 88 | # run: pnpm build --filter=@meridian/frontend # Or 'yarn generate', ensure this matches your static build script in package.json (npx nuxi generate) 89 | # env: 90 | # NUXT_DATABASE_URL: ${{ secrets.DATABASE_URL }} 91 | 92 | # - name: Publish to Cloudflare Pages 93 | # uses: cloudflare/wrangler-action@v3 # Use the official Cloudflare Wrangler action 94 | # with: 95 | # apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} # Use the secret token 96 | # accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} # Use the secret account ID 97 | # command: pages deploy apps/frontend/dist --project-name=meridian-frontend --branch=main 98 | # secrets: | 99 | # NUXT_DATABASE_URL 100 | # env: 101 | # NUXT_DATABASE_URL: ${{ secrets.DATABASE_URL }} 102 | # Replace YOUR_CLOUDFLARE_PAGES_PROJECT_NAME with the actual name from Step 3 103 | # The --branch flag tells Cloudflare which production branch this deployment corresponds to 104 | -------------------------------------------------------------------------------- /services/meridian-ml-service/src/meridian_ml_service/embeddings.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | from typing import Any 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F # noqa: N812 7 | from tqdm import tqdm 8 | from transformers import AutoModel, AutoTokenizer 9 | 10 | from .config import settings # Import settings instance 11 | 12 | # Re-using your type alias and functions, adding type hints and minor adjustments 13 | ModelComponents = tuple[Any, Any, torch.device] 14 | 15 | 16 | @lru_cache(maxsize=1) # Cache the loaded model globally 17 | def load_embedding_model() -> ModelComponents: 18 | """Loads tokenizer, model from HuggingFace based on settings.""" 19 | model_name = settings.embedding_model_name 20 | print(f"Attempting to load embedding model: {model_name}") 21 | try: 22 | tokenizer = AutoTokenizer.from_pretrained( 23 | model_name, local_files_only=True, trust_remote_code=True 24 | ) 25 | model = AutoModel.from_pretrained( 26 | model_name, local_files_only=True, trust_remote_code=True 27 | ) 28 | 29 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 30 | model.to(device) 31 | model.eval() 32 | print(f"Embedding model '{model_name}' loaded successfully on device: {device}") 33 | return tokenizer, model, device 34 | except Exception as e: 35 | print(f"ERROR: Failed to load model: {e}") 36 | raise # Critical failure 37 | 38 | 39 | def _average_pool( 40 | last_hidden_states: torch.Tensor, attention_mask: torch.Tensor 41 | ) -> torch.Tensor: 42 | """Helper function for pooling.""" 43 | last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) 44 | return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] 45 | 46 | 47 | def compute_embeddings( 48 | texts: list[str], 49 | model_components: ModelComponents, 50 | batch_size: int = 32, # Make configurable later if needed 51 | normalize: bool = True, 52 | e5_prefix: str | None = None, 53 | ) -> np.ndarray: 54 | """Computes embeddings for a list of texts using the provided model components.""" 55 | tokenizer, model, device = model_components 56 | all_embeddings: list[np.ndarray] = [] 57 | 58 | if e5_prefix: 59 | texts_to_embed = [f"{e5_prefix}{text}" for text in texts] 60 | print(f"Adding prefix '{e5_prefix}' to texts for embedding.") 61 | else: 62 | texts_to_embed = texts 63 | 64 | print(f"Computing embeddings for {len(texts_to_embed)} texts...") 65 | for i in tqdm( 66 | range(0, len(texts_to_embed), batch_size), 67 | desc="Computing Embeddings", 68 | leave=False, 69 | ): 70 | batch_texts = texts_to_embed[i : i + batch_size] 71 | try: 72 | batch_dict = tokenizer( 73 | batch_texts, 74 | max_length=512, 75 | padding=True, 76 | truncation=True, 77 | return_tensors="pt", 78 | ).to(device) 79 | except Exception as e: 80 | print(f"ERROR: Tokenization failed for batch starting at index {i}: {e}") 81 | raise 82 | 83 | with torch.no_grad(): 84 | try: 85 | outputs = model(**batch_dict) 86 | embeddings = _average_pool( 87 | outputs.last_hidden_state, batch_dict["attention_mask"] 88 | ) 89 | except Exception as e: 90 | print( 91 | f"ERROR: Model inference failed for batch starting at index {i}: {e}" 92 | ) 93 | raise 94 | 95 | if normalize: 96 | embeddings = F.normalize(embeddings, p=2, dim=1) 97 | 98 | all_embeddings.append(embeddings.cpu().numpy()) 99 | 100 | if not all_embeddings: 101 | print("Warning: No embeddings generated.") 102 | # Determine embedding dimension dynamically or return empty array of correct shape if possible 103 | # Example: get embedding dim from model config if loaded 104 | # embedding_dim = model.config.hidden_size 105 | # return np.empty((0, embedding_dim), dtype=np.float32) 106 | # Fallback for now: 107 | return np.empty((0, 0), dtype=np.float32) 108 | 109 | final_embeddings = np.vstack(all_embeddings) 110 | print(f"Embeddings computed. Shape: {final_embeddings.shape}") 111 | return final_embeddings 112 | -------------------------------------------------------------------------------- /packages/database/src/schema.ts: -------------------------------------------------------------------------------- 1 | import { 2 | boolean, 3 | index, 4 | integer, 5 | jsonb, 6 | pgEnum, 7 | pgTable, 8 | serial, 9 | text, 10 | timestamp, 11 | vector, 12 | bigserial, 13 | unique, 14 | } from 'drizzle-orm/pg-core'; 15 | import { sql } from 'drizzle-orm'; 16 | import type { DataSourceConfigWrapperType } from './validators/dataSourceConfig'; 17 | import type { AnalysisPayloadWrapper } from './validators/analysisPayload'; 18 | 19 | /** 20 | * Note: We use $ to denote the table objects 21 | * This frees up the uses of sources, articles, reports, etc as variables in the codebase 22 | **/ 23 | 24 | export const ingestedItemStatusEnum = pgEnum('ingested_item_status', [ 25 | 'NEW', 26 | 'PENDING_PROCESSING', 27 | 'PROCESSED', 28 | 'FAILED_RENDER', 29 | 'FAILED_FETCH', 30 | 'FAILED_PROCESSING', 31 | 'FAILED_EMBEDDING', 32 | 'FAILED_R2_UPLOAD', 33 | 'SKIPPED_PDF', 34 | 'SKIPPED_TOO_OLD', 35 | ]); 36 | 37 | export const sourceTypeEnum = pgEnum('source_type', ['RSS']); 38 | 39 | export const $publishers = pgTable('publishers', { 40 | id: serial('id').primaryKey(), 41 | name: text('name').notNull(), 42 | base_url: text('base_url'), 43 | created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(), 44 | }); 45 | 46 | export const $data_sources = pgTable('data_sources', { 47 | id: serial('id').primaryKey(), 48 | name: text('name').notNull(), 49 | source_type: sourceTypeEnum().notNull(), 50 | config: jsonb('config').$type().notNull(), // Stores source-specific config like {"url": "...", "config_schema_version": "1.0", "paywall": false, "category": "..."} 51 | config_version_hash: text('config_version_hash'), // Hash of config to detect changes 52 | publisher_id: integer('publisher_id').references(() => $publishers.id), 53 | scrape_frequency_minutes: integer('scrape_frequency_minutes').notNull().default(240), // Default: 4 hours 54 | lastChecked: timestamp('last_checked', { mode: 'date' }), 55 | do_initialized_at: timestamp('do_initialized_at', { mode: 'date' }), 56 | created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(), 57 | updated_at: timestamp('updated_at', { mode: 'date' }).defaultNow().notNull(), 58 | }); 59 | 60 | export const $ingested_items = pgTable( 61 | 'ingested_items', 62 | { 63 | id: bigserial('id', { mode: 'number' }).primaryKey(), 64 | 65 | item_id_from_source: text('item_id_from_source').notNull(), // RSS guid, Tweet ID, etc. 66 | raw_data_r2_key: text('raw_data_r2_key').notNull(), // R2 key for original payload 67 | 68 | display_title: text('display_title'), // nullable, might be derived later 69 | url_to_original: text('url_to_original').notNull().unique(), 70 | published_at: timestamp('published_at', { mode: 'date' }), 71 | 72 | status: ingestedItemStatusEnum().default('NEW'), 73 | 74 | content_body_r2_key: text('content_body_r2_key'), // R2 key for processed text 75 | content_body_text: text('content_body_text'), // inline snippet or full text if small 76 | word_count: integer('word_count'), 77 | 78 | embedding_text: text('embedding_text'), // text used to generate embedding 79 | analysis_payload: jsonb('analysis_payload').$type(), // structured LLM analysis 80 | source_specific_metadata: jsonb('source_specific_metadata'), // small, queryable metadata 81 | 82 | usedBrowser: boolean('used_browser'), 83 | embedding: vector('embedding', { dimensions: 384 }), 84 | fail_reason: text('fail_reason'), 85 | 86 | data_source_id: integer('data_source_id') 87 | .references(() => $data_sources.id) 88 | .notNull(), 89 | 90 | processed_at: timestamp('processed_at', { mode: 'date' }), 91 | ingested_at: timestamp('ingested_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`), 92 | }, 93 | table => [ 94 | index('embeddingIndex').using('hnsw', table.embedding.op('vector_cosine_ops')), 95 | unique('uniqueSourceItem').on(table.data_source_id, table.item_id_from_source), 96 | ] 97 | ); 98 | 99 | export const $reports = pgTable('reports', { 100 | id: serial('id').primaryKey(), 101 | title: text('title').notNull(), 102 | content: text('content').notNull(), 103 | 104 | totalArticles: integer('total_articles').notNull(), 105 | totalSources: integer('total_sources').notNull(), 106 | 107 | usedArticles: integer('used_articles').notNull(), 108 | usedSources: integer('used_sources').notNull(), 109 | 110 | tldr: text('tldr'), 111 | 112 | clustering_params: jsonb('clustering_params'), 113 | 114 | model_author: text('model_author'), 115 | 116 | createdAt: timestamp('created_at', { mode: 'date' }) 117 | .default(sql`CURRENT_TIMESTAMP`) 118 | .notNull(), 119 | }); 120 | 121 | export const $newsletter = pgTable('newsletter', { 122 | id: serial('id').primaryKey(), 123 | email: text('email').notNull().unique(), 124 | createdAt: timestamp('created_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`), 125 | }); 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Meridian: Your Personal Intelligence Agency 2 | 3 | [![Build Status](https://img.shields.io/github/actions/workflow/status/iliane5/meridian/deploy-services.yaml?branch=main)](https://github.com/iliane5/meridian/actions/workflows/deploy-services.yaml) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | 6 | **Presidential-level intelligence briefings, built with AI, tailored for you.** 7 | 8 | Meridian cuts through news noise by scraping hundreds of sources, analyzing stories with AI, and delivering concise, personalized daily briefs. 9 | 10 |

11 | Meridian Brief Example 12 |

13 | 14 | ## Why It Exists 15 | 16 | Presidents get tailored daily intelligence briefs. Now with AI, you can too. Meridian delivers: 17 | 18 | - Key global events filtered by relevance 19 | - Context and underlying drivers 20 | - Analysis of implications 21 | - Open-source transparency 22 | 23 | Built for the curious who want depth beyond headlines without the time sink. 24 | 25 | ## Key Features 26 | 27 | - **Source Coverage**: Hundreds of diverse news sources 28 | - **AI Analysis**: Multi-stage LLM processing (Gemini) for article and cluster analysis 29 | - **Smart Clustering**: Embeddings + UMAP + HDBSCAN to group related articles 30 | - **Personalized Briefing**: Daily brief with analytical voice and continuity tracking 31 | - **Web Interface**: Clean Nuxt 3 frontend 32 | 33 | ## How It Works 34 | 35 | ```mermaid 36 | graph TD 37 | A[RSS Feed URLs] --> B(Scraper Workflow CF); 38 | B --> C[Article Metadata DB]; 39 | C --> D(Article Processor Workflow CF); 40 | D -- Fetches --> E{Content Extraction}; 41 | E -- Standard --> F[Direct Fetch]; 42 | E -- Complex/Paywall --> G[Browser Rendering API]; 43 | F --> H[LLM Article Analysis]; 44 | G --> H; 45 | H --> I[Processed Articles DB]; 46 | I --> J(Brief Generation Python); 47 | J -- Embeddings --> K[UMAP/HDBSCAN Clustering]; 48 | K --> L[LLM Cluster Review]; 49 | L --> M[LLM Deep Analysis JSON]; 50 | M --> N[Markdown Summary Generation]; 51 | O[Previous Day TLDR DB] --> P{Final Briefing LLM}; 52 | N --> P; 53 | P --> Q[Final Brief Markdown]; 54 | Q --> R[Reports DB]; 55 | R --> S(Frontend API CF); 56 | S --> T[Frontend UI Nuxt]; 57 | ``` 58 | 59 | 1. **Scraping**: Cloudflare Workers fetch RSS feeds, store metadata 60 | 2. **Processing**: Extract text, analyze with Gemini for relevance and structure 61 | 3. **Brief Generation**: Cluster articles, generate analysis, synthesize final brief 62 | 4. **Frontend**: Display briefs via Nuxt/Cloudflare 63 | 64 | ## Tech Stack 65 | 66 | - **Infrastructure**: Turborepo, Cloudflare (Workers, Workflows, Pages) 67 | - **Backend**: Hono, TypeScript, PostgreSQL, Drizzle 68 | - **AI/ML**: Gemini models, multilingual-e5-small embeddings, UMAP, HDBSCAN 69 | - **Frontend**: Nuxt 3, Vue 3, Tailwind 70 | 71 | ## Setup 72 | 73 | **Prerequisites**: Node.js v22+, pnpm v9.15+, Python 3.10+, PostgreSQL, Cloudflare account, Google AI API key 74 | 75 | ```bash 76 | git clone https://github.com/iliane5/meridian.git 77 | cd meridian 78 | pnpm install 79 | # Configure .env files 80 | pnpm --filter @meridian/database migrate 81 | # Deploy via Wrangler, run Python briefing notebook manually 82 | ``` 83 | 84 | ## Status & Next Steps 85 | 86 | - ✅ **Core Pipeline**: Scraping, processing, analysis working 87 | - ⏳ **Top Priority**: Automate brief generation (currently manual Python notebook) 88 | - ⚠️ **Monitoring**: Improve scraping robustness 89 | - 🔜 **Future**: Add testing, newsletter distribution 90 | 91 | ## AI Collaboration 92 | 93 | This project benefited significantly from AI assistance: 94 | 95 | - **Claude 3.7 Sonnet**: Contributed to early architecture brainstorming, generated browser js scraping scripts, refined prompts, and called me out when I was overthinking or overengineering. 96 | - **Gemini 2.5 Pro**: Excelled with long-context tasks - comparing outputs across different prompt variants, reviewing the entire codebase before opensourcing, and nailing the analytical tone for briefs 97 | - **Gemini 2.0 Flash**: The true unsung hero of this project - blazing fast, dirt cheap, and surprisingly capable when prompted well. It's the workhorse that makes running meridian economically viable without sponsors or grants. Essentially free intelligence at scale. 98 | 99 | The first two compressed months of dev work into days and made building this way more fun. But Flash isn't just a time-saver—it's the engine that makes Meridian possible at all. No human is reading 2000+ articles daily and analyzing 100+ story clusters. Having AI peers for brainstorming felt like cheating; having AI workers for the actual intelligence pipeline feels like living in the future. 100 | 101 | ## License 102 | 103 | MIT License - See [LICENSE](./LICENSE) file for details. 104 | 105 | --- 106 | 107 | _Built because we live in an age of magic, and we keep forgetting to use it._ 108 | -------------------------------------------------------------------------------- /apps/backend/src/routers/openGraph.router.ts: -------------------------------------------------------------------------------- 1 | import { zValidator } from '@hono/zod-validator'; 2 | import { Hono } from 'hono'; 3 | import { ImageResponse } from 'workers-og'; 4 | import { z } from 'zod'; 5 | import type { HonoEnv } from '../app'; 6 | 7 | const getBriefOpenGraph = (opts: { title: string; date: Date; totalArticles: number; totalSources: number }) => 8 | ` 9 |
10 |
22 |
23 | 24 | ${opts.date.toLocaleDateString('en-US', { month: 'long', day: 'numeric', year: 'numeric' })} 25 | 26 |
27 | 28 |
29 | 30 | Intelligence brief · ${opts.totalArticles} articles · ${opts.totalSources} sources 31 | 32 |
33 | 34 |
35 | news.iliane.xyz 36 |
37 | 38 |
54 | ${decodeURIComponent(opts.title.trim())} 55 |
56 |
57 |
`; 58 | 59 | const getHomeOpenGraph = () => ` 60 |
61 |
74 |
90 | Meridian 91 |
92 | 93 |
109 | a daily brief of everything important happening that i care about, with actual analysis beyond headlines 110 |
111 |
112 |
`; 113 | 114 | const route = new Hono() 115 | .get('/default', async c => { 116 | const response = new ImageResponse(getHomeOpenGraph(), { width: 1200, height: 630 }); 117 | response.headers.set('Cache-Control', 'public, max-age=86400'); // Cache for 1 day 118 | return response; 119 | }) 120 | .get( 121 | '/brief', 122 | zValidator( 123 | 'query', 124 | z.object({ 125 | title: z.string(), 126 | date: z.string().transform(val => new Date(Number.parseInt(val))), 127 | articles: z.string().transform(val => Number.parseInt(val)), 128 | sources: z.string().transform(val => Number.parseInt(val)), 129 | }) 130 | ), 131 | async c => { 132 | const query = c.req.valid('query'); 133 | const response = new ImageResponse( 134 | getBriefOpenGraph({ 135 | title: query.title, 136 | date: query.date, 137 | totalArticles: query.articles, 138 | totalSources: query.sources, 139 | }), 140 | { width: 1200, height: 630 } 141 | ); 142 | // Cache brief images for longer since they don't change much despite having params 143 | response.headers.set('Cache-Control', 'public, max-age=86400, stale-while-revalidate=43200'); 144 | return response; 145 | } 146 | ); 147 | 148 | export default route; 149 | -------------------------------------------------------------------------------- /apps/backend/test/parseArticle.spec.ts: -------------------------------------------------------------------------------- 1 | import { Readability } from '@mozilla/readability'; 2 | import * as linkedom from 'linkedom'; 3 | import { beforeEach, describe, expect, it, vi } from 'vitest'; 4 | import { parseArticle } from '../src/lib/parsers'; 5 | 6 | // Mock the Readability and parseHTML dependencies 7 | vi.mock('@mozilla/readability', () => { 8 | return { 9 | Readability: vi.fn(), 10 | }; 11 | }); 12 | 13 | vi.mock('linkedom', () => { 14 | return { 15 | parseHTML: vi.fn(), 16 | }; 17 | }); 18 | 19 | describe('parseArticle', () => { 20 | // Note: Testing Readability itself is hard. Focus on the wrapper. 21 | 22 | beforeEach(() => { 23 | vi.resetAllMocks(); 24 | 25 | // Default mocks for linkedom 26 | vi.mocked(linkedom.parseHTML).mockReturnValue({ 27 | document: 'mock-document', 28 | } as unknown); 29 | }); 30 | 31 | it('should return an error Result if Readability constructor or parse() throws an exception', () => { 32 | // Setup: Make Readability throw an error 33 | vi.mocked(Readability).mockImplementation(() => { 34 | throw new Error('Readability error'); 35 | }); 36 | 37 | // Execute 38 | const result = parseArticle({ html: 'Test' }); 39 | 40 | // Verify 41 | expect(result.isErr()).toBe(true); 42 | if (result.isErr()) { 43 | expect(result.error.type).toBe('READABILITY_ERROR'); 44 | } 45 | }); 46 | 47 | it('should return an error Result if Readability returns null', () => { 48 | // Setup: Make Readability.parse() return null 49 | vi.mocked(Readability).mockImplementation(() => { 50 | return { 51 | parse: () => null, 52 | } as unknown as Readability; 53 | }); 54 | 55 | // Execute 56 | const result = parseArticle({ html: 'Test' }); 57 | 58 | // Verify 59 | expect(result.isErr()).toBe(true); 60 | if (result.isErr()) { 61 | expect(result.error.type).toBe('NO_ARTICLE_FOUND'); 62 | } 63 | }); 64 | 65 | it('should return an error Result if Readability result is missing title', () => { 66 | // Setup: Make Readability.parse() return an object without a title 67 | vi.mocked(Readability).mockImplementation(() => { 68 | return { 69 | parse: () => ({ 70 | title: '', // empty title 71 | textContent: 'Some content', 72 | }), 73 | } as unknown as Readability; 74 | }); 75 | 76 | // Execute 77 | const result = parseArticle({ html: 'Test' }); 78 | 79 | // Verify 80 | expect(result.isErr()).toBe(true); 81 | if (result.isErr()) { 82 | expect(result.error.type).toBe('NO_ARTICLE_FOUND'); 83 | } 84 | }); 85 | 86 | it('should return an error Result if Readability result is missing textContent', () => { 87 | // Setup: Make Readability.parse() return an object without textContent 88 | vi.mocked(Readability).mockImplementation(() => { 89 | return { 90 | parse: () => ({ 91 | title: 'Article Title', 92 | textContent: '', // empty textContent 93 | }), 94 | } as unknown as Readability; 95 | }); 96 | 97 | // Execute 98 | const result = parseArticle({ html: 'Test' }); 99 | 100 | // Verify 101 | expect(result.isErr()).toBe(true); 102 | if (result.isErr()) { 103 | expect(result.error.type).toBe('NO_ARTICLE_FOUND'); 104 | } 105 | }); 106 | 107 | it('should return the extracted title, cleaned textContent, and publishedTime when successful', () => { 108 | // Setup: Make Readability.parse() return a valid article 109 | vi.mocked(Readability).mockImplementation(() => { 110 | return { 111 | parse: () => ({ 112 | title: 'Article Title', 113 | textContent: 'Article content here', 114 | publishedTime: '2025-03-18T18:04:44-04:00', 115 | }), 116 | } as unknown as Readability; 117 | }); 118 | 119 | // Execute 120 | const result = parseArticle({ html: 'Test' }); 121 | 122 | // Verify 123 | expect(result.isOk()).toBe(true); 124 | if (result.isOk()) { 125 | expect(result.value).toEqual({ 126 | title: 'Article Title', 127 | text: 'Article content here', 128 | publishedTime: '2025-03-18T18:04:44-04:00', 129 | }); 130 | } 131 | }); 132 | 133 | it('should clean and normalize whitespace in the extracted textContent', () => { 134 | // Setup: Make Readability.parse() return messy text content 135 | const messyText = ' Multiple spaces \n\n\n and \t\t tabs \n and extra newlines '; 136 | vi.mocked(Readability).mockImplementation(() => { 137 | return { 138 | parse: () => ({ 139 | title: 'Article Title', 140 | textContent: messyText, 141 | }), 142 | } as unknown as Readability; 143 | }); 144 | 145 | // Execute 146 | const result = parseArticle({ html: 'Test' }); 147 | 148 | // Verify 149 | expect(result.isOk()).toBe(true); 150 | if (result.isOk()) { 151 | // The text should be cleaned according to the cleanString function logic 152 | expect(result.value.text).toBe('Multiple spaces\nand tabs\nand extra newlines'); 153 | } 154 | }); 155 | }); 156 | -------------------------------------------------------------------------------- /apps/frontend/src/server/api/admin/sources/index.get.ts: -------------------------------------------------------------------------------- 1 | import { sql, $ingested_items, and, gte } from '@meridian/database'; 2 | import { getDB } from '~/server/lib/utils'; 3 | 4 | export default defineEventHandler(async event => { 5 | await requireUserSession(event); // require auth 6 | 7 | const db = getDB(event); 8 | const sources = await db.query.$data_sources.findMany(); 9 | if (sources.length === 0) { 10 | return { overview: null, sources: [] }; 11 | } 12 | 13 | // get article stats for last 7 days 14 | const sevenDaysAgo = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000); 15 | const articleStats = await db.query.$ingested_items.findMany({ 16 | where: sql`ingested_at >= ${sevenDaysAgo.toISOString()}`, 17 | columns: { 18 | data_source_id: true, 19 | status: true, 20 | ingested_at: true, 21 | processed_at: true, 22 | }, 23 | }); 24 | 25 | // calculate per-source stats 26 | const sourceStats = sources.map(source => { 27 | const sourceArticles = articleStats.filter(a => a.data_source_id === source.id); 28 | const last24hArticles = sourceArticles.filter( 29 | a => a.ingested_at && new Date(a.ingested_at) > new Date(Date.now() - 24 * 60 * 60 * 1000) 30 | ); 31 | 32 | // calculate health metrics 33 | const totalArticles = sourceArticles.length; 34 | const processedArticles = sourceArticles.filter(a => a.status === 'PROCESSED'); 35 | const failedArticles = sourceArticles.filter(a => a.status?.endsWith('_FAILED')); 36 | 37 | // calculate processing time for processed articles 38 | const processingTimes = processedArticles 39 | .map(a => 40 | a.processed_at && a.ingested_at ? new Date(a.processed_at).getTime() - new Date(a.ingested_at).getTime() : null 41 | ) 42 | .filter(time => time !== null); 43 | 44 | const avgProcessingTime = processingTimes.length 45 | ? Math.round(processingTimes.reduce((a, b) => a + b, 0) / processingTimes.length / 1000) // in seconds 46 | : null; 47 | 48 | return { 49 | id: source.id, 50 | name: source.name, 51 | url: source.config.config.url, 52 | paywall: source.config.config.rss_paywall, 53 | frequency: 54 | source.scrape_frequency_minutes <= 60 55 | ? 'Hourly' 56 | : source.scrape_frequency_minutes <= 120 57 | ? '4 Hours' 58 | : source.scrape_frequency_minutes <= 180 59 | ? '6 Hours' 60 | : 'Daily', 61 | lastChecked: source.lastChecked?.toISOString(), 62 | 63 | // article counts 64 | totalArticles: sourceArticles.length, 65 | avgPerDay: last24hArticles.length / 24, 66 | 67 | // health metrics 68 | processSuccessRate: totalArticles ? (processedArticles.length / totalArticles) * 100 : null, 69 | errorRate: totalArticles ? (failedArticles.length / totalArticles) * 100 : null, 70 | avgProcessingTime, 71 | }; 72 | }); 73 | 74 | // get global stats 75 | const startOfToday = new Date(); 76 | startOfToday.setUTCHours(0, 0, 0, 0); 77 | 78 | const [lastSourceCheck, lastArticleProcessed, lastArticleFetched, todayStats, staleSources] = await Promise.all([ 79 | // get latest source check 80 | db.query.$data_sources.findFirst({ 81 | orderBy: sql`last_checked DESC NULLS LAST`, 82 | columns: { lastChecked: true }, 83 | }), 84 | // get latest processed article 85 | db.query.$ingested_items.findFirst({ 86 | where: sql`status = 'PROCESSED'`, 87 | orderBy: sql`processed_at DESC NULLS LAST`, 88 | columns: { processed_at: true }, 89 | }), 90 | // get latest fetched article 91 | db.query.$ingested_items.findFirst({ 92 | orderBy: sql`ingested_at DESC NULLS LAST`, 93 | columns: { ingested_at: true }, 94 | }), 95 | // get today's stats 96 | db.query.$ingested_items.findMany({ 97 | where: and(gte($ingested_items.ingested_at, startOfToday)), 98 | columns: { 99 | status: true, 100 | ingested_at: true, 101 | processed_at: true, 102 | }, 103 | }), 104 | // get stale sources count 105 | db.query.$data_sources.findMany({ 106 | where: sql`( 107 | (scrape_frequency_minutes <= 60 AND last_checked < NOW() - INTERVAL '2 hours') OR 108 | (scrape_frequency_minutes <= 120 AND last_checked < NOW() - INTERVAL '8 hours') OR 109 | (scrape_frequency_minutes <= 180 AND last_checked < NOW() - INTERVAL '12 hours') OR 110 | (scrape_frequency_minutes <= 240 AND last_checked < NOW() - INTERVAL '48 hours') 111 | )`, 112 | columns: { id: true }, 113 | }), 114 | ]); 115 | 116 | const overview = { 117 | lastSourceCheck: lastSourceCheck?.lastChecked?.toISOString() ?? null, 118 | lastArticleProcessed: lastArticleProcessed?.processed_at?.toISOString() ?? null, 119 | lastArticleFetched: lastArticleFetched?.ingested_at?.toISOString() ?? null, 120 | articlesProcessedToday: todayStats.filter(a => a.status === 'PROCESSED').length, 121 | articlesFetchedToday: todayStats.length, 122 | errorsToday: todayStats.filter(a => a.status?.endsWith('_FAILED')).length, 123 | staleSourcesCount: staleSources.length, 124 | totalSourcesCount: sources.length, 125 | }; 126 | 127 | return { 128 | overview, 129 | sources: sourceStats, 130 | }; 131 | }); 132 | -------------------------------------------------------------------------------- /apps/backend/src/lib/parsers.ts: -------------------------------------------------------------------------------- 1 | import { Readability } from '@mozilla/readability'; 2 | import { XMLParser } from 'fast-xml-parser'; 3 | import { parseHTML } from 'linkedom'; 4 | import { Result, err, ok } from 'neverthrow'; 5 | import { z } from 'zod'; 6 | 7 | const rssFeedSchema = z.object({ 8 | title: z.string().min(1), 9 | link: z.string(), 10 | pubDate: z.date().nullable(), 11 | }); 12 | 13 | function cleanString(text: string) { 14 | return text 15 | .replace(/[ \t]+/g, ' ') // collapse spaces/tabs 16 | .replace(/\n\s+/g, '\n') // clean spaces after newlines 17 | .replace(/\s+\n/g, '\n') // clean spaces before newlines 18 | .replace(/\n{3,}/g, '\n\n') // keep max 2 consecutive newlines 19 | .trim(); // clean edges 20 | } 21 | 22 | function cleanUrl(url: string) { 23 | const u = new URL(url); 24 | 25 | const paramsToRemove = ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid']; 26 | for (const param of paramsToRemove) { 27 | u.searchParams.delete(param); 28 | } 29 | 30 | return u.toString(); 31 | } 32 | 33 | /** 34 | * Parses an RSS/XML feed content to extract article information 35 | * 36 | * Handles various RSS feed formats and structures while normalizing the output. 37 | * Extracts titles, links, and publication dates from the feed items. 38 | * 39 | * @param xml The XML content of the RSS feed as a string 40 | * @returns A Result containing either an array of parsed feed items or an error 41 | */ 42 | export async function parseRSSFeed(xml: string): Promise[], Error>> { 43 | const safeParser = Result.fromThrowable( 44 | (xml: string) => new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_' }).parse(xml), 45 | e => (e instanceof Error ? e : new Error(String(e))) 46 | ); 47 | 48 | const parsedXml = safeParser(xml); 49 | if (parsedXml.isErr()) { 50 | return err(new Error(`Parse error: ${parsedXml.error.message}`)); 51 | } 52 | 53 | const result = parsedXml.value; 54 | 55 | // handle various feed structures 56 | let items = result.rss?.channel?.item || result.feed?.entry || result.item || result['rdf:RDF']?.item || []; 57 | 58 | // handle single item case 59 | items = Array.isArray(items) ? items : [items]; 60 | 61 | // biome-ignore lint/suspicious/noExplicitAny: 62 | const properItems = items.map((item: any) => { 63 | let title = ''; 64 | let link = ''; 65 | let id = ''; 66 | let pubDateString: string | null = null; 67 | 68 | if (typeof item.title === 'string') { 69 | title = item.title; 70 | } else if (typeof item.title === 'object' && item.title['#text']) { 71 | title = item.title['#text']; 72 | } else { 73 | title = 'UNKNOWN'; 74 | } 75 | 76 | if (typeof item.link === 'string') { 77 | link = item.link; 78 | } else if (typeof item.link === 'object' && item.link['@_href']) { 79 | link = item.link['@_href']; 80 | } else if (typeof item.guid === 'string') { 81 | link = item.guid; 82 | } else { 83 | link = 'UNKNOWN'; 84 | } 85 | 86 | if (typeof item.guid === 'string') { 87 | id = item.guid; 88 | } else if (typeof item.guid === 'object' && item.guid['#text']) { 89 | id = item.guid['#text']; 90 | } else { 91 | id = 'UNKNOWN'; 92 | } 93 | 94 | if (typeof item.pubDate === 'string') { 95 | pubDateString = item.pubDate; 96 | } else if (typeof item.published === 'string') { 97 | pubDateString = item.published; 98 | } else if (typeof item.updated === 'string') { 99 | pubDateString = item.updated; 100 | } 101 | 102 | let pubDate: Date | null = null; 103 | if (pubDateString) { 104 | pubDate = new Date(pubDateString); 105 | if (Number.isNaN(pubDate.getTime())) { 106 | pubDate = null; 107 | } 108 | } 109 | 110 | return { 111 | title: cleanString(title), 112 | link: cleanUrl(cleanString(link)), 113 | id: cleanString(id), 114 | pubDate, 115 | }; 116 | }); 117 | 118 | // standardize the items 119 | const parsedItems = z.array(rssFeedSchema).safeParse(properItems); 120 | if (parsedItems.success === false) { 121 | return err(new Error(`Validation error: ${parsedItems.error.message}`)); 122 | } 123 | 124 | return ok(parsedItems.data); 125 | } 126 | 127 | /** 128 | * Parses HTML content to extract article text and metadata 129 | * 130 | * Uses Mozilla Readability to identify and extract the main content 131 | * from an HTML document, ignoring navigation, ads, and other non-content elements. 132 | * 133 | * @param opts Object containing the HTML content to parse 134 | * @returns A Result containing either the parsed article data or an error object 135 | */ 136 | export function parseArticle(opts: { html: string }) { 137 | const safeReadability = Result.fromThrowable( 138 | (html: string) => new Readability(parseHTML(html).document).parse(), 139 | e => (e instanceof Error ? e : new Error(String(e))) 140 | ); 141 | 142 | const articleResult = safeReadability(opts.html); 143 | if (articleResult.isErr()) { 144 | return err({ type: 'READABILITY_ERROR', error: articleResult.error }); 145 | } 146 | 147 | // if we can't parse the article or there is no article, not much we can do 148 | const article = articleResult.value; 149 | if (article === null || !article.title || !article.textContent) { 150 | return err({ type: 'NO_ARTICLE_FOUND', error: new Error('No article found') }); 151 | } 152 | 153 | return ok({ 154 | title: article.title, 155 | text: cleanString(article.textContent), 156 | publishedTime: article.publishedTime || undefined, 157 | }); 158 | } 159 | -------------------------------------------------------------------------------- /apps/backend/src/lib/rateLimiter.ts: -------------------------------------------------------------------------------- 1 | import type { WorkflowStep } from 'cloudflare:workers'; 2 | import { Logger } from './logger'; 3 | 4 | /** 5 | * Configuration options for the rate limiter 6 | */ 7 | type RateLimiterOptions = { 8 | maxConcurrent: number; 9 | globalCooldownMs: number; 10 | domainCooldownMs: number; 11 | }; 12 | 13 | /** 14 | * Represents a batch item with an ID and URL 15 | */ 16 | type BatchItem = { 17 | id: IdType; 18 | url: string; 19 | }; 20 | 21 | /** 22 | * Rate limiter that respects per-domain cooldowns to prevent overloading specific domains 23 | * when making HTTP requests. Handles batching and throttling of requests. 24 | * 25 | * @template T Type of the batch items, must extend BatchItem 26 | * @template I Type of the ID field, defaults to number | string 27 | */ 28 | export class DomainRateLimiter, I = number | string> { 29 | private lastDomainAccess = new Map(); 30 | private options: RateLimiterOptions; 31 | private logger: Logger; 32 | 33 | /** 34 | * Creates a new DomainRateLimiter instance 35 | * 36 | * @param options Configuration options for throttling 37 | */ 38 | constructor(options: RateLimiterOptions) { 39 | this.options = options; 40 | this.logger = new Logger({ service: 'DomainRateLimiter' }); 41 | } 42 | 43 | /** 44 | * Processes a batch of items with domain-aware rate limiting 45 | * 46 | * @param items Array of items to process 47 | * @param step Workflow step instance for handling sleeps/delays 48 | * @param processItem Function that processes a single item and returns a result 49 | * @returns Promise resolving to an array of results in the same order as input items 50 | * 51 | * @template R The return type of the processItem function 52 | */ 53 | async processBatch( 54 | items: T[], 55 | step: WorkflowStep, 56 | processItem: (item: T, domain: string) => Promise 57 | ): Promise { 58 | const batchLogger = this.logger.child({ batch_size: items.length }); 59 | batchLogger.info('Starting batch processing'); 60 | 61 | const results: R[] = []; 62 | const remainingItems = [...items]; 63 | 64 | while (remainingItems.length > 0) { 65 | const currentBatch: T[] = []; 66 | const currentTime = Date.now(); 67 | 68 | // Select items for current batch based on domain cooldown 69 | for (const item of [...remainingItems]) { 70 | if (currentBatch.length >= this.options.maxConcurrent) break; 71 | 72 | try { 73 | const domain = new URL(item.url).hostname; 74 | const lastAccess = this.lastDomainAccess.get(domain) || 0; 75 | 76 | if (currentTime - lastAccess >= this.options.domainCooldownMs) { 77 | currentBatch.push(item); 78 | // Remove from remaining items 79 | const idx = remainingItems.findIndex(i => i.id === item.id); 80 | if (idx >= 0) remainingItems.splice(idx, 1); 81 | } 82 | } catch (e) { 83 | // Skip invalid URLs 84 | const idx = remainingItems.findIndex(i => i.id === item.id); 85 | if (idx >= 0) remainingItems.splice(idx, 1); 86 | } 87 | } 88 | 89 | if (currentBatch.length === 0) { 90 | // Nothing ready yet, wait for next domain to be ready 91 | const nextReady = Math.min( 92 | ...remainingItems 93 | .map(item => { 94 | try { 95 | const domain = new URL(item.url).hostname; 96 | const lastAccess = this.lastDomainAccess.get(domain) || 0; 97 | return this.options.domainCooldownMs - (currentTime - lastAccess); 98 | } catch { 99 | return Number.POSITIVE_INFINITY; // Skip invalid URLs 100 | } 101 | }) 102 | .filter(time => time > 0) // Only consider positive wait times 103 | ); 104 | batchLogger.debug('Waiting for domain cooldown', { wait_time_ms: Math.max(500, nextReady) }); 105 | await step.sleep(`waiting for domain cooldown (${Math.round(nextReady / 1000)}s)`, Math.max(500, nextReady)); 106 | continue; 107 | } 108 | 109 | batchLogger.debug('Processing batch', { batch_size: currentBatch.length, remaining: remainingItems.length }); 110 | 111 | // Process current batch in parallel 112 | const batchResults = await Promise.allSettled( 113 | currentBatch.map(async item => { 114 | try { 115 | const domain = new URL(item.url).hostname; 116 | this.lastDomainAccess.set(domain, Date.now()); 117 | return await processItem(item, domain); 118 | } catch (error) { 119 | const itemLogger = batchLogger.child({ item_id: item.id }); 120 | itemLogger.error( 121 | 'Error processing item', 122 | undefined, 123 | error instanceof Error ? error : new Error(String(error)) 124 | ); 125 | throw error; 126 | } 127 | }) 128 | ); 129 | 130 | // Add results 131 | for (const result of batchResults) { 132 | if (result.status === 'fulfilled') { 133 | results.push(result.value); 134 | } 135 | } 136 | 137 | // Apply global cooldown between batches if we have more items to process 138 | if (remainingItems.length > 0) { 139 | batchLogger.debug('Applying global rate limit', { cooldown_ms: this.options.globalCooldownMs }); 140 | await step.sleep( 141 | `global rate limit (${Math.round(this.options.globalCooldownMs / 1000)}s)`, 142 | this.options.globalCooldownMs 143 | ); 144 | } 145 | } 146 | 147 | batchLogger.info('Batch processing complete', { processed_count: results.length }); 148 | return results; 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /apps/briefs/src/llm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from openai import OpenAI 3 | from dotenv import load_dotenv 4 | import json 5 | import re 6 | from typing import Dict, Optional, Any, Tuple, List 7 | 8 | load_dotenv() 9 | 10 | client = OpenAI( 11 | # This is the default and can be omitted 12 | api_key=os.environ.get("GOOGLE_API_KEY"), 13 | base_url="https://generativelanguage.googleapis.com/v1beta/openai/", 14 | ) 15 | 16 | # ---- NEW: Global list to store LLM calls ---- 17 | LLM_CALL_LOG = [] 18 | # -------------------------------------------- 19 | 20 | 21 | def call_llm( 22 | model: str, messages: list[dict], temperature: float = 0 23 | ) -> Tuple[Optional[str], Optional[Tuple[int, int]]]: 24 | """ 25 | Calls the LLM API, logs the interaction, and returns content and usage. 26 | Returns (None, None) on API error. 27 | """ 28 | try: 29 | response = client.chat.completions.create( 30 | model=model, 31 | messages=messages, 32 | n=1, 33 | temperature=temperature, 34 | ) 35 | 36 | output_content = response.choices[0].message.content 37 | usage_stats = ( 38 | response.usage.prompt_tokens, 39 | response.usage.completion_tokens, 40 | ) 41 | 42 | # ---- NEW: Log the call details ---- 43 | log_entry = { 44 | "model": model, 45 | "messages": messages, 46 | "temperature": temperature, 47 | "output": output_content, 48 | "usage": { 49 | "prompt_tokens": usage_stats[0], 50 | "completion_tokens": usage_stats[1], 51 | }, 52 | } 53 | LLM_CALL_LOG.append(log_entry) 54 | # ----------------------------------- 55 | 56 | return output_content, usage_stats 57 | 58 | except Exception as e: 59 | print(f"ERROR: LLM API call failed for model {model}: {e}") 60 | # Log the error attempt? Maybe not for the clean log requested. 61 | # You could add a separate error log if needed. 62 | # ---- NEW: Log the failed attempt ---- 63 | # log_entry = { 64 | # "model": model, 65 | # "messages": messages, 66 | # "temperature": temperature, 67 | # "output": None, 68 | # "error": str(e), 69 | # "usage": None 70 | # } 71 | # LLM_CALL_LOG.append(log_entry) 72 | # ------------------------------------ 73 | return None, None # Indicate failure 74 | 75 | 76 | # ---- NEW: Function to save the log ---- 77 | def save_llm_log_to_json(filename: str = "llm_calls_log.json"): 78 | """Saves the accumulated LLM call log to a JSON file.""" 79 | print(f"Attempting to save {len(LLM_CALL_LOG)} LLM calls to {filename}...") 80 | try: 81 | with open(filename, "w", encoding="utf-8") as f: 82 | json.dump(LLM_CALL_LOG, f, indent=2, ensure_ascii=False) 83 | print(f"LLM call log successfully saved to {filename}") 84 | except Exception as e: 85 | print(f"ERROR: Failed to save LLM call log to {filename}: {e}") 86 | 87 | 88 | # Option 1: Automatically save on exit (use with caution in notebooks) 89 | # atexit.register(save_llm_log_to_json, filename="llm_calls_log_autosave.json") 90 | # --------------------------------------- 91 | 92 | 93 | def parse_llm_json_output( 94 | llm_output: Optional[str], 95 | expected_schema: Optional[ 96 | Dict 97 | ] = None, # Currently unused, but placeholder for future validation 98 | ) -> Optional[Dict]: 99 | """ 100 | Safely parses JSON from LLM output, handling markdown code fences. 101 | 102 | Args: 103 | llm_output: The raw string output from the LLM. 104 | expected_schema: Optional schema for basic validation (not implemented yet). 105 | 106 | Returns: 107 | The parsed dictionary or None if parsing fails or input is None. 108 | """ 109 | if llm_output is None: 110 | # print("Debug: parse_llm_json_output received None input.") # Optional debug print 111 | return None 112 | 113 | # Regex to find JSON within ```json ... ``` blocks 114 | match = re.search(r"```json\s*(\{.*?\})\s*```", llm_output, re.DOTALL) 115 | 116 | json_string = None 117 | if match: 118 | json_string = match.group(1).strip() 119 | # print("Debug: Found JSON within ```json fences.") # Optional debug print 120 | else: 121 | # print("Debug: No ```json fences found. Checking if entire string is JSON.") # Optional debug print 122 | # Fallback: Check if the entire string is valid JSON (maybe without fences) 123 | # Be cautious with this fallback as LLMs often add extra text 124 | temp_string = llm_output.strip() 125 | if temp_string.startswith("{") and temp_string.endswith("}"): 126 | # print("Debug: Entire string looks like JSON.") # Optional debug print 127 | json_string = temp_string 128 | # else: # Removed risky loose brace finding 129 | 130 | if json_string: 131 | try: 132 | parsed_json = json.loads(json_string) 133 | if isinstance(parsed_json, dict): 134 | # Optional: Add basic schema validation here if needed 135 | # if expected_schema and not all(key in parsed_json for key in expected_schema): 136 | # print("Warning: Parsed JSON missing expected keys.") 137 | # return None # Or handle differently 138 | # print("Debug: Successfully parsed JSON string into dict.") # Optional debug print 139 | return parsed_json 140 | else: 141 | print(f"Warning: Parsed JSON is not a dictionary: {type(parsed_json)}") 142 | return None 143 | except json.JSONDecodeError as e: 144 | print( 145 | f"ERROR: Failed to decode JSON: {e}\nInput string (first 500 chars): {json_string[:500]}..." 146 | ) 147 | return None 148 | except Exception as e: 149 | print(f"ERROR: Unexpected error parsing JSON: {e}") 150 | return None 151 | else: 152 | # print(f"Warning: No valid JSON structure found in LLM output (first 500 chars): {llm_output[:500]}...") # Keep this warning 153 | return None 154 | -------------------------------------------------------------------------------- /apps/frontend/src/composables/useTableOfContents.ts: -------------------------------------------------------------------------------- 1 | export interface TocItem { 2 | id: string; 3 | text: string; 4 | level: number; 5 | } 6 | 7 | export interface UseTableOfContentsOptions { 8 | contentRef: Ref; 9 | headerOffset?: number; 10 | selectors?: string; // e.g., 'h2, h3, u > strong' 11 | } 12 | 13 | const DEFAULT_HEADER_OFFSET = 80; 14 | const DEFAULT_SELECTORS = 'h2, h3, u > strong'; 15 | 16 | // Simple slugify, might need refinement depending on edge cases 17 | const generateSlug = (text: string): string => { 18 | return text 19 | .toLowerCase() 20 | .replace(/[^a-z0-9\s-]/g, '') // Remove special chars except space/hyphen 21 | .trim() 22 | .replace(/\s+/g, '-') // Replace spaces with hyphens 23 | .slice(0, 50); // Limit length 24 | }; 25 | 26 | export function useTableOfContents({ 27 | contentRef, 28 | headerOffset = DEFAULT_HEADER_OFFSET, 29 | selectors = DEFAULT_SELECTORS, 30 | }: UseTableOfContentsOptions) { 31 | const tocItems = ref([]); 32 | const activeHeadingId = ref(null); 33 | const mobileMenuOpen = ref(false); // Keep mobile state here if tied to TOC display 34 | 35 | let observer: IntersectionObserver | null = null; 36 | 37 | const generateToc = () => { 38 | if (!contentRef.value) return; 39 | 40 | const elements = contentRef.value.querySelectorAll(selectors); 41 | const newTocItems: TocItem[] = []; 42 | const observedElements: Element[] = []; // Keep track of elements to observe 43 | 44 | elements.forEach((el, index) => { 45 | let level: number; 46 | const text = el.textContent?.trim() || ''; 47 | let targetElement: HTMLElement = el as HTMLElement; 48 | 49 | if (el.tagName === 'H2') level = 2; 50 | else if (el.tagName === 'H3') level = 3; 51 | else if (el.tagName === 'STRONG' && el.parentElement?.tagName === 'U') { 52 | level = 5; // Special level for topics 53 | targetElement = el.parentElement; // Target the tag 54 | } else { 55 | return; // Skip unrecognized elements 56 | } 57 | 58 | // Ensure unique ID even if slug is identical 59 | const id = `${level === 5 ? 'topic' : 'section'}-${index}-${generateSlug(text)}`; 60 | 61 | if (text && targetElement) { 62 | targetElement.id = id; // Assign ID 63 | newTocItems.push({ id, text, level }); 64 | observedElements.push(targetElement); // Add element for intersection observer 65 | } 66 | }); 67 | 68 | tocItems.value = newTocItems; 69 | setupIntersectionObserver(observedElements); // Setup observer after generating TOC 70 | }; 71 | 72 | const setupIntersectionObserver = (elements: Element[]) => { 73 | // Disconnect previous observer if exists 74 | if (observer) { 75 | observer.disconnect(); 76 | } 77 | 78 | // Observer options: trigger when heading is near the top of the viewport 79 | const options = { 80 | rootMargin: `-${headerOffset - 1}px 0px -${window.innerHeight - headerOffset - 50}px 0px`, // Adjust bottom margin as needed 81 | threshold: 0, // Trigger as soon as any part enters/leaves the rootMargin 82 | }; 83 | 84 | observer = new IntersectionObserver(entries => { 85 | // Find the topmost visible entry 86 | let topmostVisibleEntry: IntersectionObserverEntry | null = null; 87 | entries.forEach(entry => { 88 | if (entry.isIntersecting) { 89 | // Prioritize the entry closest to the top boundary defined by rootMargin 90 | if (!topmostVisibleEntry || entry.boundingClientRect.top < topmostVisibleEntry.boundingClientRect.top) { 91 | topmostVisibleEntry = entry; 92 | } 93 | } 94 | }); 95 | 96 | if (topmostVisibleEntry) { 97 | activeHeadingId.value = (topmostVisibleEntry as IntersectionObserverEntry).target.id; 98 | } else { 99 | // If no entry is intersecting within the top margin, check if we scrolled past the first item 100 | if (tocItems.value.length > 0 && window.scrollY > document.getElementById(tocItems.value[0].id)!.offsetTop) { 101 | // Potentially keep the last active ID, or find the last item scrolled past 102 | // For simplicity, let's just keep the *last* one that *was* active if nothing is currently in the top zone 103 | // activeHeadingId.value remains unchanged unless explicitly cleared or updated 104 | } else { 105 | // Scrolled to the very top above the first item 106 | activeHeadingId.value = null; 107 | } 108 | } 109 | }, options); 110 | 111 | elements.forEach(el => observer!.observe(el)); 112 | }; 113 | 114 | // Computed property for the "current section name" shown in mobile/dropdown 115 | const currentSectionName = computed(() => { 116 | if (!activeHeadingId.value) { 117 | return 'on this page'; // Default text 118 | } 119 | const activeItem = tocItems.value.find(item => item.id === activeHeadingId.value); 120 | // Maybe find the parent H2 if the active item is H3/topic? Depends on desired UX. 121 | // For now, just use the active item's text. 122 | return activeItem ? activeItem.text.toLowerCase() : 'on this page'; 123 | }); 124 | 125 | const scrollToSection = (id: string) => { 126 | const el = document.getElementById(id); 127 | if (el) { 128 | const elementPosition = el.getBoundingClientRect().top; 129 | const offsetPosition = elementPosition + window.pageYOffset - headerOffset; 130 | window.scrollTo({ top: offsetPosition, behavior: 'smooth' }); 131 | mobileMenuOpen.value = false; // Close mobile menu on selection 132 | } 133 | }; 134 | 135 | onMounted(() => { 136 | // Ensure DOM is ready before querying elements 137 | nextTick(() => { 138 | generateToc(); 139 | }); 140 | }); 141 | 142 | onUnmounted(() => { 143 | if (observer) { 144 | observer.disconnect(); 145 | } 146 | }); 147 | 148 | // Optional: Watch for content changes if the article content could be dynamic 149 | // watch(contentRef, () => { nextTick(generateToc); }); 150 | 151 | return { 152 | tocItems, 153 | activeHeadingId, 154 | currentSectionName, 155 | mobileMenuOpen, 156 | generateToc, // Expose if manual regeneration is needed 157 | scrollToSection, 158 | }; 159 | } 160 | -------------------------------------------------------------------------------- /apps/backend/src/lib/articleFetchers.ts: -------------------------------------------------------------------------------- 1 | import { err, ok } from 'neverthrow'; 2 | import { z } from 'zod'; 3 | import type { Env } from '../index'; 4 | import { parseArticle } from './parsers'; 5 | import { tryCatchAsync } from './tryCatchAsync'; 6 | import { userAgents } from './utils'; 7 | 8 | /** 9 | * Schema for validating responses from the Cloudflare Browser Rendering API 10 | */ 11 | export const articleSchema = z.object({ 12 | status: z.coerce.boolean(), 13 | errors: z.array(z.object({ code: z.number(), message: z.string() })).optional(), 14 | result: z.string(), 15 | }); 16 | 17 | /** 18 | * Fetches an article using Cloudflare's Browser Rendering API 19 | * 20 | * This method simulates a real browser to handle modern websites with complex 21 | * JavaScript, cookie consent walls, paywalls, and other obstacles that might 22 | * prevent content scraping with a regular HTTP client. 23 | * 24 | * @param env Application environment with Cloudflare credentials 25 | * @param url URL of the article to fetch 26 | * @returns Result containing either the parsed article content or an error object 27 | */ 28 | export async function getArticleWithBrowser(env: Env, url: string) { 29 | const response = await tryCatchAsync( 30 | fetch(`https://api.cloudflare.com/client/v4/accounts/${env.CLOUDFLARE_ACCOUNT_ID}/browser-rendering/content`, { 31 | method: 'POST', 32 | headers: { 33 | 'Content-Type': 'application/json', 34 | Authorization: `Bearer ${env.CLOUDFLARE_API_TOKEN}`, 35 | }, 36 | body: JSON.stringify({ 37 | url, 38 | userAgent: userAgents[Math.floor(Math.random() * userAgents.length)], 39 | setExtraHTTPHeaders: { 40 | Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 41 | 'Accept-Encoding': 'gzip, deflate, br', 42 | Connection: 'keep-alive', 43 | DNT: '1', 44 | 'Accept-Language': 'en-US,en;q=0.5', 45 | 'Sec-Fetch-Dest': 'document', 46 | 'Sec-Fetch-Mode': 'navigate', 47 | 'Sec-Fetch-Site': 'none', 48 | 'Sec-Fetch-User': '?1', 49 | 'Upgrade-Insecure-Requests': '1', 50 | }, 51 | cookies: [], 52 | gotoOptions: { 53 | waitUntil: 'networkidle0', 54 | timeout: 30000, 55 | referer: 'https://www.google.com/', 56 | }, 57 | viewport: { 58 | width: 390, 59 | height: 844, 60 | deviceScaleFactor: 3, 61 | isMobile: true, 62 | hasTouch: true, 63 | isLandscape: false, 64 | }, 65 | rejectResourceTypes: ['image', 'media', 'font', 'websocket'], 66 | bestAttempt: true, 67 | // all of these are very brittle, like all script tag usage 68 | // this mostly works for now but good to revisit every once in a while 69 | addScriptTag: [ 70 | // Ensures consistent date formatting by overriding Intl.DateTimeFormat 71 | // to always use 'en-US' locale regardless of browser settings 72 | // This prevents inconsistent date parsing across different environments 73 | { 74 | content: 75 | "(() => { Object.defineProperty(Intl, 'DateTimeFormat', { \n writable: true, \n value: new Proxy(Intl.DateTimeFormat, { \n construct: (target, args) => new target('en-US', Object.assign({}, args[1])) \n })\n }); })();", 76 | }, 77 | // Automatically accepts cookie consent popups by finding buttons that contain 78 | // 'accept' and 'cookie'/'consent' text, then programmatically clicking the first match 79 | // This bypasses cookie walls that would otherwise block content access 80 | { 81 | content: 82 | "(() => { const cookieButtons = Array.from(document.querySelectorAll(\'button, a\')).filter(el => el.textContent.toLowerCase().includes(\'accept\') && (el.textContent.toLowerCase().includes(\'cookie\') || el.textContent.toLowerCase().includes(\'consent\'))); if(cookieButtons.length > 0) { cookieButtons[0].click(); } })();", 83 | }, 84 | // Circumvents paywalls by: 85 | // 1. Removing elements with paywall/subscribe identifiers in id/class 86 | // 2. Removing modal overlays and fixed position barriers 87 | // 3. Restoring normal page scroll behavior 88 | // This targets common paywall implementations across various sites 89 | { 90 | content: 91 | "(() => { const paywallElements = Array.from(document.querySelectorAll(\'div, section\')).filter(el => el.id.toLowerCase().includes(\'paywall\') || el.className.toLowerCase().includes(\'paywall\') || el.id.toLowerCase().includes(\'subscribe\') || el.className.toLowerCase().includes(\'subscribe\')); paywallElements.forEach(el => el.remove()); document.querySelectorAll(\'.modal, .modal-backdrop, body > div[style*=\"position: fixed\"]\').forEach(el => el.remove()); document.body.style.overflow = \'auto\'; })();", 92 | }, 93 | // Cleans up the DOM by removing non-content elements that interfere with article parsing: 94 | // - Scripts, styles, iframes that might contain tracking or ads 95 | // - Ad containers and advertisement blocks 96 | // - Social media widgets and sharing buttons 97 | // - Comments sections, navbars, headers, footers (except those within articles) 98 | // - Various UI elements not relevant to the core article content 99 | { 100 | content: 101 | '(() => { document.querySelectorAll(\'script, style, iframe, .ad, .ads, .advertisement, [class*="social"], [id*="social"], .share, .comments, aside, nav, header:not(article header), footer:not(article footer), [role="complementary"], [role="banner"], [role="navigation"], form, .related, .recommended, .newsletter, .subscription\').forEach(el => el.remove()); })();', 102 | }, 103 | // Simplifies the DOM by stripping all HTML attributes except essential ones: 104 | // - href: preserves links 105 | // - src: maintains images and embedded content 106 | // - alt: keeps accessibility text for images 107 | // - title: retains tooltip text 108 | // This reduces noise and potential tracking parameters in the parsed content 109 | { 110 | content: 111 | "(() => { const keepAttributes = [\'href\', \'src\', \'alt\', \'title\']; document.querySelectorAll(\'*\').forEach(el => { [...el.attributes].forEach(attr => { if (!keepAttributes.includes(attr.name.toLowerCase())) { el.removeAttribute(attr.name); }}); }); })();", 112 | }, 113 | // Recursively removes empty elements to clean up the DOM structure 114 | // Continues removing elements until no more empty ones are found 115 | // This eliminates spacing artifacts and layout containers that serve no content purpose 116 | { 117 | content: 118 | "(() => { function removeEmpty() { let removed = 0; document.querySelectorAll(\'div, span, p, section, article\').forEach(el => { if (!el.hasChildNodes() || el.textContent.trim() === \'\') { el.remove(); removed++; } }); return removed; } let pass; do { pass = removeEmpty(); } while(pass > 0); })();", 119 | }, 120 | // Removes simple meta tags that provide minimal information value 121 | // Meta tags with only one attribute are typically not useful for content analysis 122 | // This helps reduce noise in the document head 123 | { 124 | content: 125 | "(() => { document.querySelectorAll(\'meta\').forEach(meta => { if (meta.attributes.length <= 1) { meta.remove(); } }); })();", 126 | }, 127 | ], 128 | waitForSelector: { 129 | selector: 'article, .article, .content, .post, #article, main', 130 | timeout: 5000, 131 | }, 132 | }), 133 | }) 134 | ); 135 | if (response.isErr()) { 136 | return err({ type: 'FETCH_ERROR', error: response.error }); 137 | } 138 | 139 | const parsedPageContent = articleSchema.safeParse(await response.value.json()); 140 | if (parsedPageContent.success === false) { 141 | return err({ type: 'VALIDATION_ERROR', error: parsedPageContent.error }); 142 | } 143 | 144 | const articleResult = parseArticle({ html: parsedPageContent.data.result }); 145 | if (articleResult.isErr()) { 146 | return err({ type: 'PARSE_ERROR', error: articleResult.error }); 147 | } 148 | 149 | return ok(articleResult.value); 150 | } 151 | 152 | /** 153 | * Fetches an article using a simple HTTP request 154 | * 155 | * This is a lighter-weight alternative to browser rendering that works for 156 | * simpler websites that don't rely heavily on client-side JavaScript for content. 157 | * 158 | * @param url URL of the article to fetch 159 | * @returns Result containing either the parsed article content or an error object 160 | */ 161 | export async function getArticleWithFetch(url: string) { 162 | const response = await tryCatchAsync( 163 | fetch(url, { 164 | method: 'GET', 165 | headers: { 166 | 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)], 167 | Referer: 'https://www.google.com/', 168 | }, 169 | }) 170 | ); 171 | if (response.isErr()) { 172 | return err({ type: 'FETCH_ERROR', error: response.error }); 173 | } 174 | 175 | const articleResult = parseArticle({ html: await response.value.text() }); 176 | if (articleResult.isErr()) { 177 | return err({ type: 'PARSE_ERROR', error: articleResult.error }); 178 | } 179 | 180 | return ok(articleResult.value); 181 | } 182 | -------------------------------------------------------------------------------- /apps/backend/test/rateLimiter.spec.ts: -------------------------------------------------------------------------------- 1 | import type { WorkflowStep } from 'cloudflare:workers'; 2 | import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; 3 | import { DomainRateLimiter } from '../src/lib/rateLimiter'; 4 | 5 | type BatchItem = { 6 | id: number | string; 7 | url: string; 8 | }; 9 | 10 | describe('DomainRateLimiter', () => { 11 | // Mock 'step.sleep' to track calls and resolve immediately or after checking delays 12 | let mockSleep: ReturnType; 13 | let step: WorkflowStep; // Mocked step object 14 | let rateLimiter: DomainRateLimiter; 15 | let processItem: ReturnType; 16 | 17 | beforeEach(() => { 18 | mockSleep = vi.fn().mockImplementation((reason, ms) => { 19 | // Advance time when sleep is called 20 | vi.advanceTimersByTime(ms); 21 | return Promise.resolve(); 22 | }); 23 | step = { sleep: mockSleep } as unknown as WorkflowStep; 24 | // Setup rateLimiter with specific options for testing 25 | rateLimiter = new DomainRateLimiter({ maxConcurrent: 2, globalCooldownMs: 100, domainCooldownMs: 200 }); 26 | processItem = vi.fn().mockImplementation(async (item: BatchItem) => `processed-${item.id}`); 27 | 28 | // Mock Date.now() to control time 29 | vi.useFakeTimers(); 30 | vi.setSystemTime(new Date(2023, 1, 1, 0, 0, 0)); 31 | }); 32 | 33 | afterEach(() => { 34 | vi.useRealTimers(); 35 | }); 36 | 37 | it('should process all items concurrently if limits are not hit', async () => { 38 | const items = [ 39 | { id: 1, url: 'https://domain1.com/page1' }, 40 | { id: 2, url: 'https://domain2.com/page1' }, 41 | ]; 42 | 43 | const results = await rateLimiter.processBatch(items, step, processItem); 44 | 45 | expect(results).toHaveLength(2); 46 | expect(results).toEqual(['processed-1', 'processed-2']); 47 | expect(processItem).toHaveBeenCalledTimes(2); 48 | expect(mockSleep).not.toHaveBeenCalled(); 49 | }); 50 | 51 | it('should not exceed maxConcurrent processing simultaneously', async () => { 52 | const items = [ 53 | { id: 1, url: 'https://domain1.com/page1' }, 54 | { id: 2, url: 'https://domain2.com/page1' }, 55 | { id: 3, url: 'https://domain3.com/page1' }, 56 | ]; 57 | 58 | await rateLimiter.processBatch(items, step, processItem); 59 | 60 | // Should process first two items concurrently (maxConcurrent: 2), then apply global cooldown 61 | expect(mockSleep).toHaveBeenCalledWith(expect.any(String), 100); 62 | expect(processItem).toHaveBeenCalledTimes(3); 63 | }); 64 | 65 | it('should call step.sleep for globalCooldownMs between batches if needed', async () => { 66 | const items = [ 67 | { id: 1, url: 'https://domain1.com/page1' }, 68 | { id: 2, url: 'https://domain2.com/page1' }, 69 | { id: 3, url: 'https://domain3.com/page1' }, 70 | { id: 4, url: 'https://domain4.com/page1' }, 71 | { id: 5, url: 'https://domain5.com/page1' }, 72 | ]; 73 | 74 | await rateLimiter.processBatch(items, step, processItem); 75 | 76 | // Should have 3 batches: 2 items, 2 items, 1 item 77 | // Sleep should be called twice for global cooldown between batches 78 | expect(mockSleep).toHaveBeenCalledTimes(2); 79 | expect(mockSleep).toHaveBeenCalledWith(expect.stringContaining('global rate limit'), 100); 80 | }); 81 | 82 | it('should call step.sleep for domainCooldownMs if processing the same domain twice quickly', async () => { 83 | const items = [ 84 | { id: 1, url: 'https://domain1.com/page1' }, 85 | { id: 2, url: 'https://domain1.com/page2' }, // Same domain 86 | ]; 87 | 88 | // Process first item 89 | await rateLimiter.processBatch([items[0]], step, processItem); 90 | 91 | // Reset mock to track calls separately 92 | mockSleep.mockClear(); 93 | processItem.mockClear(); 94 | 95 | // Advance time but not enough to clear domain cooldown 96 | vi.advanceTimersByTime(100); 97 | 98 | // Process second item 99 | await rateLimiter.processBatch([items[1]], step, processItem); 100 | 101 | // Should wait for domain cooldown 102 | expect(mockSleep).toHaveBeenCalledWith(expect.stringContaining('waiting for domain cooldown'), expect.any(Number)); 103 | }); 104 | 105 | it('should allow different domains to be processed concurrently without domain cooldown', async () => { 106 | const items = [ 107 | { id: 1, url: 'https://domain1.com/page1' }, 108 | { id: 2, url: 'https://domain2.com/page1' }, 109 | ]; 110 | 111 | await rateLimiter.processBatch(items, step, processItem); 112 | 113 | // Should process both concurrently without domain cooldown 114 | expect(processItem).toHaveBeenCalledTimes(2); 115 | expect(mockSleep).not.toHaveBeenCalled(); 116 | }); 117 | 118 | it('should skip items with invalid URLs without throwing an error', async () => { 119 | const items = [ 120 | { id: 1, url: 'https://domain1.com/page1' }, 121 | { id: 2, url: 'invalid-url' }, // Invalid URL 122 | ]; 123 | 124 | const results = await rateLimiter.processBatch(items, step, processItem); 125 | 126 | // Should only process valid URLs 127 | expect(results).toHaveLength(1); 128 | expect(results).toEqual(['processed-1']); 129 | expect(processItem).toHaveBeenCalledTimes(1); 130 | }); 131 | 132 | it('should call step.sleep with calculated wait time if all available items are domain-limited', async () => { 133 | // Process first item 134 | await rateLimiter.processBatch([{ id: 1, url: 'https://domain1.com/page1' }], step, processItem); 135 | 136 | // Reset mocks 137 | mockSleep.mockClear(); 138 | processItem.mockClear(); 139 | 140 | // Advance time to 100ms 141 | vi.advanceTimersByTime(100); 142 | 143 | // Try to process the same domain again (should need to wait 100ms more) 144 | await rateLimiter.processBatch([{ id: 2, url: 'https://domain1.com/page2' }], step, processItem); 145 | 146 | // Should wait for remaining time on domain cooldown (200ms - 100ms = 100ms) 147 | expect(mockSleep).toHaveBeenCalledWith(expect.stringContaining('waiting for domain cooldown'), expect.any(Number)); 148 | 149 | // Should eventually process the item 150 | expect(processItem).toHaveBeenCalledTimes(1); 151 | }); 152 | 153 | it('should call the processItem function with the correct item and extracted domain', async () => { 154 | const item = { id: 1, url: 'https://example.com/page1' }; 155 | 156 | await rateLimiter.processBatch([item], step, processItem); 157 | 158 | expect(processItem).toHaveBeenCalledWith(item, 'example.com'); 159 | }); 160 | 161 | it('should return results for all successfully processed items', async () => { 162 | const items = [ 163 | { id: 1, url: 'https://domain1.com/page1' }, 164 | { id: 2, url: 'https://domain2.com/page1' }, 165 | ]; 166 | 167 | const results = await rateLimiter.processBatch(items, step, processItem); 168 | 169 | expect(results).toEqual(['processed-1', 'processed-2']); 170 | }); 171 | 172 | it('should handle errors during processItem gracefully and continue processing others', async () => { 173 | const items = [ 174 | { id: 1, url: 'https://domain1.com/page1' }, 175 | { id: 2, url: 'https://domain2.com/page1' }, 176 | ]; 177 | 178 | // Make the first item fail 179 | processItem.mockImplementation(async (item: BatchItem) => { 180 | if (item.id === 1) throw new Error('Processing failed'); 181 | return `processed-${item.id}`; 182 | }); 183 | 184 | const results = await rateLimiter.processBatch(items, step, processItem); 185 | 186 | // Should have only the successful result 187 | expect(results).toEqual(['processed-2']); 188 | expect(processItem).toHaveBeenCalledTimes(2); 189 | }); 190 | 191 | it('should update internal lastDomainAccess times correctly', async () => { 192 | const items = [ 193 | { id: 1, url: 'https://domain1.com/page1' }, 194 | { id: 2, url: 'https://domain1.com/page2' }, // Same domain 195 | ]; 196 | 197 | // Process first item 198 | await rateLimiter.processBatch([items[0]], step, processItem); 199 | 200 | // Advance time past domain cooldown 201 | vi.advanceTimersByTime(250); 202 | 203 | // Reset mock to track calls separately 204 | mockSleep.mockClear(); 205 | processItem.mockClear(); 206 | 207 | // Process second item 208 | await rateLimiter.processBatch([items[1]], step, processItem); 209 | 210 | // Should not wait for domain cooldown since time has advanced past cooldown period 211 | expect(mockSleep).not.toHaveBeenCalled(); 212 | expect(processItem).toHaveBeenCalledTimes(1); 213 | }); 214 | 215 | it('should only wait for cooldowns of domains with pending items', async () => { 216 | // First, process items from two different domains 217 | await rateLimiter.processBatch( 218 | [ 219 | { id: 1, url: 'https://domain1.com/page1' }, 220 | { id: 2, url: 'https://domain2.com/page1' }, 221 | ], 222 | step, 223 | processItem 224 | ); 225 | 226 | // Reset mocks 227 | mockSleep.mockClear(); 228 | processItem.mockClear(); 229 | 230 | // Advance time partially through cooldown period 231 | vi.advanceTimersByTime(50); 232 | 233 | // Set up domain1 with a much longer remaining cooldown (by manipulating lastDomainAccess) 234 | // @ts-expect-error accessing private property for testing 235 | rateLimiter.lastDomainAccess.set('domain1.com', Date.now()); 236 | 237 | // Now process only domain2 item 238 | await rateLimiter.processBatch([{ id: 3, url: 'https://domain2.com/page2' }], step, processItem); 239 | 240 | // Should wait for domain2's cooldown (150ms) not domain1's longer cooldown (200ms) 241 | expect(mockSleep).toHaveBeenCalledWith(expect.stringContaining('waiting for domain cooldown'), expect.any(Number)); 242 | 243 | // Verify the wait time is for domain2 not domain1 244 | const sleepTime = mockSleep.mock.calls[0][1]; 245 | expect(sleepTime).toBe(500); // There's a minimum 500ms wait time enforced in the code 246 | 247 | // Should eventually process the item 248 | expect(processItem).toHaveBeenCalledTimes(1); 249 | }); 250 | }); 251 | -------------------------------------------------------------------------------- /apps/backend/src/routers/durableObjects.router.ts: -------------------------------------------------------------------------------- 1 | import { zValidator } from '@hono/zod-validator'; 2 | import { $data_sources, $ingested_items, eq, isNull } from '@meridian/database'; 3 | import { Hono } from 'hono'; 4 | import { z } from 'zod'; 5 | import type { HonoEnv } from '../app'; 6 | import { Logger } from '../lib/logger'; 7 | import { tryCatchAsync } from '../lib/tryCatchAsync'; 8 | import { getDb, hasValidAuthToken } from '../lib/utils'; 9 | 10 | const logger = new Logger({ router: 'durable-objects' }); 11 | 12 | const route = new Hono() 13 | // handle DO-specific routes 14 | .get( 15 | '/source/:sourceId/*', 16 | zValidator( 17 | 'param', 18 | z.object({ 19 | sourceId: z.string().min(1, 'Source ID is required'), 20 | }) 21 | ), 22 | async c => { 23 | const { sourceId } = c.req.valid('param'); 24 | const doId = c.env.DATA_SOURCE_INGESTOR.idFromName(decodeURIComponent(sourceId)); 25 | const stub = c.env.DATA_SOURCE_INGESTOR.get(doId); 26 | 27 | // reconstruct path for the DO 28 | const url = new URL(c.req.url); 29 | const pathParts = url.pathname.split('/'); 30 | const doPath = `/${pathParts.slice(4).join('/')}`; 31 | const doUrl = new URL(doPath + url.search, 'http://do'); 32 | 33 | const doRequest = new Request(doUrl.toString(), c.req.raw); 34 | return stub.fetch(doRequest); 35 | } 36 | ) 37 | // admin endpoints 38 | .post( 39 | '/admin/source/:sourceId/init', 40 | zValidator( 41 | 'param', 42 | z.object({ 43 | sourceId: z.string().min(1, 'Source ID is required'), 44 | }) 45 | ), 46 | async c => { 47 | // auth check 48 | if (!hasValidAuthToken(c)) { 49 | return c.json({ error: 'Unauthorized' }, 401); 50 | } 51 | 52 | const initLogger = logger.child({ operation: 'init-source' }); 53 | const { sourceId } = c.req.valid('param'); 54 | 55 | const db = getDb(c.env.HYPERDRIVE); 56 | 57 | // Get the source first 58 | const sourceResult = await tryCatchAsync( 59 | db.query.$data_sources.findFirst({ 60 | where: eq($data_sources.id, Number(sourceId)), 61 | }) 62 | ); 63 | 64 | if (sourceResult.isErr()) { 65 | const error = sourceResult.error instanceof Error ? sourceResult.error : new Error(String(sourceResult.error)); 66 | initLogger.error('Failed to fetch source', { sourceId }, error); 67 | return c.json({ error: 'Failed to fetch source' }, 500); 68 | } 69 | 70 | const source = sourceResult.value; 71 | if (!source) { 72 | return c.json({ error: 'Source not found' }, 404); 73 | } 74 | 75 | // Initialize the DO 76 | const doId = c.env.DATA_SOURCE_INGESTOR.idFromName(source.config.config.url); 77 | const stub = c.env.DATA_SOURCE_INGESTOR.get(doId); 78 | 79 | const initResult = await tryCatchAsync( 80 | stub.initialize({ 81 | id: source.id, 82 | source_type: source.source_type, 83 | config: source.config, 84 | config_version_hash: source.config_version_hash, 85 | scrape_frequency_tier: source.scrape_frequency_minutes, 86 | }) 87 | ); 88 | if (initResult.isErr()) { 89 | const error = initResult.error instanceof Error ? initResult.error : new Error(String(initResult.error)); 90 | initLogger.error('Failed to initialize source DO', { sourceId, url: source.config.config.url }, error); 91 | return c.json({ error: 'Failed to initialize source DO' }, 500); 92 | } 93 | 94 | initLogger.info('Successfully initialized source DO', { sourceId, url: source.config.config.url }); 95 | return c.json({ success: true }); 96 | } 97 | ) 98 | .post('/admin/initialize-dos', async c => { 99 | // auth check 100 | if (!hasValidAuthToken(c)) { 101 | return c.json({ error: 'Unauthorized' }, 401); 102 | } 103 | 104 | const initLogger = logger.child({ operation: 'initialize-dos' }); 105 | initLogger.info('Initializing SourceScraperDOs from database'); 106 | 107 | const db = getDb(c.env.HYPERDRIVE); 108 | 109 | // Get batch size from query params, default to 100 110 | const batchSize = Number(c.req.query('batchSize')) || 100; 111 | initLogger.info('Using batch size', { batchSize }); 112 | 113 | const allSourcesResult = await tryCatchAsync( 114 | db 115 | .select({ 116 | id: $data_sources.id, 117 | source_type: $data_sources.source_type, 118 | config: $data_sources.config, 119 | config_version_hash: $data_sources.config_version_hash, 120 | scrape_frequency_tier: $data_sources.scrape_frequency_minutes, 121 | }) 122 | .from($data_sources) 123 | .where(isNull($data_sources.do_initialized_at)) 124 | ); 125 | if (allSourcesResult.isErr()) { 126 | const error = 127 | allSourcesResult.error instanceof Error ? allSourcesResult.error : new Error(String(allSourcesResult.error)); 128 | initLogger.error('Failed to fetch sources from database', undefined, error); 129 | return c.json({ error: 'Failed to fetch sources from database' }, 500); 130 | } 131 | 132 | const allSources = allSourcesResult.value; 133 | initLogger.info('Sources fetched from database', { source_count: allSources.length }); 134 | 135 | // Process sources in batches 136 | let processedCount = 0; 137 | let successCount = 0; 138 | 139 | // Create batches of sources 140 | const batches = []; 141 | for (let i = 0; i < allSources.length; i += batchSize) { 142 | batches.push(allSources.slice(i, i + batchSize)); 143 | } 144 | 145 | // Process each batch sequentially 146 | for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) { 147 | const batch = batches[batchIndex]; 148 | initLogger.info('Processing batch', { batchIndex: batchIndex + 1, batchSize: batch.length }); 149 | 150 | const batchResults = await Promise.all( 151 | batch.map(async source => { 152 | const sourceLogger = initLogger.child({ source_id: source.id, url: source.config.config.url }); 153 | const doId = c.env.DATA_SOURCE_INGESTOR.idFromName(source.config.config.url); 154 | const stub = c.env.DATA_SOURCE_INGESTOR.get(doId); 155 | 156 | sourceLogger.debug('Initializing DO'); 157 | const result = await tryCatchAsync(stub.initialize(source)); 158 | if (result.isErr()) { 159 | const error = result.error instanceof Error ? result.error : new Error(String(result.error)); 160 | sourceLogger.error('Failed to initialize DO', undefined, error); 161 | return false; 162 | } 163 | 164 | sourceLogger.debug('Successfully initialized DO'); 165 | return true; 166 | }) 167 | ); 168 | 169 | processedCount += batch.length; 170 | successCount += batchResults.filter(success => success).length; 171 | 172 | initLogger.info('Batch completed', { 173 | batchIndex: batchIndex + 1, 174 | batchSuccessful: batchResults.filter(success => success).length, 175 | totalProcessed: processedCount, 176 | totalSuccessful: successCount, 177 | }); 178 | } 179 | 180 | initLogger.info('Initialization process complete', { total: allSources.length, successful: successCount }); 181 | return c.json({ initialized: successCount, total: allSources.length }); 182 | }) 183 | .delete( 184 | '/admin/source/:sourceId', 185 | zValidator( 186 | 'param', 187 | z.object({ 188 | sourceId: z.string().min(1, 'Source ID is required'), 189 | }) 190 | ), 191 | async c => { 192 | // auth check 193 | if (!hasValidAuthToken(c)) { 194 | return c.json({ error: 'Unauthorized' }, 401); 195 | } 196 | 197 | const deleteLogger = logger.child({ operation: 'delete-source' }); 198 | const { sourceId } = c.req.valid('param'); 199 | 200 | const db = getDb(c.env.HYPERDRIVE); 201 | 202 | // Get the source first to get its URL 203 | const sourceResult = await tryCatchAsync( 204 | db.query.$data_sources.findFirst({ 205 | where: eq($data_sources.id, Number(sourceId)), 206 | }) 207 | ); 208 | 209 | if (sourceResult.isErr()) { 210 | const error = sourceResult.error instanceof Error ? sourceResult.error : new Error(String(sourceResult.error)); 211 | deleteLogger.error('Failed to fetch source', { sourceId }, error); 212 | return c.json({ error: 'Failed to fetch source' }, 500); 213 | } 214 | 215 | const source = sourceResult.value; 216 | if (!source) { 217 | return c.json({ error: 'Source not found' }, 404); 218 | } 219 | 220 | // Delete the durable object first 221 | const doId = c.env.DATA_SOURCE_INGESTOR.idFromName(source.config.config.url); 222 | const stub = c.env.DATA_SOURCE_INGESTOR.get(doId); 223 | 224 | const deleteResult = await tryCatchAsync( 225 | stub.fetch('http://do/delete', { 226 | method: 'DELETE', 227 | }) 228 | ); 229 | if (deleteResult.isErr()) { 230 | const error = deleteResult.error instanceof Error ? deleteResult.error : new Error(String(deleteResult.error)); 231 | deleteLogger.error('Failed to delete source DO', { sourceId, url: source.config.config.url }, error); 232 | return c.json({ error: 'Failed to delete source DO' }, 500); 233 | } 234 | 235 | // Then delete from database 236 | // delete the articles first 237 | const articlesResult = await tryCatchAsync( 238 | db.delete($ingested_items).where(eq($ingested_items.data_source_id, Number(sourceId))) 239 | ); 240 | if (articlesResult.isErr()) { 241 | const error = 242 | articlesResult.error instanceof Error ? articlesResult.error : new Error(String(articlesResult.error)); 243 | deleteLogger.error('Failed to delete articles', { sourceId }, error); 244 | return c.json({ error: 'Failed to delete articles' }, 500); 245 | } 246 | 247 | const dbDeleteResult = await tryCatchAsync( 248 | db.delete($data_sources).where(eq($data_sources.id, Number(sourceId))) 249 | ); 250 | if (dbDeleteResult.isErr()) { 251 | const error = 252 | dbDeleteResult.error instanceof Error ? dbDeleteResult.error : new Error(String(dbDeleteResult.error)); 253 | deleteLogger.error('Failed to delete source from database', { sourceId }, error); 254 | return c.json({ error: 'Failed to delete source from database' }, 500); 255 | } 256 | 257 | deleteLogger.info('Successfully deleted source', { sourceId, url: source.config.config.url }); 258 | return c.json({ success: true }); 259 | } 260 | ); 261 | 262 | export default route; 263 | -------------------------------------------------------------------------------- /apps/backend/test/fixtures/ft_com.xml: -------------------------------------------------------------------------------- 1 | 2 | <![CDATA[World]]>https://www.ft.com/stream/82645c31-4426-4ef5-99c9-9df6e0940c00RSS for NodeTue, 18 Mar 2025 23:53:48 GMT15<![CDATA[‘If Trump defies a Supreme Court order, will it matter to markets?’]]>https://www.ft.com/content/2e579290-fc0c-4b88-8703-f0bae45266d92e579290-fc0c-4b88-8703-f0bae45266d9Tue, 18 Mar 2025 23:34:47 GMT<![CDATA[Putin agrees 30-day halt to strikes on Ukrainian energy infrastructure in call with Trump]]>https://www.ft.com/content/75b37ad2-0f35-4fe7-b3cf-a36b965c9a7175b37ad2-0f35-4fe7-b3cf-a36b965c9a71Tue, 18 Mar 2025 23:17:18 GMT<![CDATA[Why has Netanyahu renewed Israel’s offensive against Hamas?]]>https://www.ft.com/content/f190b582-7b24-4e30-9d9a-024bff2c6f6ff190b582-7b24-4e30-9d9a-024bff2c6f6fTue, 18 Mar 2025 22:52:03 GMT<![CDATA[US chief justice rebukes Trump after president’s threat to impeach judges]]>https://www.ft.com/content/86b3b77a-2986-4f0f-9475-886c846dfd6886b3b77a-2986-4f0f-9475-886c846dfd68Tue, 18 Mar 2025 22:42:48 GMT<![CDATA[FirstFT: US chief justice issues rare rebuke to Donald Trump]]>https://www.ft.com/content/e615e03f-c0b8-4f79-9ad8-4763a22cff87e615e03f-c0b8-4f79-9ad8-4763a22cff87Tue, 18 Mar 2025 22:01:17 GMT<![CDATA[Reeves to squeeze public spending further in Spring Statement ]]>https://www.ft.com/content/2f72d3c9-4508-40f7-a4a9-07d7b0750c712f72d3c9-4508-40f7-a4a9-07d7b0750c71Tue, 18 Mar 2025 21:34:29 GMT<![CDATA[Howard Lutnick touts Elon Musk’s Starlink for US broadband scheme]]>https://www.ft.com/content/ae99e775-cc64-4831-9ace-6853d0f457edae99e775-cc64-4831-9ace-6853d0f457edTue, 18 Mar 2025 21:31:29 GMT<![CDATA[The Capital One shakedown]]>https://www.ft.com/content/3007e000-7e61-4e70-86e8-07c52df6b52f3007e000-7e61-4e70-86e8-07c52df6b52fTue, 18 Mar 2025 20:35:56 GMT<![CDATA[Wall Street stocks slide as sell-off in tech shares picks up pace]]>https://www.ft.com/content/ed38a070-38d3-4ba4-908c-aeaba8a8f185ed38a070-38d3-4ba4-908c-aeaba8a8f185Tue, 18 Mar 2025 20:05:36 GMT<![CDATA[Netanyahu says Israel has resumed ‘fighting with force’ against Hamas]]>https://www.ft.com/content/880503da-a915-4fe5-ad5e-deab8a00b669880503da-a915-4fe5-ad5e-deab8a00b669Tue, 18 Mar 2025 20:00:03 GMT<![CDATA[Erdoğan’s main rival risks ban from Turkish vote after degree annulled]]>https://www.ft.com/content/a7b053f8-9762-4dd5-a786-25071d4cc233a7b053f8-9762-4dd5-a786-25071d4cc233Tue, 18 Mar 2025 19:21:18 GMT<![CDATA[How the UK’s welfare cuts will change claimants’ lives]]>https://www.ft.com/content/131d963c-ca3d-40d5-a0f4-45ced7e72098131d963c-ca3d-40d5-a0f4-45ced7e72098Tue, 18 Mar 2025 18:30:39 GMT<![CDATA[Tennis stars accuse governing bodies of ‘cartel’ to deny them more prize money ]]>https://www.ft.com/content/06b2464a-d913-4ca0-9d50-cfd89f580b7206b2464a-d913-4ca0-9d50-cfd89f580b72Tue, 18 Mar 2025 18:25:08 GMT<![CDATA[Former Russian politician goes on trial in UK for breaching sanctions]]>https://www.ft.com/content/ad585753-e04b-4534-ae6b-85ff95619ed5ad585753-e04b-4534-ae6b-85ff95619ed5Tue, 18 Mar 2025 18:04:16 GMT<![CDATA[The White House war on federal statistics]]>https://www.ft.com/content/a2cbb4e6-c0d8-49ee-84db-a708fdfb7c52a2cbb4e6-c0d8-49ee-84db-a708fdfb7c52Tue, 18 Mar 2025 17:40:30 GMT<![CDATA[Sterling climbs above $1.30 for first time since November]]>https://www.ft.com/content/7e45cce6-e8bf-4a7b-baf6-2a0634f53c1e7e45cce6-e8bf-4a7b-baf6-2a0634f53c1eTue, 18 Mar 2025 17:02:50 GMT<![CDATA[Germany’s parliament approves Merz’s €1tn spending plan]]>https://www.ft.com/content/80742c32-1af3-4881-a935-f3045df12b1280742c32-1af3-4881-a935-f3045df12b12Tue, 18 Mar 2025 16:27:09 GMT<![CDATA[Will anybody buy a ‘Mar-a-Lago accord’?]]>https://www.ft.com/content/9fa4a76d-60bb-45cd-aba0-744973f98dea9fa4a76d-60bb-45cd-aba0-744973f98deaTue, 18 Mar 2025 16:06:56 GMT<![CDATA[Brussels seeks to purchase weapons for entire EU]]>https://www.ft.com/content/aedd1e6b-fb4f-41fd-af10-af9dce6f88dcaedd1e6b-fb4f-41fd-af10-af9dce6f88dcTue, 18 Mar 2025 15:49:36 GMT<![CDATA[Europe’s moment is more than reheated Gaullism]]>https://www.ft.com/content/af23ff94-5578-4d7a-b4db-47d010cb7b11af23ff94-5578-4d7a-b4db-47d010cb7b11Tue, 18 Mar 2025 15:26:41 GMT<![CDATA[Labour unveils disability benefits cuts that aim to save over £5bn]]>https://www.ft.com/content/0cc2d3f0-7ed8-4ee4-aa41-313fd3fb44630cc2d3f0-7ed8-4ee4-aa41-313fd3fb4463Tue, 18 Mar 2025 14:58:43 GMT<![CDATA[What auto bosses worry will be Trump’s next target in tariff war]]>https://www.ft.com/content/e50a432d-454b-4a13-924d-98657498ba81e50a432d-454b-4a13-924d-98657498ba81Tue, 18 Mar 2025 14:38:31 GMT<![CDATA[Hong Kong’s cargo sector faces a tariff test]]>https://www.ft.com/content/01e24e19-9987-40c1-a408-762f1d821c3b01e24e19-9987-40c1-a408-762f1d821c3bTue, 18 Mar 2025 13:18:23 GMT<![CDATA[UK’s red tape cut should follow business world’s dotted lines]]>https://www.ft.com/content/db69dc53-719b-4add-81f5-13543a7a839edb69dc53-719b-4add-81f5-13543a7a839eTue, 18 Mar 2025 13:12:49 GMT<![CDATA[Economists forecast slowing US growth and increased inflation ]]>https://www.ft.com/content/58a5a30a-485c-4ac3-9987-a3b7cef5da6c58a5a30a-485c-4ac3-9987-a3b7cef5da6cTue, 18 Mar 2025 13:00:50 GMT --------------------------------------------------------------------------------