├── .nvmrc ├── .dockerignore ├── nest-cli.json ├── .prettierrc ├── .vscode ├── extensions.json └── settings.json ├── docs └── assets │ ├── search-ui.jpg │ ├── bot-set-domain.gif │ ├── search-command.jpg │ └── search-and-jump.gif ├── tsconfig.build.json ├── src ├── app.service.ts ├── config │ ├── auth.config.ts │ ├── ocr.config.ts │ ├── meilisearch.config.ts │ ├── http.config.ts │ ├── queue.config.ts │ ├── bot.config.ts │ └── cache.config.ts ├── token │ ├── token.module.ts │ └── token.service.ts ├── object-id.ts ├── ocr │ ├── ocr.service.ts │ ├── google-ocr.service.spec.ts │ ├── azure-ocr.service.spec.ts │ ├── google-ocr.service.ts │ ├── ocr.module.ts │ ├── paddle-ocr-web.service.ts │ └── azure-ocr.service.ts ├── import │ ├── import.module.ts │ └── import.controller.ts ├── app.controller.ts ├── bot │ ├── bot.module.ts │ ├── webhook.controller.ts │ └── bot.service.ts ├── queue │ ├── queue.service.ts │ ├── meta.types.ts │ ├── queue.module.ts │ ├── memory-queue.service.ts │ └── bull-queue.service.ts ├── user │ ├── user.module.ts │ ├── profile.controller.ts │ └── auth.controller.ts ├── search │ ├── search.module.ts │ ├── image-index.service.ts │ ├── search.controller.ts │ ├── index.service.ts │ └── meili-search.service.ts ├── app.module.ts └── main.ts ├── ROADMAP.md ├── Dockerfile ├── .gitignore ├── tsconfig.json ├── .eslintrc.js ├── LICENSE ├── .github └── workflows │ └── ci.yaml ├── .env.example ├── package.json ├── README.md └── public └── index.html /.nvmrc: -------------------------------------------------------------------------------- 1 | 18 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | /.env 2 | /node_modules 3 | /dist 4 | /Dockerfile 5 | /secrets 6 | -------------------------------------------------------------------------------- /nest-cli.json: -------------------------------------------------------------------------------- 1 | { 2 | "collection": "@nestjs/schematics", 3 | "sourceRoot": "src" 4 | } 5 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "singleQuote": true, 3 | "trailingComma": "all", 4 | "semi": false 5 | } 6 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["esbenp.prettier-vscode", "dbaeumer.vscode-eslint"] 3 | } 4 | -------------------------------------------------------------------------------- /docs/assets/search-ui.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oott123/telegram-archive-server/HEAD/docs/assets/search-ui.jpg -------------------------------------------------------------------------------- /docs/assets/bot-set-domain.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oott123/telegram-archive-server/HEAD/docs/assets/bot-set-domain.gif -------------------------------------------------------------------------------- /docs/assets/search-command.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oott123/telegram-archive-server/HEAD/docs/assets/search-command.jpg -------------------------------------------------------------------------------- /docs/assets/search-and-jump.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oott123/telegram-archive-server/HEAD/docs/assets/search-and-jump.gif -------------------------------------------------------------------------------- /tsconfig.build.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "exclude": ["node_modules", "test", "dist", "**/*spec.ts"] 4 | } 5 | -------------------------------------------------------------------------------- /src/app.service.ts: -------------------------------------------------------------------------------- 1 | import { Injectable } from '@nestjs/common' 2 | 3 | @Injectable() 4 | export class AppService { 5 | getHello(): string { 6 | return 'Hello World!' 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/config/auth.config.ts: -------------------------------------------------------------------------------- 1 | import { registerAs } from '@nestjs/config' 2 | 3 | export default registerAs('auth', () => ({ 4 | jwtSecret: process.env.AUTH_JWT_SECRET || '', 5 | importToken: process.env.AUTH_IMPORT_TOKEN || '', 6 | })) 7 | -------------------------------------------------------------------------------- /src/token/token.module.ts: -------------------------------------------------------------------------------- 1 | import { Module } from '@nestjs/common' 2 | import { TokenService } from './token.service' 3 | 4 | @Module({ 5 | exports: [TokenService], 6 | providers: [TokenService], 7 | }) 8 | export class TokenModule {} 9 | -------------------------------------------------------------------------------- /src/object-id.ts: -------------------------------------------------------------------------------- 1 | export const objectId = (() => { 2 | let currentId = 0 3 | const map = new WeakMap() 4 | 5 | return (object) => { 6 | if (!map.has(object)) { 7 | map.set(object, ++currentId) 8 | } 9 | 10 | return map.get(object) 11 | } 12 | })() 13 | -------------------------------------------------------------------------------- /src/ocr/ocr.service.ts: -------------------------------------------------------------------------------- 1 | export abstract class OCRService { 2 | abstract recognize(image: Uint8Array): Promise 3 | } 4 | 5 | export type OCRResponse = Array<{ 6 | text: string 7 | vertices?: Array<{ x: number; y: number }> 8 | confidence?: number 9 | rotation?: number 10 | }> 11 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | # Roadmap 2 | 3 | - [x] MeiliSearch 单条消息添加索引效率似乎十分不佳。需要有个打包机制,满 100 条或者 60 秒打包提交一次之类的。 4 | - [x] 如果有打包提交机制,那就需要有 graceful 退出,或者缓存 5 | - [x] 支持 OCR 图片搜索 6 | - [ ] 支持图片归档到 S3 7 | - [ ] 支持多条记录合并上下文搜索,应对说话喜欢换行的人 8 | - [ ] 为没有头像的人生成基于名字的默认头像 9 | - [ ] 配置消息队列分批大小和超时 10 | - [ ] 抓取链接归档,并进行索引(可能要新增一个搜索字段) 11 | -------------------------------------------------------------------------------- /src/config/ocr.config.ts: -------------------------------------------------------------------------------- 1 | import { registerAs } from '@nestjs/config' 2 | 3 | export default registerAs('ocr', () => ({ 4 | enable: process.env.OCR_ENABLE === 'true', 5 | driver: process.env.OCR_DRIVER || 'google', 6 | endpoint: process.env.OCR_ENDPOINT, 7 | credentials: process.env.OCR_CREDENTIALS, 8 | })) 9 | -------------------------------------------------------------------------------- /src/import/import.module.ts: -------------------------------------------------------------------------------- 1 | import { Module } from '@nestjs/common' 2 | import { SearchModule } from '../search/search.module' 3 | import { ImportController } from './import.controller' 4 | 5 | @Module({ 6 | controllers: [ImportController], 7 | imports: [SearchModule], 8 | }) 9 | export class ImportModule {} 10 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "typescript.tsdk": "./node_modules/typescript/lib", 3 | "editor.codeActionsOnSave": { 4 | "source.fixAll": "explicit" 5 | }, 6 | "editor.formatOnSave": true, 7 | "editor.defaultFormatter": "esbenp.prettier-vscode", 8 | "cSpell.words": ["Meili", "meilisearch", "reakit"] 9 | } 10 | -------------------------------------------------------------------------------- /src/config/meilisearch.config.ts: -------------------------------------------------------------------------------- 1 | import { registerAs } from '@nestjs/config' 2 | 3 | export default registerAs('meilisearch', () => ({ 4 | host: process.env.MEILISEARCH_HOST || 'http://localhost:7700', 5 | apiKey: process.env.MEILISEARCH_API_KEY || '', 6 | indexPrefix: process.env.MEILISEARCH_INDEX_PREFIX || '', 7 | })) 8 | -------------------------------------------------------------------------------- /src/app.controller.ts: -------------------------------------------------------------------------------- 1 | import { Controller, Get } from '@nestjs/common' 2 | import { AppService } from './app.service' 3 | 4 | @Controller() 5 | export class AppController { 6 | constructor(private readonly appService: AppService) {} 7 | 8 | @Get() 9 | getHello(): string { 10 | return this.appService.getHello() 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:18 AS builder 2 | WORKDIR /app 3 | COPY package.json yarn.lock /app/ 4 | RUN yarn 5 | COPY . /app 6 | RUN yarn build && yarn --production 7 | 8 | FROM gcr.io/distroless/nodejs:18 9 | WORKDIR /app 10 | COPY --from=builder /app/dist /app/dist 11 | COPY --from=builder /app/node_modules /app/node_modules 12 | COPY --from=builder /app/public /app/public 13 | CMD ["/app/dist/main.js"] 14 | -------------------------------------------------------------------------------- /src/bot/bot.module.ts: -------------------------------------------------------------------------------- 1 | import { Module } from '@nestjs/common' 2 | import { SearchModule } from '../search/search.module' 3 | import { BotService } from './bot.service' 4 | import { WebhookController } from './webhook.controller' 5 | 6 | @Module({ 7 | imports: [SearchModule], 8 | providers: [BotService], 9 | exports: [BotService], 10 | controllers: [WebhookController], 11 | }) 12 | export class BotModule {} 13 | -------------------------------------------------------------------------------- /src/queue/queue.service.ts: -------------------------------------------------------------------------------- 1 | import { QueueMeta, QueueProcessor, QueueTypes } from './meta.types' 2 | 3 | export abstract class QueueService { 4 | public abstract process( 5 | queue: T, 6 | handler: QueueProcessor, 7 | concurrency?: number, 8 | ): Promise 9 | 10 | public abstract add( 11 | queue: T, 12 | data: QueueMeta, 13 | ): Promise 14 | } 15 | -------------------------------------------------------------------------------- /src/user/user.module.ts: -------------------------------------------------------------------------------- 1 | import { Module } from '@nestjs/common' 2 | import { BotModule } from '../bot/bot.module' 3 | import { TokenModule } from '../token/token.module' 4 | import { AuthController } from './auth.controller' 5 | import { ProfileController } from './profile.controller' 6 | 7 | @Module({ 8 | controllers: [ProfileController, AuthController], 9 | imports: [BotModule, TokenModule], 10 | }) 11 | export class UserModule {} 12 | -------------------------------------------------------------------------------- /src/config/http.config.ts: -------------------------------------------------------------------------------- 1 | import { registerAs } from '@nestjs/config' 2 | 3 | export default registerAs('http', () => ({ 4 | baseUrl: process.env.HTTP_BASE_URL || '', 5 | uiUrl: process.env.HTTP_UI_URL || '', 6 | host: process.env.HTTP_HOST || (process.env.PORT ? '0.0.0.0' : '127.0.0.1'), 7 | port: process.env.HTTP_PORT || process.env.PORT || 3100, 8 | jwtSecret: process.env.HTTP_JWT_SECRET || '', 9 | globalPrefix: '/api/v1', 10 | })) 11 | -------------------------------------------------------------------------------- /src/config/queue.config.ts: -------------------------------------------------------------------------------- 1 | import { registerAs } from '@nestjs/config' 2 | 3 | export default registerAs('queue', () => ({ 4 | enable: process.env.QUEUE_ENABLE === 'true', 5 | redis: { 6 | host: process.env.QUEUE_REDIS_HOST || 'localhost', 7 | port: Number(process.env.QUEUE_REDIS_PORT || 6379), 8 | password: process.env.QUEUE_REDIS_PASSWORD, 9 | db: Number(process.env.QUEUE_REDIS_DB || 0), 10 | }, 11 | keyPrefix: process.env.QUEUE_REDIS_KEY_PREFIX || '', 12 | })) 13 | -------------------------------------------------------------------------------- /src/config/bot.config.ts: -------------------------------------------------------------------------------- 1 | import { registerAs } from '@nestjs/config' 2 | 3 | export default registerAs('bot', () => ({ 4 | token: process.env.TELEGRAM_BOT_TOKEN || '', 5 | webhook: process.env.TELEGRAM_WEBHOOK === 'true', 6 | followEdit: process.env.TELEGRAM_FOLLOW_EDIT === 'true', 7 | followDelete: process.env.TELEGRAM_FOLLOW_DELETE === 'true', 8 | processImage: true, // If you don't want OCR, disable OCR module is enough 9 | updateToken: process.env.TELEGRAM_WEBHOOK_UPDATE_TOKEN || '', 10 | })) 11 | -------------------------------------------------------------------------------- /src/config/cache.config.ts: -------------------------------------------------------------------------------- 1 | import { registerAs } from '@nestjs/config' 2 | 3 | export default registerAs('cache', () => ({ 4 | cacheStore: process.env.CACHE_STORE || 'memory', 5 | redis: { 6 | host: process.env.CACHE_REDIS_HOST || 'localhost', 7 | port: Number(process.env.CACHE_REDIS_PORT || 6379), 8 | password: process.env.CACHE_REDIS_PASSWORD, 9 | db: Number(process.env.CACHE_REDIS_DB || 0), 10 | keyPrefix: process.env.CACHE_REDIS_KEY_PREFIX || '', 11 | }, 12 | ttl: 3600, 13 | })) 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # compiled output 2 | /dist 3 | /node_modules 4 | 5 | # Logs 6 | logs 7 | *.log 8 | npm-debug.log* 9 | pnpm-debug.log* 10 | yarn-debug.log* 11 | yarn-error.log* 12 | lerna-debug.log* 13 | 14 | # OS 15 | .DS_Store 16 | 17 | # Tests 18 | /coverage 19 | /.nyc_output 20 | 21 | # IDEs and editors 22 | /.idea 23 | .project 24 | .classpath 25 | .c9/ 26 | *.launch 27 | .settings/ 28 | *.sublime-workspace 29 | 30 | # IDE - VSCode 31 | .vscode/* 32 | !.vscode/settings.json 33 | !.vscode/tasks.json 34 | !.vscode/launch.json 35 | !.vscode/extensions.json 36 | 37 | /.env 38 | /secrets 39 | /.envrc 40 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "declaration": true, 5 | "removeComments": true, 6 | "emitDecoratorMetadata": true, 7 | "experimentalDecorators": true, 8 | "allowSyntheticDefaultImports": true, 9 | "target": "es2017", 10 | "sourceMap": true, 11 | "outDir": "./dist", 12 | "baseUrl": "./", 13 | "incremental": true, 14 | "strict": true, 15 | "skipLibCheck": true, 16 | "strictNullChecks": true, 17 | "noImplicitAny": false, 18 | "strictBindCallApply": true, 19 | "forceConsistentCasingInFileNames": true, 20 | "noFallthroughCasesInSwitch": false 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/queue/meta.types.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | MessageIndex, 3 | OptionalTextMessageIndex, 4 | } from '../search/meili-search.service' 5 | 6 | export type Image = { 7 | type: 'url' | 'base64' 8 | data: string 9 | } 10 | 11 | export type OCRMeta = { 12 | images: Image[] 13 | message: OptionalTextMessageIndex 14 | } 15 | 16 | export type MessageMeta = { 17 | message: MessageIndex 18 | } 19 | 20 | type QueueMetaMap = { 21 | ocr: OCRMeta 22 | message: MessageMeta 23 | } 24 | 25 | export type QueueTypes = keyof QueueMetaMap 26 | 27 | export type QueueMeta = QueueMetaMap[T] 28 | 29 | export type QueueProcessor = ( 30 | meta: QueueMeta, 31 | ) => Promise 32 | -------------------------------------------------------------------------------- /src/search/search.module.ts: -------------------------------------------------------------------------------- 1 | import { Module } from '@nestjs/common' 2 | import { TokenModule } from 'src/token/token.module' 3 | import { MeiliSearchService } from './meili-search.service' 4 | import { SearchController } from './search.controller' 5 | import { IndexService } from './index.service' 6 | import { ImageIndexService } from './image-index.service' 7 | import { QueueModule } from 'src/queue/queue.module' 8 | import { OCRModule } from 'src/ocr/ocr.module' 9 | 10 | @Module({ 11 | imports: [TokenModule, QueueModule, OCRModule], 12 | providers: [MeiliSearchService, IndexService, ImageIndexService], 13 | exports: [MeiliSearchService, IndexService, ImageIndexService], 14 | controllers: [SearchController], 15 | }) 16 | export class SearchModule {} 17 | -------------------------------------------------------------------------------- /src/bot/webhook.controller.ts: -------------------------------------------------------------------------------- 1 | import type { Update } from '@grammyjs/types' 2 | import { 3 | Body, 4 | Controller, 5 | ForbiddenException, 6 | Param, 7 | Post, 8 | } from '@nestjs/common' 9 | import { BotService } from './bot.service' 10 | 11 | @Controller('bot/webhook') 12 | export class WebhookController { 13 | public constructor(private botService: BotService) {} 14 | 15 | @Post(':updateToken/update') 16 | public async update( 17 | @Param('updateToken') updateToken: string, 18 | @Body() update: Update, 19 | ) { 20 | if (!this.botService.checkUpdateToken(updateToken)) { 21 | throw new ForbiddenException('invalid token') 22 | } 23 | 24 | void this.botService.handleUpdate(update) 25 | 26 | return true 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | parser: '@typescript-eslint/parser', 3 | parserOptions: { 4 | project: 'tsconfig.json', 5 | sourceType: 'module', 6 | }, 7 | plugins: ['@typescript-eslint/eslint-plugin'], 8 | extends: ['plugin:@typescript-eslint/recommended'], 9 | root: true, 10 | env: { 11 | node: true, 12 | jest: true, 13 | }, 14 | ignorePatterns: ['.eslintrc.js'], 15 | rules: { 16 | '@typescript-eslint/interface-name-prefix': 'off', 17 | '@typescript-eslint/explicit-function-return-type': 'off', 18 | '@typescript-eslint/explicit-module-boundary-types': 'off', 19 | '@typescript-eslint/no-explicit-any': 'off', 20 | '@typescript-eslint/no-floating-promises': 'error', 21 | '@typescript-eslint/explicit-member-accessibility': 'error', 22 | }, 23 | } 24 | -------------------------------------------------------------------------------- /src/ocr/google-ocr.service.spec.ts: -------------------------------------------------------------------------------- 1 | import { GoogleOCRService } from './google-ocr.service' 2 | import { readFile } from 'fs/promises' 3 | 4 | let googleOcr: GoogleOCRService 5 | let image: Buffer 6 | 7 | beforeEach(async () => { 8 | googleOcr = new GoogleOCRService({ 9 | enable: true, 10 | driver: 'google', 11 | endpoint: 'eu-vision.googleapis.com', 12 | credentials: '', 13 | }) 14 | image = await readFile('docs/assets/search-ui.jpg') 15 | }) 16 | 17 | test('simple ocr', async () => { 18 | const result = await googleOcr.recognize(image) 19 | const texts = result.map((x) => x.text).join('\n') 20 | expect(texts).toContain('搜索界面') 21 | expect(texts).toContain('Telegram') 22 | expect(texts).toContain('Archive') 23 | expect(texts).toContain('Server') 24 | expect(texts).toContain('宣传图') 25 | }) 26 | -------------------------------------------------------------------------------- /src/ocr/azure-ocr.service.spec.ts: -------------------------------------------------------------------------------- 1 | import { AzureOCRService } from './azure-ocr.service' 2 | import { readFile } from 'fs/promises' 3 | 4 | let azureOCR: AzureOCRService 5 | let image: Buffer 6 | 7 | beforeEach(async () => { 8 | azureOCR = new AzureOCRService({ 9 | enable: true, 10 | driver: 'azure', 11 | endpoint: process.env.AZURE_ENDPOINT, 12 | credentials: process.env.AZURE_CREDENTIALS, 13 | }) 14 | image = await readFile('docs/assets/search-ui.jpg') 15 | }) 16 | 17 | test('simple ocr', async () => { 18 | const result = await azureOCR.recognize(image) 19 | const texts = result.map((x) => x.text).join('\n') 20 | expect(texts).toContain('搜索界面') 21 | expect(texts).toContain('Telegram') 22 | expect(texts).toContain('Archive') 23 | expect(texts).toContain('Server') 24 | expect(texts).toContain('宣传图') 25 | }) 26 | -------------------------------------------------------------------------------- /src/queue/queue.module.ts: -------------------------------------------------------------------------------- 1 | import { Module } from '@nestjs/common' 2 | import { ConfigType } from '@nestjs/config' 3 | import { ModuleRef } from '@nestjs/core' 4 | import queueConfig from 'src/config/queue.config' 5 | import { BullQueueService } from './bull-queue.service' 6 | import { MemoryQueueService } from './memory-queue.service' 7 | import { QueueService } from './queue.service' 8 | 9 | @Module({ 10 | providers: [ 11 | { 12 | provide: QueueService, 13 | useFactory: async ( 14 | moduleRef: ModuleRef, 15 | queueCfg: ConfigType, 16 | ) => { 17 | if (queueCfg.enable) { 18 | return await moduleRef.create(BullQueueService) 19 | } else { 20 | return await moduleRef.create(MemoryQueueService) 21 | } 22 | }, 23 | inject: [ModuleRef, queueConfig.KEY], 24 | }, 25 | ], 26 | exports: [QueueService], 27 | }) 28 | export class QueueModule {} 29 | -------------------------------------------------------------------------------- /src/queue/memory-queue.service.ts: -------------------------------------------------------------------------------- 1 | import { Injectable } from '@nestjs/common' 2 | import { 3 | OCRMeta, 4 | MessageMeta, 5 | QueueProcessor, 6 | QueueMeta, 7 | QueueTypes, 8 | } from './meta.types' 9 | import { QueueService } from './queue.service' 10 | 11 | @Injectable() 12 | export class MemoryQueueService implements QueueService { 13 | private processors = new Map>() 14 | 15 | public async process( 16 | queue: T, 17 | handler: QueueProcessor, 18 | concurrency: number, 19 | ): Promise { 20 | this.processors.set(queue, handler) 21 | } 22 | 23 | public async add( 24 | queue: T, 25 | data: QueueMeta, 26 | ): Promise { 27 | const processor = this.processors.get(queue) 28 | if (!processor) { 29 | throw new Error(`Queue ${queue} processor not found`) 30 | } 31 | await processor(data) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/token/token.service.ts: -------------------------------------------------------------------------------- 1 | import { ForbiddenException, Inject, Injectable } from '@nestjs/common' 2 | import { ConfigType } from '@nestjs/config' 3 | import jwt = require('jsonwebtoken') 4 | import authConfig from '../config/auth.config' 5 | 6 | export type AppTokenPayload = { 7 | chatId: string 8 | userId: number 9 | } 10 | 11 | @Injectable() 12 | export class TokenService { 13 | private secret: string 14 | 15 | constructor(@Inject(authConfig.KEY) authCfg: ConfigType) { 16 | this.secret = authCfg.jwtSecret 17 | if (!this.secret) { 18 | throw new Error('please set AUTH_JWT_SECRET to keep your data safe') 19 | } 20 | } 21 | 22 | public sign(payload: AppTokenPayload) { 23 | return jwt.sign(payload, this.secret, { expiresIn: '1d' }) 24 | } 25 | 26 | public verify(token: string) { 27 | try { 28 | return jwt.verify(token, this.secret) as AppTokenPayload 29 | } catch (err) { 30 | throw new ForbiddenException('Invalid token') 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/ocr/google-ocr.service.ts: -------------------------------------------------------------------------------- 1 | import { OCRService, OCRResponse } from './ocr.service' 2 | import { ImageAnnotatorClient } from '@google-cloud/vision' 3 | import { Inject, Injectable } from '@nestjs/common' 4 | import ocrConfig from '../config/ocr.config' 5 | import { ConfigType } from '@nestjs/config' 6 | 7 | @Injectable() 8 | export class GoogleOCRService implements OCRService { 9 | private client!: ImageAnnotatorClient 10 | 11 | public constructor( 12 | @Inject(ocrConfig.KEY) ocrCfg: ConfigType, 13 | ) { 14 | this.client = new ImageAnnotatorClient({ apiEndpoint: ocrCfg.endpoint }) 15 | } 16 | 17 | public async recognize(image: Uint8Array): Promise { 18 | const imgBuffer = image instanceof Buffer ? image : Buffer.from(image) 19 | const detectResult = await this.client.textDetection(imgBuffer) 20 | const { fullTextAnnotation } = detectResult[0] 21 | if (!fullTextAnnotation?.text) { 22 | return [] 23 | } 24 | return [{ text: fullTextAnnotation.text }] 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/ocr/ocr.module.ts: -------------------------------------------------------------------------------- 1 | import { Module } from '@nestjs/common' 2 | import { ConfigType } from '@nestjs/config' 3 | import { ModuleRef } from '@nestjs/core' 4 | import ocrConfig from 'src/config/ocr.config' 5 | import { GoogleOCRService } from './google-ocr.service' 6 | import { OCRService } from './ocr.service' 7 | import { PaddleOCRWebService } from './paddle-ocr-web.service' 8 | import { AzureOCRService } from './azure-ocr.service' 9 | 10 | @Module({ 11 | providers: [ 12 | { 13 | provide: OCRService, 14 | useFactory: ( 15 | moduleRef: ModuleRef, 16 | ocrCfg: ConfigType, 17 | ) => { 18 | if (!ocrCfg.enable) { 19 | return null 20 | } 21 | if (ocrCfg.driver === 'google') { 22 | return moduleRef.create(GoogleOCRService) 23 | } else if (ocrCfg.driver === 'paddle-ocr-web') { 24 | return moduleRef.create(PaddleOCRWebService) 25 | } else if (ocrCfg.driver === 'azure') { 26 | return moduleRef.create(AzureOCRService) 27 | } 28 | 29 | try { 30 | // eslint-disable-next-line @typescript-eslint/no-var-requires 31 | const module = require(`tas-ocr-driver-${ocrCfg.driver}`) 32 | return moduleRef.create(module.OCRService) 33 | } catch (e) { 34 | console.error('====>> Failed to get OCR driver: ' + ocrCfg.driver) 35 | throw e 36 | } 37 | }, 38 | inject: [ModuleRef, ocrConfig.KEY], 39 | }, 40 | ], 41 | exports: [OCRService], 42 | }) 43 | export class OCRModule {} 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, oott123 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /src/ocr/paddle-ocr-web.service.ts: -------------------------------------------------------------------------------- 1 | import { OCRService, OCRResponse } from './ocr.service' 2 | import { Inject, Injectable } from '@nestjs/common' 3 | import ocrConfig from '../config/ocr.config' 4 | import { ConfigType } from '@nestjs/config' 5 | import Debug from 'debug' 6 | 7 | const debug = Debug('app:ocr:paddle-ocr-web') 8 | 9 | @Injectable() 10 | export class PaddleOCRWebService implements OCRService { 11 | private endpoint: string 12 | 13 | public constructor( 14 | @Inject(ocrConfig.KEY) ocrCfg: ConfigType, 15 | ) { 16 | this.endpoint = ocrCfg.endpoint! 17 | debug('init paddle-ocr-web with endpoint', this.endpoint) 18 | } 19 | 20 | public async recognize(image: Uint8Array): Promise { 21 | const imgBuffer = image instanceof Buffer ? image : Buffer.from(image) 22 | const imgBlob = new Blob([imgBuffer]) 23 | 24 | const form = new FormData() 25 | form.append('lang', 'zh-Hans') 26 | form.append('file', imgBlob) 27 | 28 | debug('uploading file to paddle-ocr-web') 29 | const res = await ( 30 | await fetch(this.endpoint, { 31 | method: 'POST', 32 | body: form, 33 | }) 34 | ).json() 35 | 36 | debug('paddle-ocr-web response', res?.result) 37 | const textParts = [] as OCRResponse 38 | 39 | if (Array.isArray(res.result)) { 40 | for (const item of res.result) { 41 | textParts.push({ 42 | text: item[1][0], 43 | vertices: item[0].map((v: [number, number]) => ({ 44 | x: v[0], 45 | y: v[1], 46 | })), 47 | confidence: item[1][1], 48 | }) 49 | } 50 | } 51 | 52 | return textParts 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Docker 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | tags: 8 | - 'v*.*.*' 9 | paths-ignore: 10 | - '/README.md' 11 | - '/ROADMAP.md' 12 | 13 | jobs: 14 | docker: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v2 19 | - name: Docker meta 20 | id: meta 21 | uses: docker/metadata-action@v3 22 | with: 23 | # list of Docker images to use as base name for tags 24 | images: | 25 | quay.io/oott123/telegram-archive-server 26 | # generate Docker tags based on the following events/attributes 27 | tags: | 28 | type=ref,event=branch 29 | type=semver,pattern={{version}} 30 | type=semver,pattern={{major}}.{{minor}} 31 | type=semver,pattern={{major}} 32 | - name: Set up QEMU 33 | uses: docker/setup-qemu-action@v1 34 | - name: Set up Docker Buildx 35 | uses: docker/setup-buildx-action@v1 36 | - name: Login to Quay.io 37 | if: github.event_name != 'pull_request' 38 | uses: docker/login-action@v1 39 | with: 40 | registry: quay.io 41 | username: ${{ secrets.QUAY_ROBOT_USERNAME }} 42 | password: ${{ secrets.QUAY_ROBOT_TOKEN }} 43 | - name: Build and push 44 | uses: docker/build-push-action@v2 45 | with: 46 | context: . 47 | push: ${{ github.event_name != 'pull_request' }} 48 | tags: ${{ steps.meta.outputs.tags }} 49 | labels: ${{ steps.meta.outputs.labels }} 50 | cache-from: type=gha 51 | cache-to: type=gha,mode=max 52 | -------------------------------------------------------------------------------- /src/user/profile.controller.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Controller, 3 | Get, 4 | Param, 5 | Header, 6 | Inject, 7 | CACHE_MANAGER, 8 | } from '@nestjs/common' 9 | import { Cache } from 'cache-manager' 10 | import { join } from 'path' 11 | import { BotService } from '../bot/bot.service' 12 | 13 | @Controller('profile') 14 | export class ProfileController { 15 | constructor( 16 | private botService: BotService, 17 | @Inject(CACHE_MANAGER) private cache: Cache, 18 | ) {} 19 | 20 | @Get('/:userId/photo') 21 | @Header('Cache-Control', 'public, max-age=86400') 22 | @Header('Content-Type', 'image/jpeg') 23 | async getProfilePhoto(@Param('userId') userId: string) { 24 | userId = userId.replace(/^user/, '') 25 | const cacheKey = `photo_${userId}` 26 | const cached = await this.cache.get(cacheKey) 27 | if (cached) { 28 | return Buffer.from(cached, 'base64') 29 | } 30 | 31 | const photo = await this.botService.getProfilePhoto(Number(userId)) 32 | const buf = photo ? await photo.buffer() : this.getDefaultPhoto() 33 | this.cache.set(cacheKey, buf, { ttl: 3600 }).catch(console.error) 34 | return buf 35 | } 36 | 37 | getDefaultPhoto() { 38 | return Buffer.from( 39 | 'R0lGODlhgACAAPEAAF+81v///5vW5tLt9CH5BAAAAAAALAAAAACAAIAAAAL+hI+py+0Po5y02ouz3rz7D4biSJbmiabqyrbuC8fyTNf2jef6zvf+DwwKh8Si8YhMKpfMpvMJjUqn1IpAMMAOAoNuVlAdXbeBsvls7oY5V7T7fR6sL2S4/X2dQ7D3vl2ux8DnR4gXmFBXqBh3aJC4CMl1GEkZB7ZWmVkGWKXpySn16Bl5GSUwOlr6JIoaGcXaCgnKFDs6qwRbu6iKqzvadOrryXuU+2bc50V4WxTsl9dGeul8R+u3AIndR0xEDcftfQe+rRSOxuy43GCMLoTcjry5ftcehMxtYO6GD6Avn/Runh949HqJa+CPkaCCAK8xUNQgmcGDChKeW2CxHpD+eAF4WdyX4KPGHxyzHBi0iNNHSQ0VcWw10sdLYRJb0tQUs8fMm3By8uD5aSLQSD53DM1UVMfOo2eUMK2UZOVTN06nQuL3Q6pVM1G37kKy1CpWmV4XISnr8ohWtGN3rC2btAbalM3mmu1mN+2QsHCHvLXrNy/dIIKzAfmbt+0MxIKzFibl4zHRHowLK3ZR+bFbyZXiouCc6bIKvqBZ1iBduqMN1KlPp8ZJ4/UwGbKDxmBdm3YlaP1wp4GmhZLnD2u/COzMDyUh0SQ45oFAOudL19+Y9zZcoUvPG8G7PMew0qQGL1vEhxlD3rv1Ruzbu38PP778+fTr27+PP7/+/fwT+/v/D2CAAg5IYIEGHohggjUUAAA7', 40 | 'base64', 41 | ) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/search/image-index.service.ts: -------------------------------------------------------------------------------- 1 | import { Injectable } from '@nestjs/common' 2 | import { OCRService } from 'src/ocr/ocr.service' 3 | import { QueueProcessor } from 'src/queue/meta.types' 4 | import { QueueService } from 'src/queue/queue.service' 5 | import { OptionalTextMessageIndex } from './meili-search.service' 6 | import Debug from 'debug' 7 | 8 | const debug = Debug('app:search:image-index') 9 | 10 | @Injectable() 11 | export class ImageIndexService { 12 | public constructor(private queue: QueueService, private ocr: OCRService) {} 13 | 14 | public async indexImage( 15 | images: Buffer[], 16 | message: OptionalTextMessageIndex, 17 | ): Promise { 18 | if (!this.ocr) { 19 | return 20 | } 21 | await this.queue.add('ocr', { 22 | images: images.map((b) => ({ 23 | type: 'base64', 24 | data: b.toString('base64'), 25 | })), 26 | message, 27 | }) 28 | } 29 | 30 | public async startHandleOCR() { 31 | await this.queue.process('ocr', this.handleOCR) 32 | } 33 | 34 | private handleOCR: QueueProcessor<'ocr'> = async ({ images, message }) => { 35 | if (!this.ocr) { 36 | return 37 | } 38 | const textList = [message.text] 39 | const ocrRaw: any[] = [] 40 | 41 | try { 42 | for (const image of images) { 43 | if (image.type !== 'base64') { 44 | throw new Error('TODO') 45 | } 46 | const buf = Buffer.from(image.data, 'base64') 47 | debug(`getting image buffer, size ${buf.length}`, buf) 48 | 49 | const ocrResult = await this.ocr.recognize(buf) 50 | ocrRaw.push(ocrResult) 51 | 52 | const text = ocrResult.map((x) => x.text).join('\n') 53 | textList.push(text) 54 | } 55 | } catch (e) { 56 | debug('unable to process ocr, skipping it', e) 57 | } 58 | 59 | const searchable = textList 60 | .filter((x) => x) 61 | .join('\n') 62 | .trim() 63 | if (!searchable) { 64 | return 65 | } 66 | await this.queue.add('message', { 67 | message: { 68 | ...message, 69 | text: searchable, 70 | ocr: ocrRaw, 71 | }, 72 | }) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/search/search.controller.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Body, 3 | Controller, 4 | ForbiddenException, 5 | Headers, 6 | Param, 7 | Post, 8 | } from '@nestjs/common' 9 | import { SearchParams } from 'meilisearch' 10 | import { TokenService } from 'src/token/token.service' 11 | import { MeiliSearchService } from './meili-search.service' 12 | 13 | @Controller('search') 14 | export class SearchController { 15 | constructor( 16 | private meiliSearch: MeiliSearchService, 17 | private tokenService: TokenService, 18 | ) {} 19 | 20 | @Post('compilable/meili/indexes/:chatId/search') 21 | async meilisearchCompilable( 22 | @Param('chatId') chatIdInput: string, 23 | @Headers('X-Meili-API-Key') token: string, 24 | @Body() body: SearchParams & { q: string }, 25 | ) { 26 | const { chatId } = this.tokenService.verify(token) 27 | if (chatId !== chatIdInput) { 28 | throw new ForbiddenException('无权访问该聊天,请从机器人按钮重新登录') 29 | } 30 | 31 | const { q, ...options } = body 32 | const filteredOptions = removeKeysFromObject(options, [ 33 | 'filter', 34 | 'attributesToHighlight', 35 | 'attributesToRetrieve', 36 | ]) 37 | filteredOptions.filter = [`chatId = ${chatId}`] 38 | filteredOptions.attributesToHighlight = ['text'] 39 | filteredOptions.attributesToRetrieve = [ 40 | 'text', 41 | 'chatId', 42 | 'messageId', 43 | 'fromId', 44 | 'fromName', 45 | 'timestamp', 46 | ] 47 | // TODO: adds back the filter with fromId 48 | return await this.meiliSearch.getMessagesIndex().search(q, filteredOptions) 49 | } 50 | } 51 | 52 | function removeKeysFromObject>( 53 | obj: T, 54 | keyToRemove: string | string[], 55 | ): T { 56 | const lowerKey = ( 57 | Array.isArray(keyToRemove) ? keyToRemove : [keyToRemove] 58 | ).map((s) => s.toLowerCase()) 59 | 60 | const localeLowerKey = ( 61 | Array.isArray(keyToRemove) ? keyToRemove : [keyToRemove] 62 | ).map((s) => s.toLocaleLowerCase()) 63 | 64 | return Object.entries(obj).reduce((acc: any, curEntry) => { 65 | const [key, val] = curEntry 66 | if ( 67 | !localeLowerKey.includes(key.toLocaleLowerCase()) && 68 | !lowerKey.includes(key.toLowerCase()) 69 | ) { 70 | acc[key] = val 71 | } 72 | return acc 73 | }, {} as T) 74 | } 75 | -------------------------------------------------------------------------------- /src/app.module.ts: -------------------------------------------------------------------------------- 1 | import { CacheModule, Module } from '@nestjs/common' 2 | import { ConfigModule, ConfigService, ConfigType } from '@nestjs/config' 3 | import { AppController } from './app.controller' 4 | import { AppService } from './app.service' 5 | import { SearchModule } from './search/search.module' 6 | import { ImportModule } from './import/import.module' 7 | import { BotModule } from './bot/bot.module' 8 | import { UserModule } from './user/user.module' 9 | import { TokenModule } from './token/token.module' 10 | import meilisearchConfig from './config/meilisearch.config' 11 | import botConfig from './config/bot.config' 12 | import httpConfig from './config/http.config' 13 | import authConfig from './config/auth.config' 14 | import { ServeStaticModule } from '@nestjs/serve-static' 15 | import { join } from 'path' 16 | import { OCRModule } from './ocr/ocr.module' 17 | import { QueueModule } from './queue/queue.module' 18 | import cacheConfig from './config/cache.config' 19 | import redisStore = require('cache-manager-ioredis') 20 | import ocrConfig from './config/ocr.config' 21 | import queueConfig from './config/queue.config' 22 | 23 | @Module({ 24 | imports: [ 25 | ServeStaticModule.forRoot({ 26 | rootPath: join(__dirname, '..', 'public'), 27 | }), 28 | ConfigModule.forRoot({ 29 | isGlobal: true, 30 | expandVariables: true, 31 | load: [ 32 | meilisearchConfig, 33 | botConfig, 34 | httpConfig, 35 | authConfig, 36 | cacheConfig, 37 | ocrConfig, 38 | queueConfig, 39 | ], 40 | }), 41 | CacheModule.registerAsync({ 42 | isGlobal: true, 43 | useFactory: async (cacheCfg: ConfigType) => { 44 | if (cacheCfg.cacheStore === 'memory') { 45 | return {} 46 | } else if (cacheCfg.cacheStore === 'redis') { 47 | return { 48 | ...cacheCfg.redis, 49 | store: redisStore, 50 | ttl: cacheCfg.ttl, 51 | } 52 | } else { 53 | throw new Error(`No such cache store ${cacheCfg.cacheStore}`) 54 | } 55 | }, 56 | inject: [cacheConfig.KEY], 57 | }), 58 | SearchModule, 59 | ImportModule, 60 | BotModule, 61 | UserModule, 62 | TokenModule, 63 | OCRModule, 64 | QueueModule, 65 | ], 66 | controllers: [AppController], 67 | providers: [AppService], 68 | }) 69 | export class AppModule {} 70 | -------------------------------------------------------------------------------- /src/main.ts: -------------------------------------------------------------------------------- 1 | import { NestFactory } from '@nestjs/core' 2 | import { 3 | FastifyAdapter, 4 | NestFastifyApplication, 5 | } from '@nestjs/platform-fastify' 6 | import { AppModule } from './app.module' 7 | import { MeiliSearchService } from './search/meili-search.service' 8 | import Debug from 'debug' 9 | import httpConfig from './config/http.config' 10 | import { ConfigType } from '@nestjs/config' 11 | import { BotService } from './bot/bot.service' 12 | import { IndexService } from './search/index.service' 13 | import { ImageIndexService } from './search/image-index.service' 14 | import { Logger } from '@nestjs/common' 15 | 16 | const debug = Debug('app:main') 17 | 18 | process.on('unhandledRejection', (reason) => { 19 | console.error('Unhandled Rejection caught, process exiting') 20 | console.error(reason) 21 | process.exit(1) 22 | }) 23 | 24 | async function bootstrap() { 25 | debug('bootstrapping app') 26 | 27 | const logger = new Logger('bootstrap') 28 | const [role] = process.argv.slice(2) 29 | const roles = role ? role.split(',') : ['bot', 'ocr'] 30 | logger.log(`Starting roles ${roles.join(', ')}`) 31 | 32 | debug('creating app') 33 | const app = await NestFactory.create( 34 | AppModule, 35 | new FastifyAdapter({ 36 | bodyLimit: 800 * 1024 * 1024, 37 | }), 38 | { 39 | cors: { 40 | allowedHeaders: ['Content-Type', 'Authorization', 'X-Meili-API-Key'], 41 | origin: '*', 42 | maxAge: 86400, 43 | }, 44 | }, 45 | ) 46 | 47 | const httpCfg = app.get>(httpConfig.KEY) 48 | app.setGlobalPrefix(httpCfg.globalPrefix) 49 | 50 | if (roles.includes('bot')) { 51 | debug('creating bot') 52 | const bot = app.get(BotService) 53 | await bot.start() 54 | 55 | debug('migrating search') 56 | const search = app.get(MeiliSearchService) 57 | await search.migrate() 58 | 59 | debug('recovering index') 60 | const index = app.get(IndexService) 61 | await index.recoverFromCache() 62 | 63 | debug('start async index handler') 64 | await index.startHandleAsyncMessage() 65 | } 66 | 67 | if (roles.includes('ocr')) { 68 | debug('start ocr handler') 69 | const imageIndex = app.get(ImageIndexService) 70 | await imageIndex.startHandleOCR() 71 | } 72 | 73 | debug('enable shutdown hooks') 74 | app.enableShutdownHooks() 75 | 76 | if (roles.includes('bot')) { 77 | debug('starting http') 78 | await app.listen(httpCfg.port, httpCfg.host) 79 | } 80 | 81 | logger.log('App bootstrap finished') 82 | } 83 | 84 | bootstrap().catch((err) => { 85 | console.error(err) 86 | process.exit(1) 87 | }) 88 | -------------------------------------------------------------------------------- /src/ocr/azure-ocr.service.ts: -------------------------------------------------------------------------------- 1 | import type { OCRService, OCRResponse } from './ocr.service' 2 | import { Inject, Injectable } from '@nestjs/common' 3 | import ocrConfig from '../config/ocr.config' 4 | import { ConfigType } from '@nestjs/config' 5 | import Debug from 'debug' 6 | import { ComputerVisionClient } from '@azure/cognitiveservices-computervision' 7 | import { CognitiveServicesCredentials } from '@azure/ms-rest-azure-js' 8 | 9 | const debug = Debug('app:ocr:azure') 10 | 11 | @Injectable() 12 | export class AzureOCRService implements OCRService { 13 | private client: ComputerVisionClient 14 | 15 | public constructor( 16 | @Inject(ocrConfig.KEY) ocrCfg: ConfigType, 17 | ) { 18 | const credentials = new CognitiveServicesCredentials(ocrCfg.credentials!) 19 | const client = new ComputerVisionClient(credentials, ocrCfg.endpoint!) 20 | this.client = client 21 | 22 | debug('init azure vision') 23 | } 24 | 25 | public async recognize(image: Uint8Array): Promise { 26 | const imgBuffer = image instanceof Buffer ? image : Buffer.from(image) 27 | 28 | debug('uploading file to azure vision') 29 | const request = await this.client.readInStream(imgBuffer, { 30 | readingOrder: 'natural', 31 | }) 32 | debug('read request finished', request._response) 33 | 34 | const results = await (async () => { 35 | const totalTimes = 30 36 | for (let i = 0; i < totalTimes; i++) { 37 | await new Promise((r) => setTimeout(r, 500)) 38 | 39 | const result = await this.client.getReadResult( 40 | request._response.parsedHeaders.operationLocation?.match( 41 | /[^/]+$/, 42 | )![0], 43 | ) 44 | if (result._response.parsedBody.status === 'running') { 45 | debug(`task running ${i + 1}/${totalTimes}...`) 46 | continue 47 | } 48 | 49 | if (result._response.parsedBody.status === 'succeeded') { 50 | debug('task success') 51 | return result._response.parsedBody.analyzeResult?.readResults 52 | } 53 | 54 | throw new Error('failed to recognize') 55 | } 56 | throw new Error('task timeout') 57 | })() 58 | 59 | const textParts = [] as OCRResponse 60 | 61 | if (results) { 62 | for (const page of results) { 63 | for (const line of page.lines) { 64 | textParts.push({ 65 | text: line.text, 66 | vertices: chunk(line.boundingBox, 2).map(([x, y]) => ({ x, y })), 67 | }) 68 | } 69 | } 70 | } 71 | 72 | return textParts 73 | } 74 | } 75 | 76 | function chunk(array: T[], size): T[][] { 77 | const chunkedArray = [] as T[][] 78 | for (let i = 0; i < array.length; i += size) { 79 | chunkedArray.push(array.slice(i, i + size)) 80 | } 81 | return chunkedArray 82 | } 83 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | ## MeiliSearch Config ## 2 | 3 | # URL of MeiliSearch instance 4 | MEILISEARCH_HOST=http://localhost:7700 5 | 6 | # Use secret key or master key here 7 | MEILISEARCH_API_KEY= 8 | 9 | # TAS will prefix created index with this value, useful for shared MeiliSearch instance 10 | MEILISEARCH_INDEX_PREFIX=tas_ 11 | 12 | 13 | ## Telegram Config ## 14 | 15 | # Bot token 16 | TELEGRAM_BOT_TOKEN=1234:somethingrandom 17 | 18 | # Use webhook 19 | TELEGRAM_WEBHOOK=false 20 | 21 | # Some random text used in webhook url, use bot token if not set (but not recommended) 22 | TELEGRAM_WEBHOOK_UPDATE_TOKEN=anythingyouwant 23 | 24 | # Wheather to follow edits 25 | TELEGRAM_FOLLOW_EDIT=true 26 | # or delete 27 | TELEGRAM_FOLLOW_DELETE=true 28 | 29 | 30 | ## HTTP Config ## 31 | 32 | # Public accessable URL 33 | HTTP_BASE_URL=https://example.com:3100 34 | 35 | # Search UI URL 36 | HTTP_UI_URL=https://example.com:3100 37 | 38 | # Listen host 39 | HTTP_HOST=0.0.0.0 40 | 41 | # Listen port 42 | HTTP_PORT=3100 43 | 44 | 45 | ## Auth Config ## 46 | 47 | # Used for signing search tokens, KEEP IT SAFE or data may be leaked 48 | AUTH_JWT_SECRET="you will need this long enough" 49 | 50 | # Used for import archive, keep it safe or you may lose your data 51 | AUTH_IMPORT_TOKEN="you will keep this safe dont you" 52 | 53 | 54 | ## Cache Config ## 55 | 56 | # Cache store, which used for user avatar and message queue; 'memory' | 'redis' 57 | CACHE_STORE=redis 58 | 59 | # Redis related config 60 | CACHE_REDIS_HOST=localhost 61 | CACHE_REDIS_PORT=6379 62 | CACHE_REDIS_PASSWORD= 63 | CACHE_REDIS_DB=0 64 | CACHE_REDIS_KEY_PREFIX=tas_ 65 | 66 | ## OCR Config ## 67 | 68 | # Enable OCR, if OCR is not enabled then texts in image will not be searchable 69 | OCR_ENABLE=false 70 | 71 | # OCR Driver, 'google' | 'paddle-ocr-web' 72 | OCR_DRIVER=google 73 | 74 | # OCR Enpoint 75 | # for google, use 'eu-vision.googleapis.com' or 'us-vision.googleapis.com' 76 | # for paddle-ocr-web, use http://localhost:1234/api 77 | OCR_ENDPOINT=eu-vision.googleapis.com 78 | 79 | # OCR Credentials 80 | # for google, this will be ignored, you should set GOOGLE_APPLICATION_CREDENTIALS below 81 | # for paddle-ocr-web, this will be ignored 82 | OCR_CREDENTIALS= 83 | 84 | ## Queue Config ## 85 | 86 | # Enable queue, if queue is not enabled then all task will be done in-process 87 | QUEUE_ENABLE=false 88 | 89 | # Redis related config 90 | QUEUE_REDIS_HOST=${CACHE_REDIS_HOST} 91 | QUEUE_REDIS_PORT=${CACHE_REDIS_PORT} 92 | QUEUE_REDIS_PASSWORD=${CACHE_REDIS_PASSWORD} 93 | QUEUE_REDIS_DB=${CACHE_REDIS_DB} 94 | QUEUE_REDIS_KEY_PREFIX=${CACHE_REDIS_KEY_PREFIX}queue_ 95 | 96 | ## Third Party Integrations ## 97 | 98 | # Set to the path of the JSON file that contains your service account key if you are using google cloud services 99 | GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials/of/google.json 100 | -------------------------------------------------------------------------------- /src/search/index.service.ts: -------------------------------------------------------------------------------- 1 | import { 2 | CACHE_MANAGER, 3 | Inject, 4 | Injectable, 5 | OnModuleDestroy, 6 | } from '@nestjs/common' 7 | import { MeiliSearchService, MessageIndex } from './meili-search.service' 8 | import Debug from 'debug' 9 | import { Cache } from 'cache-manager' 10 | import { QueueProcessor } from 'src/queue/meta.types' 11 | import { QueueService } from 'src/queue/queue.service' 12 | 13 | const debug = Debug('app:search:index') 14 | 15 | const MESSAGES_QUEUE_KEY = 'messages' 16 | const INSERT_BATCH = 100 17 | const INSERT_TIMEOUT = 60 * 1000 18 | 19 | @Injectable() 20 | export class IndexService implements OnModuleDestroy { 21 | private messagesQueue: MessageIndex[] 22 | private queueTimer: any 23 | 24 | public constructor( 25 | @Inject(CACHE_MANAGER) private cache: Cache, 26 | private search: MeiliSearchService, 27 | private asyncQueue: QueueService, 28 | ) { 29 | this.messagesQueue = [] 30 | } 31 | 32 | public async recoverFromCache() { 33 | const queue = await this.cache.get(MESSAGES_QUEUE_KEY) 34 | if (queue && Array.isArray(queue)) { 35 | this.messagesQueue = queue.concat(this.messagesQueue) 36 | } 37 | if (this.messagesQueue.length > 0) { 38 | debug(`${this.messagesQueue.length} items recovered from cache`) 39 | await this.importAllQueued() 40 | } 41 | } 42 | 43 | public async writeToCache() { 44 | debug(`writing cache (${this.messagesQueue.length} items)`) 45 | await this.cache.set(MESSAGES_QUEUE_KEY, this.messagesQueue, { ttl: 0 }) 46 | } 47 | 48 | public async importAllQueued() { 49 | if (this.messagesQueue.length < 1) { 50 | return 51 | } 52 | debug('importing all queued message') 53 | const queue = this.messagesQueue 54 | this.messagesQueue = [] 55 | try { 56 | await this.search.importMessages(queue) 57 | } catch (e) { 58 | this.messagesQueue = queue.concat(this.messagesQueue) 59 | throw e 60 | } 61 | await this.writeToCache() 62 | } 63 | 64 | public queueMessage(message: MessageIndex) { 65 | debug('adding message to queue', message) 66 | this.messagesQueue.push(message) 67 | 68 | this.writeToCache().catch(console.error) 69 | 70 | this.queueTimer && clearTimeout(this.queueTimer) 71 | 72 | if (this.messagesQueue.length >= INSERT_BATCH) { 73 | debug('message batch reached') 74 | this.importAllQueued().catch(console.error) 75 | } else { 76 | this.queueTimer = setTimeout(() => { 77 | debug('insert timeout reached') 78 | this.importAllQueued().catch(console.error) 79 | }, INSERT_TIMEOUT) 80 | } 81 | } 82 | 83 | public async onModuleDestroy() { 84 | debug('app exiting, writing queue to cache') 85 | // await this.writeToCache() 86 | await this.importAllQueued() 87 | } 88 | 89 | public async startHandleAsyncMessage() { 90 | await this.asyncQueue.process('message', this.handleAsyncMessage) 91 | } 92 | 93 | private handleAsyncMessage: QueueProcessor<'message'> = async ({ 94 | message, 95 | }) => { 96 | this.queueMessage(message) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "telegram-archive-server", 3 | "version": "0.4.1", 4 | "description": "", 5 | "author": "", 6 | "private": true, 7 | "license": "BSD-3-Clause", 8 | "scripts": { 9 | "prebuild": "rimraf dist", 10 | "build": "nest build", 11 | "format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\"", 12 | "start": "nest start", 13 | "start:dev": "nest start --watch", 14 | "start:debug": "nest start --debug --watch", 15 | "start:prod": "node dist/main", 16 | "lint": "eslint \"{src,apps,libs,test}/**/*.ts\" --fix", 17 | "test": "jest", 18 | "test:watch": "jest --watch", 19 | "test:cov": "jest --coverage", 20 | "test:debug": "node --inspect-brk -r tsconfig-paths/register -r ts-node/register node_modules/.bin/jest --runInBand", 21 | "test:e2e": "jest --config ./test/jest-e2e.json" 22 | }, 23 | "dependencies": { 24 | "@azure/cognitiveservices-computervision": "^8.2.0", 25 | "@azure/ms-rest-azure-js": "^2.1.0", 26 | "@google-cloud/vision": "^2.4.0", 27 | "@nestjs/common": "^8.0.0", 28 | "@nestjs/config": "^1.0.2", 29 | "@nestjs/core": "^8.0.0", 30 | "@nestjs/platform-express": "^8.0.0", 31 | "@nestjs/platform-fastify": "^8.0.11", 32 | "@nestjs/serve-static": "^2.2.2", 33 | "bullmq": "^1.50.5", 34 | "cache-manager": "^3.4.4", 35 | "cache-manager-ioredis": "^2.1.0", 36 | "debug": "^4.3.2", 37 | "deep-equal": "^2.0.5", 38 | "fastify": "^3.22.0", 39 | "fastify-static": "^4.4.2", 40 | "grammy": "^1.3.3", 41 | "https-proxy-agent": "^5.0.0", 42 | "jsonwebtoken": "^8.5.1", 43 | "meilisearch": "^0.34.2", 44 | "node-fetch": "2", 45 | "reflect-metadata": "^0.1.13", 46 | "rimraf": "^3.0.2", 47 | "rxjs": "^7.2.0" 48 | }, 49 | "devDependencies": { 50 | "@nestjs/cli": "^8.0.0", 51 | "@nestjs/schematics": "^8.0.0", 52 | "@nestjs/testing": "^8.0.0", 53 | "@types/cache-manager": "^3.4.2", 54 | "@types/debug": "^4.1.7", 55 | "@types/deep-equal": "^1.0.1", 56 | "@types/express": "^4.17.13", 57 | "@types/ioredis": "^4.27.7", 58 | "@types/jest": "^27.0.1", 59 | "@types/jsonwebtoken": "^8.5.5", 60 | "@types/node": "^16.0.0", 61 | "@types/node-fetch": "2", 62 | "@types/supertest": "^2.0.11", 63 | "@typescript-eslint/eslint-plugin": "^4.28.2", 64 | "@typescript-eslint/parser": "^4.28.2", 65 | "eslint": "^7.30.0", 66 | "eslint-config-prettier": "^8.3.0", 67 | "eslint-plugin-prettier": "^3.4.0", 68 | "jest": "^27.0.6", 69 | "prettier": "^2.3.2", 70 | "supertest": "^6.1.3", 71 | "ts-jest": "^27.0.3", 72 | "ts-loader": "^9.2.3", 73 | "ts-node": "^10.0.0", 74 | "tsconfig-paths": "^3.10.1", 75 | "typescript": "^4.3.5" 76 | }, 77 | "jest": { 78 | "moduleFileExtensions": [ 79 | "js", 80 | "json", 81 | "ts" 82 | ], 83 | "rootDir": "src", 84 | "testRegex": ".*\\.spec\\.ts$", 85 | "transform": { 86 | "^.+\\.(t|j)s$": "ts-jest" 87 | }, 88 | "collectCoverageFrom": [ 89 | "**/*.(t|j)s" 90 | ], 91 | "coverageDirectory": "../coverage", 92 | "testEnvironment": "node" 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/queue/bull-queue.service.ts: -------------------------------------------------------------------------------- 1 | import { Inject, Injectable, OnModuleDestroy } from '@nestjs/common' 2 | import { ConfigType } from '@nestjs/config' 3 | import queueConfig from '../config/queue.config' 4 | import { Queue, Worker } from 'bullmq' 5 | import Debug from 'debug' 6 | import { QueueMeta, QueueProcessor, QueueTypes } from './meta.types' 7 | import { QueueService } from './queue.service' 8 | 9 | const debug = Debug('app:queue:bull') 10 | 11 | @Injectable() 12 | export class BullQueueService implements OnModuleDestroy, QueueService { 13 | private readonly queueMap: Map> 14 | private readonly workerMap: Map> 15 | private readonly redisOptions: ConfigType['redis'] 16 | private readonly redisKeyPrefix: string 17 | 18 | public constructor( 19 | @Inject(queueConfig.KEY) queueCfg: ConfigType, 20 | ) { 21 | this.queueMap = new Map() 22 | this.workerMap = new Map() 23 | this.redisOptions = queueCfg.redis 24 | this.redisKeyPrefix = queueCfg.keyPrefix 25 | for (const key of ['ocr', 'message'] as const) { 26 | this.setQueue( 27 | key, 28 | new Queue(key, { 29 | connection: this.redisOptions, 30 | prefix: this.redisKeyPrefix, 31 | }), 32 | ) 33 | } 34 | } 35 | 36 | private setQueue(key: T, queue: Queue>) { 37 | this.queueMap.set(key, queue) 38 | } 39 | 40 | private getQueue(key: T): Queue> { 41 | if (!this.queueMap.has(key)) { 42 | throw new Error(`queue ${key} not found`) 43 | } 44 | return this.queueMap.get(key) as Queue> 45 | } 46 | 47 | public async onModuleDestroy() { 48 | for (const key of this.queueMap.keys()) { 49 | await this.getQueue(key).close() 50 | } 51 | } 52 | 53 | public async process( 54 | queue: T, 55 | handler: QueueProcessor, 56 | concurrency = 1, 57 | ) { 58 | debug( 59 | `setup process handler for queue ${queue} with concurrency ${concurrency}`, 60 | handler, 61 | ) 62 | if (!this.queueMap.has(queue)) { 63 | throw new Error(`Unknown queue ${queue}`) 64 | } 65 | if (this.workerMap.has(queue)) { 66 | throw new Error(`Already has worker for queue ${queue}`) 67 | } 68 | 69 | const worker = new Worker>( 70 | queue, 71 | async (job) => { 72 | const debug = Debug(`app:queue:bull:${queue}:${job.id}`) 73 | debug(`running`) 74 | try { 75 | await handler(job.data) 76 | debug('finished') 77 | } catch (err) { 78 | debug('error', err) 79 | throw err 80 | } 81 | }, 82 | { 83 | concurrency, 84 | connection: this.redisOptions, 85 | prefix: this.redisKeyPrefix, 86 | }, 87 | ) 88 | 89 | this.workerMap.set(queue, worker) 90 | } 91 | 92 | public async add( 93 | queue: T, 94 | data: QueueMeta, 95 | ): Promise { 96 | debug(`adding job to queue ${queue}`) 97 | await this.getQueue(queue).add(queue, data) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/search/meili-search.service.ts: -------------------------------------------------------------------------------- 1 | import { Inject, Injectable } from '@nestjs/common' 2 | import { ConfigType } from '@nestjs/config' 3 | import meilisearchConfig from '../config/meilisearch.config' 4 | import { Index, MeiliSearch, Settings } from 'meilisearch' 5 | import Debug from 'debug' 6 | import deepEqual = require('deep-equal') 7 | 8 | const debug = Debug('app:search:meili') 9 | 10 | export type MessageIndex = { 11 | id: string 12 | messageId: number 13 | chatId: string 14 | fromId: string 15 | fromName: string 16 | /** searchable text */ 17 | text: string 18 | raw: any 19 | from: 'import' | 'bot' 20 | timestamp: number 21 | ocr?: any 22 | } 23 | 24 | export type OptionalTextMessageIndex = Omit & { 25 | text: string | undefined 26 | } 27 | 28 | @Injectable() 29 | export class MeiliSearchService { 30 | private client: MeiliSearch 31 | private indexPrefix: string 32 | private messagesIndex: Index 33 | 34 | public constructor( 35 | @Inject(meilisearchConfig.KEY) 36 | msConfig: ConfigType, 37 | ) { 38 | this.client = new MeiliSearch(msConfig) 39 | this.indexPrefix = msConfig.indexPrefix 40 | this.messagesIndex = this.client.index( 41 | `${this.indexPrefix}messages`, 42 | ) 43 | } 44 | 45 | public async migrate(): Promise { 46 | const settings: Settings = { 47 | searchableAttributes: ['text'], 48 | filterableAttributes: ['chatId', 'fromId', 'timestamp'], 49 | sortableAttributes: ['timestamp'], 50 | synonyms: { 51 | 妈: ['🐴', '马'], 52 | 草: ['🌿', '艹', '操', '肏'], 53 | CF: ['CloudFlare', 'Cloud Flare'], 54 | CloudFlare: ['CF'], 55 | }, 56 | } 57 | const rankingRules = [ 58 | 'words', 59 | 'sort', 60 | 'typo', 61 | 'proximity', 62 | 'exactness', 63 | 'timestamp:desc', 64 | ] 65 | 66 | try { 67 | const index = await this.client.getIndex(this.messagesIndex.uid) 68 | if (index.primaryKey == null) { 69 | await this.client.updateIndex(this.messagesIndex.uid, { 70 | primaryKey: 'id', 71 | }) 72 | } 73 | } catch (e) { 74 | await this.client.createIndex(this.messagesIndex.uid, { 75 | primaryKey: 'id', 76 | }) 77 | } 78 | await this.messagesIndex.fetchInfo() 79 | 80 | const currentSettings = await this.messagesIndex.getSettings() 81 | for (const key of Object.keys(settings)) { 82 | if (!deepEqual(currentSettings[key], settings[key])) { 83 | await this.messagesIndex.updateSettings(settings) 84 | break 85 | } 86 | } 87 | 88 | const currentRankingRules = await this.messagesIndex.getRankingRules() 89 | if (!deepEqual(currentRankingRules, rankingRules)) { 90 | await this.messagesIndex.updateRankingRules(rankingRules) 91 | } 92 | } 93 | 94 | public async importMessages(messages: MessageIndex[]): Promise { 95 | debug('importing messages', messages) 96 | await this.messagesIndex.addDocuments(messages) 97 | } 98 | 99 | public async search(query: string, chatId: string, fromId?: number) { 100 | const result = await this.messagesIndex.search(query, { 101 | filter: [ 102 | `chatId = ${chatId}`, 103 | ...[fromId == null ? [] : [`fromId = ${fromId}`]], 104 | ], 105 | }) 106 | return result 107 | } 108 | 109 | public getMessagesIndex() { 110 | return this.messagesIndex 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/user/auth.controller.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Controller, 3 | ForbiddenException, 4 | Get, 5 | Header, 6 | Inject, 7 | Query, 8 | Redirect, 9 | } from '@nestjs/common' 10 | import { ConfigType } from '@nestjs/config' 11 | import { createHash, createHmac, timingSafeEqual } from 'crypto' 12 | import { BotService } from 'src/bot/bot.service' 13 | import { TokenService } from 'src/token/token.service' 14 | import botConfig from '../config/bot.config' 15 | import httpConfig from '../config/http.config' 16 | 17 | @Controller('user/auth') 18 | export class AuthController { 19 | private botToken: string 20 | private uiUrl: string 21 | private baseUrl: string 22 | 23 | constructor( 24 | @Inject(httpConfig.KEY) httpCfg: ConfigType, 25 | @Inject(botConfig.KEY) botCfg: ConfigType, 26 | private tokenService: TokenService, 27 | private botService: BotService, 28 | ) { 29 | this.botToken = botCfg.token 30 | this.uiUrl = httpCfg.uiUrl 31 | this.baseUrl = `${httpCfg.baseUrl}${httpCfg.globalPrefix}` 32 | } 33 | 34 | // chatId=supergroup1098355009&id=29947350&first_name=三三&username=stupid33&photo_url=https%3A%2F%2Ft.me%2Fi%2Fuserpic%2F320%2FOm2VdBoFI8c9cDtdtheFHpFK8c-W5rjEtTFolUe4O6I.jpg&auth_date=1634173284&hash=somehex 35 | @Get('viaTelegram') 36 | @Header('Cache-Control', 'no-cache, no-store') 37 | @Redirect('/', 302) 38 | public async authCallback( 39 | @Query('chatId') chatId: string, 40 | @Query('id') userId: string, 41 | @Query('first_name') firstName: string, 42 | @Query('last_name') lastName: string, 43 | @Query('username') username: string, 44 | @Query('photo_url') photoUrl: string, 45 | @Query('auth_date') authDate: string, 46 | @Query('hash') hash: string, 47 | ): Promise<{ url: string }> { 48 | const telegramLogin = { 49 | id: userId, 50 | first_name: firstName, 51 | last_name: lastName, 52 | username, 53 | photo_url: photoUrl, 54 | auth_date: authDate, 55 | } 56 | 57 | await this.verifyHash(telegramLogin, hash) 58 | 59 | const numberUserId = Number(userId) 60 | if (!(await this.botService.checkIfUserIsMember(numberUserId, chatId))) { 61 | throw new ForbiddenException('User is not a member of this chat') 62 | } 63 | 64 | const token = this.tokenService.sign({ userId: numberUserId, chatId }) 65 | 66 | const url = new URL(`${this.uiUrl}/index.html`) 67 | url.searchParams.append('tas_server', this.baseUrl) 68 | url.searchParams.append('tas_indexName', chatId) 69 | url.searchParams.append('tas_authKey', token) 70 | 71 | return { url: url.toString() } 72 | } 73 | 74 | private async verifyHash( 75 | loginObject: Record, 76 | inputHash: string, 77 | ) { 78 | const authDate = loginObject.auth_date 79 | if (!authDate) { 80 | throw new ForbiddenException('No auth_date in login') 81 | } 82 | 83 | if (Math.abs(Number(authDate) * 1000 - Date.now()) > 5 * 60 * 1000) { 84 | throw new ForbiddenException('Invalid auth date') 85 | } 86 | 87 | // Data-check-string is a concatenation of all received fields, sorted in alphabetical order, in the format key= with a line feed character ('\n', 0x0A) used as separator 88 | const keys = Object.keys(loginObject).sort() 89 | const dataCheckString = keys 90 | .reduce( 91 | (acc, key) => 92 | loginObject[key] ? `${acc}${key}=${loginObject[key]}\n` : acc, 93 | '', 94 | ) 95 | .trimEnd() 96 | 97 | // You can verify the authentication and the integrity of the data received by comparing the received hash parameter with the hexadecimal representation of the HMAC-SHA-256 signature of the data-check-string with the SHA256 hash of the bot's token used as a secret key. 98 | const sha = createHash('sha256').update(this.botToken).digest() 99 | 100 | const hmac = createHmac('sha256', sha) 101 | hmac.update(dataCheckString) 102 | if (!timingSafeEqual(hmac.digest(), Buffer.from(inputHash, 'hex'))) { 103 | throw new ForbiddenException('Invalid hash') 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/import/import.controller.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Body, 3 | Controller, 4 | Post, 5 | Headers, 6 | ForbiddenException, 7 | Inject, 8 | } from '@nestjs/common' 9 | import { ConfigType } from '@nestjs/config' 10 | import { timingSafeEqual } from 'crypto' 11 | import Debug from 'debug' 12 | import authConfig from 'src/config/auth.config' 13 | import { 14 | MeiliSearchService, 15 | MessageIndex, 16 | } from '../search/meili-search.service' 17 | 18 | const debug = Debug('app:import:import.controller') 19 | 20 | type TelegramExportGroup = { 21 | name: string 22 | type: string 23 | id: number 24 | messages: TelegramExportMessageInGroup[] 25 | } 26 | 27 | type TelegramExportMessageInGroup = { 28 | id: number 29 | type: string 30 | date: string 31 | from: string 32 | from_id: `${'user' | 'channel' | 'group'}${string}` 33 | text: string | Array<{ type: string; text: string }> 34 | } 35 | 36 | const TelegramExportGroupTypeMap = { 37 | private_supergroup: 'supergroup', 38 | } 39 | 40 | @Controller('import') 41 | export class ImportController { 42 | private importToken: string 43 | 44 | constructor( 45 | private readonly searchService: MeiliSearchService, 46 | @Inject(authConfig.KEY) authCfg: ConfigType, 47 | ) { 48 | this.importToken = authCfg.importToken 49 | if (!this.importToken) { 50 | throw new Error('please set AUTH_IMPORT_TOKEN to keep your data safe') 51 | } 52 | } 53 | 54 | @Post('fromTelegramGroupExport') 55 | async fromTelegramGroupExport( 56 | @Body() body: TelegramExportGroup, 57 | @Headers('Authorization') authHeader: string, 58 | ) { 59 | const auth = `${authHeader}`.split(' ')[1] 60 | if (!auth) { 61 | throw new ForbiddenException('auth not found') 62 | } 63 | 64 | if (!this.compareImportToken(auth)) { 65 | throw new ForbiddenException('invalid token') 66 | } 67 | 68 | const { id: groupId, type: groupType, messages } = body 69 | if (!groupId) { 70 | throw new Error('groupId is required') 71 | } 72 | if (!Array.isArray(messages)) { 73 | throw new Error('import data misformed: messages is not array') 74 | } 75 | debug( 76 | `import from telegram group export: ${groupType}:${groupId}, ${messages.length} messages`, 77 | ) 78 | 79 | const mappedGroupType = TelegramExportGroupTypeMap[groupType] 80 | if (!mappedGroupType) { 81 | throw new Error(`import data misformed: unknown group type: ${groupType}`) 82 | } 83 | const chatId = `${mappedGroupType}${groupId}` 84 | 85 | const messageBuffer: MessageIndex[] = [] 86 | let messageCount = 0 87 | for (const message of messages) { 88 | const { id, type, date, from, from_id, text } = message 89 | switch (type) { 90 | case 'message': 91 | if (Array.isArray(text)) { 92 | // check text has text 93 | if ( 94 | text 95 | .filter((item) => typeof item !== 'string') 96 | .some((item) => typeof item.text !== 'string') 97 | ) { 98 | console.warn('message text dont have text', message) 99 | break 100 | } 101 | } 102 | const searchable = Array.isArray(text) 103 | ? text 104 | .map((item) => (typeof item === 'string' ? item : item.text)) 105 | .join('') 106 | : text 107 | if (!searchable) { 108 | break 109 | } 110 | const messageIndex: MessageIndex = { 111 | id: `${chatId}__${id}`, 112 | messageId: id, 113 | chatId, 114 | fromId: from_id, 115 | fromName: from, 116 | text: searchable, 117 | raw: message, 118 | from: 'import', 119 | timestamp: new Date(date).getTime(), 120 | } 121 | messageBuffer.push(messageIndex) 122 | break 123 | case 'service': 124 | break 125 | default: 126 | console.warn('unknown message type', type, message) 127 | break 128 | } 129 | 130 | if (messageBuffer.length >= 10000) { 131 | messageCount += messageBuffer.length 132 | await this.searchService.importMessages(messageBuffer) 133 | messageBuffer.length = 0 134 | } 135 | } 136 | 137 | if (messageBuffer.length > 0) { 138 | messageCount += messageBuffer.length 139 | await this.searchService.importMessages(messageBuffer) 140 | } 141 | 142 | debug( 143 | `import from telegram group export: ${groupType}:${groupId}, ${messageCount} of ${messages.length} messages has been queued`, 144 | ) 145 | 146 | return { queued: messageCount } 147 | } 148 | 149 | private compareImportToken(token: string) { 150 | try { 151 | return timingSafeEqual(Buffer.from(token), Buffer.from(this.importToken)) 152 | } catch (e) { 153 | return false 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Telegram Archive Server 2 | 3 | [![Docker](https://github.com/oott123/telegram-archive-server/actions/workflows/ci.yaml/badge.svg)](https://github.com/oott123/telegram-archive-server/actions/workflows/ci.yaml) [![CJK Ready](https://img.shields.io/badge/CJK-ready-66ccff)](./README.md) [![Releases](https://img.shields.io/github/package-json/v/oott123/telegram-archive-server/master?label=version)](https://github.com/oott123/telegram-archive-server/releases) [![quay.io](https://img.shields.io/badge/Browse%20on-quay.io-blue?logo=docker&logoColor=white)](https://quay.io/repository/oott123/telegram-archive-server?tab=tags) [![BSD 3 Clause Licensed](https://img.shields.io/github/license/oott123/telegram-archive-server)](./LICENSE) 4 | 5 | 一个适合 CJK 环境的,Telegram 群聊搜索和归档机器人。 6 | 7 | ## 功能概览 8 | 9 | - 支持群成员鉴权,仅群友可以搜索 10 | - 支持导入历史聊天记录,自动去重 11 | - 使用 MeiliSearch 对中文进行搜索,索引效果好 12 | - 支持图片 OCR 纳入搜索结果(仅支持新增,尚未支持历史图片) 13 | - 有简单的网页界面,可以显示头像 14 | - 搜索结果可以跳转打开聊天界面 15 | 16 | ## 展示 17 | 18 | ### 聊天鉴权 19 | 20 | ![](./docs/assets/search-command.jpg) 21 | 22 | 点击【搜索】按钮即可自动鉴权打开搜索界面。 23 | 24 | ### 搜索界面 25 | 26 | ![](./docs/assets/search-ui.jpg) 27 | 28 | 点击时间链接即可跳转聊天界面。 29 | 30 | ![](./docs/assets/search-and-jump.gif) 31 | 32 | ## 部署 33 | 34 | ### 准备 35 | 36 | 你需要: 37 | 38 | - 一个 Bot 帐号,事先获取它的 token 39 | - 一个公网可及的 https 服务器,一定要有 https 40 | - 一个**超级群**,目前只支持超级群 41 | - 一个 MeiliSearch 实例,配不配置 key 都行 42 | - 一个 Redis 实例,没有也行,就是可能异常重启会丢消息 43 | 44 | ### 配置 45 | 46 | 下载 [`.env.example`](./.env.example) 文件,参考内部注释,进行相应配置。 47 | 48 | 你可以将它保存为 `.env` ,或是作为环境变量配置。 49 | 50 | ### 运行 51 | 52 | #### HTTPS 53 | 54 | TAS 并不提供内建的 https 服务,建议使用 Caddy 或类似软件反向代理 TAS。 55 | 56 | #### With Docker 57 | 58 | ```bash 59 | docker run -d --restart=always --env-file=.env quay.io/oott123/telegram-archive-server 60 | ``` 61 | 62 | 当然,也可以使用 Kubernetes 或者 docker-compose 运行。 63 | 64 | #### Using Source Code 65 | 66 | 如果没有 Docker 或者不想用 Docker,也可以从源码编译部署。此时你还需要: 67 | 68 | - git 69 | - node 18 70 | 71 | ```bash 72 | git clone https://github.com/oott123/telegram-archive-server.git 73 | cd telegram-archive-server 74 | # git checkout vX.X.X 75 | cp .env.example .env 76 | vim .env 77 | yarn 78 | yarn build 79 | yarn start 80 | ``` 81 | 82 | ### 使用 83 | 84 | 在群里发送 `/search`。Bot 可能会提示你设置 Domain,按提示设置即可。 85 | 86 | ![](./docs/assets/bot-set-domain.gif) 87 | 88 | #### 获取用户头像 89 | 90 | 用户必须满足以下条件,才能在搜索结果中展示头像: 91 | 92 | - 曾与 Bot 交互过(发送过消息,或是授权登录过) 93 | - 用户设置头像公开可见 94 | 95 | #### 新记录的索引规则 96 | 97 | 由于 MeiliSearch 对新消息的索引效率较差,只有在满足如下任意条件时,消息才会进入索引: 98 | 99 | - 60 秒内没有收到新消息 100 | - 累计收到了 100 条没有进入索引的消息 101 | - 主进程接收到 SIGINT 信号 102 | 103 | 如果没有使用 redis 以持久化消息队列,在程序异常、服务器重启时可能会丢失未进入队列的消息。 104 | 105 | ### 导入老的聊天记录 106 | 107 | **当前仅支持超级群导入。** 108 | 109 | 在桌面客户端点击三点按钮 - Export chat history,等待导出完成,得到 `result.json`。 110 | 111 | 执行: 112 | 113 | ```bash 114 | curl \ 115 | -H "Content-Type: application/json" \ 116 | -H "Authorization: Bearer $AUTH_IMPORT_TOKEN" \ 117 | -XPOST -T result.json \ 118 | http://localhost:3100/api/v1/import/fromTelegramGroupExport 119 | ``` 120 | 121 | 即可导入记录。注意一次只能导入单个群的记录。 122 | 123 | ### OCR 识别文字(TBD) 124 | 125 | 如果启用 OCR 队列,那么 Redis 是必须的(可以和缓存共用一个实例),并配置第三方识别服务。识别流程如下: 126 | 127 | [![](https://mermaid.ink/img/eyJjb2RlIjoic2VxdWVuY2VEaWFncmFtXG4gIGF1dG9udW1iZXJcbiAgQm905a6e5L6LLT4-K09DUuWunuS-izog6YCa6L-HIE9DUiDpmJ_liJflj5HpgIHlm77niYdcbiAgT0NS5a6e5L6LLT4-K09DUuacjeWKoTog6K-G5Yir5Zu-54mHXG4gIE9DUuacjeWKoS0-Pi1PQ1Llrp7kvos6IOi_lOWbnue7k-aenFxuICBPQ1Llrp7kvostPj4tQm905a6e5L6LOiDpgJrov4flhaXlupPpmJ_liJflj5HpgIHor4bliKvnu5PmnpxcbiAgYWN0aXZhdGUgQm905a6e5L6LXG4gIEJvdOWunuS-iy0-Pi1NZWlsaVNlYXJjaDog5YWl5bqTIiwibWVybWFpZCI6eyJ0aGVtZSI6ImRlZmF1bHQifSwidXBkYXRlRWRpdG9yIjp0cnVlLCJhdXRvU3luYyI6dHJ1ZSwidXBkYXRlRGlhZ3JhbSI6dHJ1ZX0)](https://mermaid.live/edit/#eyJjb2RlIjoic2VxdWVuY2VEaWFncmFtXG4gIGF1dG9udW1iZXJcbiAgQm905a6e5L6LLT4-K09DUuWunuS-izog6YCa6L-HIE9DUiDpmJ_liJflj5HpgIHlm77niYdcbiAgT0NS5a6e5L6LLT4-K09DUuacjeWKoTog6K-G5Yir5Zu-54mHXG4gIE9DUuacjeWKoS0-Pi1PQ1Llrp7kvos6IOi_lOWbnue7k-aenFxuICBPQ1Llrp7kvostPj4tQm905a6e5L6LOiDpgJrov4flhaXlupPpmJ_liJflj5HpgIHor4bliKvnu5PmnpxcbiAgYWN0aXZhdGUgQm905a6e5L6LXG4gIEJvdOWunuS-iy0-Pi1NZWlsaVNlYXJjaDog5YWl5bqTIiwibWVybWFpZCI6IntcbiAgXCJ0aGVtZVwiOiBcImRlZmF1bHRcIlxufSIsInVwZGF0ZUVkaXRvciI6dHJ1ZSwiYXV0b1N5bmMiOnRydWUsInVwZGF0ZURpYWdyYW0iOnRydWV9) 128 | 129 | 识别和入库可以在不同的角色实例上完成:图片下载和文本入库将在 Bot 实例上完成,OCR 实例仅需访问 OCR 服务即可。 130 | 131 | 这样的设计使得维护者可以设计离线式的集中识别(例如使用抢占式实例运行识别服务,队列清空后关机),降低识别成本。 132 | 133 | 如果你使用的是第三方云服务,可以直接关闭 OCR 队列,或是在同一个实例中开启 Bot 和 OCR 角色。 134 | 135 | #### 识别服务 136 | 137 | ##### Google Cloud Vision 138 | 139 | 参考 [Google Cloud Vision 文本识别文档](https://cloud.google.com/vision/docs/ocr) 和 [Google Cloud Vision 计费规则](https://cloud.google.com/vision/pricing)。配置如下: 140 | 141 | ```bash 142 | OCR_DRIVER=google 143 | OCR_ENDPOINT=eu-vision.googleapis.com # 或者 us-vision.googleapis.com ,决定 Google 在何处存储处理数据 144 | GOOGLE_APPLICATION_CREDENTIALS=/path/to/google/credentials.json # 从 GCP 后台下载的 json 鉴权文件 145 | ``` 146 | 147 | ##### PaddleOCR 148 | 149 | 你需要一个 [paddleocr-web](https://github.com/lilydjwg/paddleocr-web) 实例。配置如下: 150 | 151 | ```bash 152 | OCR_DRIVER=paddle-ocr-web 153 | OCR_ENDPOINT=http://127.0.0.1:8980/api 154 | ``` 155 | 156 | ##### Azure OCR 157 | 158 | 创建一个 [Azure Vision](https://portal.azure.com/#create/Microsoft.CognitiveServicesComputerVision) 资源,并将资源信息配置如下: 159 | 160 | ```bash 161 | OCR_DRIVER=azure 162 | OCR_ENDPOINT=https://tas.cognitiveservices.azure.com 163 | OCR_CREDENTIALS=000000000000000000000000000000000 164 | ``` 165 | 166 | #### 启动不同角色 167 | 168 | ```bash 169 | docker run [...] dist/main ocr,bot 170 | # or 171 | node dist/main ocr,bot 172 | ``` 173 | 174 | ## 开发 175 | 176 | ```bash 177 | DEBUG=app:*,grammy* yarn start:debug 178 | ``` 179 | 180 | ### 前端开发 181 | 182 | 搜索服务鉴权后,服务端会跳转到:`$HTTP_UI_URL/index.html` 并带上以下 URL 参数: 183 | 184 | - `tas_server` - 服务器基础 URL,形如 `http://localhost:3100/api/v1` 185 | - `tas_indexName` - 群号,形如 `supergroup1234567890` 186 | - `tas_authKey` - 服务器签发的 JWT,可以用来作为 MeiliSearch 的 api key 使用。 187 | 188 | ### MeiliSearch 兼容 189 | 190 | 在 `/api/v1/search/compilable/meili` 处可以当作普通的 MeiliSearch 实例进行搜索。 191 | 192 | 索引名应该使用形如 `supergroup1234567890` 的群号; API Key 则是服务端签发的 JWT。 193 | 194 | 请注意 filter 由于安全原因暂时不可使用。 195 | -------------------------------------------------------------------------------- /public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 搜索 8 | 10 | 51 | 52 | 53 | 54 |
55 | 56 |
57 |
58 |
59 | 60 | 62 | 63 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /src/bot/bot.service.ts: -------------------------------------------------------------------------------- 1 | import { Inject, Injectable } from '@nestjs/common' 2 | import { ConfigType } from '@nestjs/config' 3 | import { Bot, Context, GrammyError, NextFunction } from 'grammy' 4 | import botConfig from '../config/bot.config' 5 | import { OptionalTextMessageIndex } from '../search/meili-search.service' 6 | import httpConfig from '../config/http.config' 7 | import { PhotoSize, Update } from '@grammyjs/types' 8 | import Debug = require('debug') 9 | import fetch from 'node-fetch' 10 | import createHttpsProxyAgent = require('https-proxy-agent') 11 | import { IndexService } from 'src/search/index.service' 12 | import { ImageIndexService } from 'src/search/image-index.service' 13 | 14 | const debug = Debug('app:bot:bot.service') 15 | 16 | @Injectable() 17 | export class BotService { 18 | private bot: Bot 19 | private useWebhook: boolean 20 | private baseUrl: string 21 | private updateToken: string 22 | private agent: any 23 | 24 | public constructor( 25 | @Inject(botConfig.KEY) 26 | botCfg: ConfigType, 27 | @Inject(httpConfig.KEY) 28 | httpCfg: ConfigType, 29 | private index: IndexService, 30 | private imageIndex: ImageIndexService, 31 | ) { 32 | this.useWebhook = botCfg.webhook 33 | this.baseUrl = `${httpCfg.baseUrl}${httpCfg.globalPrefix}` 34 | this.updateToken = botCfg.updateToken || botCfg.token 35 | 36 | if (this.useWebhook && !this.baseUrl) { 37 | throw new Error( 38 | 'You MUST set HTTP_BASE_URL if you have enabled TELEGRAM_WEBHOOK', 39 | ) 40 | } 41 | 42 | this.agent = getProxyAgent() 43 | this.bot = new Bot(botCfg.token, { 44 | client: { 45 | baseFetchConfig: { 46 | agent: this.agent, 47 | compress: true, 48 | }, 49 | }, 50 | }) 51 | 52 | this.bot.on('msg', this.botOnMessage) 53 | 54 | if (botCfg.followEdit) { 55 | this.bot.on('edit', this.botOnMessage) 56 | } 57 | 58 | this.bot.command('search', this.botOnSearchCommand) 59 | this.bot.command('flush', this.botOnFlushCommand) 60 | } 61 | 62 | public async start() { 63 | if (this.useWebhook) { 64 | await this.bot.init() 65 | return this.setWebhookUrl() 66 | } else { 67 | await this.startPolling() 68 | } 69 | } 70 | 71 | public async checkIfUserIsMember(userId: number, chatId: string) { 72 | const id = this.chatId2ApiId(chatId) 73 | const { status } = await this.bot.api.getChatMember(id, userId) 74 | 75 | return ( 76 | status === 'member' || status === 'creator' || status === 'administrator' 77 | ) 78 | } 79 | 80 | public chatId2ApiId(chatId: string) { 81 | return Number(chatId.replace(/^supergroup/, '-100').replace(/^group/, '-')) 82 | } 83 | 84 | public async getProfilePhoto(userId: number) { 85 | const { photos } = await this.tryGetPhotos(userId) 86 | if (photos.length < 1 || photos[0].length < 1) { 87 | return null 88 | } 89 | 90 | const { file_id: fileId } = getSmallestPhoto(photos[0]) 91 | return await this.fetchFile(fileId) 92 | } 93 | 94 | private async fetchFile(fileId: string) { 95 | const { file_path: filePath } = await this.bot.api.getFile(fileId) 96 | const fileUrl = `https://api.telegram.org/file/bot${this.bot.token}/${filePath}` 97 | 98 | const res = await fetch(fileUrl, { agent: this.agent }) 99 | 100 | return res 101 | } 102 | 103 | private botOnMessage = async (ctx: Context, next: NextFunction) => { 104 | await next() 105 | const { msg, chat, from } = ctx 106 | if (!chat || !msg || !from) { 107 | return 108 | } 109 | const realId = `${chat.id}`.replace(/^-100/, '') 110 | const chatId = `${chat.type}${realId}` 111 | const searchable = msg?.text || msg?.caption 112 | 113 | const baseMessage: OptionalTextMessageIndex = { 114 | id: `${chatId}__${msg.message_id}`, 115 | messageId: msg.message_id, 116 | chatId, 117 | fromId: `user${from.id}`, 118 | fromName: joinNames(from.first_name, from.last_name), 119 | text: searchable, 120 | raw: ctx.msg, 121 | from: 'bot', 122 | timestamp: msg.date * 1000, 123 | } 124 | 125 | if (searchable) { 126 | await this.index.queueMessage({ 127 | ...baseMessage, 128 | text: searchable, 129 | }) 130 | } 131 | 132 | if (msg?.photo?.length) { 133 | await this.handlePhoto(msg.photo, baseMessage) 134 | } 135 | } 136 | 137 | private async handlePhoto( 138 | photoSize: PhotoSize[], 139 | baseMessage: OptionalTextMessageIndex, 140 | ) { 141 | const { file_id: fileId } = getLargestPhoto(photoSize) 142 | const res = await this.fetchFile(fileId) 143 | const buf = await res.buffer() 144 | await this.imageIndex.indexImage([buf], baseMessage) 145 | } 146 | 147 | private botOnFlushCommand = async (ctx: Context) => { 148 | const { chat } = ctx 149 | 150 | if (chat?.type === 'private') { 151 | await ctx.reply('?') 152 | return 153 | } 154 | 155 | await this.index.importAllQueued() 156 | await ctx.reply('🧽') 157 | } 158 | 159 | private botOnSearchCommand = async (ctx: Context) => { 160 | const { msg, chat, from } = ctx 161 | if (!chat || !msg || !from) { 162 | return 163 | } 164 | 165 | if (chat.type === 'private') { 166 | await ctx.reply('?') 167 | return 168 | } 169 | 170 | const realId = `${chat.id}`.replace(/^-100/, '') 171 | const chatId = `${chat.type}${realId}` 172 | const authUrl = new URL(this.baseUrl + '/user/auth/viaTelegram') 173 | authUrl.searchParams.append('chatId', chatId) 174 | 175 | try { 176 | await ctx.reply('🔍群内消息搜索试运行中,有问题请点我头像', { 177 | reply_markup: { 178 | inline_keyboard: [ 179 | [ 180 | { 181 | text: '搜索', 182 | login_url: { 183 | url: authUrl.toString(), 184 | request_write_access: true, 185 | forward_text: `搜索「${chat.title}」`, 186 | }, 187 | }, 188 | ], 189 | ], 190 | }, 191 | }) 192 | } catch (e: any) { 193 | if ( 194 | e instanceof GrammyError && 195 | (e.description.includes('login URL is invalid') || 196 | e.description.includes('BOT_DOMAIN_INVALID')) 197 | ) { 198 | await ctx.reply( 199 | `当前无法使用登录,请联系 @BotFather 在 Settings / Domain 处将域名修改为 ${authUrl.hostname}`, 200 | { parse_mode: 'HTML' }, 201 | ) 202 | } else { 203 | throw e 204 | } 205 | } 206 | } 207 | 208 | private async tryGetPhotos(userId: number) { 209 | try { 210 | return await this.bot.api.getUserProfilePhotos(userId, { 211 | limit: 1, 212 | }) 213 | } catch (e: any) { 214 | if (e.message.includes('user not found')) { 215 | return { photos: [] } 216 | } else { 217 | throw e 218 | } 219 | } 220 | } 221 | 222 | private async setWebhookUrl() { 223 | const url = `${this.baseUrl}/bot/webhook/${this.updateToken}/update` 224 | await this.bot.api.setWebhook(url) 225 | } 226 | 227 | private async startPolling() { 228 | void this.bot.start() 229 | } 230 | 231 | public handleUpdate(update: Update) { 232 | return this.bot.handleUpdate(update) 233 | } 234 | 235 | public checkUpdateToken(tokenInput: string) { 236 | return tokenInput === this.updateToken 237 | } 238 | } 239 | 240 | function joinNames(firstName: string, lastName: string | undefined) { 241 | return [firstName, lastName].filter((x) => x).join(' ') 242 | } 243 | 244 | function getProxyAgent() { 245 | const proxy = process.env.https_proxy || process.env.http_proxy 246 | if (!proxy) { 247 | return 248 | } 249 | 250 | return createHttpsProxyAgent(proxy) 251 | } 252 | 253 | function getSmallestPhoto(photos: PhotoSize[]): PhotoSize { 254 | const sorted = photos.sort((a, b) => a.width - b.width) 255 | return sorted[0] 256 | } 257 | 258 | function getLargestPhoto(photos: PhotoSize[]): PhotoSize { 259 | const sorted = photos.sort((a, b) => b.width - a.width) 260 | return sorted[0] 261 | } 262 | --------------------------------------------------------------------------------