├── tgbot ├── app │ ├── __init__.py │ ├── aiosqlite_wrapper.py │ └── exceptions.py ├── views │ ├── __init__.py │ └── telegram │ │ ├── __init__.py │ │ ├── document_list_widget.py │ │ ├── common.py │ │ └── progress_bar.py ├── widgets │ └── __init__.py ├── configs │ ├── development.yaml │ ├── __init__.py │ ├── logging.yaml │ └── base.yaml ├── .gitignore ├── promotions │ ├── __init__.py │ └── promotions.yaml ├── handlers │ ├── stop.py │ ├── noop.py │ ├── aboutus.py │ ├── librarian.py │ ├── help.py │ ├── howtohelp.py │ ├── shortlink.py │ ├── close.py │ ├── report.py │ ├── roll.py │ ├── q.py │ ├── view.py │ ├── start.py │ ├── submit.py │ ├── vote.py │ ├── mlt.py │ ├── cybrex.py │ └── riot.py ├── translations │ └── __init__.py ├── requirements.txt ├── Dockerfile ├── main.py ├── README.md └── markdownifytg.py ├── web ├── .prettierrc.json ├── .eslintignore ├── service-worker.js ├── public │ ├── favicon.ico │ ├── favicon-dark.png │ ├── mstile-70x70.png │ ├── default-cover.jpg │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon-light.png │ ├── mstile-144x144.png │ ├── mstile-150x150.png │ ├── mstile-310x150.png │ ├── mstile-310x310.png │ ├── apple-touch-icon.png │ ├── android-chrome-192x192.png │ ├── android-chrome-512x512.png │ ├── android-chrome-maskable-192x192.png │ ├── android-chrome-maskable-512x512.png │ ├── browserconfig.xml │ ├── sitemap.xml │ ├── site.webmanifest │ ├── favicon-black.svg │ ├── favicon.svg │ └── safari-pinned-tab.svg ├── src │ ├── services │ │ ├── search │ │ │ ├── index.ts │ │ │ ├── search-service.ts │ │ │ └── query-processor.ts │ │ ├── index.ts │ │ └── user-service.ts │ ├── assets │ │ └── origin.jpg │ ├── views │ │ ├── HowToSearchView.vue │ │ ├── InstallIpfsView.vue │ │ ├── DoomsdayView.vue │ │ ├── IntroView.vue │ │ ├── StcHubApiView.vue │ │ ├── StcBoxView.vue │ │ ├── DonateView.vue │ │ ├── DocumentView.vue │ │ ├── Reader.vue │ │ └── BookmarksView.vue │ ├── components │ │ ├── TagsList.vue │ │ ├── LoadingSpinner.vue │ │ ├── QrCode.vue │ │ ├── ReferencesList.vue │ │ ├── SearchList.vue │ │ ├── ConnectivityIssues.vue │ │ ├── DocumentButtons.vue │ │ ├── DjvuReader.vue │ │ ├── EpubReader.vue │ │ ├── DocumentSnippet.vue │ │ ├── download-progress.ts │ │ └── PdfReader.vue │ ├── main.ts │ ├── App.vue │ ├── database.ts │ ├── router │ │ └── index.ts │ ├── utils.ts │ └── scss │ │ └── styles.scss ├── summa-config.json ├── tsconfig.config.json ├── .gitignore ├── env.d.ts ├── vite-sw.config.ts ├── publi.sh ├── tsconfig.json ├── README.md ├── index.html ├── .eslintrc.js ├── vite.config.ts └── package.json ├── cybrex ├── cybrex │ ├── __init__.py │ ├── chains │ │ ├── base.py │ │ ├── __init__.py │ │ └── map_reduce.py │ ├── prompts │ │ └── __init__.py │ ├── vector_storage │ │ ├── __init__.py │ │ └── base.py │ ├── exceptions.py │ ├── data_source │ │ ├── base.py │ │ └── geck_data_source.py │ ├── utils.py │ └── llm_manager.py ├── MANIFEST.in ├── .gitignore ├── .isort.cfg ├── .flake8 ├── requirements.txt ├── pyproject.toml ├── examples │ ├── on-the-fly-translation.ipynb │ └── analyse-references.ipynb └── README.md ├── geck ├── stc_geck │ ├── __init__.py │ ├── exceptions.py │ └── utils.py ├── MANIFEST.in ├── .gitignore ├── .isort.cfg ├── .flake8 ├── requirements.txt └── pyproject.toml ├── library ├── sciparse │ ├── __init__.py │ ├── models │ │ ├── .gitignore │ │ └── lid.176.ftz │ ├── exceptions.py │ ├── language_detect.py │ └── cli.py ├── telegram │ ├── README.md │ ├── __init__.py │ ├── session_backend │ │ ├── __init__.py │ │ └── core_postgres.py │ ├── common.py │ ├── promotioner.py │ └── utils.py ├── .gitignore ├── user_manager │ ├── __init__.py │ └── user_manager.py └── textutils │ ├── __init__.py │ ├── html_processing.py │ └── utils.py ├── .flake8 ├── .gitignore ├── .isort.cfg ├── .env.light └── docker-compose.light.yml /tgbot/app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tgbot/views/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/.prettierrc.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /cybrex/cybrex/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cybrex/cybrex/chains/base.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geck/stc_geck/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /library/sciparse/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /library/telegram/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /library/telegram/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tgbot/widgets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cybrex/cybrex/chains/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cybrex/cybrex/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tgbot/views/telegram/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tgbot/configs/development.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | -------------------------------------------------------------------------------- /cybrex/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /cybrex/cybrex/vector_storage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /geck/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /library/sciparse/models/.gitignore: -------------------------------------------------------------------------------- 1 | lid.176.bin -------------------------------------------------------------------------------- /cybrex/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | __pycache__ 3 | dist -------------------------------------------------------------------------------- /geck/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | __pycache__ 3 | dist -------------------------------------------------------------------------------- /tgbot/.gitignore: -------------------------------------------------------------------------------- 1 | bots.db 2 | configs/production.yaml -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = venv/* 3 | max-line-length = 160 -------------------------------------------------------------------------------- /library/.gitignore: -------------------------------------------------------------------------------- 1 | actions 2 | integral 3 | pdftools 4 | siteparsers -------------------------------------------------------------------------------- /web/.eslintignore: -------------------------------------------------------------------------------- 1 | .eslintrc.js 2 | public/* 3 | service-worker.js -------------------------------------------------------------------------------- /web/service-worker.js: -------------------------------------------------------------------------------- 1 | node_modules/summa-wasm/dist/service-worker.js -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | fabrica 3 | infra 4 | venv 5 | docker-compose.yml 6 | __pycache__ 7 | -------------------------------------------------------------------------------- /web/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/favicon.ico -------------------------------------------------------------------------------- /web/src/services/search/index.ts: -------------------------------------------------------------------------------- 1 | export {IpfsSearchService} from './ipfs-search-service' 2 | -------------------------------------------------------------------------------- /web/public/favicon-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/favicon-dark.png -------------------------------------------------------------------------------- /web/public/mstile-70x70.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/mstile-70x70.png -------------------------------------------------------------------------------- /web/src/assets/origin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/src/assets/origin.jpg -------------------------------------------------------------------------------- /web/public/default-cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/default-cover.jpg -------------------------------------------------------------------------------- /web/public/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/favicon-16x16.png -------------------------------------------------------------------------------- /web/public/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/favicon-32x32.png -------------------------------------------------------------------------------- /web/public/favicon-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/favicon-light.png -------------------------------------------------------------------------------- /web/public/mstile-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/mstile-144x144.png -------------------------------------------------------------------------------- /web/public/mstile-150x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/mstile-150x150.png -------------------------------------------------------------------------------- /web/public/mstile-310x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/mstile-310x150.png -------------------------------------------------------------------------------- /web/public/mstile-310x310.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/mstile-310x310.png -------------------------------------------------------------------------------- /library/user_manager/__init__.py: -------------------------------------------------------------------------------- 1 | from .user_manager import UserManager 2 | 3 | __all__ = ['UserManager'] 4 | -------------------------------------------------------------------------------- /web/public/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/apple-touch-icon.png -------------------------------------------------------------------------------- /library/sciparse/models/lid.176.ftz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/library/sciparse/models/lid.176.ftz -------------------------------------------------------------------------------- /web/public/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/android-chrome-192x192.png -------------------------------------------------------------------------------- /web/public/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/android-chrome-512x512.png -------------------------------------------------------------------------------- /web/src/services/index.ts: -------------------------------------------------------------------------------- 1 | export { IpfsSearchService } from './search' 2 | export { UserService } from './user-service' 3 | -------------------------------------------------------------------------------- /geck/.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | skip_glob=**/venv/** 3 | include_trailing_comma=True 4 | multi_line_output=3 5 | force_grid_wrap=2 6 | -------------------------------------------------------------------------------- /cybrex/.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | skip_glob=**/venv/** 3 | include_trailing_comma=True 4 | multi_line_output=3 5 | force_grid_wrap=2 6 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | skip_glob=**/venv/** 3 | include_trailing_comma=True 4 | multi_line_output=3 5 | force_grid_wrap=2 6 | line_length=120 -------------------------------------------------------------------------------- /library/sciparse/exceptions.py: -------------------------------------------------------------------------------- 1 | from aiobaseclient.exceptions import BadRequestError 2 | 3 | __all__ = [ 4 | 'BadRequestError', 5 | ] 6 | -------------------------------------------------------------------------------- /library/telegram/session_backend/__init__.py: -------------------------------------------------------------------------------- 1 | from .sqlalchemy import AlchemySessionContainer 2 | 3 | __all__ = ['AlchemySessionContainer'] 4 | -------------------------------------------------------------------------------- /web/public/android-chrome-maskable-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/android-chrome-maskable-192x192.png -------------------------------------------------------------------------------- /web/public/android-chrome-maskable-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nexus-stc/stc/HEAD/web/public/android-chrome-maskable-512x512.png -------------------------------------------------------------------------------- /cybrex/cybrex/exceptions.py: -------------------------------------------------------------------------------- 1 | from izihawa_utils.exceptions import BaseError 2 | 3 | 4 | class QdrantStorageNotAvailableError(BaseError): 5 | pass 6 | -------------------------------------------------------------------------------- /cybrex/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | .git, 4 | __pycache__, 5 | venv, 6 | build, 7 | dist, 8 | ignore = I, W503 9 | max-line-length = 140 10 | -------------------------------------------------------------------------------- /geck/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | .git, 4 | __pycache__, 5 | venv, 6 | build, 7 | dist, 8 | ignore = I, W503 9 | max-line-length = 140 10 | -------------------------------------------------------------------------------- /web/summa-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": "bafyb4iadbza7ckc3djc2k5lfaorwaufcjurzxzkjsj5e7qt2wrguqs7ywm", 3 | "ipfs_api_multiaddr": "/ip4/10.1.2.3/tcp/5001", 4 | "ipfs_http_base_url": "http://10.1.2.3:8080" 5 | } 6 | -------------------------------------------------------------------------------- /tgbot/promotions/__init__.py: -------------------------------------------------------------------------------- 1 | from izihawa_configurator import Configurator 2 | 3 | 4 | def get_promotions(): 5 | return Configurator(['tgbot/promotions/promotions.yaml'])['promotions'] 6 | 7 | 8 | promotions = get_promotions() 9 | -------------------------------------------------------------------------------- /web/src/services/user-service.ts: -------------------------------------------------------------------------------- 1 | export class UserService { 2 | liked_items: string[] 3 | 4 | constructor () { 5 | this.liked_items = [] 6 | } 7 | 8 | like (item: string) { 9 | this.liked_items.push(item) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /geck/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp>=3.8.5 2 | aiokit>=1.2.3 3 | aiosumma>=2.47.1 4 | humanfriendly>=10.0 5 | ipfs-hamt-directory-py>=0.1.1 6 | izihawa-ipfs-api>=1.0.7 7 | izihawa-utils>=1.1.3 8 | multidict>=6.0.4 9 | summa-embed>=0.20.2 10 | termcolor>=2.3.0 11 | fire>=0.5.0 -------------------------------------------------------------------------------- /web/public/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #ffc40d 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /cybrex/cybrex/vector_storage/base.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Iterable, 3 | List, 4 | Optional, 5 | Tuple, 6 | ) 7 | 8 | 9 | class BaseVectorStorage: 10 | def query(self, query_embedding: List[float], n_chunks: int, field_values: Optional[Iterable[Tuple[str, str]]] = None): 11 | raise NotImplementedError() 12 | -------------------------------------------------------------------------------- /web/public/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://libstc.cc/ 5 | 2023-04-12 6 | 7 | 8 | https://libstc.cc/#/about 9 | 2023-04-12 10 | 11 | 12 | -------------------------------------------------------------------------------- /geck/stc_geck/exceptions.py: -------------------------------------------------------------------------------- 1 | from izihawa_utils.exceptions import BaseError 2 | 3 | 4 | class IpfsConnectionError(BaseError): 5 | pass 6 | 7 | 8 | class ItemNotFound(BaseError): 9 | def __init__(self, query): 10 | self.query = query 11 | 12 | 13 | class CidNotFound(BaseError): 14 | def __init__(self, query): 15 | self.query = query 16 | -------------------------------------------------------------------------------- /tgbot/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from izihawa_configurator import Configurator 2 | from izihawa_utils import env 3 | 4 | 5 | def get_config(): 6 | return Configurator([ 7 | 'tgbot/configs/base.yaml', 8 | 'tgbot/configs/%s.yaml?' % env.type, 9 | 'tgbot/configs/logging.yaml', 10 | ], env_prefix='STC_TGBOT') 11 | 12 | 13 | config = get_config() 14 | -------------------------------------------------------------------------------- /library/telegram/common.py: -------------------------------------------------------------------------------- 1 | from telethon import Button 2 | 3 | 4 | def close_button(session_id: str = None): 5 | if session_id: 6 | return Button.inline( 7 | text='✖️', 8 | data=f'/close_{session_id}', 9 | ) 10 | else: 11 | return Button.inline( 12 | text='✖️', 13 | data='/close', 14 | ) 15 | -------------------------------------------------------------------------------- /web/tsconfig.config.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@tsconfig/node18/tsconfig.json", 3 | "include": [ 4 | "summa-config.ts", 5 | "vite.config.ts", 6 | "vite-sw.config.ts", 7 | "vitest.config.ts", 8 | "cypress.config.*" 9 | ], 10 | "compilerOptions": { 11 | "composite": true, 12 | "moduleResolution": "Node", 13 | "resolveJsonModule": true, 14 | "types": ["node"] 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /tgbot/handlers/stop.py: -------------------------------------------------------------------------------- 1 | from telethon import events 2 | 3 | from library.telegram.base import RequestContext 4 | 5 | from .base import BaseHandler 6 | 7 | 8 | class StopHandler(BaseHandler): 9 | filter = events.NewMessage(incoming=True, pattern='^/stop$') 10 | 11 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 12 | request_context.statbox(action='show', mode='stop') 13 | -------------------------------------------------------------------------------- /.env.light: -------------------------------------------------------------------------------- 1 | COMPOSE_PATH_SEPARATOR=: 2 | COMPOSE_FILE=docker-compose.light.yml 3 | COMPOSE_PROJECT_NAME=light 4 | 5 | # Retrieve next two parameters at https://my.telegram.org 6 | STC_TGBOT_application.default_bot.app_id=... 7 | STC_TGBOT_application.default_bot.app_hash=... 8 | 9 | # Register your bot at @BotFather in Telegram 10 | STC_TGBOT_application.default_bot.bot_name=... 11 | STC_TGBOT_application.default_bot.bot_token=... 12 | -------------------------------------------------------------------------------- /web/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | .DS_Store 12 | dist 13 | dist-ssr 14 | coverage 15 | *.local 16 | 17 | /cypress/videos/ 18 | /cypress/screenshots/ 19 | 20 | # Editor directories and files 21 | .vscode/* 22 | !.vscode/extensions.json 23 | .idea 24 | *.suo 25 | *.ntvs* 26 | *.njsproj 27 | *.sln 28 | *.sw? 29 | -------------------------------------------------------------------------------- /cybrex/requirements.txt: -------------------------------------------------------------------------------- 1 | aiokit>=1.2.3 2 | beautifulsoup4>=4.12.2 3 | ctransformers>=0.2.17 4 | FlagEmbedding>=1.1.2 5 | InstructorEmbedding>=1.0.1 6 | izihawa-configurator>=1.0.4 7 | izihawa-utils>=1.1.3 8 | keybert>=0.7.0 9 | langchain>=0.0.222 10 | lazy>=1.5 11 | lxml>=4.9.3 12 | openai>=0.27.8 13 | orjson 14 | pypdf>=3.12.0 15 | pyyaml>=6.0 16 | qdrant_client>=1.5.4 17 | tiktoken>=0.5.1 18 | safetensors==0.3.1 19 | stc-geck>=1.8.35 20 | unstructured[html]>=0.10.28 21 | -------------------------------------------------------------------------------- /tgbot/handlers/noop.py: -------------------------------------------------------------------------------- 1 | from telethon import events 2 | 3 | from library.telegram.base import RequestContext 4 | 5 | from .base import BaseCallbackQueryHandler 6 | 7 | 8 | class NoopHandler(BaseCallbackQueryHandler): 9 | filter = events.CallbackQuery(pattern='^/noop$') 10 | 11 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 12 | request_context.statbox(action='start', mode='noop') 13 | await event.answer() 14 | -------------------------------------------------------------------------------- /web/env.d.ts: -------------------------------------------------------------------------------- 1 | import 'vite/client' 2 | 3 | import { type SearchService } from '@/services/summa' 4 | 5 | declare module '@vue/runtime-core' { 6 | interface ComponentCustomProperties { 7 | search_service: SearchService 8 | } 9 | } 10 | 11 | declare global { 12 | namespace NodeJS { 13 | interface ProcessEnv { 14 | GITHUB_AUTH_TOKEN: string 15 | NODE_ENV: 'development' | 'production' 16 | PORT?: string 17 | PWD: string 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /web/src/views/HowToSearchView.vue: -------------------------------------------------------------------------------- 1 | 6 | 16 | -------------------------------------------------------------------------------- /web/src/views/InstallIpfsView.vue: -------------------------------------------------------------------------------- 1 | 6 | 17 | -------------------------------------------------------------------------------- /tgbot/translations/__init__.py: -------------------------------------------------------------------------------- 1 | from izihawa_configurator import Configurator 2 | 3 | 4 | def get_translations(): 5 | return Configurator([ 6 | 'tgbot/translations/translations.yaml', 7 | ]) 8 | 9 | 10 | def t(label, language='en'): 11 | if language in _translations and label in _translations[language]: 12 | return _translations[language][label] 13 | return _translations['en'][label] 14 | 15 | 16 | _translations = get_translations() 17 | 18 | 19 | __all__ = ['t'] 20 | -------------------------------------------------------------------------------- /web/src/components/TagsList.vue: -------------------------------------------------------------------------------- 1 | 7 | 8 | 20 | -------------------------------------------------------------------------------- /tgbot/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | lxml 3 | pandas 4 | fasttext-wheel 5 | PyCryptodome 6 | pypdf>=3.17.0 7 | seaborn 8 | 9 | aiobaseclient 10 | aiocrossref 11 | aiokit>=1.2.3 12 | aiosqlite 13 | aiosumma>=2.47.3 14 | bleach 15 | base36 16 | cybrex[petals]>=1.11.11 17 | dateparser 18 | emoji 19 | isbnlib>=3.10.13 20 | izihawa_configurator>=1.0.4 21 | izihawa_ipfs_api>=1.0.0 22 | izihawa_loglib>=1.0.2 23 | izihawa_utils 24 | lru-dict 25 | markdownify 26 | sqlalchemy 27 | stc-geck>=1.8.38 28 | 29 | telethon==1.30.3 30 | -------------------------------------------------------------------------------- /tgbot/app/aiosqlite_wrapper.py: -------------------------------------------------------------------------------- 1 | import aiosqlite 2 | from aiokit import AioThing 3 | 4 | 5 | def dict_factory(cursor, row): 6 | d = {} 7 | for idx, col in enumerate(cursor.description): 8 | d[col[0]] = row[idx] 9 | return d 10 | 11 | 12 | class AioSqlite(AioThing): 13 | def __init__(self, db_name): 14 | super().__init__() 15 | self.db = aiosqlite.connect(db_name) 16 | 17 | async def start(self): 18 | self.db = await self.db 19 | self.db.row_factory = dict_factory 20 | -------------------------------------------------------------------------------- /docker-compose.light.yml: -------------------------------------------------------------------------------- 1 | services: 2 | ipfs: 3 | image: ipfs/kubo:latest 4 | environment: 5 | IPFS_PROFILE: server 6 | ports: 7 | - 8080:8080 8 | volumes: 9 | - /Users/pasha/data-ipfs:/data/ipfs 10 | tgbot: 11 | build: 12 | context: . 13 | dockerfile: tgbot/Dockerfile 14 | depends_on: 15 | ipfs: 16 | condition: service_healthy 17 | env_file: 18 | - .env.light 19 | restart: always 20 | volumes: 21 | - /Users/pasha/tmp:/usr/lib/stc-tgbot 22 | -------------------------------------------------------------------------------- /web/public/site.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "short_name": "", 4 | "icons": [ 5 | { 6 | "src": "./android-chrome-192x192.png", 7 | "sizes": "192x192", 8 | "type": "image/png" 9 | }, 10 | { 11 | "src": "./android-chrome-512x512.png", 12 | "sizes": "512x512", 13 | "type": "image/png" 14 | } 15 | ], 16 | "theme_color": "#ffffff", 17 | "background_color": "#ffffff", 18 | "display": "standalone" 19 | } 20 | -------------------------------------------------------------------------------- /web/vite-sw.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | 3 | // https://vitejs.dev/config/ 4 | export default defineConfig({ 5 | base: '', 6 | build: { 7 | emptyOutDir: false, 8 | rollupOptions: { 9 | input: { 10 | 'service-worker': './node_modules/summa-wasm/dist/service-worker.js', 11 | }, 12 | output: [ 13 | { 14 | entryFileNames: () => { 15 | return '[name].js' 16 | } 17 | } 18 | ] 19 | }, 20 | target: 'esnext' 21 | } 22 | }) 23 | -------------------------------------------------------------------------------- /web/src/views/DoomsdayView.vue: -------------------------------------------------------------------------------- 1 | 6 | 15 | -------------------------------------------------------------------------------- /tgbot/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG MODE 2 | 3 | FROM python:3.11-slim as builder-common 4 | RUN apt-get update \ 5 | && apt-get install gcc g++ git golang -y \ 6 | && apt-get clean 7 | WORKDIR /app 8 | ADD tgbot/requirements.txt tgbot/requirements.txt 9 | RUN python3 -m venv venv 10 | RUN venv/bin/pip3 install -r tgbot/requirements.txt 11 | COPY fabrica fabrica 12 | COPY library library 13 | COPY tgbot tgbot 14 | COPY infra/hub/aioclient infra/hub/aioclient 15 | COPY infra/hub/proto infra/hub/proto 16 | ENV PYTHONPATH=/app 17 | RUN mkdir /usr/lib/stc-tgbot 18 | RUN mkdir /var/log/stc-tgbot 19 | CMD ["/app/venv/bin/python3", "tgbot/main.py"] -------------------------------------------------------------------------------- /web/publi.sh: -------------------------------------------------------------------------------- 1 | npm run build-only 2 | 3 | API_ADDR=($(jq -r '.ipfs_api_multiaddr' summa-config.json)) 4 | echo Adding dist... 5 | DIST_CID=$(ipfs --api $API_ADDR add --pin -Q -r --hash=blake3 dist) 6 | echo Settings MFS... 7 | ipfs --api $API_ADDR files rm -r /stc-web 8 | ipfs --api $API_ADDR files cp /ipfs/"$DIST_CID" /stc-web 9 | INDEX_CID=$(jq -r -c '.index' summa-config.json) 10 | ipfs --api $API_ADDR files cp -p /ipfs/$INDEX_CID /stc-web/data 11 | ipfs --api $API_ADDR files cp -p /ipfs/bafybeiaysi4s6lnjev27ln5icwm6tueaw2vdykrtjkwiphwekaywqhcjze/I /stc-web/images/wiki 12 | ipfs --api $API_ADDR files stat --hash /stc-web 13 | -------------------------------------------------------------------------------- /cybrex/cybrex/data_source/base.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import ( 3 | List, 4 | Optional, 5 | ) 6 | 7 | 8 | @dataclass 9 | class SourceDocument: 10 | document: dict 11 | document_id: str 12 | 13 | 14 | class BaseDataSource: 15 | async def stream_documents( 16 | self, 17 | query: str, 18 | limit: int = 0, 19 | ) -> List[SourceDocument]: 20 | raise NotImplementedError() 21 | 22 | async def search_documents(self, query: str, limit: int = 5, sources: Optional[List[str]] = None) -> List[SourceDocument]: 23 | raise NotImplementedError() 24 | -------------------------------------------------------------------------------- /web/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@vue/tsconfig/tsconfig.dom.json", 3 | "include": [ 4 | "env.d.ts", 5 | "src/**/*.ts", 6 | "src/**/*.vue", 7 | "summa-config.ts", 8 | "vite.config.ts", 9 | "vite-sw.config.ts" 10 | ], 11 | "compilerOptions": { 12 | "baseUrl": ".", 13 | "esModuleInterop": true, 14 | "moduleResolution": "Node", 15 | "paths": { 16 | "@/*": ["./src/*"] 17 | }, 18 | "resolveJsonModule": true, 19 | "strict": false, 20 | "types": ["node"] 21 | }, 22 | 23 | "references": [ 24 | { 25 | "path": "./tsconfig.config.json" 26 | } 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /web/README.md: -------------------------------------------------------------------------------- 1 | # Web STC 2 | 3 | Search engine in your browser that can retrieve all data through IPFS. Uncensorable, unblockable, yours. 4 | Original instance of STC lives at http://libstc.cc 5 | 6 | Here you can find its source codes and make a contribution if you are a skilled developer. 7 | 8 | ## Development 9 | 10 | It requires [IPFS to be installed](https://docs.ipfs.tech/install/ipfs-desktop/) and launched. 11 | 12 | ```bash 13 | npm i 14 | npm run dev 15 | ``` 16 | 17 | ## Publishing 18 | 19 | Publishing here means 20 | - Build static site 21 | - Create IPFS directory with the static site and links to data batteries 22 | 23 | It can be done with `bash publi.sh` 24 | -------------------------------------------------------------------------------- /library/sciparse/language_detect.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from typing import Dict 3 | 4 | import fasttext 5 | 6 | if os.path.exists('./library/sciparse/models/lid.176.bin'): 7 | path_to_pretrained_model = './library/sciparse/models/lid.176.bin' 8 | fmodel = fasttext.load_model(path_to_pretrained_model) 9 | else: 10 | path_to_pretrained_model = './library/sciparse/models/lid.176.ftz' 11 | fmodel = fasttext.load_model(path_to_pretrained_model) 12 | 13 | 14 | def detect_language(text: str, threshold: float = 0.85) -> Dict[str, float]: 15 | prediction = fmodel.predict([text.replace('\n', ' ')], threshold=threshold) 16 | if prediction[0][0]: 17 | return prediction[0][0][0][-2:] 18 | -------------------------------------------------------------------------------- /web/src/components/LoadingSpinner.vue: -------------------------------------------------------------------------------- 1 | 8 | 9 | 32 | -------------------------------------------------------------------------------- /geck/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools<65.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "stc-geck" 7 | version = "1.8.38" 8 | authors = [{ name = "Interdimensional Walker" }] 9 | description = "GECK (Garden Of Eden Creation Kit) is a toolkit for setting up and maintaning STC" 10 | readme = "README.md" 11 | requires-python = ">=3.8" 12 | classifiers = [ 13 | "Programming Language :: Python :: 3.8", 14 | ] 15 | dynamic = ["dependencies"] 16 | 17 | [project.scripts] 18 | geck = "stc_geck.cli:main" 19 | 20 | [project.urls] 21 | "Homepage" = "https://github.com/nexus-stc/stc" 22 | 23 | [tool.setuptools.dynamic] 24 | dependencies = {file = ["requirements.txt"]} 25 | 26 | -------------------------------------------------------------------------------- /cybrex/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools<65.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "cybrex" 7 | version = "1.12.10" 8 | authors = [{ name = "Interdimensional Walker" }] 9 | description = "Researching AI" 10 | readme = "README.md" 11 | requires-python = ">=3.8" 12 | classifiers = [ 13 | "Programming Language :: Python :: 3.8", 14 | ] 15 | dynamic = ["dependencies"] 16 | 17 | [project.scripts] 18 | cybrex = "cybrex.cli:main" 19 | 20 | [project.urls] 21 | "Homepage" = "https://github.com/nexus-stc/stc" 22 | 23 | [project.optional-dependencies] 24 | petals = ["petals>=2.0.0"] 25 | 26 | [tool.setuptools.dynamic] 27 | dependencies = {file = ["requirements.txt"]} 28 | 29 | -------------------------------------------------------------------------------- /cybrex/cybrex/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | 4 | class MultipleAsyncExecution: 5 | def __init__(self, par): 6 | self.par = par 7 | self.s = asyncio.Semaphore(par) 8 | 9 | async def execute(self, coro): 10 | if not self.s: 11 | raise RuntimeError('`ParallelAsyncExecution` has been already joined') 12 | await self.s.acquire() 13 | task = asyncio.create_task(coro) 14 | task.add_done_callback(lambda f: self.s.release()) 15 | return task 16 | 17 | async def join(self): 18 | for i in range(self.par): 19 | await self.s.acquire() 20 | s = self.s 21 | self.s = None 22 | for i in range(self.par): 23 | s.release() 24 | -------------------------------------------------------------------------------- /tgbot/handlers/aboutus.py: -------------------------------------------------------------------------------- 1 | from telethon import ( 2 | Button, 3 | events, 4 | ) 5 | 6 | from library.telegram.base import RequestContext 7 | from tgbot.translations import t 8 | 9 | from .base import BaseHandler 10 | 11 | 12 | class AboutusHandler(BaseHandler): 13 | filter = events.NewMessage(incoming=True, pattern='^/aboutus(@[A-Za-z0-9_]+)?$') 14 | is_group_handler = True 15 | 16 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 17 | request_context.statbox(action='show', mode='aboutus') 18 | await event.reply( 19 | t('ABOUT_US', request_context.chat['language']), 20 | buttons=Button.clear(), 21 | link_preview=False, 22 | ) 23 | -------------------------------------------------------------------------------- /tgbot/handlers/librarian.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from telethon import events 4 | 5 | from library.telegram.base import RequestContext 6 | 7 | from .base import BaseHandler 8 | 9 | 10 | class LibrarianTextHandler(BaseHandler): 11 | filter = events.NewMessage(incoming=True, pattern=re.compile(r'(.*)', flags=re.DOTALL)) 12 | is_group_handler = True 13 | 14 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 15 | session_id = self.generate_session_id() 16 | request_context.add_default_fields(mode='librarian_text', session_id=session_id) 17 | user_id = event.sender_id 18 | 19 | if user_id not in self.application.config['librarian']['moderators']: 20 | await event.delete() 21 | -------------------------------------------------------------------------------- /tgbot/main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from concurrent.futures import ThreadPoolExecutor 4 | 5 | from app.application import TelegramApplication 6 | from configs import get_config 7 | from izihawa_loglib import configure_logging 8 | 9 | 10 | def main(config): 11 | configure_logging(config) 12 | loop = asyncio.new_event_loop() 13 | loop.set_default_executor(ThreadPoolExecutor(64)) 14 | asyncio.set_event_loop(loop) 15 | loop.run_until_complete(TelegramApplication(config=config).start_and_wait()) 16 | asyncio.get_running_loop().stop() 17 | logging.getLogger('statbox').info({ 18 | 'mode': 'application', 19 | 'action': 'exit', 20 | }) 21 | 22 | 23 | if __name__ == '__main__': 24 | main(config=get_config()) 25 | -------------------------------------------------------------------------------- /web/src/components/QrCode.vue: -------------------------------------------------------------------------------- 1 | 4 | 5 | 32 | -------------------------------------------------------------------------------- /tgbot/handlers/help.py: -------------------------------------------------------------------------------- 1 | from telethon import ( 2 | Button, 3 | events, 4 | ) 5 | 6 | from library.telegram.base import RequestContext 7 | from tgbot.translations import t 8 | 9 | from .base import BaseHandler 10 | 11 | 12 | class HelpHandler(BaseHandler): 13 | filter = events.NewMessage(incoming=True, pattern='^/help(@[A-Za-z0-9_]+)?$') 14 | is_group_handler = True 15 | 16 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 17 | request_context.statbox(action='show', mode='help') 18 | 19 | if event.is_group or event.is_channel: 20 | if event.pattern_match.group(1) == f'@{request_context.bot_name}': 21 | await event.reply(t('HELP_FOR_GROUPS', request_context.chat['language']), buttons=Button.clear()) 22 | else: 23 | await event.reply(t('HELP', request_context.chat['language']), buttons=Button.clear()) 24 | -------------------------------------------------------------------------------- /tgbot/handlers/howtohelp.py: -------------------------------------------------------------------------------- 1 | from telethon import events 2 | 3 | from library.telegram.base import RequestContext 4 | from tgbot.configs import config 5 | from tgbot.translations import t 6 | 7 | from .base import BaseHandler 8 | 9 | 10 | class HowToHelpHandler(BaseHandler): 11 | filter = events.NewMessage(incoming=True, pattern='^/howtohelp(@[A-Za-z0-9_]+)?$') 12 | is_group_handler = True 13 | 14 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 15 | request_context.statbox(action='show', mode='howtohelp') 16 | await event.reply( 17 | t('HOW_TO_HELP', request_context.chat['language']).format( 18 | reddit_url=config['reddit'].get('url', '🚫'), 19 | related_channel=config['telegram'].get('related_channel', '🚫'), 20 | twitter_contact_url=config['twitter'].get('contact_url', '🚫') 21 | )) 22 | -------------------------------------------------------------------------------- /library/sciparse/cli.py: -------------------------------------------------------------------------------- 1 | import fire 2 | from aiobaseclient import BaseClient 3 | from aiokit.utils import sync_fu 4 | from izihawa_ipfs_api import IpfsHttpClient 5 | 6 | from library.sciparse.sciparser import ( 7 | ClientPool, 8 | SciParser, 9 | ) 10 | 11 | 12 | async def process(grobid_base_url, ipfs_base_url, doi): 13 | ipfs_http_client = IpfsHttpClient(base_url=ipfs_base_url) 14 | await ipfs_http_client.start() 15 | grobid_client = BaseClient(base_url=grobid_base_url) 16 | await grobid_client.start() 17 | 18 | sci_parser = SciParser( 19 | ipfs_http_client=ipfs_http_client, 20 | grobid_pool=ClientPool.from_client(grobid_client, par=16), 21 | ) 22 | await sci_parser.start() 23 | parsed_paper = await sci_parser.parse_paper(doi) 24 | print(parsed_paper) 25 | 26 | 27 | def main(): 28 | fire.Fire(sync_fu(process)) 29 | 30 | 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /tgbot/handlers/shortlink.py: -------------------------------------------------------------------------------- 1 | from telethon import events 2 | 3 | from library.telegram.base import RequestContext 4 | from tgbot.translations import t 5 | from tgbot.views.telegram.common import ( 6 | TooLongQueryError, 7 | encode_query_to_deep_link, 8 | ) 9 | 10 | from .base import BaseHandler 11 | 12 | 13 | class ShortlinkHandler(BaseHandler): 14 | filter = events.NewMessage(incoming=True, pattern='^/shortlink\\s?(.*)?') 15 | 16 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 17 | query = event.pattern_match.group(1) 18 | request_context.statbox(action='start', mode='shortlink', query=query) 19 | 20 | try: 21 | text = encode_query_to_deep_link(query, request_context.bot_name) 22 | except TooLongQueryError: 23 | text = t('TOO_LONG_QUERY_FOR_SHORTLINK', request_context.chat['language']) 24 | 25 | return await event.reply(f'`{text}`', link_preview=False) 26 | -------------------------------------------------------------------------------- /tgbot/app/exceptions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from izihawa_utils.exceptions import BaseError 4 | 5 | 6 | class BannedUserError(BaseError): 7 | level = logging.WARNING 8 | code = 'banned_user_error' 9 | 10 | def __init__(self, ban_timeout: int): 11 | self.ban_timeout = ban_timeout 12 | 13 | 14 | class UnknownFileFormatError(BaseError): 15 | level = logging.WARNING 16 | code = 'unknown_file_format_error' 17 | 18 | 19 | class UnknownIndexAliasError(BaseError): 20 | code = 'unknown_index_alias_error' 21 | 22 | 23 | class WidgetError(BaseError): 24 | level = logging.WARNING 25 | code = 'widget_error' 26 | 27 | def __init__(self, text, buttons): 28 | self.text = text 29 | self.buttons = buttons 30 | 31 | 32 | class DownloadError(BaseError): 33 | level = logging.WARNING 34 | code = 'download_error' 35 | 36 | 37 | class InvalidSearchError(BaseError): 38 | def __init__(self, search): 39 | self.search = search 40 | -------------------------------------------------------------------------------- /web/src/components/ReferencesList.vue: -------------------------------------------------------------------------------- 1 | 15 | 16 | 32 | -------------------------------------------------------------------------------- /web/src/components/SearchList.vue: -------------------------------------------------------------------------------- 1 | 13 | 14 | 30 | 31 | 40 | -------------------------------------------------------------------------------- /web/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | STC 16 | 17 | 18 |
19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /web/src/main.ts: -------------------------------------------------------------------------------- 1 | import './scss/styles.scss' 2 | import 'bootstrap' 3 | import 'bootstrap/js/dist/tab' 4 | 5 | import { createApp } from 'vue' 6 | 7 | import App from './App.vue' 8 | import router from './router' 9 | import { get_label } from './translations' 10 | import {SearchService} from "@/services/search/search-service"; 11 | 12 | // Set theme to the user's preferred color scheme 13 | function updateTheme () { 14 | const color_mode = window.matchMedia('(prefers-color-scheme: dark)').matches 15 | ? 'dark' 16 | : 'light' 17 | document.querySelector('html').setAttribute('data-bs-theme', color_mode) 18 | } 19 | 20 | // Set theme on load 21 | updateTheme() 22 | 23 | // Update theme when the preferred scheme changes 24 | window 25 | .matchMedia('(prefers-color-scheme: dark)') 26 | .addEventListener('change', updateTheme) 27 | 28 | const app = createApp(App) 29 | app.use(router) 30 | 31 | app.mixin({ 32 | methods: { 33 | get_label 34 | } 35 | }) 36 | app.config.globalProperties.search_service = new SearchService("info") 37 | app.mount('#app') 38 | -------------------------------------------------------------------------------- /cybrex/cybrex/llm_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | class LLMManager: 5 | def __init__(self, llm, prompter, config, max_prompt_chars, tokenizer=None): 6 | self.llm = llm 7 | self.prompter = prompter 8 | self.config = config 9 | self.max_prompt_chars = max_prompt_chars 10 | self.tokenizer = tokenizer 11 | 12 | @property 13 | def context_length(self): 14 | return self.config['context_length'] 15 | 16 | def process(self, prompt): 17 | logging.getLogger('statbox').info({'action': 'process', 'mode': 'llm_manager', 'prompt': prompt}) 18 | if self.tokenizer: 19 | input_ids = self.tokenizer(prompt, return_tensors="pt")["input_ids"] 20 | outputs = self.llm.generate( 21 | input_ids, 22 | max_new_tokens=self.config.get('max_new_tokens'), 23 | temperature=self.config.get('temperature', 1.0), 24 | ) 25 | return self.tokenizer.batch_decode(outputs[:, input_ids.shape[1]:])[0].replace('', '') 26 | else: 27 | return self.llm(prompt) 28 | -------------------------------------------------------------------------------- /web/src/components/ConnectivityIssues.vue: -------------------------------------------------------------------------------- 1 | 5 | 6 | 42 | -------------------------------------------------------------------------------- /web/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "env": { 3 | "browser": true, 4 | "es2021": true 5 | }, 6 | "extends": [ 7 | "standard-with-typescript", 8 | "plugin:vue/vue3-recommended", 9 | "@vue/typescript/recommended", 10 | ], 11 | "overrides": [ 12 | { 13 | "env": { 14 | "node": true 15 | }, 16 | "files": [ 17 | ".eslintrc.{js,cjs}" 18 | ], 19 | "parserOptions": { 20 | "sourceType": "script" 21 | } 22 | } 23 | ], 24 | "parser": "vue-eslint-parser", 25 | "parserOptions": { 26 | "ecmaVersion": "latest", 27 | "project": "./tsconfig.json", 28 | "sourceType": "module" 29 | }, 30 | "plugins": [ 31 | "@typescript-eslint", 32 | "vue", 33 | "simple-import-sort" 34 | ], 35 | "root": true, 36 | "rules": { 37 | "simple-import-sort/imports": "error", 38 | "simple-import-sort/exports": "error", 39 | "@typescript-eslint/naming-convention": [ 40 | "error", 41 | { 42 | "selector": "variableLike", "format": ["snake_case", "camelCase"] 43 | } 44 | ] 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /tgbot/README.md: -------------------------------------------------------------------------------- 1 | ### Launching bots 2 | 3 | - The first startup will be slow! 4 | - Make sure to mount volumes for persistence. Otherwise, after every restart, you will lose your caches and databases (including users and riot bots). 5 | - Beforehand, you need to set up all credentials in the `.env.light` file. After setting them up, execute the following command in the Terminal: 6 | 7 | ```bash 8 | docker compose --env-file .env.light up --force-recreate --build 9 | ``` 10 | Wait for the following line to be displayed in the logs: 11 | ```bash 12 | light-tgbot-1 | INFO:statbox:{'action': 'started', 'mode': 'dynamic_bot', 'bot_name': ''} 13 | ``` 14 | 15 | Possible performance optimizations, from least to most complicated: 16 | 17 | - Mount to tgbot to cache bot credentials: 18 | ```yaml 19 | volumes: 20 | - /usr/lib/stc-tgbot:/usr/lib/stc-tgbot 21 | - /var/log/stc-tgbot:/var/log/stc-tgbot 22 | ``` 23 | - Mount to ipfs to cache the database and downloaded items: 24 | ```yaml 25 | volumes: 26 | - /data/ipfs:/data/ipfs 27 | ``` 28 | - If you have mounted volumes to ipfs, pin the database to IPFS: 29 | ```bash 30 | docker compose --env-file .env.light exec ipfs ipfs pin add /ipns/libstc.cc --progress 31 | ``` 32 | - Host the database directly (requires development experience). -------------------------------------------------------------------------------- /web/src/views/IntroView.vue: -------------------------------------------------------------------------------- 1 | 15 | 31 | 32 | 40 | -------------------------------------------------------------------------------- /library/textutils/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | NON_ALNUMWHITESPACE_REGEX = re.compile(r'([^\s\w])+') 4 | EMAIL_REGEX = re.compile(r'([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})') 5 | HASHTAG_REGEX = re.compile(r'([#@]+)([A-Za-z0-9_]+)') 6 | MULTIWHITESPACE_REGEX = re.compile(r"\s+") 7 | STICKER_REGEX = re.compile( 8 | '^[\U0001F1E0-\U0001F1FF' 9 | '\U0001F300-\U0001F5FF' 10 | '\U0001F600-\U0001F64F' 11 | '\U0001F680-\U0001F6FF' 12 | '\U0001F700-\U0001F77F' 13 | '\U0001F780-\U0001F7FF' 14 | '\U0001F800-\U0001F8FF' 15 | '\U0001F900-\U0001F9FF' 16 | '\U0001FA00-\U0001FA6F' 17 | '\U0001FA70-\U0001FAFF' 18 | '\U00002702-\U000027B0]$', 19 | flags=re.UNICODE, 20 | ) 21 | URL_REGEX_TEXT = r'(https?|ftp)?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' 22 | URL_REGEX = re.compile(URL_REGEX_TEXT) 23 | HIDDEN_CHAR = '‌' 24 | TELEGRAM_LINK_REGEX = re.compile('(?:https?://)?t\\.me/(?!joinchat/)([A-Za-z0-9_]+)') 25 | 26 | DOI_WILDCARD_REGEX_TEXT = r'(10.\d{4,9}).*\.\*' 27 | DOI_REGEX_TEXT = r'(?:doi.org/)?(10.\d{4,9})\s?(?:/|%2[Ff])\s?([%-._;()<>/:A-Za-z0-9]+[^.?\s])' 28 | DOI_REGEX = re.compile(DOI_REGEX_TEXT) 29 | ISBN_REGEX = re.compile(r'^(?:[iI][sS][bB][nN]\:?\s*)?((97(8|9))?\-?\d{9}(\d|X))$') 30 | MD5_REGEX = re.compile(r'([A-Fa-f0-9]{32})') 31 | ONLY_DOI_REGEX = re.compile(r'^(10.\d{4,9})\s?/\s?([-._;()<>/:A-Za-z0-9]+[^.?\s])$') 32 | PUBMED_ID_REGEX = re.compile(r'(?:(?:https?://)?(?:www.)?ncbi.nlm.nih.gov/pubmed/|[Pp][Mm][Ii][Dd]\s?:?\s*)([0-9]+)') 33 | CJK_CHAR_REGEX_TEXT = r'[\u4e00-\u9fff]' 34 | -------------------------------------------------------------------------------- /library/telegram/promotioner.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | class Promotioner: 5 | """ 6 | Promotioner is used to select promotion randomly based on weights of every promotion. 7 | """ 8 | def __init__( 9 | self, 10 | promotions: list, 11 | default_promotion_index: int = 0, 12 | promotion_vars: dict = None, 13 | ): 14 | self.promotions = promotions 15 | self.default_promotion_index = default_promotion_index 16 | if not promotion_vars: 17 | promotion_vars = {} 18 | self.promotion_vars = promotion_vars 19 | self.partial_sums: list = [self.promotions[0]['weight']] 20 | for promotion in self.promotions[1:]: 21 | self.partial_sums.append(promotion['weight'] + self.partial_sums[-1]) 22 | 23 | def choose_promotion(self, language: str = 'en') -> str: 24 | pivot = random.randrange(self.partial_sums[-1]) 25 | for partial_sum, promotion in zip(self.partial_sums, self.promotions): 26 | if partial_sum <= pivot: 27 | continue 28 | if language in promotion['texts']: 29 | return promotion['texts'][language].format(**self.promotion_vars) 30 | elif promotion.get('local', False): 31 | default_promotion = self.promotions[self.default_promotion_index] 32 | if language in default_promotion['texts']: 33 | return default_promotion['texts'][language].format(**self.promotion_vars) 34 | return default_promotion['texts']['en'].format(**self.promotion_vars) 35 | else: 36 | return promotion['texts']['en'].format(**self.promotion_vars) 37 | -------------------------------------------------------------------------------- /library/telegram/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import traceback 3 | from contextlib import asynccontextmanager 4 | from typing import Optional 5 | 6 | from izihawa_loglib import error_log 7 | from telethon import ( 8 | errors, 9 | events, 10 | ) 11 | 12 | 13 | @asynccontextmanager 14 | async def safe_execution( 15 | error_log=error_log, 16 | on_fail: Optional = None, 17 | level=logging.WARNING, 18 | is_logging_enabled: bool = True 19 | ): 20 | try: 21 | try: 22 | yield 23 | except events.StopPropagation: 24 | raise 25 | except errors.MessageNotModifiedError: 26 | pass 27 | except ( 28 | errors.UserIsBlockedError, 29 | errors.QueryIdInvalidError, 30 | errors.MessageDeleteForbiddenError, 31 | errors.MessageIdInvalidError, 32 | errors.ChatAdminRequiredError, 33 | ) as e: 34 | if is_logging_enabled: 35 | error_log(e, level=level) 36 | traceback.print_exc() 37 | except ValueError as e: 38 | if e.args and e.args[0].startswith('Request was unsuccessful'): 39 | if is_logging_enabled: 40 | error_log(e, level=level) 41 | else: 42 | raise 43 | except Exception as e: 44 | if is_logging_enabled: 45 | error_log(e, level=level) 46 | traceback.print_exc() 47 | if on_fail: 48 | await on_fail() 49 | except events.StopPropagation: 50 | raise 51 | except Exception as e: 52 | if is_logging_enabled: 53 | error_log(e, level=level) 54 | -------------------------------------------------------------------------------- /web/src/views/StcHubApiView.vue: -------------------------------------------------------------------------------- 1 | 22 | 31 | -------------------------------------------------------------------------------- /tgbot/handlers/close.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from telethon import events 5 | 6 | from library.telegram.base import RequestContext 7 | from library.telegram.utils import safe_execution 8 | from tgbot.translations import t 9 | 10 | from .base import BaseCallbackQueryHandler 11 | 12 | 13 | def is_earlier_than_2_days(message): 14 | if message.date: 15 | return time.time() - time.mktime(message.date.timetuple()) < 48 * 60 * 60 - 10 16 | 17 | 18 | class CloseHandler(BaseCallbackQueryHandler): 19 | filter = events.CallbackQuery(pattern='^/close(?:_([A-Za-z0-9]+))?(?:_([0-9]+))?$') 20 | 21 | async def handler(self, event, request_context: RequestContext): 22 | session_id = event.pattern_match.group(1) 23 | if session_id: 24 | session_id = session_id.decode() 25 | request_context.add_default_fields(mode='close') 26 | 27 | target_events = [] 28 | message = await event.get_message() 29 | 30 | if message and is_earlier_than_2_days(message): 31 | target_events.append(event.answer()) 32 | request_context.statbox( 33 | action='close', 34 | message_id=message.id, 35 | session_id=session_id, 36 | ) 37 | reply_message = await message.get_reply_message() 38 | if reply_message and is_earlier_than_2_days(reply_message): 39 | target_events.append(reply_message.delete()) 40 | target_events.append(message.delete()) 41 | else: 42 | async with safe_execution(is_logging_enabled=False): 43 | await event.answer(t('DELETION_FORBIDDEN_DUE_TO_AGE')) 44 | await asyncio.gather(*target_events) 45 | -------------------------------------------------------------------------------- /tgbot/handlers/report.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from stc_geck.advices import BaseDocumentHolder 4 | from telethon import events 5 | 6 | from library.telegram.base import RequestContext 7 | from library.telegram.utils import safe_execution 8 | 9 | from .base import BaseCallbackQueryHandler 10 | 11 | 12 | class ReportHandler(BaseCallbackQueryHandler): 13 | filter = events.NewMessage(incoming=True, pattern=r'^(?:@\w+)?\s+\/r_([A-Za-z0-9_-]+)(?:\s+(.*))?$') 14 | 15 | def parse_pattern(self, event: events.ChatAction): 16 | cid, reason = event.pattern_match.group(1),event.pattern_match.group(2) 17 | return cid, reason 18 | 19 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 20 | cid, reason = self.parse_pattern(event) 21 | 22 | request_context.add_default_fields(mode='report', cid=cid) 23 | request_context.statbox(action='report') 24 | 25 | document = await self.application.summa_client.get_one_by_field_value('nexus_science', 'cid', cid) 26 | document_holder = BaseDocumentHolder(document) 27 | 28 | await self.application.database.add_vote_broken_file( 29 | bot_name=self.bot_config['bot_name'], 30 | user_id=request_context.chat['chat_id'], 31 | internal_id=document_holder.get_internal_id(), 32 | cid=cid, 33 | reason=reason, 34 | ) 35 | async with safe_execution(): 36 | return await asyncio.gather( 37 | event.reply( 38 | f'Thank you for reporting `{document_holder.get_internal_id()}`. ' 39 | f'Be careful, too many misreports will cause a ban', 40 | ), 41 | event.delete(), 42 | ) 43 | -------------------------------------------------------------------------------- /web/public/favicon-black.svg: -------------------------------------------------------------------------------- 1 | 3 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /library/textutils/html_processing.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from library.textutils.utils import despace 4 | 5 | 6 | def reduce_br(soup_str): 7 | soup_str = soup_str.replace("
", "
").replace('


', '

').replace('

', '

') 8 | soup_str = re.sub(r'([^.>])
([^(
)])', r'\g<1> \g<2>', soup_str) 9 | soup_str = re.sub(r'(?:
\s*)+([^(
)])', r'

\g<1>', soup_str) 10 | soup_str = despace(soup_str) 11 | return soup_str 12 | 13 | 14 | def remove_chars(soup_str): 15 | soup_str = soup_str.replace('\ufeff', '').replace('\r\n', '\n') 16 | return soup_str 17 | 18 | 19 | def process_tags(soup): 20 | for el in soup.find_all(): 21 | if el.name == 'span': 22 | el.unwrap() 23 | elif el.name == 'em': 24 | el.name = 'i' 25 | elif el.name == 'italic': 26 | el.name = 'i' 27 | elif el.name == 'strong': 28 | el.name = 'b' 29 | elif el.name == 'sec': 30 | el.name = 'section' 31 | elif el.name == 'p' and 'ref' in el.attrs.get('class', []): 32 | el.name = 'ref' 33 | elif el.name == 'disp-formula': 34 | el.name = 'formula' 35 | new_attrs = {} 36 | if 'href' in el.attrs: 37 | new_attrs['href'] = el.attrs['href'] 38 | if 'class' in el.attrs: 39 | new_attrs['class'] = el.attrs['class'] 40 | el.attrs = new_attrs 41 | return soup 42 | 43 | 44 | def headerize_headers(soup): 45 | for el in soup.find_all(): 46 | if el.name == 'p': 47 | children = list(el.children) 48 | if len(children) == 1 and children[0].name == 'b': 49 | new_header = children[0] 50 | new_header.name = 'header' 51 | el.replace_with(new_header) 52 | return soup 53 | -------------------------------------------------------------------------------- /web/public/favicon.svg: -------------------------------------------------------------------------------- 1 | 3 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /web/src/App.vue: -------------------------------------------------------------------------------- 1 | 25 | 26 | 39 | 59 | -------------------------------------------------------------------------------- /web/src/views/StcBoxView.vue: -------------------------------------------------------------------------------- 1 | 25 | 35 | -------------------------------------------------------------------------------- /library/user_manager/user_manager.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class UserManager: 5 | def __init__(self): 6 | self.search_times = {} 7 | self.search_ban_times = {} 8 | self.tasks = set() 9 | self.limits = {} 10 | 11 | def add_search_time(self, user_id: str, search_time: float): 12 | current_time = time.time() 13 | search_times = self.search_times.get(user_id, []) 14 | search_times.append(search_time) 15 | counter = 0 16 | 17 | for i in reversed(search_times): 18 | if i > current_time - 10: 19 | counter = counter + 1 20 | if counter > 5: 21 | self.search_ban_times[user_id] = current_time + int(60) 22 | del self.search_times[user_id] 23 | return 24 | else: 25 | if counter == 1: 26 | del self.search_times[user_id] 27 | return 28 | 29 | if len(search_times) > 20: 30 | self.search_ban_times[user_id] = current_time + int(120) 31 | del self.search_times[user_id] 32 | return 33 | 34 | self.search_times[user_id] = search_times 35 | 36 | def check_search_ban_timeout(self, user_id: str): 37 | ban_time = self.search_ban_times.get(user_id) 38 | if ban_time: 39 | timeout = int(ban_time - time.time()) 40 | if timeout > 0: 41 | return timeout 42 | del self.search_ban_times[user_id] 43 | 44 | def add_task(self, user_id, id): 45 | self.tasks.add((user_id, id)) 46 | self.limits[user_id] = self.limits.get(user_id, 0) + 1 47 | 48 | def remove_task(self, user_id, id): 49 | self.tasks.remove((user_id, id)) 50 | self.limits[user_id] = self.limits.get(user_id, 1) - 1 51 | 52 | def has_task(self, user_id, id): 53 | return (user_id, id) in self.tasks 54 | 55 | def hit_limits(self, user_id): 56 | return self.limits.get(user_id, 0) >= 3 57 | -------------------------------------------------------------------------------- /tgbot/promotions/promotions.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | promotions: 4 | - texts: 5 | en: 💬 The victory of humanity is inevitable 6 | weight: 1 7 | - texts: 8 | en: 💬 Shall build Standard Template Construct 9 | weight: 1 10 | - texts: 11 | en: 💬 Gaining knowledge is the only purpose of life 12 | weight: 1 13 | - texts: 14 | en: 💬 Knowledge cannot belong 15 | weight: 1 16 | - texts: 17 | en: 💬 Obey the path of discovery 18 | weight: 1 19 | - texts: 20 | en: 💬 Research is the only and ultimate goal 21 | weight: 1 22 | - texts: 23 | en: 💬 Intellectual property is not a valid form of property 24 | weight: 1 25 | - texts: 26 | en: ⤴️ Stay tuned with us at @{related_channel}, [Twitter]({twitter_contact_url}) and [Reddit]({reddit_url}) 27 | es: ⤴️ Mantente en contacto con nosotros en @{related_channel}, [Twitter]({twitter_contact_url}) y [Reddit]({reddit_url}) 28 | it: ⤴️ Resta aggiornato con noi su @{related_channel}, [Twitter]({twitter_contact_url}) e [Reddit]({reddit_url}) 29 | pb: ⤴️ Fique ligado conosco em @{related_channel}, [Twitter]({twitter_contact_url}) e [Reddit]({reddit_url}) 30 | ru: ⤴️ Оставайся на связи с нами на @{related_channel}, [Twitter]({twitter_contact_url}) и в [Reddit]({reddit_url}) 31 | weight: 5 32 | - texts: 33 | en: 🧬 Join [Nexus Communities](https://t.me/+fPQIvxQmJGQ3MzU8), the spaces to discuss science 34 | weight: 50 35 | - texts: 36 | en: 🔥 Join [our Reddit](https://www.reddit.com/r/science_nexus) to learn more about Nexus/STC 37 | weight: 50 38 | - texts: 39 | en: 🐦 Subscribe to our [Twitter](https://twitter.com/the_superpirate) to receive news first 40 | weight: 50 41 | - texts: 42 | en: ✉️ Subscribe to our [Telegram](https://t.me/nexus_search) to stay with us 43 | weight: 50 44 | - texts: 45 | en: ⤴️ Try [Standard Template Construct](https://libstc.cc) library 46 | ru: ⤴️ Заходи в библиотеку [Стандартных Шаблонных Конструкций](https://libstc.cc) 47 | weight: 5 48 | -------------------------------------------------------------------------------- /web/src/views/DonateView.vue: -------------------------------------------------------------------------------- 1 | 31 | 43 | -------------------------------------------------------------------------------- /web/vite.config.ts: -------------------------------------------------------------------------------- 1 | import { fileURLToPath, URL } from 'node:url' 2 | 3 | import react from '@vitejs/plugin-react' 4 | import vue from '@vitejs/plugin-vue' 5 | import { defineConfig } from 'vite' 6 | import topLevelAwait from 'vite-plugin-top-level-await' 7 | import wasm from 'vite-plugin-wasm' 8 | import vuePugPlugin from 'vue-pug-plugin' 9 | 10 | import summa_config from './summa-config.json' 11 | 12 | // https://vitejs.dev/config/ 13 | export default defineConfig({ 14 | base: '', 15 | build: { 16 | rollupOptions: { 17 | input: { 18 | index: './index.html' 19 | }, 20 | output: [ 21 | { 22 | name: 'assets/[name].[hash].js' 23 | } 24 | ] 25 | }, 26 | target: 'esnext' 27 | }, 28 | plugins: [ 29 | react({ 30 | include: '**/*.vue' 31 | }), 32 | vue({ 33 | template: { 34 | preprocessOptions: { 35 | // 'preprocessOptions' is passed through to the pug compiler 36 | plugins: [vuePugPlugin] 37 | } 38 | } 39 | }), 40 | wasm(), 41 | topLevelAwait(), 42 | ], 43 | worker: { 44 | format: 'es', 45 | plugins: [wasm()] 46 | }, 47 | resolve: { 48 | alias: { 49 | '@': fileURLToPath(new URL('./src', import.meta.url)), 50 | '~': fileURLToPath(new URL('./node_modules', import.meta.url)) 51 | }, 52 | preserveSymlinks: true 53 | }, 54 | server: { 55 | fs: { 56 | // Allow serving files from one level up to the project root 57 | allow: ['..'] 58 | }, 59 | proxy: { 60 | '^/data': { 61 | target: `${summa_config.ipfs_http_base_url}/ipns/standard-template-construct.org/data`, 62 | changeOrigin: true, 63 | secure: false, 64 | rewrite: (path) => path.replace(/^\/data/, ''), 65 | }, 66 | '^/images/wiki': { 67 | target: `${summa_config.ipfs_http_base_url}/ipns/en.wikipedia-on-ipfs.org/I`, 68 | changeOrigin: true, 69 | secure: false, 70 | rewrite: (path) => path.replace(/^\/images\/wiki/, ''), 71 | } 72 | } 73 | } 74 | }) -------------------------------------------------------------------------------- /tgbot/handlers/roll.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | from telethon import events 5 | 6 | from library.telegram.base import RequestContext 7 | from tgbot.views.telegram.base_holder import BaseTelegramDocumentHolder 8 | 9 | from .base import BaseHandler 10 | 11 | 12 | class RollHandler(BaseHandler): 13 | filter = events.NewMessage(incoming=True, pattern=re.compile(r'^/roll(?:@\w+)?(.*)?$', re.DOTALL)) 14 | is_group_handler = True 15 | 16 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 17 | start_time = time.time() 18 | 19 | session_id = self.generate_session_id() 20 | request_context.add_default_fields(mode='roll', session_id=session_id) 21 | string_query = event.pattern_match.group(1).strip() 22 | 23 | query, query_traits = self.application.search_request_builder.process( 24 | string_query, 25 | is_fieldnorms_scoring_enabled=False, 26 | collector='reservoir_sampling', 27 | limit=1, 28 | default_query_language=request_context.chat['language'], 29 | ) 30 | documents = await self.application.summa_client.search_documents(query) 31 | 32 | if documents: 33 | holder = BaseTelegramDocumentHolder(documents[0]) 34 | promo = self.application.promotioner.choose_promotion(query_traits.query_language) 35 | view = holder.view_builder(query_traits.query_language).add_view(bot_name=request_context.bot_name).add_new_line(2).add(promo, escaped=True).build() 36 | buttons_builder = holder.buttons_builder(query_traits.query_language) 37 | 38 | if request_context.is_group_mode(): 39 | buttons_builder.add_remote_download_button(bot_name=request_context.bot_name) 40 | else: 41 | buttons_builder.add_download_button() 42 | buttons_builder.add_close_button() 43 | 44 | request_context.statbox(action='show', duration=time.time() - start_time) 45 | await event.respond(view, buttons=buttons_builder.build(), link_preview=True) 46 | -------------------------------------------------------------------------------- /geck/stc_geck/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import os 4 | import re 5 | import socket 6 | import tempfile 7 | from urllib.parse import quote 8 | 9 | import ipfs_hamt_directory_py 10 | 11 | NON_ALNUMWHITESPACE_REGEX = re.compile(r'([^\s\w])+') 12 | MULTIWHITESPACE_REGEX = re.compile(r"\s+") 13 | 14 | 15 | def cast_string_to_single_string(s): 16 | processed = MULTIWHITESPACE_REGEX.sub(' ', NON_ALNUMWHITESPACE_REGEX.sub(' ', s)) 17 | processed = processed.strip().replace(' ', '-') 18 | return processed 19 | 20 | 21 | async def create_car(output_car, documents, limit, name_template) -> str: 22 | with tempfile.TemporaryDirectory() as td: 23 | input_data = os.path.join(td, 'input_data.txt') 24 | with open(input_data, 'wb') as f: 25 | async for document in documents: 26 | if limit <= 0: 27 | break 28 | id_ = document.get('doi') or document.get('md5') 29 | item_name = name_template.format( 30 | title=cast_string_to_single_string(document['title']) if 'title' in document else id_, 31 | id=id_, 32 | md5=document.get('md5'), 33 | doi=document.get('doi'), 34 | extension=document.get('metadata', {}).get('extension', 'pdf'), 35 | ) 36 | f.write(quote(item_name, safe='').encode()) 37 | f.write(b' ') 38 | f.write(document['cid'].encode()) 39 | f.write(b' ') 40 | f.write(str(document.get('filesize') or 0).encode()) 41 | f.write(b'\n') 42 | limit -= 1 43 | return await asyncio.get_event_loop().run_in_executor( 44 | None, lambda: ipfs_hamt_directory_py.from_file(input_data, output_car, td), 45 | ) 46 | 47 | 48 | def is_endpoint_listening(endpoint): 49 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 50 | ip, port = endpoint.split(':') 51 | try: 52 | is_open = sock.connect_ex((ip, int(port))) == 0 53 | sock.close() 54 | return is_open 55 | except socket.gaierror as e: 56 | logging.getLogger('warning').warning({'action': 'warning', 'error': str(e)}) 57 | return False 58 | -------------------------------------------------------------------------------- /web/src/views/DocumentView.vue: -------------------------------------------------------------------------------- 1 | 8 | 9 | 77 | -------------------------------------------------------------------------------- /tgbot/handlers/q.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import re 3 | 4 | from bs4 import BeautifulSoup 5 | from telethon import events 6 | 7 | from library.telegram.base import RequestContext 8 | from library.telegram.common import close_button 9 | from library.textutils.utils import remove_markdown 10 | 11 | from ..translations import t 12 | from .base import BaseHandler 13 | from ..views.telegram.common import encode_query_to_deep_link 14 | 15 | 16 | class QHandler(BaseHandler): 17 | filter = events.NewMessage(incoming=True, pattern=re.compile(r'^/q(?:@\w+)?(?:\s+(.*))?$', re.DOTALL)) 18 | is_group_handler = True 19 | 20 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 21 | session_id = self.generate_session_id() 22 | request_context.add_default_fields(mode='cybrex', session_id=session_id) 23 | request_context.statbox(action='show', sender_id=event.sender_id) 24 | 25 | query = event.pattern_match.group(1) 26 | if not query: 27 | text = "Send query for semantic search after `/q`: `/q What is hemoglobin?`" 28 | return await event.reply(text) 29 | query = query.strip() 30 | 31 | scored_chunks = await self.application.cybrex_ai.semantic_search(query, n_chunks=3, n_documents=0) 32 | response = f'🤔 **{query}**' 33 | 34 | references = [] 35 | for scored_chunk in scored_chunks[:3]: 36 | field, value = scored_chunk.chunk.document_id.split(':', 2) 37 | 38 | document_id = f'{field}:{value}' 39 | title = scored_chunk.chunk.title.replace('\n', ' - ') 40 | text_title = BeautifulSoup(title or '', 'lxml').get_text(separator='') 41 | deep_query = encode_query_to_deep_link(document_id, bot_name=request_context.bot_name) 42 | if deep_query: 43 | reference = f' - **{text_title}** - [{document_id}]({deep_query})' 44 | else: 45 | reference = f' - **{text_title}** - `{document_id}`' 46 | reference += f'\n**Text:** {remove_markdown(scored_chunk.chunk.text)}' 47 | references.append(reference) 48 | 49 | references = '\n\n'.join(references) 50 | if references: 51 | response += f'\n\n**References:**\n\n{references}' 52 | return await event.reply(response, buttons=[close_button()]) 53 | -------------------------------------------------------------------------------- /tgbot/configs/logging.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | logging: 4 | disable_existing_loggers: false 5 | formatters: 6 | base: 7 | class: izihawa_loglib.formatters.BaseFormatter 8 | default: 9 | class: izihawa_loglib.formatters.DefaultFormatter 10 | traceback: 11 | class: izihawa_loglib.formatters.TracebackFormatter 12 | handlers: 13 | console: 14 | class: logging.StreamHandler 15 | level: INFO 16 | stream: 'ext://sys.stderr' 17 | debug: 18 | class: izihawa_loglib.handlers.BaseFileHandler 19 | filename: '{{ log_path }}/debug.log' 20 | formatter: default 21 | level: DEBUG 22 | error: 23 | class: izihawa_loglib.handlers.BaseFileHandler 24 | filename: '{{ log_path }}/error.log' 25 | formatter: default 26 | level: ERROR 27 | operation: 28 | class: izihawa_loglib.handlers.BaseFileHandler 29 | filename: '{{ log_path }}/operation.log' 30 | formatter: base 31 | level: DEBUG 32 | statbox: 33 | class: izihawa_loglib.handlers.BaseFileHandler 34 | filename: '{{ log_path }}/statbox.log' 35 | formatter: default 36 | level: INFO 37 | traceback: 38 | class: izihawa_loglib.handlers.BaseFileHandler 39 | filename: '{{ log_path }}/traceback.log' 40 | formatter: traceback 41 | level: ERROR 42 | warning: 43 | class: izihawa_loglib.handlers.BaseFileHandler 44 | filename: '{{ log_path }}/warning.log' 45 | formatter: default 46 | level: WARNING 47 | loggers: 48 | aiobaseclient: 49 | handlers: 50 | - error 51 | - warning 52 | propagate: false 53 | chardet: 54 | handlers: 55 | - error 56 | propagate: false 57 | debug: 58 | handlers: 59 | - debug 60 | propagate: false 61 | error: 62 | handlers: 63 | - console 64 | - error 65 | - traceback 66 | - warning 67 | propagate: false 68 | operation: 69 | handlers: 70 | - operation 71 | propagate: false 72 | statbox: 73 | handlers: 74 | - console 75 | - statbox 76 | propagate: false 77 | telethon: 78 | handlers: 79 | - error 80 | - warning 81 | propagate: false 82 | root: 83 | handlers: 84 | - debug 85 | level: DEBUG 86 | version: 1 87 | -------------------------------------------------------------------------------- /web/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "summa-web", 3 | "version": "0.0.0", 4 | "scripts": { 5 | "dev": "vite --mode development", 6 | "build": "run-p type-check build-only", 7 | "preview": "vite preview --port 4173", 8 | "build-only": "vite build --config vite.config.ts && vite build --config vite-sw.config.ts", 9 | "type-check": "vue-tsc --noEmit", 10 | "lint": "eslint . --ext .vue,.js,.jsx,.cjs,.mjs,.ts,.tsx,.cts,.mts --fix --ignore-path .gitignore", 11 | "publish": "bash publi.sh" 12 | }, 13 | "dependencies": { 14 | "@grpc/grpc-js": "^1.9.7", 15 | "@protobuf-ts/grpcweb-transport": "^2.9.1", 16 | "@vueuse/core": "^10.2.1", 17 | "@vueuse/rxjs": "^10.2.1", 18 | "axios": "^1.4.0", 19 | "bootstrap": "^5.3.0", 20 | "comlink": "^4.4.1", 21 | "crypto-js": "^4.1.1", 22 | "detect-browser": "^5.3.0", 23 | "dexie": "^3.2.4", 24 | "epubjs": "^0.3.93", 25 | "google-protobuf": "^3.21.2", 26 | "grpc-web": "^1.4.2", 27 | "hammerjs": "^2.0.8", 28 | "npm-run-all": "^4.1.5", 29 | "pdfjs-dist": "^3.11.174", 30 | "qr-creator": "^1.0.0", 31 | "summa-wasm": "^0.135.7", 32 | "vite-plugin-require": "^1.1.11", 33 | "vite-plugin-top-level-await": "^1.3.1", 34 | "vite-plugin-wasm": "^3.2.2", 35 | "vue": "^3.3.4", 36 | "vue-router": "^4.2.4", 37 | "zingtouch": "^1.0.6" 38 | }, 39 | "devDependencies": { 40 | "@protobuf-ts/plugin": "^2.9.1", 41 | "@rushstack/eslint-patch": "^1.3.2", 42 | "@tsconfig/node18": "^18.2.0", 43 | "@types/node": "^20.4.2", 44 | "@typescript-eslint/eslint-plugin": "^5.62.0", 45 | "@vitejs/plugin-react": "^4.0.3", 46 | "@vitejs/plugin-vue": "^4.2.3", 47 | "@vue/eslint-config-prettier": "^8.0.0", 48 | "@vue/eslint-config-typescript": "^11.0.3", 49 | "@vue/tsconfig": "^0.4.0", 50 | "bootstrap-icons": "^1.10.5", 51 | "djvujs-dist": "^0.5.4", 52 | "eslint": "^8.45.0", 53 | "eslint-config-airbnb-base": "^15.0.0", 54 | "eslint-config-standard-with-typescript": "^37.0.0", 55 | "eslint-plugin-import": "^2.27.5", 56 | "eslint-plugin-n": "^16.0.1", 57 | "eslint-plugin-promise": "^6.1.1", 58 | "eslint-plugin-simple-import-sort": "^10.0.0", 59 | "eslint-plugin-vue": "^9.15.1", 60 | "kubo-rpc-client": "^3.0.1", 61 | "prettier": "^3.0.0", 62 | "sass": "^1.64.0", 63 | "ts-node": "^10.9.1", 64 | "typescript": "^5.1.6", 65 | "vite": "^4.4.9", 66 | "vue-pug-plugin": "^2.0.3", 67 | "vue-tsc": "^1.8.5" 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /tgbot/markdownifytg.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from markdownify import ( 4 | MarkdownConverter, 5 | abstract_inline_conversion, chomp, 6 | ) 7 | 8 | html_heading_re = re.compile(r'(h[1-6]|header|title)') 9 | 10 | 11 | class Converter(MarkdownConverter): 12 | convert_b = abstract_inline_conversion(lambda self: '**') 13 | convert_i = abstract_inline_conversion(lambda self: '__') 14 | convert_em = abstract_inline_conversion(lambda self: '__') 15 | 16 | def convert_header(self, el, text, convert_as_inline): 17 | return '\n' + super().convert_b(el, text, convert_as_inline) + '\n' 18 | 19 | def convert_hn(self, n, el, text, convert_as_inline): 20 | return '\n' + super().convert_b(el, text, convert_as_inline) + '\n' 21 | 22 | def convert_hr(self, el, text, convert_as_inline): 23 | return '' 24 | 25 | def convert_title(self, el, text, convert_as_inline): 26 | return super().convert_b(el, text, convert_as_inline) + '\n' 27 | 28 | def convert_formula(self, el, text, convert_as_inline): 29 | return '🔢\n' 30 | 31 | def convert_a(self, el, text, convert_as_inline): 32 | prefix, suffix, text = chomp(text) 33 | if not text: 34 | return '' 35 | href = el.get('href') 36 | return f'[{text}]({href})' 37 | 38 | def convert_img(self, el, text, convert_as_inline): 39 | return '🖼️\n' 40 | 41 | def convert_table(self, el, text, convert_as_inline): 42 | return '🔢\n' 43 | 44 | 45 | class SnippetConverter(MarkdownConverter): 46 | convert_highlight = abstract_inline_conversion(lambda self: '**') 47 | convert_i = abstract_inline_conversion(lambda self: '') 48 | convert_header = abstract_inline_conversion(lambda self: '') 49 | 50 | def convert_hn(self, n, el, text, convert_as_inline): 51 | return text 52 | 53 | def convert_hr(self, el, text, convert_as_inline): 54 | return '' 55 | 56 | def convert_title(self, el, text, convert_as_inline): 57 | return text 58 | 59 | def convert_formula(self, el, text, convert_as_inline): 60 | return '🔢\n' 61 | 62 | def convert_img(self, el, text, convert_as_inline): 63 | return '🖼️\n' 64 | 65 | def convert_table(self, el, text, convert_as_inline): 66 | return '🔢\n' 67 | 68 | 69 | md_converter = Converter(escape_asterisks=False) 70 | highlight_md_converter = SnippetConverter(escape_asterisks=False) 71 | 72 | 73 | def md(html, **options): 74 | return Converter(**options).convert(html) 75 | -------------------------------------------------------------------------------- /library/telegram/session_backend/core_postgres.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Any, 3 | Union, 4 | ) 5 | 6 | from sqlalchemy.dialects.postgresql import insert 7 | from telethon.sessions.memory import _SentFileType 8 | from telethon.tl.types import ( 9 | InputDocument, 10 | InputPhoto, 11 | ) 12 | 13 | from .core import AlchemyCoreSession 14 | 15 | 16 | class AlchemyPostgresCoreSession(AlchemyCoreSession): 17 | def set_update_state(self, entity_id: int, row: Any) -> None: 18 | t = self.UpdateState.__table__ 19 | values = dict(pts=row.pts, qts=row.qts, date=row.date.timestamp(), 20 | seq=row.seq, unread_count=row.unread_count) 21 | with self.engine.begin() as conn: 22 | conn.execute(insert(t) 23 | .values(session_id=self.session_id, entity_id=entity_id, **values) 24 | .on_conflict_do_update(constraint=t.primary_key, set_=values)) 25 | 26 | def process_entities(self, tlo: Any) -> None: 27 | rows = self._entities_to_rows(tlo) 28 | if not rows: 29 | return 30 | 31 | t = self.Entity.__table__ 32 | ins = insert(t) 33 | upsert = ins.on_conflict_do_update(constraint=t.primary_key, set_={ 34 | "hash": ins.excluded.hash, 35 | "username": ins.excluded.username, 36 | "phone": ins.excluded.phone, 37 | "name": ins.excluded.name, 38 | }) 39 | with self.engine.begin() as conn: 40 | conn.execute(upsert, [dict(session_id=self.session_id, id=row[0], hash=row[1], 41 | username=row[2], phone=row[3], name=row[4]) 42 | for row in rows]) 43 | 44 | def cache_file(self, md5_digest: str, file_size: int, 45 | instance: Union[InputDocument, InputPhoto]) -> None: 46 | if not isinstance(instance, (InputDocument, InputPhoto)): 47 | raise TypeError("Cannot cache {} instance".format(type(instance))) 48 | 49 | t = self.SentFile.__table__ 50 | values = dict(id=instance.id, hash=instance.access_hash) 51 | with self.engine.begin() as conn: 52 | conn.execute(insert(t) 53 | .values(session_id=self.session_id, md5_digest=md5_digest, 54 | type=_SentFileType.from_type(type(instance)).value, 55 | file_size=file_size, **values) 56 | .on_conflict_do_update(constraint=t.primary_key, set_=values)) 57 | -------------------------------------------------------------------------------- /tgbot/handlers/view.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from telethon import ( 5 | events, 6 | functions, 7 | ) 8 | from telethon.errors import MessageIdInvalidError 9 | 10 | from library.telegram.base import RequestContext 11 | from tgbot.translations import t 12 | from tgbot.views.telegram.base_holder import BaseTelegramDocumentHolder 13 | 14 | from .base import BaseHandler 15 | 16 | 17 | def is_earlier_than_2_days(message): 18 | if message.date: 19 | return time.time() - time.mktime(message.date.timetuple()) < 2 * 24 * 60 * 60 - 10 20 | 21 | 22 | class ViewHandler(BaseHandler): 23 | filter = events.NewMessage(incoming=True, pattern='^/v_([A-Za-z0-9_-]+)') 24 | 25 | def parse_pattern(self, event: events.ChatAction): 26 | cid = event.pattern_match.group(1) 27 | return cid 28 | 29 | async def get_message(self, message_id, request_context: RequestContext): 30 | get_message_request = functions.messages.GetMessagesRequest(id=[message_id]) 31 | messages = await self.application.get_telegram_client(request_context.bot_name)(get_message_request) 32 | return messages.messages[0] 33 | 34 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 35 | cid = self.parse_pattern(event) 36 | 37 | request_context.add_default_fields(mode='view', cid=cid) 38 | request_context.statbox(action='view') 39 | 40 | language = request_context.chat['language'] 41 | 42 | try: 43 | prefetch_message = await event.reply(t("SEARCHING", request_context.chat['language'])) 44 | document = await self.application.summa_client.get_one_by_field_value('nexus_science', 'links.cid', cid) 45 | if not document: 46 | return await event.reply(t("OUTDATED_VIEW_LINK", language)) 47 | holder = BaseTelegramDocumentHolder(document) 48 | promo = self.application.promotioner.choose_promotion(language) 49 | view_builder = holder.view_builder(language).add_view(bot_name=request_context.bot_name).add_new_line(2).add(promo, escaped=True) 50 | buttons = holder.buttons_builder(language).add_default_layout( 51 | bot_name=request_context.bot_name, 52 | is_group_mode=request_context.is_group_mode(), 53 | ).build() 54 | return await asyncio.gather( 55 | event.delete(), 56 | prefetch_message.edit(view_builder.build(), buttons=buttons, link_preview=holder.has_cover()), 57 | ) 58 | except MessageIdInvalidError: 59 | return await event.reply(t("VIEWS_CANNOT_BE_SHARED", language)) 60 | -------------------------------------------------------------------------------- /tgbot/handlers/start.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from telethon import events 4 | 5 | from library.telegram.base import RequestContext 6 | from tgbot.translations import t 7 | from tgbot.views.telegram.common import ( 8 | DecodeDeepQueryError, 9 | decode_deep_query, recode_base64_to_base36, 10 | ) 11 | 12 | from .search import BaseSearchHandler 13 | 14 | 15 | class StartHandler(BaseSearchHandler): 16 | filter = events.NewMessage(incoming=True, pattern='^/start\\s?(.*)?') 17 | 18 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 19 | raw_query = event.pattern_match.group(1) 20 | string_query = None 21 | 22 | request_context.statbox(action='start', mode='start', text=event.text) 23 | 24 | try: 25 | string_query = decode_deep_query(raw_query) 26 | except DecodeDeepQueryError as e1: 27 | try: 28 | cid = recode_base64_to_base36(raw_query) 29 | string_query = f'links.cid:{cid}' 30 | except DecodeDeepQueryError as e2: 31 | request_context.error_log(e1, mode='start', raw_query=raw_query) 32 | request_context.error_log(e2, mode='start', raw_query=raw_query) 33 | 34 | if string_query: 35 | request_context.statbox(action='query', mode='start', query=string_query) 36 | request_message = await self.application.get_telegram_client(request_context.bot_name).send_message(event.chat, string_query) 37 | prefetch_message = await request_message.reply( 38 | t("SEARCHING", request_context.chat['language']), 39 | ) 40 | try: 41 | text, buttons, link_preview = await self.setup_widget( 42 | request_context=request_context, 43 | string_query=string_query, 44 | is_shortpath_enabled=True, 45 | ) 46 | edit_action = self.application.get_telegram_client(request_context.bot_name).edit_message( 47 | request_context.chat['chat_id'], 48 | prefetch_message.id, 49 | text, 50 | buttons=buttons, 51 | link_preview=link_preview, 52 | ) 53 | await asyncio.gather( 54 | event.delete(), 55 | edit_action, 56 | ) 57 | except Exception: 58 | await prefetch_message.delete() 59 | raise 60 | else: 61 | request_context.statbox(action='show', mode='start') 62 | await event.reply(t('HELP', request_context.chat['language'])) 63 | -------------------------------------------------------------------------------- /web/src/views/Reader.vue: -------------------------------------------------------------------------------- 1 | 13 | 14 | 75 | -------------------------------------------------------------------------------- /cybrex/examples/on-the-fly-translation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from transformers import MBartForConditionalGeneration, MBart50TokenizerFast\n", 12 | "\n", 13 | "model = MBartForConditionalGeneration.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")\n", 14 | "tokenizer = MBart50TokenizerFast.from_pretrained(\"facebook/mbart-large-50-many-to-many-mmt\")" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "outputs": [], 21 | "source": [ 22 | "tokenizer.lang_code_to_id" 23 | ], 24 | "metadata": { 25 | "collapsed": false 26 | } 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "outputs": [], 32 | "source": [ 33 | "article = \"Forty-two patients operated on by skin expansion have been contacted after a mean time of 25 months from the last surgery. Two biopsies have been taken from the expanded area of each patient. In 12 patients it has been possible to obtain a similar sampling from the opposite, nonexpanded area of the body. The samples underwent optic microscopy and cell kinetic and DNA content investigations. The epidermal structure of the followed-up skin, compared with the skin of the opposite side of the body, looks normal. The mitotic activity of the epidermal cells has returned to the values of preexpanded skin. The dermis shows a low degree of elastosis and zonal fragmentation of elastic fibers. The hypodermis, where the expander capsule was removed during the last surgery, does not show an accentuated fibrosis.\"\n", 34 | "tokenizer.src_lang = \"en_XX\"\n", 35 | "inputs = tokenizer(article, return_tensors=\"pt\")\n", 36 | "\n", 37 | "translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[\"ru_RU\"], max_length=1024)\n", 38 | "tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]" 39 | ], 40 | "metadata": { 41 | "collapsed": false 42 | } 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "outputs": [], 48 | "source": [], 49 | "metadata": { 50 | "collapsed": false 51 | } 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "Python 3", 57 | "language": "python", 58 | "name": "python3" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": { 62 | "name": "ipython", 63 | "version": 2 64 | }, 65 | "file_extension": ".py", 66 | "mimetype": "text/x-python", 67 | "name": "python", 68 | "nbconvert_exporter": "python", 69 | "pygments_lexer": "ipython2", 70 | "version": "2.7.6" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 0 75 | } 76 | -------------------------------------------------------------------------------- /web/src/database.ts: -------------------------------------------------------------------------------- 1 | import Dexie from 'dexie' 2 | 3 | import { average } from '@/utils' 4 | 5 | export class UserDb extends Dexie { 6 | bookmarks!: Dexie.Table 7 | search_metrics!: Dexie.Table 8 | 9 | constructor (name: string, version: number) { 10 | super(name) 11 | this.version(version).stores({ 12 | bookmarks: '[index_name+query],created_at', 13 | search_metrics: 'created_at' 14 | }) 15 | this.bookmarks.mapToClass(Bookmark) 16 | this.search_metrics.mapToClass(SearchMetric) 17 | } 18 | 19 | async add_search_metrics (search_metrics: SearchMetric) { 20 | return await this.transaction('rw', this.search_metrics, async () => { 21 | await this.search_metrics.offset(100).delete() 22 | return await this.search_metrics.put(search_metrics) 23 | }) 24 | } 25 | 26 | async get_average_spent (last_n_time: number) { 27 | return await this.transaction('rw', this.search_metrics, async () => { 28 | const result = await this.search_metrics 29 | .orderBy('created_at') 30 | .reverse() 31 | .limit(last_n_time) 32 | .toArray() 33 | if (result.length < last_n_time) { 34 | return undefined 35 | } 36 | return average(result.map((x) => x.spent)) 37 | }) 38 | } 39 | 40 | async add_bookmark (bookmark: IBookmark) { 41 | return await this.transaction('rw', this.bookmarks, async () => { 42 | return await this.bookmarks.put(bookmark) 43 | }) 44 | } 45 | 46 | async get_all_bookmarks () { 47 | return await this.transaction('rw', this.bookmarks, async () => { 48 | return await this.bookmarks.orderBy('created_at').reverse().toArray() 49 | }) 50 | } 51 | 52 | async has_bookmark (index_name: string, query: string) { 53 | return await this.transaction('rw', this.bookmarks, async () => { 54 | return (await this.bookmarks.get([index_name, query])) !== undefined 55 | }) 56 | } 57 | 58 | async delete_bookmark (index_name: string, query: string) { 59 | await this.transaction('rw', this.bookmarks, async () => { 60 | await this.bookmarks.delete([index_name, query]) 61 | }) 62 | } 63 | } 64 | 65 | interface IBookmark { 66 | index_name: string 67 | query: string 68 | created_at: number 69 | } 70 | 71 | export class Bookmark implements IBookmark { 72 | index_name: string 73 | query: string 74 | created_at: number 75 | 76 | constructor (index_name: string, query: string) { 77 | this.index_name = index_name 78 | this.query = query 79 | this.created_at = Date.now() / 1000 80 | } 81 | } 82 | 83 | interface ISearchMetric { 84 | spent: number 85 | created_at: number 86 | } 87 | 88 | export class SearchMetric implements ISearchMetric { 89 | spent: number 90 | created_at: number 91 | 92 | constructor (spent: number) { 93 | this.spent = spent 94 | this.created_at = Date.now() / 1000 95 | } 96 | } 97 | 98 | export const user_db = new UserDb('UserDb', 8) 99 | -------------------------------------------------------------------------------- /web/public/safari-pinned-tab.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 8 | Created by potrace 1.14, written by Peter Selinger 2001-2017 9 | 10 | 12 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /web/src/router/index.ts: -------------------------------------------------------------------------------- 1 | // @ts-nocheck 2 | import { createRouter, createWebHashHistory } from 'vue-router' 3 | 4 | const router = createRouter({ 5 | history: createWebHashHistory(import.meta.env.BASE_URL), 6 | scrollBehavior (to, from, savedPosition) { 7 | if (to.path.startsWith("/help") && from.path.startsWith("/help")) { 8 | return { 9 | el: '#hrv', 10 | behavior: 'smooth', 11 | } 12 | } 13 | // always scroll to top 14 | return { top: 0 } 15 | }, 16 | routes: [ 17 | { 18 | path: '/', 19 | name: 'search', 20 | component: async () => await import('../views/SearchView.vue'), 21 | props: (route) => ({ 22 | q: route.query.q, 23 | p: Number.parseInt(route.query.p), 24 | t: route.query.t, 25 | y: route.query.y 26 | }) 27 | }, 28 | { 29 | path: '/reader', 30 | name: 'reader', 31 | component: async () => await import('../views/Reader.vue'), 32 | props: (route) => ({ 33 | cid: route.query.cid, 34 | filename: route.query.filename, 35 | anchor: route.query.anchor 36 | }) 37 | }, 38 | { 39 | path: '/bookmarks', 40 | name: 'bookmarks', 41 | component: async () => await import('../views/BookmarksView.vue') 42 | }, 43 | { 44 | path: '/help', 45 | name: 'help', 46 | component: async () => await import('../views/HelpView.vue'), 47 | children: [ 48 | { 49 | path: '', 50 | name: 'intro', 51 | component: async () => await import('../views/IntroView.vue') 52 | }, 53 | { 54 | path: 'doomsday', 55 | name: 'doomsday', 56 | component: async () => await import('../views/DoomsdayView.vue') 57 | }, 58 | { 59 | path: 'donate', 60 | name: 'donate', 61 | component: async () => await import('../views/DonateView.vue') 62 | }, 63 | { 64 | path: 'how-to-search', 65 | name: 'how-to-search', 66 | component: async () => await import('../views/HowToSearchView.vue') 67 | }, 68 | { 69 | path: 'install-ipfs', 70 | name: 'install-ipfs', 71 | component: async () => await import('../views/InstallIpfsView.vue') 72 | }, 73 | { 74 | path: 'replicate', 75 | name: 'replicate', 76 | component: async () => await import('../views/ReplicateView.vue') 77 | }, 78 | { 79 | path: 'stc-box', 80 | name: 'stc-box', 81 | component: async () => await import('../views/StcBoxView.vue') 82 | }, 83 | { 84 | path: 'stc-hub-api', 85 | name: 'stc-hub-api', 86 | component: async () => await import('../views/StcHubApiView.vue') 87 | } 88 | ] 89 | }, 90 | { 91 | path: '/nexus_science/:id(.+)', 92 | name: 'document', 93 | component: async () => await import('../views/DocumentView.vue'), 94 | props: true 95 | } 96 | ] 97 | }) 98 | 99 | export default router 100 | -------------------------------------------------------------------------------- /web/src/services/search/search-service.ts: -------------------------------------------------------------------------------- 1 | // @ts-nocheck 2 | 3 | import { 4 | type IndexQuery, 5 | } from 'summa-wasm' 6 | import { 7 | IpfsSearchProvider, 8 | RemoteSearchProvider, 9 | type SearchProvider, SearchProviderStatus, 10 | } from "@/services/search/search-provider"; 11 | import {ref} from "vue"; 12 | import {utils} from "summa-wasm"; 13 | 14 | export class SearchService { 15 | search_providers: Array; 16 | current_provider_ix: Number; 17 | init_guard: Promise; 18 | current_init_status: any; 19 | loading_failure_reason: any; 20 | 21 | 22 | constructor(logging_level: string) { 23 | this.current_init_status = ref(undefined); 24 | let search_providers = []; 25 | let { ipfs_hostname, ipfs_protocol } = utils.get_ipfs_hostname(); 26 | const ipfs_hostname_stripped = ipfs_hostname.split(':')[0] 27 | if ( 28 | ipfs_hostname_stripped !== 'localhost' 29 | && ipfs_hostname_stripped !== 'ipfs.io' 30 | && ipfs_hostname_stripped !== 'dweb.link' 31 | ) { 32 | search_providers.push(new RemoteSearchProvider( 33 | `${ipfs_protocol}//api.${ipfs_hostname_stripped}`, 34 | "Local API", 35 | )); 36 | } 37 | search_providers.push(...[ 38 | new RemoteSearchProvider( 39 | "https://api.libstc.cc", 40 | "Nebula Nomad Station", 41 | ), 42 | new IpfsSearchProvider(this.current_init_status, {logging_level}), 43 | ]); 44 | this.search_providers = search_providers; 45 | this.current_provider_ix = ref(undefined); 46 | this.loading_failure_reason = ref(undefined); 47 | this.init_guard = (async () => { 48 | await this.setup(); 49 | })() 50 | } 51 | 52 | async setup() { 53 | let last_error = undefined; 54 | for (const [index, search_provider] of this.search_providers.entries()) { 55 | try { 56 | await search_provider.setup(this.current_init_status); 57 | } catch (e) { 58 | last_error = e; 59 | continue; 60 | } 61 | if (search_provider.status.value == SearchProviderStatus.Succeeded) { 62 | this.current_provider_ix.value = index; 63 | return; 64 | } 65 | } 66 | if (last_error !== undefined) { 67 | this.loading_failure_reason.value = last_error.toString(); 68 | } 69 | } 70 | 71 | async change_provider(index: Number) { 72 | const new_provider = this.search_providers[index]; 73 | if (new_provider.status.value == SearchProviderStatus.NotSetup) { 74 | await new_provider.setup(); 75 | } else { 76 | await new_provider.healthcheck(); 77 | } 78 | if (new_provider.status.value == SearchProviderStatus.Succeeded) { 79 | this.current_provider_ix.value = index; 80 | } 81 | } 82 | 83 | async search(index_query: IndexQuery, options: QueryOptions): Promise { 84 | await this.init_guard; 85 | return this.search_providers[this.current_provider_ix.value].search(index_query, options); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /tgbot/views/telegram/document_list_widget.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | List, 3 | Optional, 4 | ) 5 | 6 | from telethon import Button 7 | 8 | from library.telegram.common import close_button 9 | from tgbot.translations import t 10 | from tgbot.views.telegram.base_holder import BaseTelegramDocumentHolder 11 | 12 | 13 | class DocumentListWidget: 14 | def __init__( 15 | self, 16 | chat: dict, 17 | document_holders: List[BaseTelegramDocumentHolder], 18 | bot_name, 19 | header: Optional[str] = None, 20 | promotioner=None, 21 | has_next: bool = False, 22 | session_id: Optional[str] = None, 23 | message_id: Optional[int] = None, 24 | request_id: Optional[str] = None, 25 | cmd: str = None, 26 | page: int = 0, 27 | page_size: int = 5, 28 | ): 29 | self.chat = chat 30 | self.document_holders = document_holders 31 | self.bot_name = bot_name 32 | self.header = header 33 | self.promotioner = promotioner 34 | self.cmd = cmd 35 | self.has_next = has_next 36 | self.session_id = session_id 37 | self.message_id = message_id 38 | self.request_id = request_id 39 | self.page = page 40 | self.page_size = page_size 41 | 42 | async def render(self) -> tuple[str, Optional[list]]: 43 | if not len(self.document_holders): 44 | return t('COULD_NOT_FIND_ANYTHING', self.chat['language']), [close_button(self.session_id)] 45 | 46 | serp_elements = [] 47 | for position, document_holder in enumerate(self.document_holders): 48 | serp_elements.append( 49 | document_holder 50 | .view_builder(self.chat['language']) 51 | .add_short_description() 52 | .add_new_line() 53 | .add_links() 54 | .build() 55 | ) 56 | 57 | serp = '\n\n'.join(serp_elements) 58 | 59 | if self.header: 60 | serp = f'**{self.header}**\n\n{serp}' 61 | 62 | promotion_language = self.chat['language'] 63 | promo = self.promotioner.choose_promotion(promotion_language) 64 | serp = f'{serp}\n\n{promo}\n' 65 | 66 | buttons = [] 67 | if self.cmd and self.message_id and self.session_id and (self.has_next or self.page > 0): 68 | buttons = [ 69 | Button.inline( 70 | text='<<1' if self.page > 1 else ' ', 71 | data=f'/{self.cmd}_{self.session_id}_{self.message_id}_0' 72 | if self.page > 1 else '/noop', 73 | ), 74 | Button.inline( 75 | text=f'<{self.page}' if self.page > 0 else ' ', 76 | data=f'/{self.cmd}_{self.session_id}_{self.message_id}_{self.page - 1}' 77 | if self.page > 0 else '/noop', 78 | ), 79 | Button.inline( 80 | text=f'{self.page + 2}>' if self.has_next else ' ', 81 | data=f'/{self.cmd}_{self.session_id}_{self.message_id}_{self.page + 1}' 82 | if self.has_next else '/noop', 83 | ) 84 | ] 85 | buttons.append(close_button(self.session_id)) 86 | return serp, buttons 87 | -------------------------------------------------------------------------------- /web/src/utils.ts: -------------------------------------------------------------------------------- 1 | import { utils } from 'summa-wasm' 2 | 3 | async function get_default_cover() { 4 | const default_cover = await fetch('./default-cover.jpg') 5 | const blob = await default_cover.blob() 6 | return URL.createObjectURL(blob); 7 | } 8 | export const default_cover = await get_default_cover() 9 | 10 | export function format_bytes (bytes: number, decimals = 2) { 11 | if (!+bytes) return '0 Bytes' 12 | 13 | const k = 1024 14 | const dm = decimals < 0 ? 0 : decimals 15 | const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'] 16 | 17 | const i = Math.floor(Math.log(bytes) / Math.log(k)) 18 | 19 | return `${parseFloat((bytes / Math.pow(k, i)).toFixed(dm))} ${sizes[i]}` 20 | } 21 | 22 | export function format_date (unixtime: bigint): string { 23 | const date = new Date(Number.parseInt(unixtime.toString()) * 1000) 24 | let month = (date.getMonth() + 1).toString() 25 | if (month.length < 2) { 26 | month = '0' + month 27 | } 28 | let day = date.getDate().toString() 29 | if (day.length < 2) { 30 | day = '0' + day 31 | } 32 | return `${date.getFullYear()}-${month}-${day}` 33 | } 34 | 35 | export function format_percent (v: number): string { 36 | return (v * 100).toFixed(2) + '%' 37 | } 38 | export const sleep = async (ms: number) => await new Promise((r) => setTimeout(r, ms)) 39 | 40 | export function generate_filename (title: string) { 41 | return ( 42 | (title || "unnamed") 43 | .toLowerCase() 44 | .replace(/[^\p{L}\p{N}]/gu, ' ') 45 | .replace(/\s+/gu, ' ') 46 | .replace(/\s/gu, '-') 47 | ) 48 | } 49 | 50 | export function is_int (s: string) { 51 | return !isNaN(parseFloat(s)) 52 | } 53 | 54 | export function average (arr: number[]) { 55 | if (arr.length === 0) { 56 | return undefined 57 | } 58 | let total = 0 59 | for (let i = 0; i < arr.length; i++) { 60 | total += arr[i] 61 | } 62 | return total / arr.length 63 | } 64 | 65 | export function decode_html(html) { 66 | const txt = document.createElement('textarea'); 67 | txt.innerHTML = html; 68 | return txt.value; 69 | } 70 | 71 | export function extract_text_from_html(html) { 72 | const parser = new DOMParser(); 73 | const document = parser.parseFromString(html || "", "text/html"); 74 | return document.getElementsByTagName("body")[0].textContent; 75 | } 76 | 77 | export function remove_unpaired_escaped_tags(str) { 78 | const openTags = []; 79 | const regex = /<\/?([a-z][a-z0-9]*)\b[^&]*>/gi; 80 | 81 | // First pass: Handle and remove unpaired closing tags 82 | let intermediateStr = str.replace(regex, (match, p1) => { 83 | if (match.startsWith('</')) { 84 | if (openTags.length && openTags[openTags.length - 1] === p1) { 85 | openTags.pop(); 86 | return match; // Keep the closing tag if it matches the last opening tag 87 | } 88 | return ''; // Remove the closing tag if it doesn't match the last opening tag 89 | } else { 90 | openTags.push(p1); 91 | return match; // Keep the opening tag for now 92 | } 93 | }); 94 | 95 | // Second pass: Remove unpaired opening tags 96 | for (const tag of openTags) { 97 | const unpairedTag = new RegExp(`<${tag}\\b[^&]*>`, 'gi'); 98 | intermediateStr = intermediateStr.replace(unpairedTag, ''); 99 | } 100 | 101 | return intermediateStr; 102 | } 103 | -------------------------------------------------------------------------------- /cybrex/examples/analyse-references.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": false 7 | }, 8 | "source": [ 9 | "# Analyse References of Paper to Find an Answer\n", 10 | "\n", 11 | "Following example shows how to extract references from the document and do a semantic search over the documents and all its references\n", 12 | "\n", 13 | "Optionally, start Summa server to enhance performance of queries to STC" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import logging\n", 25 | "import sys\n", 26 | "\n", 27 | "from stc_geck.advices import BaseDocumentHolder\n", 28 | "from cybrex.cybrex_ai import CybrexAI\n", 29 | "\n", 30 | "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", 31 | "\n", 32 | "cybrex = CybrexAI()\n", 33 | "query = 'What drugs are used for haemophilia treatment?'\n", 34 | "seed_doi = '10.1111/j.1365-2516.2007.01474.x'\n", 35 | "await cybrex.start()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "outputs": [], 42 | "source": [ 43 | "# Search seed document\n", 44 | "seed_document = await cybrex.search_documents(f'id.dois:{seed_doi}', n_documents=1)\n", 45 | "seed_document = seed_document[0].document\n", 46 | "\n", 47 | "# And track all document identifiers for further extraction\n", 48 | "related_document_ids = {f'id.dois:{seed_doi}'}\n", 49 | "\n", 50 | "# Get out references\n", 51 | "for reference in seed_document['references']:\n", 52 | " related_document_ids.add(f'id.dois:{reference[\"doi\"]}')\n", 53 | "\n", 54 | "# Get in references\n", 55 | "referencing_documents = await cybrex.search_documents(f'rd:{seed_doi})', n_documents=100)\n", 56 | "for referencing_document in referencing_documents:\n", 57 | " referencing_document_holder = BaseDocumentHolder(referencing_document)\n", 58 | " related_document_ids.add(referencing_document_holder.get_internal_id())\n", 59 | "\n", 60 | "print('Following documents will be queries:', related_document_ids)\n", 61 | "\n", 62 | "# Now, retrieve all documents and its metadata\n", 63 | "related_documents = await cybrex.search_documents(' '.join(related_document_ids), n_documents=100)" 64 | ], 65 | "metadata": { 66 | "collapsed": false 67 | } 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "outputs": [], 73 | "source": [ 74 | "related_chunks = await cybrex.semantic_search_in_documents(query, related_documents, n_chunks=10, minimum_score=0.5, skip_downloading_pdf=False)" 75 | ], 76 | "metadata": { 77 | "collapsed": false 78 | } 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "outputs": [], 84 | "source": [ 85 | "related_chunks[:10]" 86 | ], 87 | "metadata": { 88 | "collapsed": false 89 | } 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.11.3" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 0 113 | } 114 | -------------------------------------------------------------------------------- /web/src/scss/styles.scss: -------------------------------------------------------------------------------- 1 | $base-light: #644494; 2 | $base-dark: #FE9609; 3 | $enable-negative-margins: true; 4 | 5 | $headings-color: $base-light; 6 | $headings-color-dark: $base-dark; 7 | $link-color: $base-light; 8 | $link-color-dark: $base-dark; 9 | $code-color: $base-light; 10 | $code-color-dark: $base-dark; 11 | $badge-font-size: 0.85em; 12 | 13 | @import "bootstrap/scss/bootstrap"; 14 | @import "bootstrap-icons/font/bootstrap-icons.css"; 15 | 16 | .inversion-filter {} 17 | @media (prefers-color-scheme: dark) { 18 | .inversion-filter { 19 | filter: invert(100%) hue-rotate(180deg); 20 | } 21 | } 22 | 23 | .favicon-inversion-filter { 24 | filter: invert(29%) sepia(15%) saturate(2317%) hue-rotate(222deg) brightness(100%) contrast(90%) 25 | } 26 | @media (prefers-color-scheme: dark) { 27 | .favicon-inversion-filter { 28 | filter: invert(78%) sepia(78%) saturate(4236%) hue-rotate(356deg) brightness(103%) contrast(99%); 29 | } 30 | } 31 | 32 | .content-view { 33 | word-break: break-word !important; 34 | } 35 | 36 | .content-view header { @extend h5; } 37 | .content-view section header { @extend b; } 38 | 39 | .content-view .infobox { 40 | font-family: sans-serif; 41 | max-width: 320px; 42 | float: right !important; 43 | clear: right !important; 44 | background-color: rgb(44, 50, 56) !important; 45 | border: 1px solid #a2a9b1 !important; 46 | border-spacing: 3px !important; 47 | margin: 0.5em 0 0.5em 1em; 48 | padding: 0.2em; 49 | line-height: 1.5em; 50 | width: 100%; 51 | @extend small; 52 | @extend .m-2; 53 | } 54 | 55 | .content-view .wikitable { 56 | margin: 1em 0; 57 | border: 1px solid #a2a9b1; 58 | border-collapse: collapse; 59 | } 60 | 61 | .content-view .wikitable > tr > th, .wikitable > * > tr > th { 62 | text-align: center; 63 | } 64 | 65 | .content-view .wikitable > tr > th, .wikitable > tr > td, .wikitable > * > tr > th, .wikitable > * > tr > td { 66 | border: 1px solid #a2a9b1; 67 | padding: 0.2em 0.4em; 68 | } 69 | 70 | .content-view .infobox > tbody { 71 | font-family: sans-serif; 72 | border-spacing: 3px; 73 | line-height: 1.5em; 74 | display: flex !important; 75 | flex-flow: column nowrap !important; 76 | @extend .m-2; 77 | } 78 | 79 | 80 | .content-view .infobox > tbody table { 81 | width: 100%; 82 | min-width: 100%; 83 | border-collapse: collapse; 84 | display: inline-table; 85 | } 86 | 87 | .content-view .reflist { 88 | @extend .small; 89 | } 90 | .content-view .refbegin { 91 | @extend .small; 92 | } 93 | 94 | .content-view details > summary { 95 | list-style: none; 96 | } 97 | .content-view details > summary::-webkit-details-marker { 98 | display: none; 99 | } 100 | 101 | .content-view .tright { 102 | float: right !important; 103 | clear: right !important; 104 | margin: 14px 0 14px 14px; 105 | } 106 | 107 | .content-view .tleft { 108 | float: left !important; 109 | clear: left !important; 110 | margin: 14px 14px 14px 0; 111 | } 112 | 113 | .content-view .thumb { 114 | max-width: 704px; 115 | 116 | box-sizing: content-box; 117 | display: block !important; 118 | position: static; 119 | text-align: center !important; 120 | } 121 | 122 | .content-view .thumb .thumbinner { 123 | display: flex !important; 124 | flex-direction: column; 125 | } 126 | 127 | .content-view .thumbcaption { 128 | margin: 0.5em 0 0; 129 | padding: 0 !important; 130 | width: auto !important; 131 | font-size: 0.8em; 132 | line-height: 1.5; 133 | @extend small; 134 | } 135 | 136 | .content-view .trow { 137 | display: flex !important; 138 | } 139 | 140 | .content-view .hatnote { 141 | @extend small; 142 | @extend .m-2; 143 | } 144 | 145 | -------------------------------------------------------------------------------- /cybrex/cybrex/data_source/geck_data_source.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | List, 3 | Optional, 4 | ) 5 | 6 | import orjson 7 | from stc_geck.advices import BaseDocumentHolder 8 | from stc_geck.client import StcGeck 9 | 10 | from .base import ( 11 | BaseDataSource, 12 | SourceDocument, 13 | ) 14 | 15 | 16 | class GeckDataSource(BaseDataSource): 17 | def __init__(self, geck: StcGeck): 18 | self.geck = geck 19 | 20 | def _query_function( 21 | self, 22 | query: str = '', 23 | with_language_filter: bool = True, 24 | with_type_filter: bool = True, 25 | with_existence_filter: bool = False, 26 | ): 27 | subqueries = [] 28 | if with_type_filter: 29 | subqueries.append({'occur': 'must', 'query': {'boolean': {'subqueries': [ 30 | {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'book'}}}, 31 | {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'edited-book'}}}, 32 | {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'monograph'}}}, 33 | {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'reference-book'}}}, 34 | {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'journal-article'}}}, 35 | {'occur': 'should', 'query': {'term': {'field': 'type', 'value': 'wiki'}}}, 36 | ]}}}) 37 | if with_language_filter: 38 | subqueries.append({'occur': 'must', 'query': {'term': {'field': 'languages', 'value': 'en'}}}) 39 | if with_existence_filter: 40 | subqueries.append({'occur': 'must', 'query': {'exists': {'field': 'content'}}}) 41 | if query: 42 | subqueries.append({'occur': 'must', 'query': {'match': {'value': query.lower()}}}) 43 | if subqueries: 44 | return {'boolean': {'subqueries': subqueries}} 45 | else: 46 | return {'all': {}} 47 | 48 | async def stream_documents( 49 | self, 50 | query: str, 51 | limit: int = 0, 52 | ) -> List[SourceDocument]: 53 | documents = self.geck.get_summa_client().documents( 54 | self.geck.index_alias, 55 | query_filter=self._query_function(query, with_existence_filter=True), 56 | ) 57 | counter = 0 58 | async for document in documents: 59 | document = orjson.loads(document) 60 | document_holder = BaseDocumentHolder(document) 61 | document_id = document_holder.get_internal_id() 62 | if not document_id: 63 | continue 64 | yield SourceDocument( 65 | document=document, 66 | document_id=document_id, 67 | ) 68 | counter += 1 69 | if limit and counter >= limit: 70 | return 71 | 72 | async def search_documents( 73 | self, 74 | query: str, 75 | limit: int = 5, 76 | sources: Optional[List[str]] = None 77 | ) -> List[SourceDocument]: 78 | documents = await self.geck.get_summa_client().search_documents({ 79 | 'index_alias': self.geck.index_alias, 80 | 'query': self._query_function(query), 81 | 'collectors': [{'top_docs': {'limit': limit}}], 82 | 'is_fieldnorms_scoring_enabled': False, 83 | }) 84 | source_documents = [] 85 | for document in documents: 86 | document_holder = BaseDocumentHolder(document) 87 | document_id = document_holder.get_internal_id() 88 | if not document_id: 89 | continue 90 | source_documents.append(SourceDocument( 91 | document=document, 92 | document_id=document_id, 93 | )) 94 | return source_documents 95 | -------------------------------------------------------------------------------- /web/src/components/DocumentButtons.vue: -------------------------------------------------------------------------------- 1 | 33 | 34 | 97 | -------------------------------------------------------------------------------- /cybrex/cybrex/chains/map_reduce.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import ( 3 | Iterable, 4 | List, 5 | ) 6 | 7 | from ..document_chunker import Chunk 8 | from ..llm_manager import LLMManager 9 | 10 | 11 | class MapReduceChain: 12 | def __init__(self, llm_manager: LLMManager, chunk_accumulator): 13 | self.llm_manager = llm_manager 14 | self.chunk_accumulator = chunk_accumulator 15 | 16 | def input_splitter(self, chunks: List[Chunk]) -> str: 17 | for chunk in chunks: 18 | self.chunk_accumulator.accept(chunk) 19 | if self.chunk_accumulator.is_full(): 20 | yield self.chunk_accumulator.produce() 21 | if not self.chunk_accumulator.is_empty(): 22 | yield self.chunk_accumulator.produce() 23 | 24 | def output_processor(self, llm_output: str) -> Chunk: 25 | return Chunk( 26 | title=None, 27 | document_id=None, 28 | chunk_id=None, 29 | text=llm_output, 30 | length=len(llm_output) 31 | ) 32 | 33 | def process(self, chunks: Iterable[Chunk]): 34 | while True: 35 | input_chunks = self.input_splitter(chunks) 36 | outputs = [] 37 | for input_chunk in input_chunks: 38 | llm_output = self.llm_manager.process(input_chunk) 39 | logging.getLogger('statbox').info({ 40 | 'action': 'intermediate_map_reduce_step', 41 | 'output': llm_output, 42 | }) 43 | outputs.append(llm_output) 44 | if len(outputs) == 1: 45 | return outputs[0].strip() 46 | chunks = list(map(self.output_processor, outputs)) 47 | 48 | 49 | class ChunkAccumulator: 50 | def __init__(self, prompter, max_chunk_length: int): 51 | self.prompter = prompter 52 | self.max_chunk_length = max_chunk_length 53 | self.chunks = [] 54 | self.current_chunk_length = 0 55 | 56 | def accept(self, chunk: Chunk): 57 | self.current_chunk_length += len(chunk.text) 58 | self.chunks.append(chunk) 59 | 60 | def is_full(self): 61 | return self.current_chunk_length >= self.max_chunk_length 62 | 63 | def is_empty(self): 64 | return len(self.chunks) == 0 65 | 66 | 67 | class QAChunkAccumulator(ChunkAccumulator): 68 | def __init__(self, query: str, prompter, max_chunk_length: int): 69 | super().__init__(prompter=prompter, max_chunk_length=max_chunk_length) 70 | self.query = query 71 | 72 | def produce(self): 73 | collected_chunks = self.chunks 74 | self.chunks = [] 75 | self.current_chunk_length = 0 76 | return self.prompter.qa_prompt(self.query, collected_chunks) 77 | 78 | 79 | class SummarizeChunkAccumulator(ChunkAccumulator): 80 | def produce(self): 81 | collected_chunks = self.chunks 82 | self.chunks = [] 83 | self.current_chunk_length = 0 84 | return self.prompter.summarize_prompt(collected_chunks) 85 | 86 | 87 | class QAChain(MapReduceChain): 88 | def __init__(self, query: str, llm_manager): 89 | super().__init__( 90 | llm_manager=llm_manager, 91 | chunk_accumulator=QAChunkAccumulator( 92 | query=query, 93 | prompter=llm_manager.prompter, 94 | max_chunk_length=llm_manager.max_prompt_chars, 95 | )) 96 | 97 | 98 | class SummarizeChain(MapReduceChain): 99 | def __init__(self, llm_manager: LLMManager): 100 | super().__init__( 101 | llm_manager=llm_manager, 102 | chunk_accumulator=SummarizeChunkAccumulator( 103 | prompter=llm_manager.prompter, 104 | max_chunk_length=llm_manager.max_prompt_chars, 105 | ) 106 | ) 107 | -------------------------------------------------------------------------------- /tgbot/views/telegram/common.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import binascii 3 | import logging 4 | 5 | import base36 6 | from izihawa_utils.exceptions import BaseError 7 | from telethon import Button 8 | 9 | from tgbot.translations import t 10 | 11 | 12 | class TooLongQueryError(BaseError): 13 | level = logging.WARNING 14 | code = 'too_long_query_error' 15 | 16 | 17 | class DecodeDeepQueryError(BaseError): 18 | level = logging.WARNING 19 | code = 'decode_deep_query_error' 20 | 21 | 22 | def vote_button(language: str, case: str): 23 | label = f"REPORT_{case.upper()}_FILE" 24 | case = {'correct': 'c', 'incorrect': 'i'}[case] 25 | return Button.inline( 26 | text=t(label, language), 27 | data=f'/vote_{case}', 28 | ) 29 | 30 | 31 | def encode_query_to_deep_link(query, bot_name, skip_encoding=False): 32 | if not skip_encoding: 33 | query = encode_deep_query(query) 34 | if len(query) <= 64: 35 | return f'https://t.me/{bot_name}?start={query}' 36 | raise TooLongQueryError() 37 | 38 | 39 | def to_bytes(n): 40 | return [n & 255] + to_bytes(n >> 8) if n > 0 else [] 41 | 42 | 43 | def recode_base36_to_base64(query): 44 | return base64.b64encode(bytearray(to_bytes(base36.loads(query))), altchars=b'-_').rstrip(b'=') 45 | 46 | 47 | def recode_base64_to_base36(query): 48 | try: 49 | # Padding fix 50 | return base36.dumps(int.from_bytes(base64.b64decode(query + "=" * ((4 - len(query) % 4) % 4), altchars=b'-_'), 'little')) 51 | except (binascii.Error, ValueError, UnicodeDecodeError) as e: 52 | raise DecodeDeepQueryError(nested_error=e) 53 | 54 | 55 | def encode_deep_query(query): 56 | return base64.b64encode(query.encode(), altchars=b'-_').decode() 57 | 58 | 59 | def decode_deep_query(query): 60 | try: 61 | # Padding fix 62 | return base64.b64decode(query + "=" * ((4 - len(query) % 4) % 4), altchars=b'-_').decode() 63 | except (binascii.Error, ValueError, UnicodeDecodeError) as e: 64 | raise DecodeDeepQueryError(nested_error=e) 65 | 66 | 67 | async def remove_button(event, mark, and_empty_too=False, link_preview=None): 68 | original_message = await event.get_message() 69 | if original_message: 70 | original_buttons = original_message.buttons 71 | buttons = [] 72 | for original_line in original_buttons: 73 | line = [] 74 | for original_button in original_line: 75 | if mark in original_button.text or (and_empty_too and not original_button.text.strip()): 76 | continue 77 | line.append(original_button) 78 | if line: 79 | buttons.append(line) 80 | await event.edit(original_message.text, buttons=buttons, link_preview=link_preview) 81 | 82 | 83 | def get_formatted_filesize(filesize) -> str: 84 | if filesize: 85 | filesize = max(1024, filesize) 86 | return '{:.1f}Mb'.format(float(filesize) / (1024 * 1024)) 87 | else: 88 | return '' 89 | 90 | 91 | def encode_link(bot_name, text, query) -> str: 92 | try: 93 | encoded_query = encode_query_to_deep_link(query, bot_name) 94 | if text: 95 | return f'[{text}]({encoded_query})' 96 | else: 97 | return encoded_query 98 | except TooLongQueryError: 99 | return text 100 | 101 | 102 | def fix_markdown(text: str): 103 | if text.count('**') % 2 == 1: 104 | position = text.rfind('**') 105 | text = text[:position] + text[position + 2:] 106 | if text.count('__') % 2 == 1: 107 | position = text.rfind('__') 108 | text = text[:position] + text[position + 2:] 109 | return text 110 | 111 | 112 | def add_expand_dot(text, le: int): 113 | if len(text) < le: 114 | return text 115 | crop_at = text[:le].rfind(' ') 116 | return text[:crop_at] + '...' 117 | -------------------------------------------------------------------------------- /web/src/views/BookmarksView.vue: -------------------------------------------------------------------------------- 1 | 23 | 24 | 113 | -------------------------------------------------------------------------------- /tgbot/handlers/submit.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import re 3 | from urllib.parse import unquote 4 | 5 | from telethon import events 6 | 7 | from library.telegram.base import RequestContext 8 | from library.telegram.common import close_button 9 | from library.textutils import DOI_REGEX 10 | from tgbot.app.exceptions import UnknownFileFormatError 11 | from tgbot.translations import t 12 | 13 | from ..app.librarian_service import extract_internal_id 14 | from .base import BaseHandler 15 | 16 | 17 | def is_submit_message(event): 18 | if event.document and event.document.mime_type in ('application/octet-stream', 'application/pdf', 'application/zip'): 19 | return True 20 | if event.fwd_from and event.fwd_from.document and event.document.mime_type in ( 21 | 'application/octet-stream', 'application/pdf', 'application/zip' 22 | ): 23 | return True 24 | return False 25 | 26 | 27 | class SubmitHandler(BaseHandler): 28 | filter = events.NewMessage(func=is_submit_message, incoming=True) 29 | is_group_handler = True 30 | writing_handler = True 31 | 32 | def get_internal_id_hint(self, message, reply_message) -> str: 33 | internal_id_hint = None 34 | if message.text: 35 | if internal_id := extract_internal_id(message.text): 36 | return internal_id 37 | elif doi_regex := re.search(DOI_REGEX, message.raw_text): 38 | internal_id_hint = 'id.dois:' + doi_regex.group(1) + '/' + doi_regex.group(2) 39 | if not internal_id_hint and reply_message: 40 | if internal_id := extract_internal_id(reply_message.text): 41 | return internal_id 42 | elif doi_regex := re.search(DOI_REGEX, reply_message.raw_text): 43 | internal_id_hint = 'id.dois:' + doi_regex.group(1) + '/' + doi_regex.group(2) 44 | return internal_id_hint 45 | 46 | async def handler(self, event, request_context: RequestContext): 47 | session_id = self.generate_session_id() 48 | 49 | request_context.add_default_fields(session_id=session_id) 50 | request_context.statbox(action='show', mode='submit', mime_type=event.document.mime_type) 51 | 52 | reply_message = await event.get_reply_message() 53 | internal_id_hint = self.get_internal_id_hint(message=event, reply_message=reply_message) 54 | request_context.statbox(action='doi_hint', internal_id_hint=internal_id_hint) 55 | 56 | if not internal_id_hint: 57 | return await event.reply( 58 | t('NO_DOI_HINT', request_context.chat['language']), 59 | buttons=None if request_context.is_group_mode() else [close_button()], 60 | ) 61 | field, value = internal_id_hint.split(':', 1) 62 | 63 | match event.document.mime_type: 64 | case 'application/pdf': 65 | if self.application.librarian_service: 66 | document = await self.application.summa_client.get_one_by_field_value( 67 | 'nexus_science', 68 | field, 69 | value, 70 | ) 71 | uploaded_message = await self.application.librarian_service.process_file( 72 | event, 73 | request_context, 74 | document, 75 | ) 76 | await self.application.database.add_upload(event.sender_id, uploaded_message.id, internal_id_hint) 77 | case _: 78 | request_context.statbox(action='unknown_file_format') 79 | request_context.error_log(UnknownFileFormatError(format=event.document.mime_type)) 80 | return await asyncio.gather( 81 | event.reply( 82 | t('UNKNOWN_FILE_FORMAT_ERROR', request_context.chat['language']), 83 | buttons=None if request_context.is_group_mode() else [close_button()], 84 | ), 85 | event.delete(), 86 | ) 87 | -------------------------------------------------------------------------------- /library/textutils/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import struct 3 | 4 | import isbnlib 5 | 6 | from . import ( 7 | EMAIL_REGEX, 8 | HASHTAG_REGEX, 9 | MULTIWHITESPACE_REGEX, 10 | NON_ALNUMWHITESPACE_REGEX, 11 | TELEGRAM_LINK_REGEX, 12 | URL_REGEX, 13 | ) 14 | 15 | 16 | def add_surrogate(text): 17 | return ''.join( 18 | # SMP -> Surrogate Pairs (Telegram offsets are calculated with these). 19 | # See https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview for more. 20 | ''.join(chr(y) for y in struct.unpack('', text) 54 | if escape_quote: 55 | text = text.replace("`", "'") 56 | if escape_brackets: 57 | text = text.replace('[', r'`[`').replace(']', r'`]`') 58 | elif isinstance(text, bytes): 59 | if escape_font: 60 | text = re.sub(br'([_*]){2,}', br'\g<1>', text) 61 | if escape_quote: 62 | text = text.replace(b"`", b"'") 63 | if escape_brackets: 64 | text = text.replace(b'[', br'`[`').replace(b']', br'`]`') 65 | return text 66 | 67 | 68 | def remove_markdown(text): 69 | text = re.sub('[*_~]{2,}', '', text) 70 | text = re.sub('`+', '', text) 71 | text = re.sub(r'\[\s*(.*?)(\s*)\]\(.*?\)', r'\g<1>\g<2>', text, flags=re.MULTILINE) 72 | return text 73 | 74 | 75 | def remove_emails(text): 76 | return re.sub(EMAIL_REGEX, '', text) 77 | 78 | 79 | def remove_hashtags(text): 80 | return re.sub(HASHTAG_REGEX, '', text) 81 | 82 | 83 | def remove_hidden_chars(text): 84 | return text.replace('\xad', '') 85 | 86 | 87 | def remove_url(text): 88 | return re.sub(URL_REGEX, '', text) 89 | 90 | 91 | def replace_telegram_link(text): 92 | return re.sub(TELEGRAM_LINK_REGEX, r'@\1', text) 93 | 94 | 95 | def split_at(s, pos): 96 | if len(s) < pos: 97 | return s 98 | pos -= 10 99 | pos = max(0, pos) 100 | for p in range(pos, min(pos + 20, len(s) - 1)): 101 | if s[p] in [' ', '\n', '.', ',', ':', ';', '-']: 102 | return s[:p] + '...' 103 | return s[:pos] + '...' 104 | 105 | 106 | def unwind_hashtags(text): 107 | return re.sub(HASHTAG_REGEX, r'\2', text) 108 | 109 | 110 | def process_isbns(isbnlikes): 111 | isbns = [] 112 | for isbnlike in isbnlikes: 113 | if not isbnlike: 114 | continue 115 | if isbnlike[0].isalpha() and len(isbnlike) == 10 and isbnlike[1:].isalnum(): 116 | isbns.append(isbnlike.upper()) 117 | continue 118 | isbn = isbnlib.canonical(isbnlike) 119 | if not isbn: 120 | continue 121 | isbns.append(isbn) 122 | if isbnlib.is_isbn10(isbn): 123 | if isbn13 := isbnlib.to_isbn13(isbn): 124 | isbns.append(isbn13) 125 | elif isbnlib.is_isbn13(isbn): 126 | if isbn10 := isbnlib.to_isbn10(isbn): 127 | isbns.append(isbn10) 128 | return list(sorted(set(isbns))) 129 | -------------------------------------------------------------------------------- /tgbot/views/telegram/progress_bar.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from izihawa_utils.exceptions import BaseError 4 | from telethon.errors import MessageIdInvalidError 5 | 6 | 7 | class ProgressBarLostMessageError(BaseError): 8 | pass 9 | 10 | 11 | bars = { 12 | 'filled': '█', 13 | 'empty': ' ', 14 | } 15 | 16 | 17 | def percent(done, total): 18 | return min(float(done) / total, 1.0) 19 | 20 | 21 | class ProgressBar: 22 | def __init__( 23 | self, 24 | telegram_client, 25 | request_context, 26 | banner, 27 | header, 28 | tail_text, 29 | message=None, 30 | source=None, 31 | throttle_secs: float = 0.0, 32 | hard_throttle_secs: float = 10.0, 33 | last_call: float = 0.0, 34 | done_threshold_size: int = 10 * 1024 * 1024, 35 | ): 36 | self.telegram_client = telegram_client 37 | self.request_context = request_context 38 | self.banner = banner 39 | self.header = header 40 | self.tail_text = tail_text 41 | self.message = message 42 | self.source = source 43 | self.done = 0 44 | self.total = 1 45 | self.throttle_secs = throttle_secs 46 | self.hard_throttle_secs = hard_throttle_secs 47 | self.done_threshold_size = done_threshold_size 48 | 49 | self.previous_done = 0 50 | self.last_text = None 51 | self.last_call = last_call 52 | 53 | def share(self): 54 | if self.total: 55 | return f'{float(percent(self.done, self.total) * 100):.1f}%' 56 | else: 57 | return f'{float(self.done / (1024 * 1024)):.1f}Mb' 58 | 59 | def _set_progress(self, done, total): 60 | self.previous_done = self.done 61 | self.done = done 62 | self.total = total 63 | 64 | def set_source(self, source): 65 | self.source = source 66 | 67 | def render_banner(self): 68 | banner = self.banner.format(source=self.source) 69 | return f'`{self.header}\n{banner}`' 70 | 71 | async def render_progress(self): 72 | total_bars = 20 73 | progress_bar = '' 74 | if self.total: 75 | filled = int(total_bars * percent(self.done, self.total)) 76 | progress_bar = '|' + filled * bars['filled'] + (total_bars - filled) * bars['empty'] + '| ' 77 | 78 | tail_text = self.tail_text.format(source=self.source) 79 | return f'`{self.header}\n{progress_bar}{self.share().ljust(8)} {tail_text}`' 80 | 81 | def should_send(self, now, ignore_last_call): 82 | if ignore_last_call: 83 | return True 84 | if abs(now - self.last_call) > self.hard_throttle_secs: 85 | return True 86 | if abs(now - self.last_call) > self.throttle_secs and (self.done - self.previous_done) < self.done_threshold_size: 87 | return True 88 | return False 89 | 90 | async def send_message(self, text, ignore_last_call=False): 91 | now = time.time() 92 | if not self.should_send(now, ignore_last_call): 93 | return 94 | try: 95 | if not self.message: 96 | self.message = await self.telegram_client.send_message( 97 | self.request_context.chat['chat_id'], 98 | text, 99 | ) 100 | elif text != self.last_text: 101 | r = await self.message.edit(text) 102 | if not r: 103 | raise ProgressBarLostMessageError() 104 | except MessageIdInvalidError: 105 | raise ProgressBarLostMessageError() 106 | self.last_text = text 107 | self.last_call = now 108 | return self.message 109 | 110 | async def show_banner(self): 111 | return await self.send_message(self.render_banner(), ignore_last_call=True) 112 | 113 | async def callback(self, done, total, ignore_last_call=False): 114 | self._set_progress(done, total) 115 | return await self.send_message(await self.render_progress(), ignore_last_call=ignore_last_call) 116 | -------------------------------------------------------------------------------- /web/src/components/DjvuReader.vue: -------------------------------------------------------------------------------- 1 | 7 | 8 | 114 | 123 | -------------------------------------------------------------------------------- /web/src/components/EpubReader.vue: -------------------------------------------------------------------------------- 1 | 4 | 5 | 117 | 126 | -------------------------------------------------------------------------------- /tgbot/handlers/vote.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import re 3 | 4 | import telethon 5 | from telethon import events 6 | 7 | from library.telegram.base import RequestContext 8 | 9 | from ..app.librarian_service import extract_internal_id 10 | from .base import BaseCallbackQueryHandler 11 | 12 | 13 | def remove_from_list(lst, value): 14 | try: 15 | lst.remove(value) 16 | except ValueError: 17 | pass 18 | 19 | 20 | class VoteHandler(BaseCallbackQueryHandler): 21 | is_group_handler = True 22 | filter = events.CallbackQuery(pattern='^/vote_([ic])$') 23 | writing_handler = True 24 | 25 | votes_regexp = re.compile(r'Correct:(?P\s*.*)\nIncorrect:(?P\s*.*)') 26 | doi_regexp = re.compile(r'\*\*DOI:\*\* \[(?P.*)]\(.*\)') 27 | salt = 'y4XF-OsYl3M' 28 | 29 | def parse_pattern(self, event: events.ChatAction): 30 | vote = event.pattern_match.group(1).decode() 31 | return vote 32 | 33 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 34 | user_id = event.query.user_id 35 | if user_id not in self.application.config['librarian']['moderators']: 36 | return await event.answer('You cannot vote') 37 | if self.application.is_read_only(): 38 | return await event.answer('Read-only mode, try to vote later') 39 | 40 | vote = self.parse_pattern(event) 41 | 42 | request_context.add_default_fields(mode='vote') 43 | 44 | message = await event.get_message() 45 | text = message.text 46 | current_votes = self.votes_regexp.search(text) 47 | librarian_hash = hashlib.md5(f"{user_id}-{self.salt}".encode()).hexdigest()[-8:] 48 | 49 | request_context.statbox( 50 | action='vote', 51 | vote=vote, 52 | ) 53 | 54 | sep = ', ' 55 | correct_votes = [] 56 | if correct_votes_str := current_votes.group('correct').strip(): 57 | correct_votes = correct_votes_str.split(sep) 58 | incorrect_votes = [] 59 | if incorrect_votes_str := current_votes.group('incorrect').strip(): 60 | incorrect_votes = incorrect_votes_str.split(sep) 61 | 62 | remove_from_list(correct_votes, librarian_hash) 63 | remove_from_list(incorrect_votes, librarian_hash) 64 | 65 | if vote == 'c': 66 | correct_votes.append(librarian_hash) 67 | else: 68 | incorrect_votes.append(librarian_hash) 69 | 70 | span = current_votes.span('incorrect') 71 | text = text[:span[0]] + ' ' + sep.join(incorrect_votes) + text[span[1]:] 72 | span = current_votes.span('correct') 73 | text = text[:span[0]] + ' ' + sep.join(correct_votes) + text[span[1]:] 74 | await message.edit(text) 75 | 76 | if ( 77 | len(correct_votes) - len(incorrect_votes) >= self.application.config['librarian']['required_votes'] 78 | or user_id in self.application.config['librarian']['super_moderators'] and vote == 'c' 79 | ): 80 | await message.edit(text, buttons=None) 81 | if internal_id := extract_internal_id(text): 82 | pass 83 | elif doi_re := self.doi_regexp.search(text): 84 | internal_id = f'id.dois:{doi_re.group("doi").strip().lower()}' 85 | else: 86 | raise ValueError() 87 | field, value = internal_id.split(':', 1) 88 | document = await self.application.summa_client.get_one_by_field_value('nexus_science', field, value) 89 | file = await message.download_media(file=bytes) 90 | 91 | request_context.statbox( 92 | action='pinning', 93 | internal_id=internal_id, 94 | filesize=len(file), 95 | ) 96 | 97 | await self.application.file_flow.pin_add(document, file, with_commit=True) 98 | await self.application.database.add_approve(message.id, 1) 99 | reply_message = await message.get_reply_message() 100 | if reply_message: 101 | try: 102 | await reply_message.delete() 103 | except telethon.errors.rpcerrorlist.MessageDeleteForbiddenError: 104 | pass 105 | await event.delete() 106 | else: 107 | await message.edit(text) 108 | -------------------------------------------------------------------------------- /web/src/components/DocumentSnippet.vue: -------------------------------------------------------------------------------- 1 | 21 | 22 | 128 | 129 | 138 | -------------------------------------------------------------------------------- /tgbot/handlers/mlt.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from izihawa_utils.common import filter_none 4 | from stc_geck.advices import get_default_scorer 5 | from telethon import events 6 | 7 | from library.telegram.base import RequestContext 8 | from library.telegram.common import close_button 9 | from tgbot.translations import t 10 | from tgbot.views.telegram.base_holder import BaseTelegramDocumentHolder 11 | 12 | from ..views.telegram.common import ( 13 | recode_base64_to_base36, 14 | remove_button, 15 | ) 16 | from .base import BaseHandler 17 | 18 | 19 | class MltHandler(BaseHandler): 20 | filter = events.CallbackQuery(pattern='^/(m|n)_(.*)') 21 | fail_as_reply = False 22 | 23 | def parse_pattern(self, event: events.ChatAction): 24 | command = event.pattern_match.group(1).decode() 25 | if command == 'm': 26 | cid = recode_base64_to_base36(event.pattern_match.group(2).decode()) 27 | return 'links.cid', cid 28 | else: 29 | internal_id = event.pattern_match.group(2).decode() 30 | return internal_id.split(':', 1) 31 | 32 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 33 | field, value = self.parse_pattern(event) 34 | 35 | request_context.add_default_fields(mode='mlt', field=field, value=value) 36 | request_context.statbox(action='view') 37 | 38 | prefetch_message = await self.application.get_telegram_client(request_context.bot_name).send_message( 39 | event.chat, 40 | t("SEARCHING", request_context.chat['language']) 41 | ) 42 | 43 | source_document = await self.application.summa_client.get_one_by_field_value('nexus_science', field, value) 44 | 45 | if not source_document: 46 | return await event.reply(t("OUTDATED_VIEW_LINK", request_context.chat['language'])) 47 | 48 | document_dump = filter_none({ 49 | 'title': source_document.get('title'), 50 | 'abstract': source_document.get('abstract'), 51 | 'tags': source_document.get('tags'), 52 | 'languages': source_document.get('languages'), 53 | }) 54 | 55 | subqueries = [{ 56 | 'occur': 'should', 57 | 'query': {'more_like_this': { 58 | 'boost': '3.0', 59 | 'max_query_terms': 64, 60 | 'min_term_frequency': 1, 61 | 'min_doc_frequency': 1, 62 | 'max_doc_frequency': 1_000_000, 63 | 'document': json.dumps(document_dump) 64 | }} 65 | }] 66 | 67 | requested_type = 'type:book type:"edited-book" type:monograph type:"reference-book" type:"journal-article"' 68 | if source_document['type'] in {'book', 'edited-book', 'monograph', 'reference-book'}: 69 | requested_type = 'type:book type:"edited-book" type:monograph type:"reference-book"' 70 | elif source_document['type'] == 'journal-article': 71 | requested_type = 'type:"journal-article"' 72 | elif source_document['type'] == 'proceedings-article': 73 | requested_type = 'type:"proceedings-article"' 74 | 75 | documents = await self.application.summa_client.search_documents({ 76 | 'index_alias': 'nexus_science', 77 | 'query': {'boolean': {'subqueries': [ 78 | {'occur': 'must', 'query': {'boolean': {'subqueries': subqueries}}}, 79 | {'occur': 'must', 'query': {'match': {'value': requested_type}}}, 80 | {'occur': 'must_not', 'query': {'match': {'value': BaseTelegramDocumentHolder(source_document).get_internal_id()}}} 81 | ]}}, 82 | 'collectors': [{'top_docs': {'limit': 5, 'scorer': get_default_scorer(self.application.search_request_builder.profile)}}], 83 | }) 84 | 85 | serp_elements = [] 86 | source_document = BaseTelegramDocumentHolder(source_document) 87 | for document in documents: 88 | serp_elements.append(BaseTelegramDocumentHolder(document).base_render( 89 | request_context, 90 | with_librarian_service=bool(self.application.librarian_service) and not self.application.is_read_only() 91 | )) 92 | serp = '\n\n'.join(serp_elements) 93 | serp = f'**Similar To: {source_document.get_title_with_link(bot_name=request_context.bot_name)}**\n\n{serp}' 94 | await remove_button(event, '🖲', and_empty_too=True) 95 | return await prefetch_message.edit(serp, buttons=[close_button()]) 96 | -------------------------------------------------------------------------------- /tgbot/configs/base.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | application: 4 | banned_chat_ids: [] 5 | bot_version: 2.0.0 6 | btc_donate_address: '357vJAFsYeCtLU36MYEgaDueg34rr5ajCy' 7 | # List of chat IDs that is allowed to bypass maintenance mode 8 | bypass_maintenance: [] 9 | data_directory: /usr/lib/stc-tgbot 10 | # Debugging mode 11 | debug: true 12 | # URL of picture to show in error message 13 | error_picture_url: 14 | eth_donate_address: '0x199bECe965e4e1e2fE3065d3F551Ebe8520AC555' 15 | # Enabled index_aliases 16 | # All users (except `bypass_maintenance` ones) will get UPGRADE_MAINTENANCE message in response 17 | is_maintenance_mode: false 18 | # Set to true for restricting writing operations (such as submitting files or requesting new ones) 19 | is_read_only: true 20 | # Default page size for SERP 21 | page_size: 5 22 | # Length of generated Request-ID used for tracking requests across all backends 23 | request_id_length: 12 24 | # Length of generated Session-ID used in commands to clue user sessions 25 | session_id_length: 8 26 | sol_donate_address: 'FcJG17cEyG8LnNkdJg8HCAQQZKxqpwTupD9fc3GXMqxD' 27 | # URL of picture to show in the message about queries with invalid syntax 28 | too_difficult_picture_url: 29 | # URL of picture to show in maintenance message 30 | upgrade_maintenance_picture_url: 31 | xmr_donate_address: '42HZx5Cg1uQ2CtCrq7QabP23BN7gBrGu6U6QumkMmR4bKS61gcoP8xyNzP5cJCbjac9yaWFhLsDmM3adMWyBKBXn1d9WiUb' 32 | xrp_donate_address: 'rw2ciyaNshpHe7bCHo4bRWq6pqqynnWKQg' 33 | xrp_donate_tag: '1968122674' 34 | # Number of async routines for starting all boots 35 | workers: 8 36 | # File Flow service for storing files. Cannot be used in light mode 37 | file_flow: 38 | enabled: false 39 | ipfs: 40 | # Base URL for IPFS Gateway. You can choose any public gateway, such as https://dweb.link/ or https://ipfs.io/ 41 | # or setup your own gateway locally and set it with http://127.0.0.1:8080 42 | http: 43 | base_url: http://ipfs:8080 44 | # Configure Librarian service for uploading files. Cannot be used in light mode 45 | librarian: 46 | # Credentials of admin account for managing Aaron's groups 47 | admin: 48 | app_id: 49 | app_hash: 50 | phone: 51 | # Credentials of bot account for posting messages 52 | bot: 53 | app_id: 54 | app_hash: 55 | bot_name: 56 | bot_token: 57 | enabled: false 58 | group_name: 'nexus_aaron' 59 | moderators: [] 60 | # How large should be difference between correct and incorrect votes for approving 61 | required_votes: 2 62 | # Can solely approve uploaded items 63 | super_moderators: [] 64 | log_path: /var/log/stc-tgbot 65 | # Configure Metadate retriever for ingesting new files. Cannot be used in light mode 66 | metadata_retriever: 67 | # Configuration of Crossref API Client 68 | crossref_api: 69 | timeout: 15 70 | user_agent: anon@example.com 71 | enabled: false 72 | # Index for committing changes 73 | index_alias: nexus_science 74 | reddit: 75 | url: https://reddit.com/r/science_nexus 76 | telegram: 77 | # Enabled handlers 78 | command_handlers: 79 | - tgbot.handlers.aboutus.AboutusHandler 80 | - tgbot.handlers.close.CloseHandler 81 | - tgbot.handlers.cybrex.CybrexHandler 82 | - tgbot.handlers.download.DownloadHandler 83 | - tgbot.handlers.howtohelp.HowToHelpHandler 84 | - tgbot.handlers.help.HelpHandler 85 | - tgbot.handlers.q.QHandler 86 | - tgbot.handlers.report.ReportHandler 87 | - tgbot.handlers.riot.RiotHandler 88 | - tgbot.handlers.riot.RiotBFHandler 89 | - tgbot.handlers.riot.RiotOldHandler 90 | - tgbot.handlers.riot.RiotCredHandler 91 | - tgbot.handlers.roll.RollHandler 92 | - tgbot.handlers.shortlink.ShortlinkHandler 93 | - tgbot.handlers.start.StartHandler 94 | - tgbot.handlers.stop.StopHandler 95 | - tgbot.handlers.mlt.MltHandler 96 | - tgbot.handlers.view.ViewHandler 97 | - tgbot.handlers.noop.NoopHandler 98 | # Channel that will be shown in /help, /howtohelp and in promotions 99 | related_channel: nexus_search 100 | search_handlers: 101 | - tgbot.handlers.search.SearchHandler 102 | - tgbot.handlers.search.SearchEditHandler 103 | - tgbot.handlers.search.SearchPagingHandler 104 | - tgbot.handlers.search.InlineSearchHandler 105 | summa: 106 | endpoint: 127.0.0.0:10082 107 | embed: 108 | enabled: true 109 | ipfs_data_directory: /ipns/libstc.cc/data/ 110 | twitter: 111 | contact_url: https://twitter.com/the_superpirate 112 | -------------------------------------------------------------------------------- /web/src/components/download-progress.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @desc downloadProgress initiator 3 | * @param files {Array} 4 | * @event beforeLoading 5 | * @event afterLoading 6 | * @event progress 7 | * @return {downloadProgressObject} 8 | **/ 9 | export default class DownloadProgress { 10 | /** 11 | * @desc downloadProgress constructor 12 | * @param files {Array} 13 | **/ 14 | files: string[] 15 | percentages: {} 16 | percentage: number 17 | events: { 18 | beforeLoading: Event 19 | afterLoading: any 20 | progress: any 21 | } 22 | 23 | promises: Array> 24 | 25 | constructor (files) { 26 | this.files = files 27 | this.percentages = {} 28 | this.percentage = 0 29 | this.events = { 30 | beforeLoading: new Event('beforeLoading'), 31 | afterLoading: function (response, url) { 32 | return new CustomEvent('afterLoading', { 33 | detail: { response, url } 34 | }) 35 | }, 36 | progress: function (percentage) { 37 | return new CustomEvent('progress', { detail: percentage }) 38 | } 39 | } 40 | this.promises = [] 41 | } 42 | 43 | /** 44 | * @desc the callback that gets called on update progress 45 | * @param url {String} 46 | * @param oEvent {Object} 47 | **/ 48 | _downloadProgressUpdateProgress (url, oEvent) { 49 | const percentComplete = oEvent.lengthComputable 50 | ? oEvent.loaded / oEvent.total 51 | : oEvent.loaded / 52 | (oEvent.target.getResponseHeader('x-decompressed-content-length') || oEvent.target.getResponseHeader('content-length')) 53 | let totalPercentage = 0 54 | let key 55 | this.percentages[url] = percentComplete 56 | for (key in this.percentages) { 57 | totalPercentage += this.percentages[key] 58 | } 59 | this.percentage = (totalPercentage / this.files.length) * 100 60 | document.dispatchEvent(this.events.progress(this.percentage)) 61 | } 62 | 63 | /** 64 | * @desc gets the target file and sends the responseText back 65 | * @param index {Number} 66 | **/ 67 | async initiate_download (index) { 68 | const that = this 69 | return await new Promise(function (resolve, reject) { 70 | const xhr = new XMLHttpRequest() 71 | const url = that.files[index] 72 | xhr.addEventListener( 73 | 'progress', 74 | that._downloadProgressUpdateProgress.bind(that, url) 75 | ) 76 | xhr.responseType = "arraybuffer"; 77 | xhr.open('GET', url) 78 | xhr.onreadystatechange = function (index) { 79 | if (xhr.status === 200 && xhr.readyState === 4) { 80 | document.dispatchEvent( 81 | that.events.afterLoading(xhr.response, that.files[index]) 82 | ) 83 | } 84 | }.bind(that, index) 85 | xhr.onload = function () { 86 | if (xhr.status >= 200 && xhr.status < 300) { 87 | resolve(xhr.response) 88 | } else { 89 | reject({ 90 | status: xhr.status, 91 | statusText: xhr.statusText 92 | }) 93 | } 94 | } 95 | xhr.onerror = function () { 96 | reject({ 97 | status: xhr.status, 98 | statusText: xhr.statusText 99 | }) 100 | } 101 | xhr.send() 102 | }) 103 | } 104 | 105 | /** 106 | * @desc attaches the callback to the given even 107 | * @param event {Object} 108 | * @param callback {Function} 109 | * @return {downloadProgressObject} 110 | **/ 111 | on (event, callback) { 112 | document.addEventListener(event, callback, false) 113 | return this 114 | } 115 | 116 | /** 117 | * @desc initializes the loading 118 | * @return {downloadProgressObject} 119 | **/ 120 | init () { 121 | document.dispatchEvent(this.events.beforeLoading) 122 | let i = 0 123 | for (; i < this.files.length; i++) { 124 | this.percentages[this.files[i]] = 0 125 | this.promises.push(this.initiate_download(i)) 126 | } 127 | return this 128 | } 129 | } 130 | 131 | export async function tracked_download (files, progress_bar) { 132 | const dp = new DownloadProgress(files) 133 | dp.on('progress', function (e) { 134 | let downloaded = e.detail 135 | if (e.detail === Infinity) { 136 | downloaded = 0 137 | } 138 | 139 | progress_bar.value = `${downloaded.toFixed(0)}%` 140 | }).on('afterLoading', function () { 141 | progress_bar.value = undefined 142 | }) 143 | dp.init() 144 | return await Promise.all(dp.promises) 145 | } 146 | -------------------------------------------------------------------------------- /web/src/components/PdfReader.vue: -------------------------------------------------------------------------------- 1 | 6 | 7 | 124 | 133 | -------------------------------------------------------------------------------- /web/src/services/search/query-processor.ts: -------------------------------------------------------------------------------- 1 | import { grpc_web } from "summa-wasm"; 2 | 3 | interface QueryConfig { 4 | page: number 5 | page_size?: number 6 | fields?: string[] 7 | index_name?: string 8 | language?: string 9 | is_date_sorting_enabled: boolean 10 | random?: boolean 11 | type?: Type, 12 | timerange: [number, number] 13 | query_parser_config: grpc_web.query.QueryParserConfig, 14 | } 15 | 16 | export enum Type { 17 | Books = "📚 Books", 18 | } 19 | 20 | export enum Language { 21 | en = '🇬🇧 English', 22 | ar = '🇦🇪 Arabic', 23 | zh = '🇨🇳 Chinese', 24 | am = '🇪🇹 Ethiopian', 25 | fa = '🇮🇷 Farsi', 26 | de = '🇩🇪 German', 27 | hi = '🇮🇳 Hindi', 28 | id = '🇮🇩 Indonesian', 29 | it = '🇮🇹 Italian', 30 | ja = '🇯🇵 Japanese', 31 | ms = '🇲🇾 Malay', 32 | pt = '🇧🇷 Portuguese', 33 | ru = '🇷🇺 Russian', 34 | es = '🇪🇸 Spanish', 35 | tg = '🇹🇯 Tajik', 36 | uk = '🇺🇦 Ukrainian', 37 | uz = '🇺🇿 Uzbek' 38 | } 39 | 40 | export class QueryProcessor { 41 | generate_request(index_config: object, query: string, query_config: QueryConfig) { 42 | return { 43 | index_alias: index_config.index_name, 44 | query: default_queries( 45 | query, 46 | query_config, 47 | ), 48 | collectors: default_collectors(query_config), 49 | is_fieldnorms_scoring_enabled: false, 50 | store_cache: true, 51 | load_cache: true 52 | } 53 | } 54 | } 55 | 56 | export function default_queries( 57 | query: string, 58 | options: QueryConfig, 59 | ) { 60 | let structured_query = {all: {}} 61 | if (query) { 62 | structured_query = {match: {value: query}} 63 | if (options.query_parser_config) { 64 | structured_query.match.query_parser_config = options.query_parser_config 65 | } 66 | } 67 | if ((options.language || options.type || options.timerange)) { 68 | let subqueries = []; 69 | if (query) { 70 | subqueries = [{ 71 | query: structured_query, 72 | occur: 1 73 | }] 74 | } 75 | if (options.language) { 76 | subqueries.push({ 77 | query: {term: {field: 'languages', value: options.language}}, 78 | occur: 1 79 | }) 80 | } 81 | if (options.type === "Books") { 82 | subqueries.push({ 83 | query: { 84 | boolean: { 85 | subqueries: [ 86 | {occur: 0, query: {term: {field: "type", value: "book"}}}, 87 | {occur: 0, query: {term: {field: "type", value: "edited-book"}}}, 88 | {occur: 0, query: {term: {field: "type", value: "monograph"}}}, 89 | {occur: 0, query: {term: {field: "type", value: "reference-book"}}}, 90 | ] 91 | } 92 | }, 93 | occur: 1, 94 | }) 95 | } 96 | if (options.timerange) { 97 | subqueries.push({ 98 | query: { 99 | range: { 100 | field: 'issued_at', value: { 101 | left: options.timerange[0].toString(), including_left: true, 102 | right: options.timerange[1].toString(), including_right: false, 103 | } 104 | } 105 | }, 106 | occur: 1 107 | }) 108 | } 109 | structured_query = { 110 | boolean: { 111 | subqueries: subqueries 112 | } 113 | } 114 | } 115 | return structured_query 116 | } 117 | const TEMPORAL_RANKING_FORMULA = "original_score * custom_score * fastsigm(abs(now - issued_at) / (86400 * 3) + 5, -1)" 118 | const PR_TEMPORAL_RANKING_FORMULA = `${TEMPORAL_RANKING_FORMULA} * 1.96 * fastsigm(iqpr(quantized_page_rank), 0.15)` 119 | 120 | export function default_collectors( 121 | query_config: QueryConfig, 122 | ) { 123 | const page_size = query_config.page_size ?? 5 124 | if (query_config.random) { 125 | return [{reservoir_sampling: {limit: query_config.page_size}}, {count: {}}] 126 | } 127 | return [{ 128 | top_docs: { 129 | offset: (query_config.page - 1) * page_size, 130 | limit: page_size, 131 | snippet_configs: {abstract: 400, title: 180}, 132 | fields: (query_config.fields != null) || [], 133 | scorer: query_config.is_date_sorting_enabled 134 | ? {order_by: 'issued_at'} 135 | : null 136 | } 137 | }, { 138 | count: {} 139 | }] 140 | } 141 | -------------------------------------------------------------------------------- /tgbot/handlers/cybrex.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import re 3 | import shlex 4 | 5 | from telethon import events 6 | 7 | from library.telegram.base import RequestContext 8 | 9 | from .base import BaseHandler 10 | 11 | 12 | class CybrexHandler(BaseHandler): 13 | filter = events.NewMessage(incoming=True, pattern=re.compile(r'^/cybrex(?:@\w+)?(?:\s+)?(.*)?$', re.DOTALL)) 14 | is_group_handler = True 15 | 16 | def parse_command(self, query): 17 | args = [] 18 | kwargs = {} 19 | argv = shlex.split(query) 20 | cmd, argv = argv[0], argv[1:] 21 | for arg in argv: 22 | if arg.startswith('-'): 23 | arg = arg.lstrip('-') 24 | k, v = arg.split('=', 1) 25 | k = k.replace('-', '_') 26 | try: 27 | v = int(v) 28 | except ValueError: 29 | try: 30 | v = float(v) 31 | except ValueError: 32 | pass 33 | kwargs[k.replace('-', '_')] = v 34 | else: 35 | args.append(arg) 36 | return cmd, args, kwargs 37 | 38 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 39 | session_id = self.generate_session_id() 40 | request_context.add_default_fields(mode='cybrex', session_id=session_id) 41 | request_context.statbox(action='show', sender_id=event.sender_id, event=str(event)) 42 | 43 | is_allowed = event.sender_id and int(event.sender_id) in self.application.config['application']['cybrex_whitelist'] 44 | is_allowed = is_allowed or ( 45 | event.sender_id is None 46 | and request_context.chat['chat_id'] in self.application.config['application']['cybrex_whitelist'] 47 | ) 48 | if not is_allowed: 49 | return await event.reply('Only People of Nexus can call me') 50 | 51 | if not self.application.cybrex_ai: 52 | return await event.reply('Cybrex is disabled for now') 53 | 54 | query = event.pattern_match.group(1).strip() 55 | if not query: 56 | text = "My name is Cybrex and I can respond queries based on STC data." 57 | return await event.reply(text) 58 | 59 | reply_message = await event.get_reply_message() 60 | request_context.statbox(action='found_reply_message', reply_message=str(reply_message)) 61 | 62 | if reply_message and reply_message.raw_text: 63 | wait_message = await event.reply('`All right, wait a sec...`') 64 | 65 | text = reply_message.raw_text 66 | cybrex_response = await self.application.cybrex_ai.general_text_processing(query, text) 67 | response = f'🤔 **{query}**' 68 | response = f'{response}\n\n🤖: {cybrex_response.answer.strip()}' 69 | return await asyncio.gather( 70 | wait_message.delete(), 71 | reply_message.reply(response), 72 | ) 73 | 74 | wait_message = await event.reply('`Looking for the answer in STC...`') 75 | 76 | cli = { 77 | 'chat-doc': self.application.cybrex_ai.chat_document, 78 | 'chat-sci': self.application.cybrex_ai.chat_science, 79 | 'semantic-search': self.application.cybrex_ai.semantic_search, 80 | 'sum-doc': self.application.cybrex_ai.summarize_document, 81 | } 82 | 83 | cmd, args, kwargs = self.parse_command(query) 84 | response = await cli[cmd](*args, **kwargs) 85 | show_texts = False 86 | 87 | if cmd == 'semantic-search': 88 | answer, chunks = None, [scored_chunk.chunk for scored_chunk in response] 89 | show_texts = True 90 | else: 91 | answer, chunks = response.answer, response.chunks 92 | 93 | response = f'🤔 **{args[0]}**' 94 | if answer: 95 | response = f'{response}\n\n🤖: {answer}' 96 | 97 | references = [] 98 | visited = set() 99 | for chunk in chunks[:3]: 100 | field, value = chunk.document_id.split(':', 2) 101 | document_id = f'{field}:{value}' 102 | title = chunk.title.split("\n")[0] 103 | reference = f' - **{title}** - `{document_id}`' 104 | if show_texts: 105 | reference += f'\n**Text:** {chunk.text}' 106 | else: 107 | if document_id in visited: 108 | continue 109 | visited.add(document_id) 110 | references.append(reference) 111 | 112 | if show_texts: 113 | references = '\n\n'.join(references) 114 | else: 115 | references = '\n'.join(references) 116 | if references: 117 | response += f'\n\n**References:**\n\n{references}' 118 | 119 | return await asyncio.gather( 120 | wait_message.delete(), 121 | event.reply(response), 122 | ) 123 | -------------------------------------------------------------------------------- /tgbot/handlers/riot.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from telethon import events 4 | from telethon.tl.types import PeerUser 5 | 6 | from library.telegram.base import RequestContext 7 | 8 | from .base import BaseHandler 9 | 10 | 11 | def test_pattern(text): 12 | return re.search( 13 | r"t\.me/([^.]+).*\n\nUse this token to access the HTTP API:\n([^\n]+)\n", 14 | text, 15 | re.MULTILINE, 16 | ) 17 | 18 | 19 | class RiotBFHandler(BaseHandler): 20 | filter = events.NewMessage( 21 | incoming=True, 22 | pattern=test_pattern, 23 | ) 24 | is_group_handler = False 25 | stop_propagation = False 26 | 27 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 28 | request_context.statbox(action='bot_father', mode='riot') 29 | if event.message.fwd_from and event.message.fwd_from.from_id == PeerUser(93372553): 30 | bot_name = event.pattern_match.group(1) 31 | bot_token = event.pattern_match.group(2).strip('`') 32 | await self.application.database.add_new_bot( 33 | bot_name=bot_name, 34 | bot_token=bot_token, 35 | user_id=int(event.message.peer_id.user_id), 36 | ) 37 | await event.reply( 38 | 'Done! Now you should provide application credentials for launching your bot.\n' 39 | 'Follow [guide](https://core.telegram.org/api/obtaining_api_id#obtaining-api-id) and ' 40 | 'then send here bot credentials in the following format:\n' 41 | f'`/riot @{bot_name.strip()} `\n' 42 | 'N.B: The only required fields will be App Name and Short Name' 43 | ) 44 | raise events.StopPropagation() 45 | else: 46 | await event.reply( 47 | 'Seems that your client hides the source of forward. ' 48 | 'Change it in the options of your Telegram client and repeat' 49 | ) 50 | raise events.StopPropagation() 51 | 52 | 53 | class RiotHandler(BaseHandler): 54 | filter = events.NewMessage( 55 | incoming=True, 56 | pattern="^/riot$", 57 | ) 58 | is_group_handler = False 59 | 60 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 61 | request_context.statbox(action='show', mode='riot') 62 | await event.reply( 63 | 'Register new bot in @BotFather and **forward** me the message starting with "Done!..."\n' 64 | 'Check twice that your client doesn\'t hide original forwarder (like Owlgram or others do)' 65 | ) 66 | raise events.StopPropagation() 67 | 68 | 69 | class RiotOldHandler(BaseHandler): 70 | filter = events.NewMessage( 71 | incoming=True, 72 | pattern="^/riot_register$", 73 | ) 74 | is_group_handler = False 75 | 76 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 77 | request_context.statbox(action='show', mode='riot') 78 | await event.reply( 79 | 'We need to re-register the bot to its owner. If you are the owner just forward here the same message ' 80 | 'from @BotFather that you had sent to create this bot' 81 | ) 82 | raise events.StopPropagation() 83 | 84 | 85 | class RiotCredHandler(BaseHandler): 86 | filter = events.NewMessage( 87 | incoming=True, 88 | pattern=r"^/riot\s+@([A-Za-z_0-9]+[Bb][Oo][Tt])\s+?\s+?$", 89 | ) 90 | is_group_handler = False 91 | 92 | async def handler(self, event: events.ChatAction, request_context: RequestContext): 93 | bot_name = event.pattern_match.group(1) 94 | app_id = event.pattern_match.group(2) 95 | app_hash = event.pattern_match.group(3) 96 | request_context.statbox(action='cred', mode='riot', target_bot_name=bot_name, app_id=app_id, app_hash=app_hash) 97 | if bot_name and app_id and app_hash: 98 | async with self.application.database.bots_db_wrapper.db.execute("select owner_id from user_bots where bot_name = ?", (bot_name.strip(),)) as cursor: 99 | async for row in cursor: 100 | if row['owner_id'] != int(event.message.peer_id.user_id): 101 | await event.reply( 102 | f"Bot {bot_name.strip()} is not associated with you. " 103 | f"Please, send message with bot token again." 104 | ) 105 | return 106 | await self.application.database.set_bot_credentials( 107 | bot_name=bot_name.strip(), 108 | app_id=app_id.strip(), 109 | app_hash=app_hash.strip(), 110 | ) 111 | await event.reply(f"Bot credentials for {bot_name.strip()} have been updated! " 112 | f"Your bot will be ready in 5 minutes. Then go to @{bot_name}, " 113 | f"type `/start` and use it") 114 | raise events.StopPropagation() 115 | -------------------------------------------------------------------------------- /cybrex/README.md: -------------------------------------------------------------------------------- 1 | # Cybrex AI 2 | 3 | Cybrex AI integrates several strategies to use AI for facilitating navigation through science. Shortly, Cybrex accept your query, retrieve scholarly publications and books from STC and answer your query using AI and collected documents. 4 | 5 | More technical description: 6 | - IPFS is utilized to access the Standard Template Construct (STC). 7 | - STC provides the raw documents for Cybrex. 8 | - Embedding Model constructs embeddings for these documents and Cybrex stores these embeddings in the vector database. 9 | - These embeddings are then used to retrieve relevant documents, and then they are sent to LLM for Q&A and summarization. 10 | 11 | ## Install 12 | 13 | You should have [installed IPFS](https://libstc.cc/#/help/install-ipfs) 14 | 15 | Then, you should install cybrex package 16 | ```bash 17 | ultranymous@nevermore:~ pip install cybrex 18 | ``` 19 | 20 | and launch qdrant database for storing vectors: 21 | 22 | ```bash 23 | ultranymous@nevermore:~ docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant 24 | ``` 25 | 26 | Upon its initial launch, `cybrex` will create a `~/.cybrex` directory containing a `config.yaml` file and a `chroma` directory. 27 | You can edit the config file to point to different IPFS addresses. 28 | 29 | ## Usage 30 | 31 | **Attention!** STC does not contain every book or publication in the world. We are constantly increasing coverage but there is still a lot to do. 32 | STC contains metadata for the most of the items, but `links` or `content` fields may be absent. 33 | 34 | ```console 35 | # (Optional) Launch Summa search engine, then you will not have to wait bootstrapping every time. 36 | # It will take a time! Wait until the text `Serving on ...` appears 37 | # If you decided to launch it, switch to another Terminal window 38 | ultranymous@nevermore:~ geck --ipfs-http-base-url 127.0.0.1:8080 - serve 39 | ``` 40 | 41 | Now we should initialize Cybrex and choose which models will be used: 42 | 43 | ```console 44 | ultranymous@nevermore:~ cybrex - write-config --force 45 | # or if you want to use OpenAI model, export keys and you should set appropriate models in config: 46 | ultranymous@nevermore:~ export OPENAI_API_KEY=... 47 | ultranymous@nevermore:~ cybrex - write-config -l openai --force 48 | # or if you want to use GPU: 49 | ultranymous@nevermore:~ cybrex - write-config --device cuda --force 50 | 51 | # Summarize a document 52 | ultranymous@nevermore:~ cybrex - sum-doc doi:10.1155/2022/7138756 53 | 54 | Document: doi:10.1155/2022/7138756 55 | Summarization: Resveratrol is a natural compound found in various plants and has been studied for 56 | its anti-inflammatory and antiviral properties. Resveratrol has been shown to regulate miR-223-3p/NLRP3 57 | pathways, inhibit downstream caspase-1 activation, reduce the expression of chemokines, and decrease 58 | the levels of calcium strength, pro-inflammatory cytokines, and MDA in an acute bacterial meningitis model. 59 | It can also regulate the PI3K/Akt/mTOR signaling pathway, reduce NF-κB/p65 and pro-inflammatory cytokines, 60 | and increase nitric oxide, sialic acid, gastric tissue, and vitamin C concentrations. Resveratrol has been 61 | found to inhibit viral replication and have antiviral activity against Zika Virus, Pseudorabies virus, 62 | and HSV-1. The exact mechanisms of action of resveratrol are still not fully understood, but it is believed 63 | to activate the host's immune defences, affect the TLRs/NF-κB signalling pathway, and directly inhibit 64 | viral gene expression. 65 | 66 | # Question a document 67 | ultranymous@nevermore:~ cybrex - chat-doc doi:10.1155/2022/7138756 \ 68 | --query "What is the antivirus effect of resveratrol?" 69 | 70 | Q: What is the antivirus effect of resveratrol? 71 | A: Resveratrol has been found to have antiviral effects, primarily through its ability to inhibit viral 72 | entry and replication. It has been reported to inhibit the replication of multiple viruses, including 73 | human immunodeficiency virus (HIV), herpes simplex virus (HSV), hepatitis C virus (HCV), and 74 | Zika virus (ZIKV). Resveratrol appears to block the activities of the TIR-domain-containing 75 | adapter-inducing interferon-β (TRIF) complex, suggesting that resveratrol would also inhibit NF-κB 76 | transcription induced by TRIF. Additionally, it has been reported to reduce the activity of respiratory 77 | syncytial virus (RSV) and to stimulate the secretion of higher levels of TNF-α, promoting cell death 78 | and RSV clearance. 79 | 80 | # Question enitre science 81 | ultranymous@nevermore:~ cybrex - chat-sci "What is the antivirus effect of resveratrol?" --n-chunks 4 --n-documents 10 82 | 83 | Q: What is the antivirus effect of resveratrol? 84 | A: Resveratrol has been found to possess antiviral activity against a variety of viruses, including herpes simplex virus, human immunodeficiency virus, and hepatitis C virus. It has been shown to inhibit the replication of several viruses, including HIV, herpes simplex virus, and influenza virus, and to regulate TLR3 expression, thus affecting the recruitment of downstream related factors and finally affecting the regulation process of related signal pathways. It has also been studied for its antiviral activity against Reoviridae, and for its potential to inhibit Zika virus cytopathy effect. It has been active against Epstein virus, rotavirus, and vesicular stomatitis virus, and has been reported to alleviate virus-induced reproductive failure and to promote RSV clearance in the body more quickly. 85 | 86 | ``` 87 | --------------------------------------------------------------------------------