├── py-origin ├── assets ├── aifw │ ├── __init__.py │ └── __main__.py ├── ui │ ├── requirements.txt │ └── desktop_app.py ├── cli │ └── requirements.txt ├── services │ ├── app │ │ ├── __init__.py │ │ ├── presidio_filters.json │ │ ├── local_api.py │ │ ├── aifw_utils.py │ │ ├── test_restore.py │ │ ├── llm_client.py │ │ ├── main.py │ │ └── one_aifw_api.py │ ├── requirements.txt │ └── fake_llm │ │ └── echo_server.py ├── Dockerfile └── README.md ├── cli └── python │ ├── assets │ ├── requirements.txt │ ├── services │ ├── app │ │ ├── __init__.py │ │ ├── local_api.py │ │ ├── aifw_utils.py │ │ ├── test_restore.py │ │ ├── llm_client.py │ │ ├── main.py │ │ └── one_aifw_api.py │ ├── requirements.txt │ └── fake_llm │ │ └── echo_server.py │ └── Dockerfile ├── .dockerignore ├── pnpm-workspace.yaml ├── docker-compose.yml ├── assets ├── local-fake-llm-apikey.json ├── oneaifw_assets_hashes.json └── aifw.yaml ├── libs ├── aifw-py │ ├── requirements.txt │ └── __init__.py ├── regex │ ├── Cargo.toml │ ├── Cargo.lock │ └── src │ │ └── lib.rs └── aifw-js │ ├── vite.config.js │ ├── package.json │ └── scripts │ └── copy-assets.mjs ├── core ├── wasm_shims.zig ├── recog_entity.zig ├── SpanMerger.zig └── NerRecognizer.zig ├── browser_extension ├── offscreen.html ├── options.html ├── content.js ├── popup.html ├── popup.js ├── manifest.json ├── README.md ├── offscreen.js ├── background.js ├── aifw-extension-sample.js └── indexeddb-models.js ├── .gitignore ├── web ├── requirements.txt ├── run.py ├── README.md ├── Dockerfile ├── app.py └── static │ └── css │ └── style.css ├── package.json ├── tools ├── requirements.txt ├── fetch_hf_models.py └── gen_assets_sha3.py ├── apps └── webapp │ ├── package.json │ ├── index.html │ ├── scripts │ ├── prepare-offline.mjs │ └── serve-coi.mjs │ ├── README.md │ ├── vite.config.js │ └── src │ └── main.js ├── tests ├── transformer-js │ ├── package.json │ ├── main.js │ ├── index.html │ └── vite.config.js ├── test_zh_pii.txt ├── test_zh_pii.anonymized.expected.txt ├── zh_address_dataset.txt ├── test_en_pii.txt ├── test_en_pii.anonymized.expected.txt └── test-aifw-core │ └── test_session.zig ├── MIT-LICENSE.txt ├── .github └── workflows │ ├── aifw-web.yml │ └── aifw-ci.yml ├── architecture.svg ├── README-GUIDE.md └── docs ├── zh_address_design.md └── oneaifw_services_api_cn.md /py-origin/assets: -------------------------------------------------------------------------------- 1 | ../assets -------------------------------------------------------------------------------- /cli/python/assets: -------------------------------------------------------------------------------- 1 | ../../assets -------------------------------------------------------------------------------- /py-origin/aifw/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [] 2 | 3 | -------------------------------------------------------------------------------- /cli/python/requirements.txt: -------------------------------------------------------------------------------- 1 | -r services/requirements.txt 2 | -------------------------------------------------------------------------------- /py-origin/ui/requirements.txt: -------------------------------------------------------------------------------- 1 | -r ../services/requirements.txt 2 | -------------------------------------------------------------------------------- /py-origin/cli/requirements.txt: -------------------------------------------------------------------------------- 1 | -r ../services/requirements.txt 2 | 3 | 4 | -------------------------------------------------------------------------------- /cli/python/services/app/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['main','local_api','llm_translation'] 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | env/ 3 | .venv 4 | .DS_Store 5 | __pycache__ 6 | 7 | *~ 8 | *.swp 9 | -------------------------------------------------------------------------------- /pnpm-workspace.yaml: -------------------------------------------------------------------------------- 1 | packages: 2 | - apps/webapp 3 | - tests/transformer-js 4 | - libs/aifw-js 5 | -------------------------------------------------------------------------------- /py-origin/services/app/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['main','analyzer','anonymizer','local_api','llm_translation'] 2 | -------------------------------------------------------------------------------- /py-origin/aifw/__main__.py: -------------------------------------------------------------------------------- 1 | from cli.oneaifw_cli import main 2 | 3 | 4 | if __name__ == "__main__": 5 | raise SystemExit(main()) 6 | 7 | 8 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | build: ./ 4 | ports: 5 | - '8844:8844' 6 | environment: 7 | - API_KEY=changeme-please 8 | -------------------------------------------------------------------------------- /assets/local-fake-llm-apikey.json: -------------------------------------------------------------------------------- 1 | { 2 | "openai-api-key": "test-local-echo", 3 | "openai-base-url": "http://127.0.0.1:8801/v1", 4 | "openai-model": "echo-001" 5 | } 6 | -------------------------------------------------------------------------------- /cli/python/services/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.95.0 2 | uvicorn[standard]>=0.22.0 3 | pydantic>=1.10.0 4 | python-multipart>=0.0.5 5 | socksio>=1.0.0 6 | litellm>=1.45.0 7 | langdetect>=1.0.9 8 | -------------------------------------------------------------------------------- /libs/aifw-py/requirements.txt: -------------------------------------------------------------------------------- 1 | langdetect>=1.0.9 2 | transformers>=4.46.0 3 | onnxruntime>=1.18.0 4 | numpy>=1.26.0 5 | # Optional for better zh Hans/Hant detection: 6 | opencc-python-reimplemented>=0.1.7 7 | -------------------------------------------------------------------------------- /core/wasm_shims.zig: -------------------------------------------------------------------------------- 1 | // Minimal C runtime shims for wasm32-freestanding linking 2 | // Only compiled when imported by freestanding targets. 3 | const std = @import("std"); 4 | 5 | pub export fn strlen(s: [*:0]const u8) usize { 6 | return std.mem.len(s); 7 | } 8 | -------------------------------------------------------------------------------- /browser_extension/offscreen.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | OneAIFW Offscreen 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /py-origin/services/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.95.0 2 | uvicorn[standard]>=0.22.0 3 | presidio-analyzer>=2.2.352 4 | presidio-anonymizer>=2.2.352 5 | pydantic>=1.10.0 6 | python-multipart>=0.0.5 7 | socksio>=1.0.0 8 | litellm>=1.45.0 9 | langdetect>=1.0.9 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.swp 3 | 4 | .DS_Store 5 | *.tar.gz 6 | 7 | __pycache__ 8 | .venv 9 | 10 | .zig-cache 11 | zig-out 12 | target 13 | 14 | ner-models 15 | 16 | package-lock.json 17 | pnpm-lock.yaml 18 | node_modules 19 | dist 20 | tests/transformer-js/public 21 | apps/webapp/public 22 | browser_extension/vendor 23 | 24 | -------------------------------------------------------------------------------- /web/requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.3.3 2 | requests==2.31.0 3 | Werkzeug==2.3.7 4 | pip 5 | 6 | # aifw-py runtime dependencies used by web service 7 | langdetect>=1.0.9 8 | transformers>=4.46.0 9 | onnxruntime>=1.18.0 10 | numpy>=1.26.0 11 | # Optional for better zh Hans/Hant detection: 12 | opencc-python-reimplemented>=0.1.7 13 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "oneaifw-workspace", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "packageManager": "pnpm@8.15.4", 7 | "scripts": { 8 | "build:zig": "zig build -Doptimize=Debug web:wasm", 9 | "build:lib": "pnpm --filter @oneaifw/aifw-js build", 10 | "build": "pnpm build:zig && pnpm build:lib" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /browser_extension/options.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

OneAIFW Extension Options

4 | Service URL:

5 | 6 | 9 | 10 | -------------------------------------------------------------------------------- /tools/requirements.txt: -------------------------------------------------------------------------------- 1 | # Python deps for exporting Hugging Face models to ONNX and INT8 quantization 2 | # Install with: pip install -r tools/requirements.txt 3 | 4 | # Core DL stack 5 | torch>=2.2.0 6 | transformers>=4.41.0 7 | tokenizers<0.20 8 | safetensors>=0.4.2 9 | huggingface_hub>=0.24.0 10 | numpy<2 11 | 12 | # ONNX export and tooling 13 | onnx>=1.15.0 14 | onnxruntime>=1.18.0 15 | onnxsim>=0.4.36 16 | -------------------------------------------------------------------------------- /libs/regex/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "aifw_regex" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [lib] 7 | # Generate only static library (.a) for all targets (native + wasm) 8 | crate-type = ["staticlib"] 9 | 10 | [dependencies] 11 | regex-automata = { version = "0.4", default-features = false, features = ["alloc", "meta", "syntax", "unicode"] } 12 | 13 | [profile.release] 14 | panic = "abort" 15 | lto = true 16 | codegen-units = 1 17 | -------------------------------------------------------------------------------- /browser_extension/content.js: -------------------------------------------------------------------------------- 1 | // Ctrl+Shift+A to anonymize selection 2 | document.addEventListener('keydown', (e)=>{ if(e.ctrlKey && e.shiftKey && e.code==='KeyA'){ const sel = window.getSelection().toString(); if(!sel) return; chrome.runtime.sendMessage({type:'ANON', text: sel}, (resp)=>{ if(resp && resp.ok){ navigator.clipboard.writeText(resp.data.text); alert('Anonymized text copied to clipboard'); } else { alert('Error: ' + (resp?.error || 'unknown')); } }); } }); 3 | -------------------------------------------------------------------------------- /libs/aifw-js/vite.config.js: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import path from 'node:path' 3 | import fs from 'node:fs' 4 | 5 | export default defineConfig({ 6 | build: { 7 | lib: { 8 | entry: path.resolve(__dirname, 'libaifw.js'), 9 | name: 'libaifw-js', 10 | fileName: () => 'aifw-js.js', 11 | formats: ['es'], 12 | }, 13 | outDir: 'dist', 14 | emptyOutDir: true, 15 | rollupOptions: { 16 | // Bundle all deps for static usage (no externals) 17 | }, 18 | }, 19 | }) 20 | -------------------------------------------------------------------------------- /apps/webapp/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "oneaifw-webapp", 3 | "private": true, 4 | "version": "0.1.0", 5 | "type": "module", 6 | "scripts": { 7 | "dev": "vite", 8 | "build": "vite build", 9 | "prepare:offline": "pnpm -w --filter @oneaifw/aifw-js build && node scripts/prepare-offline.mjs", 10 | "offline": "pnpm run prepare:offline", 11 | "serve:coi": "node scripts/serve-coi.mjs" 12 | }, 13 | "dependencies": { 14 | "@oneaifw/aifw-js": "workspace:*", 15 | "js-sha3": "^0.9.3" 16 | }, 17 | "devDependencies": { 18 | "vite": "^7.1.6" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /py-origin/services/app/presidio_filters.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity_filters": { 3 | "all": { 4 | "*": { "min_score": 0.55 } 5 | } 6 | }, 7 | "entity_whitelist": { 8 | "all": [ 9 | "EMAIL_ADDRESS", 10 | "PHONE_NUMBER", 11 | "IP_ADDRESS", 12 | "CN_ID", 13 | "PERSON", 14 | "ORGANIZATION", 15 | "PHYSICAL_ADDRESS", 16 | "USER_NAME", 17 | "BANK_NUMBER", 18 | "PAYMENT", 19 | "VERIFY_CODE", 20 | "PASSWORD", 21 | "RANDOM_SEED", 22 | "PRIVATE_KEY", 23 | "URL" 24 | ] 25 | } 26 | } 27 | 28 | 29 | -------------------------------------------------------------------------------- /libs/aifw-js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@oneaifw/aifw-js", 3 | "version": "0.1.0", 4 | "private": false, 5 | "type": "module", 6 | "main": "dist/aifw-js.js", 7 | "module": "dist/aifw-js.js", 8 | "exports": { 9 | ".": "./dist/aifw-js.js" 10 | }, 11 | "files": [ 12 | "dist", 13 | "models" 14 | ], 15 | "scripts": { 16 | "build": "vite build && node scripts/copy-assets.mjs" 17 | }, 18 | "dependencies": { 19 | "@xenova/transformers": "^2.17.2", 20 | "opencc-js": "^1.0.5", 21 | "js-sha3": "^0.9.3" 22 | }, 23 | "devDependencies": { 24 | "vite": "^7.1.6" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /tests/transformer-js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "oneaifw-transformers-browser-demo", 3 | "private": true, 4 | "version": "0.1.0", 5 | "type": "module", 6 | "scripts": { 7 | "predev": "node scripts/prep-models.mjs --offline --strict", 8 | "prebuild": "node scripts/prep-models.mjs --offline --strict", 9 | "dev": "vite", 10 | "build": "vite build", 11 | "preview": "vite preview", 12 | "prep:online": "ALLOW_REMOTE=1 node scripts/prep-models.mjs" 13 | }, 14 | "dependencies": { 15 | "@xenova/transformers": "^2.17.2", 16 | "https-proxy-agent": "^7.0.6", 17 | "tokenizers": "^0.13.3" 18 | }, 19 | "devDependencies": { 20 | "vite": "^7.1.4" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /assets/oneaifw_assets_hashes.json: -------------------------------------------------------------------------------- 1 | { 2 | "models": { 3 | "ckiplab/bert-tiny-chinese-ner": { 4 | "onnx/model_quantized.onnx": "0x0d723d495d0365236e12e51abbcb97407e8d1f51ec3154656e9267de31fc9ce6" 5 | }, 6 | "funstory-ai/neurobert-mini": { 7 | "onnx/model_quantized.onnx": "0xa7c4bfc5e2b7cfdfce2012b38e6eca712b433c4ed47ffc973ee9e3964056834a" 8 | } 9 | }, 10 | "source": "/Users/liuchangsheng/Work/funstory-ai/OneAIFW-Assets", 11 | "version": "0.3.1", 12 | "wasm": { 13 | "ort-wasm-simd-threaded.wasm": "0x74ccfd137d5b3ae7bcc2e951e2418078abfa58cf444f69502efb7bc52d6c12d4", 14 | "ort-wasm-simd.wasm": "0x0c1482593eb573d11e6e6c5539cf5436a323e4d49b843135317f053ab0523277" 15 | } 16 | } -------------------------------------------------------------------------------- /cli/python/services/app/local_api.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from .one_aifw_api import OneAIFWAPI 4 | 5 | 6 | class OneAIFWLocalAPI(OneAIFWAPI): 7 | """Local in-process API used by CLI/UI. Wraps OneAIFWAPI.""" 8 | pass 9 | 10 | 11 | # Singleton instance to be shared across imports 12 | api = OneAIFWLocalAPI() 13 | 14 | 15 | def call( 16 | text: str, 17 | api_key_file: Optional[str] = None, 18 | model: Optional[str] = None, 19 | temperature: float = 0.0, 20 | ) -> str: 21 | return api.call( 22 | text=text, 23 | api_key_file=api_key_file, 24 | model=model, 25 | temperature=temperature, 26 | ) 27 | -------------------------------------------------------------------------------- /py-origin/services/app/local_api.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from .one_aifw_api import OneAIFWAPI 4 | 5 | 6 | class OneAIFWLocalAPI(OneAIFWAPI): 7 | """Local in-process API used by CLI/UI. Wraps OneAIFWAPI.""" 8 | pass 9 | 10 | 11 | # Singleton instance to be shared across imports 12 | api = OneAIFWLocalAPI() 13 | 14 | 15 | def call( 16 | text: str, 17 | api_key_file: Optional[str] = None, 18 | model: Optional[str] = None, 19 | temperature: float = 0.0, 20 | ) -> str: 21 | return api.call( 22 | text=text, 23 | api_key_file=api_key_file, 24 | model=model, 25 | temperature=temperature, 26 | ) 27 | -------------------------------------------------------------------------------- /libs/regex/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "aifw_regex" 7 | version = "0.1.0" 8 | dependencies = [ 9 | "regex-automata", 10 | ] 11 | 12 | [[package]] 13 | name = "regex-automata" 14 | version = "0.4.10" 15 | source = "registry+https://github.com/rust-lang/crates.io-index" 16 | checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" 17 | dependencies = [ 18 | "regex-syntax", 19 | ] 20 | 21 | [[package]] 22 | name = "regex-syntax" 23 | version = "0.8.6" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" 26 | -------------------------------------------------------------------------------- /web/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | AIFW Web Module Runner 4 | 启动 AIFW Web 模块的脚本 5 | """ 6 | 7 | import os 8 | import sys 9 | 10 | def main(): 11 | print("=== AIFW Web Module ===") 12 | print("正在启动 AIFW Web 模块...") 13 | 14 | # 检查是否在正确的目录 15 | if not os.path.exists('app.py'): 16 | print("错误:请在 web 目录下运行此脚本") 17 | sys.exit(1) 18 | 19 | # 检查依赖 20 | # 启动应用 21 | print("\n启动 Web 服务器...") 22 | print("访问地址: http://localhost:5001") 23 | print("按 Ctrl+C 停止服务器") 24 | print("-" * 50) 25 | 26 | try: 27 | from app import app 28 | app.run(debug=True, host='0.0.0.0', port=5001) 29 | except KeyboardInterrupt: 30 | print("\n服务器已停止") 31 | except Exception as e: 32 | print(f"启动失败: {e}") 33 | sys.exit(1) 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /libs/aifw-py/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Public API for aifw-py. 3 | 4 | This module mirrors the high-level API of aifw-js: 5 | - init(options) 6 | - deinit() 7 | - config(mask_cfg) 8 | - detect_language(text) 9 | - mask_text(text, language) 10 | - restore_text(masked_text, mask_meta) 11 | - mask_text_batch(items) 12 | - restore_text_batch(items) 13 | - get_pii_spans(text, language) 14 | """ 15 | 16 | from .libaifw import ( 17 | init, 18 | deinit, 19 | config, 20 | detect_language, 21 | mask_text, 22 | restore_text, 23 | mask_text_batch, 24 | restore_text_batch, 25 | get_pii_spans, 26 | MatchedPIISpan, 27 | ) 28 | 29 | __all__ = [ 30 | "init", 31 | "deinit", 32 | "config", 33 | "detect_language", 34 | "mask_text", 35 | "restore_text", 36 | "mask_text_batch", 37 | "restore_text_batch", 38 | "get_pii_spans", 39 | "MatchedPIISpan", 40 | ] 41 | 42 | 43 | -------------------------------------------------------------------------------- /core/recog_entity.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | 3 | const MAX_RECOG_SCORE: f32 = 1.0; 4 | const MIN_RECOG_SCORE: f32 = 0.0; 5 | 6 | pub const EntityType = enum(u8) { 7 | None, // for normal text, not a PII entity 8 | PHYSICAL_ADDRESS, 9 | EMAIL_ADDRESS, 10 | ORGANIZATION, 11 | USER_NAME, 12 | PHONE_NUMBER, 13 | BANK_NUMBER, 14 | PAYMENT, 15 | VERIFICATION_CODE, 16 | PASSWORD, 17 | RANDOM_SEED, 18 | PRIVATE_KEY, 19 | URL_ADDRESS, 20 | }; 21 | 22 | /// The kind of the entity, for example, .Begin, .Inside, etc. 23 | /// Response the string "B-", "I-", etc. in the external NER output. 24 | pub const EntityBioTag = enum(u8) { 25 | None, // Outside of the entity 26 | Begin, // Begin of the entity 27 | Inside, // Inside of the entity 28 | }; 29 | 30 | pub const RecogEntity = struct { 31 | entity_type: EntityType = .None, 32 | start: u32, 33 | end: u32, 34 | score: f32, 35 | description: ?[]const u8, 36 | }; 37 | -------------------------------------------------------------------------------- /assets/aifw.yaml: -------------------------------------------------------------------------------- 1 | # If you want to use aifw, you must set api_key_file for yourself. 2 | # The json format of api key file is show bellow 3 | # { 4 | # "openai-model": "your_model_name", 5 | # "openai-base-url": "api base url", 6 | # "openai-api-key": "your model api key" 7 | # } 8 | # api_key_file: 9 | 10 | port: 8844 11 | log_level: "INFO" 12 | log_scopes: "app,uvicorn" 13 | log_dest: "file" 14 | log_file: "~/.aifw/aifw_server.log" 15 | temperature: 0.0 16 | 17 | # The number of month to keep log file, the out of date log file will be deleted. 18 | log_months_to_keep: 6 19 | 20 | # (Optional) filters selection 21 | filters: 22 | whitelist: all 23 | 24 | # (Optional) mask configuration 25 | mask_config: 26 | maskAddress: true 27 | maskEmail: true 28 | maskOrganization: true 29 | maskUserName: true 30 | maskPhoneNumber: true 31 | maskBankNumber: true 32 | maskPayment: true 33 | maskVerificationCode: true 34 | maskPassword: true 35 | maskRandomSeed: true 36 | maskPrivateKey: true 37 | maskUrl: true -------------------------------------------------------------------------------- /MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Funstory.ai Limited 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/transformer-js/main.js: -------------------------------------------------------------------------------- 1 | import { initEnv, buildNerPipeline } from '/@fs/Users/liuchangsheng/Work/funstory-ai/OneAIFW/libs/aifw-js/libner.js'; 2 | 3 | // Configure environment 4 | initEnv({ wasmBase: '/wasm/' }); 5 | 6 | const runBtn = document.getElementById('run'); 7 | const textEl = document.getElementById('text'); 8 | const modelEl = document.getElementById('model'); 9 | const quantizedEl = document.getElementById('quantized'); 10 | const outEl = document.getElementById('out'); 11 | 12 | runBtn.addEventListener('click', async () => { 13 | try { 14 | const modelId = modelEl.value; 15 | const quantized = !!quantizedEl.checked; 16 | const text = textEl.value || ''; 17 | 18 | const ner = await buildNerPipeline(modelId, { quantized }); 19 | const t0 = performance.now(); 20 | const output = await ner.run(text); 21 | const timeMs = Math.round(performance.now() - t0); 22 | 23 | outEl.textContent = JSON.stringify({ time_ms: timeMs, model: modelId, quantized, output }, null, 2); 24 | } catch (e) { 25 | outEl.textContent = `Error: ${e?.message || e}`; 26 | } 27 | }); 28 | 29 | // Auto-run once on load 30 | runBtn.click(); 31 | 32 | -------------------------------------------------------------------------------- /tests/test_zh_pii.txt: -------------------------------------------------------------------------------- 1 | 亲爱的客服团队: 2 | 您好!我是来自宏信科技公司的约翰·A·杜(用户名:johndoe_1984)。抱歉这封邮件有点长 🙏,我想反馈一个小小的账户问题。 3 | 我的家庭住址是: 4 | 中国北京市朝阳区建国路88号国贸中心A座1208室(对,就是之前表格里那个旧地址——上次拼错了,是我的疏忽😅)。 5 | 您可以通过以下方式联系我: 6 | 邮箱:test.user+alias@example.com 7 | 电话(美国):+1 415-555-2671 8 | 手机(中国):18744325579(目前仍然有效)。 9 | 关于您提到的退款问题,我的银行账户号是:1234 5678 9012 3456。 10 | 另外,也可以使用我的测试信用卡(仅供测试使用): 11 | Visa卡号:4242-4242-4242-4242,安全码(CVV):123,到期时间:12/34 —— 请不要实际扣款,仅供QA测试用途。 12 | 若要登录测试环境,请使用以下临时验证码:9F4T2A。 13 | 测试系统的密码为:S3cure!Passw0rd(测试完我会重置的,放心!)。 14 | 在钱包演示系统中(同样是测试数据),以下是助记词: 15 | river apple orange cable window magnet winter fee bonus ladder camera peach 16 | 此外是一个伪造的私钥块(仅供解析测试使用,非真实密钥): 17 | -----BEGIN PRIVATE KEY----- 18 | MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQC1w8P1x0kQbZpx 19 | uH7u2/1aYgH0b8oE4R2H3yV2gJg0f2oTg9zZQ97lP0JqR9x8Xx4j6ya2q4Z3Xx2F 20 | m2T1z+qk0a5Dq7mWkKX8rJq6fQIDAQABAoIBAQCd9E2J4K1Y9uRFGk1V3k1kGm3T 21 | l9aZKqv0o4h5zY+G7n8Gg3PzKj3e3lG7K1f5m0zV3Y1gM3s1V9r7YjX2Z2c9uL8k 22 | -----END PRIVATE KEY----- 23 | 如果您需要更多信息,可以访问以下链接: 24 | 🔗 https://example.com/support 25 | 或我们的文档网站:www.example.org/guide?lang=zh-CN 26 | (有时候公司内网链接 http://intranet.local/login 会自动跳转,有点奇怪,仅供参考。) 27 | 非常感谢您的帮助!如有任何问题,请随时联系我。 28 | 此致 29 | 敬礼 30 | 约翰·杜 31 | -------------------------------------------------------------------------------- /tests/test_zh_pii.anonymized.expected.txt: -------------------------------------------------------------------------------- 1 | 亲爱的客服团队: 2 | 您好!我是来自宏信科技公司的约翰·A·杜(用户名:johndoe_1984)。抱歉这封邮件有点长 🙏,我想反馈一个小小的账户问题。 3 | 我的家庭住址是: 4 | 中国北京市朝阳区建国路88号国贸中心A座1208室(对,就是之前表格里那个旧地址——上次拼错了,是我的疏忽😅)。 5 | 您可以通过以下方式联系我: 6 | 邮箱:__PII_EMAIL_ADDRESS_1__ 7 | 电话(美国):__PII_PHONE_NUMBER_2__ 8 | 手机(中国):__PII_PHONE_NUMBER_3__(目前仍然有效)。 9 | 关于您提到的退款问题,我的银行账户号是:__PII_PHONE_NUMBER_4__。 10 | 另外,也可以使用我的测试信用卡(仅供测试使用): 11 | Visa卡号:__PII_PHONE_NUMBER_5__,安全码(CVV):123,到期时间:12/34 —— 请不要实际扣款,仅供QA测试用途。 12 | 若要登录测试环境,请使用以下临时验证码:9F4T2A。 13 | 测试系统的密码为:S3cure!Passw0rd(测试完我会重置的,放心!)。 14 | 在钱包演示系统中(同样是测试数据),以下是助记词: 15 | river apple orange cable window magnet winter fee bonus ladder camera peach 16 | 此外是一个伪造的私钥块(仅供解析测试使用,非真实密钥): 17 | -----BEGIN PRIVATE KEY----- 18 | MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQC1w8P1x0kQbZpx 19 | uH7u2/1aYgH0b8oE4R2H3yV2gJg0f2oTg9zZQ97lP0JqR9x8Xx4j6ya2q4Z3Xx2F 20 | m2T1z+qk0a5Dq7mWkKX8rJq6fQIDAQABAoIBAQCd9E2J4K1Y9uRFGk1V3k1kGm3T 21 | l9aZKqv0o4h5zY+G7n8Gg3PzKj3e3lG7K1f5m0zV3Y1gM3s1V9r7YjX2Z2c9uL8k 22 | -----END PRIVATE KEY----- 23 | 如果您需要更多信息,可以访问以下链接: 24 | 🔗 __PII_URL_ADDRESS_6__ 25 | 或我们的文档网站:www.example.org/guide?lang=zh-CN 26 | (有时候公司内网链接 __PII_URL_ADDRESS_7__ 会自动跳转,有点奇怪,仅供参考。) 27 | 非常感谢您的帮助!如有任何问题,请随时联系我。 28 | 此致 29 | 敬礼 30 | 约翰·杜 31 | -------------------------------------------------------------------------------- /apps/webapp/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | OneAIFW WebApp 7 | 15 | 16 | 17 |

OneAIFW WebApp

18 |
19 | 20 | 21 |
22 |
23 | 24 | 25 |
26 |
27 |

Masked

28 |

29 |     
30 |
31 |

Restored

32 |

33 |     
34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /browser_extension/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | OneAIFW 7 | 15 | 16 | 17 |
18 | 19 | 20 |
21 |
22 | 23 | 24 | 25 |
26 |
27 | 28 |

29 |     
30 |
31 | 32 |

33 |     
34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /browser_extension/popup.js: -------------------------------------------------------------------------------- 1 | // popup.js 2 | const input = document.getElementById('input') 3 | const btnMask = document.getElementById('btn-mask') 4 | const btnRestore = document.getElementById('btn-restore') 5 | const statusEl = document.getElementById('status') 6 | const maskedEl = document.getElementById('masked') 7 | const restoredEl = document.getElementById('restored') 8 | 9 | function setStatus(s) { statusEl.textContent = s || '' } 10 | 11 | async function callBg(type, text) { 12 | return new Promise((resolve) => { 13 | chrome.runtime.sendMessage({ type, text }, (resp) => resolve(resp)) 14 | }) 15 | } 16 | 17 | btnMask.addEventListener('click', async () => { 18 | setStatus('Masking...') 19 | maskedEl.textContent = '' 20 | const resp = await callBg('ANON', input.value || '') 21 | if (resp?.ok) { 22 | maskedEl.textContent = resp.data.text 23 | setStatus('Done') 24 | } else { 25 | setStatus('Error: ' + (resp?.error || 'unknown')) 26 | } 27 | }) 28 | 29 | btnRestore.addEventListener('click', async () => { 30 | setStatus('Restoring...') 31 | restoredEl.textContent = '' 32 | const resp = await callBg('RESTORE', maskedEl.textContent || '') 33 | if (resp?.ok) { 34 | restoredEl.textContent = resp.data.text 35 | setStatus('Done') 36 | } else { 37 | setStatus('Error: ' + (resp?.error || 'unknown')) 38 | } 39 | }) 40 | -------------------------------------------------------------------------------- /browser_extension/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 3, 3 | "name": "OneAIFW Anonymizer", 4 | "version": "0.1.0", 5 | "description": "Anonymize selected text using OneAIFW (local WASM + cached models)", 6 | "permissions": [ 7 | "storage", 8 | "activeTab", 9 | "scripting", 10 | "clipboardWrite", 11 | "contextMenus", 12 | "offscreen" 13 | ], 14 | "host_permissions": [ 15 | "" 16 | ], 17 | "action": { 18 | "default_title": "OneAIFW Anonymizer", 19 | "default_popup": "popup.html" 20 | }, 21 | "background": { 22 | "service_worker": "background.js", 23 | "type": "module" 24 | }, 25 | "content_scripts": [ 26 | { 27 | "matches": [""], 28 | "js": ["content.js"], 29 | "run_at": "document_idle" 30 | } 31 | ], 32 | "web_accessible_resources": [ 33 | { 34 | "resources": [ 35 | "vendor/aifw-js/aifw-js.js", 36 | "vendor/aifw-js/libner-*.js", 37 | "vendor/aifw-js/wasm/*" 38 | ], 39 | "matches": [""] 40 | } 41 | ], 42 | "cross_origin_opener_policy": { "value": "same-origin" }, 43 | "cross_origin_embedder_policy": { "value": "require-corp" }, 44 | "content_security_policy": { 45 | "extension_pages": "script-src 'self' 'wasm-unsafe-eval'; object-src 'self'" 46 | }, 47 | "options_page": "options.html" 48 | } 49 | -------------------------------------------------------------------------------- /tests/zh_address_dataset.txt: -------------------------------------------------------------------------------- 1 | 北京市朝阳区建国路 88 号 2 | 珠海市香洲路明月花园12栋508房 3 | 南京市鼓楼区中山北路50号之3金陵中心B座18层 4 | 深圳市南山区科技南十二路8-2号科兴科学园C座5层 5 | 上海市徐汇区肇嘉浜路1065弄7号锦都苑2号楼1803室 6 | 上海市浦东新区 7 | 请寄到北京市海淀区中关村大街27号,或者上海市黄浦区南京东路299号东方商厦18层 8 | 成都市高新区天府大道100号环球中心5楼 9 | 杭州市滨江区江南大道228号滨江大厦F3 305室 10 | 廣州市越秀區北京西路黃埔花園13棟806房 11 | 上海市浦東新區銀城中路501號陸家嘴金融廣場18層 12 | 香港中環皇后大道中99號中環中心18樓1803室 13 | 九龍尖沙咀彌敦道128號K11購物藝術館6樓 14 | 新界沙田銀城街8號新城市廣場一期12座18樓B室 15 | 香港特別行政區 16 | 臺北市信義區松壽路11號台北101大樓18樓 17 | 台中市西屯區文心路二段123號 18 | 高雄市鼓山區美術東二路75號B座18樓之3 19 | 台北市大安區和平東路三段20巷5弄7號3樓 20 | 台北市信義區 21 | 澳門新口岸北京街89號國際銀行大廈18樓 22 | 澳門氹仔新街坊花園6座18樓C室 23 | 中華人民共和國澳門特別行政區 24 | 重庆市渝中区解放碑步行街8号时代广场A座F3-305室 25 | 收件地址:江苏省南京市鼓楼区广州路12号,退件地址:浙江省杭州市上城区延安路88号西湖天地3层 26 | 收件地址:江苏省南京市鼓楼区广州路12号 退件地址:浙江省杭州市上城区延安路88号西湖天地3层 27 | 中国浙江省 28 | 香港上環德輔道中恒生大廈18樓 29 | 澳門新馬路新八佰伴廣場6樓 30 | 新北市板橋區文化路一段200巷10弄5之2號4樓 31 | 苏州市工业园区星海街星海广场2栋18层1802室 32 | 成都市青羊區光華大道二期88號光華中心3層 33 | 34 | 🏙 中国大陆(含城市、区县、街道、门牌号) 35 | 我是吴光华,住在广州市越秀区北京西路黄埔花园13栋806房我的表哥在南昌市中山路2348号锦江花园6栋1403房 36 | 北京市朝阳区建国路88号国贸中心A座1208室 37 | 上海市浦东新区银城中路501号陆家嘴金融广场18层 38 | 广东省广州市天河区体育东路123号天誉大厦B座2305室 39 | 四川省成都市锦江区春熙路南段8号时代广场3层 40 | 浙江省杭州市西湖区文三路138号西湖数码港2号楼401室 41 | 江苏省南京市鼓楼区中山北路288号新世纪大厦18楼 42 | 福建省厦门市思明区嘉禾路468号国贸大厦1502室 43 | 河北省石家庄市长安区中山东路56号银都广场12层 44 | 湖南省长沙市岳麓区麓谷大道199号高新科技园C栋5楼 45 | 辽宁省沈阳市和平区青年大街309号华润大厦写字楼22层 46 | 🌇 台湾地区 47 | 台北市信义区松高路11号微风南山大楼28楼 48 | 新北市板桥区文化路二段182号18楼之3 49 | 桃园市中正路890号远东商业中心10楼 50 | 台中市西屯区台湾大道三段310号丰邑大楼12层 51 | 高雄市前镇区中华五路789号国际金融中心A栋21楼 52 | 🌆 香港特别行政区 53 | 香港中环皇后大道中99号中环中心45楼 54 | 九龙尖沙咀弥敦道132号美丽华大厦18楼1803室 55 | 香港湾仔告士打道211号海港中心23楼2301室 56 | 新界沙田科学园科技大道西12号科研大楼B座5楼 57 | 香港铜锣湾轩尼诗道500号希慎广场16层 58 | 59 | # 包含地址的复合语句测试 60 | 我住在珠海市香洲区中山路234号,我的名字是张信哲,邮箱是xingzhe@example.com 61 | -------------------------------------------------------------------------------- /apps/webapp/scripts/prepare-offline.mjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import fs from 'node:fs' 3 | import path from 'node:path' 4 | 5 | function ensureDir(p) { 6 | fs.mkdirSync(p, { recursive: true }) 7 | } 8 | 9 | function copyFile(src, destDir) { 10 | ensureDir(destDir) 11 | const dest = path.join(destDir, path.basename(src)) 12 | fs.copyFileSync(src, dest) 13 | console.log('[copy]', src, '->', dest) 14 | } 15 | 16 | function copyDir(src, dest) { 17 | ensureDir(dest) 18 | for (const e of fs.readdirSync(src)) { 19 | const s = path.join(src, e) 20 | const d = path.join(dest, e) 21 | const st = fs.statSync(s) 22 | if (st.isDirectory()) copyDir(s, d) 23 | else copyFile(s, dest) 24 | } 25 | } 26 | 27 | async function resolveAifwJsDist() { 28 | // Prefer installed package dist 29 | const nm = path.resolve(process.cwd(), 'node_modules', '@oneaifw', 'aifw-js', 'dist') 30 | if (fs.existsSync(nm)) return nm 31 | // Fallback to workspace dist 32 | const ws = path.resolve(process.cwd(), '..', '..', 'libs', 'aifw-js', 'dist') 33 | if (fs.existsSync(ws)) return ws 34 | throw new Error('cannot locate @oneaifw/aifw-js dist folder') 35 | } 36 | 37 | async function main() { 38 | const distDir = await resolveAifwJsDist() 39 | const outPublic = path.resolve(process.cwd(), 'public') 40 | ensureDir(outPublic) 41 | 42 | // Copy entire dist to vendor/aifw-js (no top-level mirrors) 43 | const vendorRoot = path.join(outPublic, 'vendor', 'aifw-js') 44 | copyDir(distDir, vendorRoot) 45 | 46 | const offlineHtmlPath = path.join(path.resolve(process.cwd()), 'aifw-offline.html') 47 | copyFile(offlineHtmlPath, outPublic) 48 | 49 | } 50 | 51 | main().catch((e) => { console.error(e); process.exit(1); }) 52 | -------------------------------------------------------------------------------- /tests/test_en_pii.txt: -------------------------------------------------------------------------------- 1 | Please translate the following to Chinese: 2 | 3 | Dear Support Team, 4 | 5 | This is John A. Doe from Acme Corporation (Username: johndoe_1984). I'm reaching out about a small account issue—sorry if this message is a bit long 🙏. 6 | 7 | My home address is: 1234 Elm Street, Suite 56, Springfield, IL 62704 (yep, the old adress—spelled wrong on my last form, my bad!). 8 | You can reach me at my email: test.user+alias@example.com, or call me at +1 415-555-2671. When I'm in China, my mobile is 18744325579 (still active). 9 | 10 | For the refund you mentioned, my bank account number is 1234 5678 9012 3456. Alternatively, you can use my payment card (dummy for testing only): Visa 4242-4242-4242-4242, CVV 123, exp 12/34 — please DO NOT actually charge this; it's just a placeholder for your QA case. 11 | 12 | To log in to the staging portal, use this temporary verification code: 9F4T2A. For the sandbox box, the pwd: S3cure!Passw0rd (I'll reset it after your tests, promise!). 13 | 14 | For our wallet demo (again, test data only), here's the seed phrase: river apple orange cable window magnet winter fee bonus ladder camera peach. 15 | And below is a dummy private key block (not real, just for parsing checks): 16 | -----BEGIN PRIVATE KEY----- 17 | MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQC1w8P1x0kQbZpx 18 | uH7u2/1aYgH0b8oE4R2H3yV2gJg0f2oTg9zZQ97lP0JqR9x8Xx4j6ya2q4Z3Xx2F 19 | m2T1z+qk0a5Dq7mWkKX8rJq6fQIDAQABAoIBAQCd9E2J4K1Y9uRFGk1V3k1kGm3T 20 | l9aZKqv0o4h5zY+G7n8Gg3PzKj3e3lG7K1f5m0zV3Y1gM3s1V9r7YjX2Z2c9uL8k 21 | -----END PRIVATE KEY----- 22 | 23 | If you need more info, see https://example.com/support — or our docs at www.example.org/guide?lang=en-US (sometimes the intranet link http://intranet.local/login redirects weirdly, FYI). 24 | 25 | Thanks a ton for your help! If anything looks off, feel free to ping me back. 26 | 27 | Sincerely, 28 | John 29 | -------------------------------------------------------------------------------- /tests/test_en_pii.anonymized.expected.txt: -------------------------------------------------------------------------------- 1 | Please translate the following to Chinese: 2 | 3 | Dear Support Team, 4 | 5 | This is John A. Doe from Acme Corporation (Username: johndoe_1984). I'm reaching out about a small account issue—sorry if this message is a bit long 🙏. 6 | 7 | My home address is: __PII_VERIFICATION_CODE_1__ Elm Street, Suite 56, Springfield, IL __PII_VERIFICATION_CODE_2__ (yep, the old adress—spelled wrong on my last form, my bad!). 8 | You can reach me at my email: __PII_EMAIL_ADDRESS_3__, or call me at __PII_PHONE_NUMBER_4__. When I'm in China, my mobile is __PII_PHONE_NUMBER_5__ (still active). 9 | 10 | For the refund you mentioned, my bank account number is __PII_PHONE_NUMBER_6__. Alternatively, you can use my payment card (dummy for testing only): Visa __PII_PHONE_NUMBER_7__, CVV 123, exp 12/34 — please DO NOT actually charge this; it's just a placeholder for your QA case. 11 | 12 | To log in to the staging portal, use this temporary verification code: __PII_VERIFICATION_CODE_8__. For the sandbox box, the pwd: __PII_PASSWORD_9__ (I'll reset it after your tests, promise!). 13 | 14 | For our wallet demo (again, test data only), here's the seed phrase: river apple orange cable window magnet winter fee bonus ladder camera peach. 15 | And below is a dummy private key block (not real, just for parsing checks): 16 | -----BEGIN PRIVATE KEY----- 17 | MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQC1w8P1x0kQbZpx 18 | uH7u2/1aYgH0b8oE4R2H3yV2gJg0f2oTg9zZQ97lP0JqR9x8Xx4j6ya2q4Z3Xx2F 19 | m2T1z+qk0a5Dq7mWkKX8rJq6fQIDAQABAoIBAQCd9E2J4K1Y9uRFGk1V3k1kGm3T 20 | l9aZKqv0o4h5zY+G7n8Gg3PzKj3e3lG7K1f5m0zV3Y1gM3s1V9r7YjX2Z2c9uL8k 21 | -----END PRIVATE KEY----- 22 | 23 | If you need more info, see __PII_URL_ADDRESS_10__ — or our docs at www.example.org/guide?lang=en-US (sometimes the intranet link __PII_URL_ADDRESS_11__ redirects weirdly, FYI). 24 | 25 | Thanks a ton for your help! If anything looks off, feel free to ping me back. 26 | 27 | Sincerely, 28 | John 29 | -------------------------------------------------------------------------------- /.github/workflows/aifw-web.yml: -------------------------------------------------------------------------------- 1 | name: aifw-web-release 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | docker: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | contents: read 11 | packages: write 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v4 15 | 16 | - name: Setup Rust (stable + wasm32 target) 17 | uses: dtolnay/rust-toolchain@stable 18 | with: 19 | targets: wasm32-unknown-unknown 20 | 21 | 22 | - name: Install Zig 23 | uses: mlugg/setup-zig@v2 24 | with: 25 | version: 0.15.2 26 | use-cache: true 27 | 28 | - name: Build Zig core native library 29 | run: zig build -Doptimize=ReleaseFast -Dtarget=x86_64-linux-gnu -Dcpu=haswell 30 | 31 | - name: Set up Docker Buildx 32 | uses: docker/setup-buildx-action@v3 33 | 34 | - name: Login to GHCR 35 | uses: docker/login-action@v3 36 | with: 37 | registry: ghcr.io 38 | username: ${{ github.repository_owner }} 39 | password: ${{ secrets.GITHUB_TOKEN }} 40 | 41 | - name: Build and push (web) 42 | uses: docker/build-push-action@v6 43 | with: 44 | context: . 45 | file: web/Dockerfile 46 | push: true 47 | platforms: linux/amd64 48 | build-args: | 49 | SPACY_PROFILE=minimal 50 | tags: | 51 | ghcr.io/${{ github.repository_owner }}/oneaifw-web:${{ github.ref_name }} 52 | ghcr.io/${{ github.repository_owner }}/oneaifw-web:latest 53 | labels: | 54 | org.opencontainers.image.title=OneAIFW Web 55 | org.opencontainers.image.description=AI Framework Web Interface 56 | org.opencontainers.image.version=${{ github.ref_name }} 57 | org.opencontainers.image.created=${{ github.event.repository.updated_at }} 58 | org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} -------------------------------------------------------------------------------- /cli/python/services/app/aifw_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from datetime import datetime 4 | from typing import Optional 5 | 6 | 7 | def cleanup_monthly_logs(base_path: Optional[str], months_to_keep: Optional[int]) -> None: 8 | """Delete monthly-rotated logs older than months_to_keep. 9 | 10 | base_path: The base log path before monthly suffix, e.g., /var/log/aifw/server.log 11 | months_to_keep: Number of months to retain. 0 => never clean. None/negative => default 6. 12 | """ 13 | if not base_path: 14 | return 15 | try: 16 | keep = 6 if (months_to_keep is None or months_to_keep < 0) else months_to_keep 17 | if keep == 0: 18 | return 19 | base_path = os.path.expanduser(base_path) 20 | base_dir = os.path.dirname(base_path) 21 | file_name = os.path.basename(base_path) 22 | if not base_dir: 23 | base_dir = "." 24 | if file_name.endswith('.log'): 25 | stem = re.escape(file_name[:-4]) 26 | pattern = re.compile(rf"^{stem}-([0-9]{{4}})-([0-9]{{2}})\.log$") 27 | else: 28 | stem = re.escape(file_name) 29 | pattern = re.compile(rf"^{stem}-([0-9]{{4}})-([0-9]{{2}})$") 30 | try: 31 | entries = os.listdir(base_dir) 32 | except Exception: 33 | return 34 | now = datetime.now() 35 | for entry in entries: 36 | m = pattern.match(entry) 37 | if not m: 38 | continue 39 | try: 40 | year = int(m.group(1)) 41 | month = int(m.group(2)) 42 | except Exception: 43 | continue 44 | age_months = (now.year - year) * 12 + (now.month - month) 45 | if age_months >= keep: 46 | try: 47 | os.remove(os.path.join(base_dir, entry)) 48 | except Exception: 49 | pass 50 | except Exception: 51 | # Best-effort cleanup; do not raise 52 | pass 53 | 54 | 55 | -------------------------------------------------------------------------------- /py-origin/services/app/aifw_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from datetime import datetime 4 | from typing import Optional 5 | 6 | 7 | def cleanup_monthly_logs(base_path: Optional[str], months_to_keep: Optional[int]) -> None: 8 | """Delete monthly-rotated logs older than months_to_keep. 9 | 10 | base_path: The base log path before monthly suffix, e.g., /var/log/aifw/server.log 11 | months_to_keep: Number of months to retain. 0 => never clean. None/negative => default 6. 12 | """ 13 | if not base_path: 14 | return 15 | try: 16 | keep = 6 if (months_to_keep is None or months_to_keep < 0) else months_to_keep 17 | if keep == 0: 18 | return 19 | base_path = os.path.expanduser(base_path) 20 | base_dir = os.path.dirname(base_path) 21 | file_name = os.path.basename(base_path) 22 | if not base_dir: 23 | base_dir = "." 24 | if file_name.endswith('.log'): 25 | stem = re.escape(file_name[:-4]) 26 | pattern = re.compile(rf"^{stem}-([0-9]{{4}})-([0-9]{{2}})\.log$") 27 | else: 28 | stem = re.escape(file_name) 29 | pattern = re.compile(rf"^{stem}-([0-9]{{4}})-([0-9]{{2}})$") 30 | try: 31 | entries = os.listdir(base_dir) 32 | except Exception: 33 | return 34 | now = datetime.now() 35 | for entry in entries: 36 | m = pattern.match(entry) 37 | if not m: 38 | continue 39 | try: 40 | year = int(m.group(1)) 41 | month = int(m.group(2)) 42 | except Exception: 43 | continue 44 | age_months = (now.year - year) * 12 + (now.month - month) 45 | if age_months >= keep: 46 | try: 47 | os.remove(os.path.join(base_dir, entry)) 48 | except Exception: 49 | pass 50 | except Exception: 51 | # Best-effort cleanup; do not raise 52 | pass 53 | 54 | 55 | -------------------------------------------------------------------------------- /py-origin/ui/desktop_app.py: -------------------------------------------------------------------------------- 1 | """OneAIFW Desktop UI (Tkinter) - local API client (no HTTP).""" 2 | import tkinter as tk 3 | from tkinter import ttk, messagebox 4 | import json 5 | import sys, os 6 | 7 | # Ensure project root is on sys.path for package imports when running from `ui/` 8 | PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 9 | if PROJECT_ROOT not in sys.path: 10 | sys.path.insert(0, PROJECT_ROOT) 11 | 12 | # Use local in-process API to avoid HTTP dependency 13 | from services.app import local_api 14 | 15 | 16 | def do_anonymize(): 17 | txt = txt_in.get("1.0", tk.END).strip() 18 | if not txt: 19 | return 20 | try: 21 | res_text = local_api.call(text=txt) 22 | txt_out.delete("1.0", tk.END) 23 | txt_out.insert(tk.END, json.dumps({"text": res_text}, ensure_ascii=False, indent=2)) 24 | except Exception as e: 25 | messagebox.showerror("Error", str(e)) 26 | 27 | 28 | def do_restore(): 29 | try: 30 | data = json.loads(txt_out.get("1.0", tk.END)) 31 | # No-op in unified API; keep for compatibility to show final text only 32 | res = {"text": data.get("text", "")} 33 | txt_out.delete("1.0", tk.END) 34 | txt_out.insert(tk.END, json.dumps(res, ensure_ascii=False, indent=2)) 35 | except Exception as e: 36 | messagebox.showerror("Error", str(e)) 37 | 38 | 39 | root = tk.Tk() 40 | root.title("OneAIFW - Local Client") 41 | root.geometry("900x650") 42 | frame = ttk.Frame(root, padding=12) 43 | frame.pack(fill=tk.BOTH, expand=True) 44 | 45 | lbl = ttk.Label(frame, text="Input text:") 46 | lbl.pack(anchor="w") 47 | txt_in = tk.Text(frame, height=10) 48 | txt_in.pack(fill=tk.BOTH, expand=True) 49 | 50 | btn_frame = ttk.Frame(frame) 51 | btn_frame.pack(fill=tk.X, pady=6) 52 | ttk.Button(btn_frame, text="Call →", command=do_anonymize).pack(side=tk.LEFT, padx=6) 53 | # Keep a placeholder button 54 | ttk.Button(btn_frame, text="Show Text", command=do_restore).pack(side=tk.LEFT, padx=6) 55 | 56 | lbl2 = ttk.Label(frame, text="Output:") 57 | lbl2.pack(anchor="w") 58 | txt_out = tk.Text(frame, height=18) 59 | txt_out.pack(fill=tk.BOTH, expand=True) 60 | 61 | root.mainloop() 62 | -------------------------------------------------------------------------------- /browser_extension/README.md: -------------------------------------------------------------------------------- 1 | # OneAIFW Browser Extension 2 | 3 | This extension anonymizes and restores selected text using the `@oneaifw/aifw-js` library. Models are downloaded once and cached in IndexedDB; ONNX/WASM runtimes are bundled. 4 | 5 | ## Build / Pack 6 | 7 | 1) Build the aifw-js library and stage assets into the extension: 8 | 9 | ```sh 10 | pnpm -w --filter @oneaifw/aifw-js build 11 | # copy vendor bundle + wasm into the extension 12 | mkdir -p browser_extension/vendor/aifw-js 13 | rsync -a --exclude 'models' libs/aifw-js/dist/* browser_extension/vendor/aifw-js 14 | ``` 15 | 16 | 2) Load extension in Chrome/Edge: 17 | - Open chrome://extensions 18 | - Enable Developer mode 19 | - Load unpacked → select `browser_extension` directory 20 | 21 | 3) First-run: 22 | - On install, the extension downloads the model files from the remote base (see `aifw-extension-sample.js`) and stores in IndexedDB 23 | - Right-click selection → “Anonymize with OneAIFW” or “Restore with OneAIFW” 24 | 25 | ## Config 26 | - Remote model base: `browser_extension/aifw-extension-sample.js` (`remoteBase`) 27 | - Model id: `defaultModelId` 28 | - WASM base is served from `vendor/aifw-js/wasm/` inside the extension 29 | 30 | ## How it works 31 | - `env.fetch` is overridden so requests to `modelsBase` come from IndexedDB instead of the network 32 | - The first installation populates IndexedDB via `ensureModelCached` 33 | 34 | ## Browser store policies (WASM) 35 | - Chrome Web Store and Firefox AMO generally require that executable code (including WASM binaries) be packaged with the extension and not downloaded at runtime for review and security reasons. 36 | - This project packages all ORT/AIFW WASM files under `vendor/aifw-js/wasm/` and declares them in `web_accessible_resources`. 37 | - Model files are large and dynamic; they are cached in IndexedDB by user action. If your store review requires models to be packaged, you can copy the desired model directory into `vendor/aifw-js/models/` and omit the remote download step. 38 | 39 | ## Development Notes 40 | - If you change `@oneaifw/aifw-js`, rebuild and re-copy `libs/aifw-js/dist` into `browser_extension/vendor/aifw-js` 41 | - If you want to pin a different model, update `remoteBase` and `defaultModelId` 42 | -------------------------------------------------------------------------------- /tools/fetch_hf_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import sys 5 | from pathlib import Path 6 | 7 | try: 8 | from huggingface_hub import hf_hub_download 9 | except Exception as e: 10 | print("Error: huggingface_hub is required. Install with: pip install huggingface_hub", file=sys.stderr) 11 | raise 12 | 13 | CANDIDATE_FILES = [ 14 | # tokenizer (fast preferred, fallback vocab) 15 | "tokenizer.json", 16 | "vocab.txt", 17 | # extra helper 18 | "tokenizer_config.json", 19 | # config 20 | "config.json", 21 | # ONNX (quantized preferred) 22 | os.path.join("onnx", "model_quantized.onnx"), 23 | os.path.join("onnx", "model.onnx"), 24 | ] 25 | 26 | 27 | def download_one(repo_id: str, filename: str, out_dir: Path, token: str | None) -> bool: 28 | dest = out_dir / filename 29 | dest.parent.mkdir(parents=True, exist_ok=True) 30 | try: 31 | local = hf_hub_download(repo_id=repo_id, filename=filename, token=token, local_dir=str(out_dir), local_dir_use_symlinks=False) 32 | # hf_hub_download already places file at local_dir/filename; ensure exists 33 | return os.path.exists(local) 34 | except Exception as e: 35 | # Not fatal; just report 36 | print(f"[fetch] skip {repo_id}/{filename}: {e}") 37 | return False 38 | 39 | 40 | def main(): 41 | ap = argparse.ArgumentParser(description="Fetch HF model artifacts (tokenizer/config/ONNX) to local dir") 42 | ap.add_argument("models", nargs="+", help="HF model repo ids, e.g. Xenova/bert-base-NER") 43 | ap.add_argument("--out-dir", default="ner-models", help="Output directory (default: ner-models)") 44 | ap.add_argument("--hf-token", default=os.environ.get("HF_TOKEN"), help="HF auth token for private models (or set HF_TOKEN)") 45 | args = ap.parse_args() 46 | 47 | base = Path(args.out_dir).resolve() 48 | base.mkdir(parents=True, exist_ok=True) 49 | 50 | for mid in args.models: 51 | print(f"[fetch] preparing: {mid}") 52 | out = base / mid 53 | for fname in CANDIDATE_FILES: 54 | download_one(mid, fname, out, args.hf_token) 55 | 56 | print(f"[fetch] done. Files stored under: {base}") 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /browser_extension/offscreen.js: -------------------------------------------------------------------------------- 1 | // offscreen.js (runs in a DOM context, module allowed) 2 | import * as aifw from './vendor/aifw-js/aifw-js.js' 3 | import { ensureModelCached, initAifwWithCache, defaultModelId } from './aifw-extension-sample.js' 4 | 5 | let ready = false 6 | let lastMetas = null 7 | 8 | async function ensureReady() { 9 | if (ready) return 10 | await ensureModelCached(defaultModelId) 11 | await initAifwWithCache() 12 | ready = true 13 | } 14 | 15 | chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { 16 | if (msg && msg._aifw) { 17 | if (msg.cmd === 'ping') { sendResponse({ ok: true }); return; } 18 | (async () => { 19 | try { 20 | await ensureReady() 21 | if (msg.cmd === 'mask') { 22 | const text = msg.text || '' 23 | const lines = text.split(/\r?\n/) 24 | const maskedLines = [] 25 | const metas = [] 26 | for (const line of lines) { 27 | const [masked, meta] = await aifw.maskText(line) 28 | maskedLines.push(masked) 29 | metas.push(meta) 30 | } 31 | lastMetas = metas 32 | sendResponse({ ok: true, text: maskedLines.join('\n'), meta: metas }) 33 | } else if (msg.cmd === 'restore') { 34 | const text = msg.text || '' 35 | const metas = Array.isArray(msg.meta) ? msg.meta : (lastMetas || []) 36 | const lines = text.split(/\r?\n/) 37 | const restoredLines = [] 38 | for (let i=0;i 2 | 8 | OneAIFW - Local Presidio Architecture 9 | 10 | 11 | Browser Extension (MV3) 12 | - Select text, Ctrl+Shift+A 13 | - Calls local /api/anonymize 14 | 15 | 16 | Desktop UI (Tkinter) 17 | - Calls local service 18 | - Displays placeholdersMap 19 | 20 | 21 | Presidio Service (FastAPI) 22 | - presidio-analyzer: AnalyzerEngine (spaCy + PatternRecognizer) 23 | - presidio-anonymizer: AnonymizerEngine 24 | - Endpoints: /api/analyze, /api/anonymize, /api/restore 25 | - Generates translation-safe placeholders: __PII_*__ 26 | 27 | 28 | Models and Recognizers 29 | - spaCy models (optional) 30 | - PatternRecognizers (regex) 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | Note: Placeholders are safe for MT/LLM round trips. Consider storing placeholdersMap in a session store (Redis) for large workflows. 42 | 43 | -------------------------------------------------------------------------------- /py-origin/services/app/test_restore.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os, sys 3 | try: 4 | # When executed as a package (recommended) 5 | from .anonymizer import AnonymizerWrapper 6 | except Exception: 7 | # Fallback: allow running from this directory via `python -m unittest test_restore.py` 8 | PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) 9 | if PROJECT_ROOT not in sys.path: 10 | sys.path.insert(0, PROJECT_ROOT) 11 | from services.presidio_service.app.anonymizer import AnonymizerWrapper 12 | 13 | 14 | class DummyAnalyzer: 15 | def analyze(self, text: str, language: str = 'en'): 16 | # Return no entities; we'll test restore directly with crafted placeholders 17 | return [] 18 | 19 | 20 | class TestRestore(unittest.TestCase): 21 | def setUp(self): 22 | self.wrapper = AnonymizerWrapper(DummyAnalyzer()) 23 | 24 | def test_exact_placeholder_restore(self): 25 | placeholders = {"__PII_EMAIL_ADDRESS_761b3e66__": "test@example.com"} 26 | text = "我的邮箱是 __PII_EMAIL_ADDRESS_761b3e66__" 27 | out = self.wrapper.restore(text, placeholders) 28 | self.assertEqual(out, "我的邮箱是 test@example.com") 29 | 30 | def test_missing_underscores_variant(self): 31 | placeholders = {"__PII_EMAIL_ADDRESS_761b3e66__": "test@example.com"} 32 | text = "我的邮箱是 PII_EMAIL_ADDRESS_761b3e66" 33 | out = self.wrapper.restore(text, placeholders) 34 | self.assertEqual(out, "我的邮箱是 test@example.com") 35 | 36 | def test_leaked_suffix_after_original(self): 37 | placeholders = {"__PII_EMAIL_ADDRESS_0b9df4b0__": "test@example.com"} 38 | text = "我的邮箱是 test@example.com0b9df4b0__" 39 | out = self.wrapper.restore(text, placeholders) 40 | self.assertEqual(out, "我的邮箱是 test@example.com") 41 | 42 | def test_overlapping_entities_prefer_longer(self): 43 | # Ensure independent of restore, the function does not break when overlapping-like patterns appear 44 | placeholders = { 45 | "__PII_URL_a37ec55b__": "example.com", 46 | "__PII_EMAIL_ADDRESS_6fbb5771__": "test@example.com", 47 | } 48 | text = "站点 example.com 和邮箱 __PII_EMAIL_ADDRESS_6fbb5771__" 49 | out = self.wrapper.restore(text, placeholders) 50 | self.assertEqual(out, "站点 example.com 和邮箱 test@example.com") 51 | 52 | 53 | if __name__ == "__main__": 54 | unittest.main() 55 | 56 | 57 | -------------------------------------------------------------------------------- /cli/python/services/app/test_restore.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os, sys 3 | try: 4 | # When executed as a package (recommended) 5 | from .anonymizer import AnonymizerWrapper 6 | except Exception: 7 | # Fallback: allow running from this directory via `python -m unittest test_restore.py` 8 | PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) 9 | if PROJECT_ROOT not in sys.path: 10 | sys.path.insert(0, PROJECT_ROOT) 11 | from services.presidio_service.app.anonymizer import AnonymizerWrapper 12 | 13 | 14 | class DummyAnalyzer: 15 | def analyze(self, text: str, language: str = 'en'): 16 | # Return no entities; we'll test restore directly with crafted placeholders 17 | return [] 18 | 19 | 20 | class TestRestore(unittest.TestCase): 21 | def setUp(self): 22 | self.wrapper = AnonymizerWrapper(DummyAnalyzer()) 23 | 24 | def test_exact_placeholder_restore(self): 25 | placeholders = {"__PII_EMAIL_ADDRESS_761b3e66__": "test@example.com"} 26 | text = "我的邮箱是 __PII_EMAIL_ADDRESS_761b3e66__" 27 | out = self.wrapper.restore(text, placeholders) 28 | self.assertEqual(out, "我的邮箱是 test@example.com") 29 | 30 | def test_missing_underscores_variant(self): 31 | placeholders = {"__PII_EMAIL_ADDRESS_761b3e66__": "test@example.com"} 32 | text = "我的邮箱是 PII_EMAIL_ADDRESS_761b3e66" 33 | out = self.wrapper.restore(text, placeholders) 34 | self.assertEqual(out, "我的邮箱是 test@example.com") 35 | 36 | def test_leaked_suffix_after_original(self): 37 | placeholders = {"__PII_EMAIL_ADDRESS_0b9df4b0__": "test@example.com"} 38 | text = "我的邮箱是 test@example.com0b9df4b0__" 39 | out = self.wrapper.restore(text, placeholders) 40 | self.assertEqual(out, "我的邮箱是 test@example.com") 41 | 42 | def test_overlapping_entities_prefer_longer(self): 43 | # Ensure independent of restore, the function does not break when overlapping-like patterns appear 44 | placeholders = { 45 | "__PII_URL_a37ec55b__": "example.com", 46 | "__PII_EMAIL_ADDRESS_6fbb5771__": "test@example.com", 47 | } 48 | text = "站点 example.com 和邮箱 __PII_EMAIL_ADDRESS_6fbb5771__" 49 | out = self.wrapper.restore(text, placeholders) 50 | self.assertEqual(out, "站点 example.com 和邮箱 test@example.com") 51 | 52 | 53 | if __name__ == "__main__": 54 | unittest.main() 55 | 56 | 57 | -------------------------------------------------------------------------------- /core/SpanMerger.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const entity = @import("recog_entity.zig"); 3 | 4 | pub const RecogEntity = entity.RecogEntity; 5 | pub const EntityType = entity.EntityType; 6 | 7 | pub const Config = struct { 8 | whitelist: []const EntityType, // accept only these if non-empty 9 | blacklist: []const EntityType, // drop these if present 10 | threshold: f32, // min score 11 | }; 12 | 13 | fn inSet(set: []const EntityType, t: EntityType) bool { 14 | var i: usize = 0; 15 | while (i < set.len) : (i += 1) { 16 | if (set[i] == t) return true; 17 | } 18 | return false; 19 | } 20 | 21 | pub fn merge(allocator: std.mem.Allocator, a: []const RecogEntity, b: []const RecogEntity, cfg: Config) ![]RecogEntity { 22 | var tmp = try std.ArrayList(RecogEntity).initCapacity(allocator, a.len + b.len); 23 | defer tmp.deinit(allocator); 24 | for (a) |e| try tmp.append(allocator, e); 25 | for (b) |e| try tmp.append(allocator, e); 26 | 27 | var spans = try tmp.toOwnedSlice(allocator); 28 | errdefer allocator.free(spans); 29 | 30 | var filtered = try std.ArrayList(RecogEntity).initCapacity(allocator, spans.len); 31 | defer filtered.deinit(allocator); 32 | for (spans) |e| { 33 | if (e.score < cfg.threshold) continue; 34 | if (cfg.whitelist.len > 0 and !inSet(cfg.whitelist, e.entity_type)) continue; 35 | if (cfg.blacklist.len > 0 and inSet(cfg.blacklist, e.entity_type)) continue; 36 | try filtered.append(allocator, e); 37 | } 38 | allocator.free(spans); 39 | spans = try filtered.toOwnedSlice(allocator); 40 | 41 | std.sort.block(RecogEntity, spans, {}, struct { 42 | fn lessThan(_: void, a: RecogEntity, b: RecogEntity) bool { 43 | return if (a.start == b.start) a.end < b.end else a.start < b.start; 44 | } 45 | }.lessThan); 46 | 47 | var out = try std.ArrayList(RecogEntity).initCapacity(allocator, spans.len); 48 | defer out.deinit(allocator); 49 | var i: usize = 0; 50 | while (i < spans.len) : (i += 1) { 51 | const cur = spans[i]; 52 | if (out.items.len == 0) { 53 | try out.append(allocator, cur); 54 | } else { 55 | const last = out.items[out.items.len - 1]; 56 | if (cur.start == last.start and cur.end == last.end) { 57 | if (cur.score > last.score) out.items[out.items.len - 1] = cur; 58 | } else { 59 | try out.append(allocator, cur); 60 | } 61 | } 62 | } 63 | allocator.free(spans); 64 | return try out.toOwnedSlice(allocator); 65 | } 66 | -------------------------------------------------------------------------------- /apps/webapp/scripts/serve-coi.mjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import http from 'node:http' 3 | import fs from 'node:fs' 4 | import path from 'node:path' 5 | import url from 'node:url' 6 | 7 | const __filename = url.fileURLToPath(import.meta.url) 8 | const __dirname = path.dirname(__filename) 9 | 10 | const root = path.resolve(__dirname, '..', 'public') 11 | const port = process.env.PORT ? Number(process.env.PORT) : 5500 12 | 13 | const mime = { 14 | '.html': 'text/html; charset=utf-8', 15 | '.js': 'application/javascript; charset=utf-8', 16 | '.mjs': 'application/javascript; charset=utf-8', 17 | '.css': 'text/css; charset=utf-8', 18 | '.json': 'application/json; charset=utf-8', 19 | '.wasm': 'application/wasm', 20 | '.onnx': 'application/octet-stream', 21 | '.txt': 'text/plain; charset=utf-8', 22 | } 23 | 24 | function send(res, status, body, ext) { 25 | res.statusCode = status 26 | res.setHeader('Cross-Origin-Opener-Policy', 'same-origin') 27 | res.setHeader('Cross-Origin-Embedder-Policy', 'require-corp') 28 | if (ext && mime[ext]) res.setHeader('Content-Type', mime[ext]) 29 | res.end(body) 30 | } 31 | 32 | function safeJoin(rootDir, reqPath) { 33 | const p = path.normalize(decodeURIComponent(reqPath.split('?')[0])) 34 | const full = path.join(rootDir, p) 35 | if (!full.startsWith(rootDir)) return null 36 | return full 37 | } 38 | 39 | const server = http.createServer((req, res) => { 40 | const urlPath = req.url || '/' 41 | let filePath = safeJoin(root, urlPath) 42 | if (!filePath) return send(res, 403, 'Forbidden') 43 | 44 | fs.stat(filePath, (err, stat) => { 45 | if (err) { 46 | // default file 47 | const fallback = path.join(root, 'offline.html') 48 | return fs.readFile(fallback, (e2, buf) => { 49 | if (e2) return send(res, 404, 'Not found') 50 | send(res, 200, buf, '.html') 51 | }) 52 | } 53 | if (stat.isDirectory()) { 54 | const indexFile = path.join(filePath, 'index.html') 55 | fs.readFile(indexFile, (e3, buf) => { 56 | if (e3) { 57 | const fallback = path.join(filePath, 'offline.html') 58 | return fs.readFile(fallback, (e4, buf2) => { 59 | if (e4) return send(res, 404, 'Not found') 60 | send(res, 200, buf2, '.html') 61 | }) 62 | } 63 | send(res, 200, buf, '.html') 64 | }) 65 | } else { 66 | fs.readFile(filePath, (e5, buf) => { 67 | if (e5) return send(res, 404, 'Not found') 68 | send(res, 200, buf, path.extname(filePath)) 69 | }) 70 | } 71 | }) 72 | }) 73 | 74 | server.listen(port, () => { 75 | console.log(`Serving ${root} with COOP/COEP at http://127.0.0.1:${port}/offline.html`) 76 | }) 77 | -------------------------------------------------------------------------------- /apps/webapp/README.md: -------------------------------------------------------------------------------- 1 | # OneAIFW WebApp 2 | 3 | A browser demo based on aifw-js. It supports: 4 | - Online development with Vite 5 | - An offline demo page served with COOP/COEP (enables ORT threads/SIMD) 6 | - Production build 7 | 8 | ## Prerequisites 9 | - Monorepo managed by pnpm. This webapp depends on the local package `@oneaifw/aifw-js` via `workspace:*`. 10 | - Node.js 18+ and pnpm 8+. 11 | 12 | ## Build aifw-js (in workspace) 13 | From the repository root (skip if already built): 14 | ```bash 15 | pnpm -w --filter @oneaifw/aifw-js build 16 | ``` 17 | 18 | ## Online development (Vite) 19 | From `apps/webapp`: 20 | ```bash 21 | pnpm run dev 22 | ``` 23 | Open the URL printed in the terminal (typically `http://127.0.0.1:5173/`). 24 | 25 | Notes: 26 | - Calling `await init()` uses the managed mode by default, which fetches NER models and ORT wasm from the GitHub-hosted assets and caches them. 27 | - To enable ORT threads/SIMD you need cross-origin isolation (COOP/COEP). Vite dev server doesn’t enable it by default; functionality works but might run with reduced performance. For full performance testing, use the “Offline demo” section below. 28 | 29 | ## Offline demo (with COOP/COEP) 30 | The offline page is `aifw-offline.html`. Copy assets into `public/` and serve with the built-in COOP/COEP server: 31 | ```bash 32 | cd apps/webapp 33 | pnpm run offline # copy @oneaifw/aifw-js dist to public/vendor/aifw-js, and copy aifw-offline.html into public/ 34 | pnpm run serve:coi # start the local static server with COOP/COEP (default port 5500) 35 | ``` 36 | Then open: 37 | ``` 38 | http://127.0.0.1:5500/aifw-offline.html 39 | ``` 40 | 41 | Troubleshooting: 42 | - If `http://127.0.0.1:5500/offline.html` returns 404, use `aifw-offline.html`, or run `pnpm run offline` to ensure the file has been copied into `public/`. 43 | 44 | ## Production build 45 | From `apps/webapp`: 46 | ```bash 47 | pnpm run build 48 | ``` 49 | Serve the generated `dist/` as static assets. It’s recommended to enable COOP/COEP response headers in production to fully leverage ORT threads/SIMD. You can adapt your own server or follow the idea from the offline demo server. 50 | 51 | ## Managed assets (at runtime) 52 | - `@oneaifw/aifw-js` uses managed mode in `init()` by default: on first run it downloads models and ORT wasm from the hosted repository, verifies integrity (SHA3-256), and warms up browser Cache Storage for faster subsequent loads. 53 | - Resource hosting repository on Hugginface 54 | 55 | ## Scripts 56 | - `pnpm run dev`: start the Vite dev server. 57 | - `pnpm run offline`: prepare offline demo assets into `public/`. 58 | - `pnpm run serve:coi`: start a local static server with COOP/COEP (default port 5500). 59 | - `pnpm run build`: production build into `dist/`. 60 | -------------------------------------------------------------------------------- /web/README.md: -------------------------------------------------------------------------------- 1 | # AIFW Web Module 2 | 3 | AIFW Web 模块提供了一个基于 Web 的界面来演示 OneAIFW 项目的隐私保护功能。 4 | 5 | ## 功能特性 6 | 7 | - 🌐 **Web 界面**:直观的 Web 界面介绍 AIFW 项目 8 | - 🔍 **敏感信息分析**:检测文本中的敏感信息实体 9 | - 🎭 **匿名化处理**:将敏感信息替换为占位符 10 | - 🔄 **文本恢复**:将匿名化文本恢复为原始内容 11 | - 🌍 **多语言支持**:支持中文和英文文本处理 12 | - 📱 **响应式设计**:适配桌面和移动设备 13 | 14 | ## 快速开始 15 | 16 | ### 1. 安装依赖 17 | 18 | ```bash 19 | pip install -r ../py-origin/services/requirements.txt 20 | pip install -r requirements.txt 21 | ``` 22 | 23 | ### 2. 启动服务 24 | 25 | ```bash 26 | python run.py 27 | ``` 28 | 29 | 或者直接运行: 30 | 31 | ```bash 32 | python app.py 33 | ``` 34 | 35 | ### 3. 访问界面 36 | 37 | 打开浏览器访问:http://localhost:5000 38 | 39 | ## API 接口 40 | 41 | ### 健康检查 42 | ``` 43 | GET /api/health 44 | ``` 45 | 46 | ### 分析敏感信息 47 | ``` 48 | POST /api/analyze 49 | Content-Type: application/json 50 | 51 | { 52 | "text": "要分析的文本", 53 | "language": "zh" 54 | } 55 | ``` 56 | 57 | ### 匿名化处理 58 | ``` 59 | POST /api/mask 60 | Content-Type: application/json 61 | 62 | { 63 | "text": "要匿名化的文本", 64 | "language": "zh" 65 | } 66 | ``` 67 | 68 | ### 恢复文本 69 | ``` 70 | POST /api/restore 71 | Content-Type: application/json 72 | 73 | { 74 | "text": "匿名化文本", 75 | "placeholders_map": { 76 | "PII_EMAIL_12345678__": "test@example.com" 77 | } 78 | } 79 | ``` 80 | 81 | ### 调用 LLM(需要配置 API 密钥) 82 | ``` 83 | POST /api/call 84 | Content-Type: application/json 85 | 86 | { 87 | "text": "要处理的文本", 88 | "api_key_file": "/path/to/api-key.json", 89 | "model": "gpt-4o-mini", 90 | "temperature": 0.0 91 | } 92 | ``` 93 | 94 | ## 项目结构 95 | 96 | ``` 97 | web/ 98 | ├── app.py # Flask 应用主文件 99 | ├── run.py # 启动脚本 100 | ├── requirements.txt # Python 依赖 101 | ├── README.md # 说明文档 102 | ├── templates/ # HTML 模板 103 | │ └── index.html # 主页面 104 | └── static/ # 静态资源 105 | ├── css/ 106 | │ └── style.css # 样式文件 107 | └── js/ 108 | └── app.js # JavaScript 文件 109 | ``` 110 | 111 | ## 依赖说明 112 | 113 | - **Flask**: Web 框架 114 | - **requests**: HTTP 请求库 115 | - **py-origin 模块**: AIFW 核心功能(需要从上级目录导入) 116 | 117 | ## 注意事项 118 | 119 | 1. 确保 `py-origin` 目录在项目根目录下 120 | 2. 首次运行可能需要安装 spaCy 语言模型 121 | 3. LLM 功能需要配置有效的 API 密钥文件 122 | 4. 建议在虚拟环境中运行 123 | 124 | ## 故障排除 125 | 126 | ### 导入错误 127 | 如果遇到 `ImportError`,请确保: 128 | - 在正确的目录下运行 129 | - `py-origin` 目录存在且可访问 130 | - 已安装所有必要的依赖 131 | 132 | ### 服务不可用 133 | 如果 AIFW 服务不可用: 134 | - 检查 `py-origin` 目录结构 135 | - 确保所有依赖已正确安装 136 | - 查看控制台错误信息 137 | 138 | ## 开发说明 139 | 140 | ### 添加新功能 141 | 1. 在 `app.py` 中添加新的路由 142 | 2. 在 `templates/index.html` 中添加 UI 元素 143 | 3. 在 `static/js/app.js` 中添加前端逻辑 144 | 4. 在 `static/css/style.css` 中添加样式 145 | 146 | ### 自定义样式 147 | 修改 `static/css/style.css` 文件来自定义界面样式。 148 | 149 | ### 添加新的 API 端点 150 | 在 `app.py` 中添加新的路由函数,遵循现有的模式。 151 | -------------------------------------------------------------------------------- /tests/transformer-js/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | OneAIFW NER (Transformers.js) 7 | 17 | 18 | 19 |

OneAIFW NER (Transformers.js)

20 |
21 | 22 | 37 |
38 |
39 | 40 | 41 | 42 | 43 |
44 |
45 | 46 | 47 |
48 |
49 | 50 |
51 |
52 | 53 |
(waiting)
54 |
55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /py-origin/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=python:3.13-slim 2 | FROM ${BASE_IMAGE} 3 | 4 | ENV PYTHONDONTWRITEBYTECODE=1 \ 5 | PYTHONUNBUFFERED=1 \ 6 | AIFW_WORK_DIR=/data/aifw \ 7 | XDG_CONFIG_HOME=/data/config 8 | 9 | WORKDIR /opt/aifw 10 | 11 | # Build-time profile to control spaCy models 12 | ARG SPACY_PROFILE=minimal 13 | 14 | # Copy requirements first for better cache (context is repo root) 15 | COPY py-origin/services/requirements.txt /opt/aifw/services/requirements.txt 16 | COPY py-origin/cli/requirements.txt /opt/aifw/cli/requirements.txt 17 | 18 | RUN pip install --upgrade pip && \ 19 | pip install --no-cache-dir -r /opt/aifw/services/requirements.txt && \ 20 | pip install --no-cache-dir -r /opt/aifw/cli/requirements.txt && \ 21 | python -m pip cache purge || true 22 | 23 | # Install spaCy models per profile 24 | RUN set -e; \ 25 | python -m spacy download en_core_web_sm; \ 26 | python -m spacy download zh_core_web_sm || true; \ 27 | if [ "$SPACY_PROFILE" = "fr" ] || [ "$SPACY_PROFILE" = "multi" ]; then python -m spacy download fr_core_news_sm || true; fi; \ 28 | if [ "$SPACY_PROFILE" = "de" ] || [ "$SPACY_PROFILE" = "multi" ]; then python -m spacy download de_core_news_sm || true; fi; \ 29 | if [ "$SPACY_PROFILE" = "ja" ] || [ "$SPACY_PROFILE" = "multi" ]; then python -m spacy download ja_core_news_sm || true; fi; \ 30 | if [ "$SPACY_PROFILE" = "multi" ]; then python -m spacy download xx_ent_wiki_sm || true; fi; \ 31 | find /usr/local/lib -type d -name '__pycache__' -prune -exec rm -rf {} + || true && \ 32 | find /usr/local/lib -type f -name '*.pyc' -delete || true 33 | 34 | # Copy only necessary project files to minimize image size (context is repo root) 35 | COPY py-origin/cli/*.py /opt/aifw/cli/ 36 | COPY py-origin/aifw/*.py /opt/aifw/aifw/ 37 | COPY py-origin/services/app/*.py /opt/aifw/services/app/ 38 | COPY py-origin/services/app/*.json /opt/aifw/services/app/ 39 | COPY py-origin/services/fake_llm/*.py /opt/aifw/services/fake_llm/ 40 | # Copy default config template (no secrets) 41 | COPY py-origin/assets/*.yaml py-origin/assets/*.json /opt/aifw/assets/ 42 | 43 | # Ensure runtime dirs; no API keys baked in image 44 | RUN mkdir -p ${AIFW_WORK_DIR} /var/log/aifw && \ 45 | chmod -R 777 ${AIFW_WORK_DIR} /var/log/aifw 46 | 47 | # Entrypoint: prepare work dir and default config if missing 48 | RUN printf '#!/bin/sh\nset -e\n: "${AIFW_WORK_DIR:=/data/aifw}"\nmkdir -p "${AIFW_WORK_DIR}"\nif [ ! -f "${AIFW_WORK_DIR}/aifw.yaml" ] && [ -f "/opt/aifw/assets/aifw.yaml" ]; then\n cp /opt/aifw/assets/aifw.yaml "${AIFW_WORK_DIR}/aifw.yaml";\nfi\nexport PYTHONPATH="/opt/aifw:${PYTHONPATH:-}"\nexec "$@"\n' > /usr/local/bin/aifw-entrypoint.sh && \ 49 | chmod +x /usr/local/bin/aifw-entrypoint.sh 50 | 51 | # Set a sane default; append happens in entrypoint using ${PYTHONPATH:-} 52 | ENV PYTHONPATH=/opt/aifw 53 | 54 | # Expose default service port 55 | EXPOSE 8844 56 | 57 | ENTRYPOINT ["/usr/local/bin/aifw-entrypoint.sh"] 58 | # Default: run the OneAIFW in interactive mode; user must mount api key file and optionally override config 59 | CMD ["/bin/bash"] 60 | -------------------------------------------------------------------------------- /.github/workflows/aifw-ci.yml: -------------------------------------------------------------------------------- 1 | name: aifw-ci 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v4 15 | 16 | - name: Setup Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.13' 20 | 21 | - name: Setup Rust (stable + wasm32 target) 22 | uses: dtolnay/rust-toolchain@stable 23 | with: 24 | targets: wasm32-unknown-unknown 25 | 26 | - name: Install Zig 27 | uses: mlugg/setup-zig@v2 28 | with: 29 | version: 0.15.2 30 | use-cache: true 31 | 32 | - name: Build Zig core 33 | run: zig build -Doptimize=ReleaseFast 34 | 35 | - name: Install dependencies 36 | run: | 37 | python -m pip install --upgrade pip 38 | pip install -r cli/python/requirements.txt 39 | pip install -r cli/python/services/requirements.txt 40 | pip install -r libs/aifw-py/requirements.txt 41 | 42 | - name: Run aifw-py tests 43 | env: 44 | PYTHONPATH: ${{ github.workspace }} 45 | run: | 46 | python tests/test-aifw-py/test_cli.py 47 | 48 | - name: Start fake LLM (echo) in background 49 | run: | 50 | cd py-origin 51 | python -m uvicorn services.fake_llm.echo_server:app --host 127.0.0.1 --port 8801 & 52 | echo $! > echo_llm.pid 53 | for i in $(seq 1 20); do curl -sf http://127.0.0.1:8801/v1/health && break || sleep 0.5; done 54 | 55 | - name: Prepare OpenAI-compatible key file 56 | run: | 57 | cat > $RUNNER_TEMP/echo-apikey.json << 'JSON' 58 | { 59 | "openai-api-key": "test-local-echo", 60 | "openai-base-url": "http://127.0.0.1:8801/v1", 61 | "openai-model": "echo-001" 62 | } 63 | JSON 64 | 65 | - name: Run tests (direct_call / launch / call / stop) 66 | env: 67 | PYTHONPATH: ${{ github.workspace }} 68 | run: | 69 | cd cli/python 70 | PROMPT="请把如下文本翻译为中文: My email address is test@example.com, and my phone number is 18744325579." 71 | # direct_call (in-process) 72 | python aifw.py direct_call --api-key-file $RUNNER_TEMP/echo-apikey.json "My email is test@example.com" 73 | 74 | # launch HTTP (daemonized), call, then stop 75 | python aifw.py launch --api-key-file $RUNNER_TEMP/echo-apikey.json --log-dest stdout || (cat ~/.aifw/aifw-server-*.log || true; exit 1) 76 | # wait until HTTP server is ready 77 | for i in $(seq 1 40); do curl -sf http://127.0.0.1:8844/api/health && break || sleep 0.5; done 78 | python aifw.py call --api-key-file $RUNNER_TEMP/echo-apikey.json "$PROMPT" 79 | python aifw.py stop || true 80 | 81 | - name: Teardown fake LLM 82 | if: always() 83 | run: | 84 | if [ -f echo_llm.pid ]; then kill $(cat echo_llm.pid) || true; fi 85 | 86 | -------------------------------------------------------------------------------- /tests/transformer-js/vite.config.js: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import fs from 'node:fs' 3 | import path from 'node:path' 4 | 5 | export default defineConfig({ 6 | // Avoid SPA history fallback serving index.html for missing JSON/ONNX under /models 7 | appType: 'mpa', 8 | server: { 9 | host: '127.0.0.1', 10 | port: 5173, 11 | open: true, 12 | headers: { 13 | 'Cross-Origin-Opener-Policy': 'same-origin', 14 | 'Cross-Origin-Embedder-Policy': 'require-corp', 15 | }, 16 | configureServer(server) { 17 | const ROOT = process.cwd() 18 | const MODELS_ROOT = path.join(ROOT, 'public', 'models') 19 | // WASM assets are copied to public/wasm during prep; no need to read node_modules at runtime 20 | server.middlewares.use((req, res, next) => { 21 | const raw = req.url || '' 22 | if (!raw.startsWith('/models/')) return next() 23 | // strip query/hash 24 | let pathname = raw 25 | try { 26 | const u = new URL(raw, 'http://127.0.0.1') 27 | pathname = u.pathname 28 | } catch (_) {} 29 | const url = pathname 30 | const rel = decodeURIComponent(url.replace(/^\/models\//, '')) 31 | const abs = path.join(MODELS_ROOT, rel) 32 | if (fs.existsSync(abs) && fs.statSync(abs).isFile()) { 33 | const stat = fs.statSync(abs) 34 | const ext = path.extname(abs).toLowerCase() 35 | if (ext === '.json') res.setHeader('Content-Type', 'application/json') 36 | else if (ext === '.txt') res.setHeader('Content-Type', 'text/plain; charset=utf-8') 37 | else if (ext === '.onnx') res.setHeader('Content-Type', 'application/octet-stream') 38 | else if (ext === '.wasm') res.setHeader('Content-Type', 'application/wasm') 39 | else if (ext === '.js') res.setHeader('Content-Type', 'application/javascript') 40 | res.setHeader('Cache-Control', 'no-cache') 41 | res.setHeader('Accept-Ranges', 'bytes') 42 | 43 | const range = req.headers['range'] 44 | if (range) { 45 | const m = /bytes=(\d*)-(\d*)/.exec(String(range)) 46 | let start = 0 47 | let end = stat.size - 1 48 | if (m) { 49 | if (m[1]) start = parseInt(m[1], 10) 50 | if (m[2]) end = parseInt(m[2], 10) 51 | } 52 | if (start > end || isNaN(start) || isNaN(end)) { 53 | res.statusCode = 416 54 | res.setHeader('Content-Range', `bytes */${stat.size}`) 55 | return res.end() 56 | } 57 | res.statusCode = 206 58 | res.setHeader('Content-Range', `bytes ${start}-${end}/${stat.size}`) 59 | res.setHeader('Content-Length', String(end - start + 1)) 60 | fs.createReadStream(abs, { start, end }).pipe(res) 61 | return 62 | } 63 | 64 | res.statusCode = 200 65 | res.setHeader('Content-Length', String(stat.size)) 66 | fs.createReadStream(abs).pipe(res) 67 | return 68 | } 69 | res.statusCode = 404 70 | res.end('Not found') 71 | }) 72 | }, 73 | }, 74 | }) 75 | -------------------------------------------------------------------------------- /web/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=python:3.13-slim 2 | FROM ${BASE_IMAGE} 3 | 4 | ENV PYTHONDONTWRITEBYTECODE=1 \ 5 | PYTHONUNBUFFERED=1 \ 6 | AIFW_WORK_DIR=/data/aifw \ 7 | XDG_CONFIG_HOME=/data/config \ 8 | AIFW_MODELS_BASE=/opt/aifw/ner-models 9 | 10 | WORKDIR /opt/aifw 11 | 12 | # Copy requirements first for better cache 13 | COPY web/requirements.txt /opt/aifw/web/requirements.txt 14 | COPY cli/python/requirements.txt /opt/aifw/cli/python/requirements.txt 15 | COPY libs/aifw-py/requirements.txt /opt/aifw/libs/aifw-py/requirements.txt 16 | 17 | # System deps (git for fetching model assets) and Python deps 18 | RUN apt-get update && \ 19 | apt-get install -y --no-install-recommends git ca-certificates && \ 20 | rm -rf /var/lib/apt/lists/* && \ 21 | pip install --upgrade pip && \ 22 | pip install --no-cache-dir -r /opt/aifw/web/requirements.txt && \ 23 | pip install --no-cache-dir -r /opt/aifw/cli/python/requirements.txt && \ 24 | pip install --no-cache-dir -r /opt/aifw/libs/aifw-py/requirements.txt && \ 25 | python -m pip cache purge || true 26 | 27 | # Copy web application files 28 | COPY web/*.py /opt/aifw/web/ 29 | COPY web/templates/ /opt/aifw/web/templates/ 30 | COPY web/static/ /opt/aifw/web/static/ 31 | 32 | # Copy CLI / services code and aifw-py library into image 33 | COPY cli/python /opt/aifw/cli/python 34 | COPY libs/aifw-py /opt/aifw/libs/aifw-py 35 | 36 | # Copy assets metadata and (optionally) pre-fetched models / wasm if present 37 | COPY assets/*.yaml assets/*.json /opt/aifw/assets/ 38 | 39 | # If OneAIFW-Assets repo is present in build context, copy its models; 40 | # otherwise clone from GitHub (public repo) at build time. 41 | RUN set -e; \ 42 | mkdir -p "${AIFW_MODELS_BASE}"; \ 43 | if [ -d "/opt/aifw/OneAIFW-Assets/models" ]; then \ 44 | cp -R /opt/aifw/OneAIFW-Assets/models/* "${AIFW_MODELS_BASE}/"; \ 45 | else \ 46 | git clone --depth 1 https://github.com/funstory-ai/OneAIFW-Assets.git /opt/aifw-assets && \ 47 | cp -R /opt/aifw-assets/models/* "${AIFW_MODELS_BASE}/" && \ 48 | rm -rf /opt/aifw-assets; \ 49 | fi 50 | 51 | # Copy prebuilt Zig core shared library if provided by CI (zig-out/lib) 52 | # Expected to contain liboneaifw_core.so built for Linux. 53 | COPY zig-out/lib /opt/aifw/zig-out/lib 54 | 55 | # Ensure runtime dirs; no API keys baked in image 56 | RUN mkdir -p ${AIFW_WORK_DIR} /var/log/aifw && \ 57 | chmod -R 777 ${AIFW_WORK_DIR} /var/log/aifw 58 | 59 | # Entrypoint: prepare work dir and default config if missing 60 | RUN printf '#!/bin/sh\nset -e\n: "${AIFW_WORK_DIR:=/data/aifw}"\nmkdir -p "${AIFW_WORK_DIR}"\nif [ ! -f "${AIFW_WORK_DIR}/aifw.yaml" ] && [ -f "/opt/aifw/assets/aifw.yaml" ]; then\n cp /opt/aifw/assets/aifw.yaml "${AIFW_WORK_DIR}/aifw.yaml";\nfi\nexport PYTHONPATH="/opt/aifw:${PYTHONPATH:-}"\nexec "$@"\n' > /usr/local/bin/aifw-entrypoint.sh && \ 61 | chmod +x /usr/local/bin/aifw-entrypoint.sh 62 | 63 | # Set a sane default; append happens in entrypoint using ${PYTHONPATH:-} 64 | ENV PYTHONPATH=/opt/aifw 65 | 66 | # Expose web application port 67 | EXPOSE 5001 68 | 69 | ENTRYPOINT ["/usr/local/bin/aifw-entrypoint.sh"] 70 | # Default: run the web application 71 | WORKDIR /opt/aifw/web 72 | CMD ["./run.py"] 73 | -------------------------------------------------------------------------------- /browser_extension/background.js: -------------------------------------------------------------------------------- 1 | async function delay(ms){return new Promise(r=>setTimeout(r,ms))} 2 | async function pingOffscreenOnce(timeoutMs=200){ 3 | return new Promise((resolve)=>{ 4 | let done=false 5 | const t=setTimeout(()=>{ if(!done) resolve(false) }, timeoutMs) 6 | try { 7 | chrome.runtime.sendMessage({ _aifw: true, cmd: 'ping' }, (resp)=>{ 8 | // Read lastError to consume and avoid "Unchecked runtime.lastError" logs 9 | void chrome.runtime.lastError 10 | clearTimeout(t) 11 | done=true 12 | resolve(!!(resp && resp.ok)) 13 | }) 14 | } catch { 15 | clearTimeout(t) 16 | resolve(false) 17 | } 18 | }) 19 | } 20 | 21 | async function ensureOffscreen() { 22 | // if already alive, return 23 | if (await pingOffscreenOnce(200)) return 24 | // create and wait until ready 25 | await chrome.offscreen.createDocument({ 26 | url: 'offscreen.html', 27 | reasons: ['BLOBS'], 28 | justification: 'Run WASM and heavy JS for aifw in DOM context', 29 | }) 30 | for (let i=0;i<15;i++){ // ~3s max 31 | if (await pingOffscreenOnce(200)) return 32 | await delay(200) 33 | } 34 | throw new Error('offscreen not ready') 35 | } 36 | 37 | async function offscreenCall(cmd, text, meta) { 38 | await ensureOffscreen() 39 | return new Promise((resolve) => { 40 | chrome.runtime.sendMessage({ _aifw: true, cmd, text, meta }, (resp) => resolve(resp)) 41 | }) 42 | } 43 | 44 | chrome.runtime.onInstalled.addListener(async () => { 45 | try { 46 | await ensureOffscreen() 47 | chrome.contextMenus.create({ id: 'aifw-mask', title: 'Anonymize with OneAIFW', contexts: ['selection'] }) 48 | } catch (e) { 49 | console.error('[aifw-ext] init failed', e) 50 | } 51 | }) 52 | 53 | chrome.contextMenus.onClicked.addListener(async (info, tab) => { 54 | const type = info.menuItemId 55 | if (type !== 'aifw-mask') return 56 | if (!tab?.id) return 57 | try { 58 | const [{ result: sel }] = await chrome.scripting.executeScript({ 59 | target: { tabId: tab.id }, 60 | func: () => window.getSelection()?.toString() || '' 61 | }) 62 | if (!sel) return 63 | const resp = await offscreenCall('mask', sel) 64 | if (resp?.ok) { 65 | await chrome.scripting.executeScript({ target: { tabId: tab.id }, func: (t) => navigator.clipboard.writeText(t), args: [resp.text] }) 66 | } else { 67 | console.error('[aifw-ext] offscreen error', resp?.error) 68 | } 69 | } catch (e) { 70 | console.error('[aifw-ext] action failed', e) 71 | } 72 | }) 73 | 74 | chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { 75 | if (msg.type === 'ANON') { 76 | (async () => { 77 | const resp = await offscreenCall('mask', msg.text || '') 78 | if (resp?.ok) sendResponse({ ok: true, data: { text: resp.text, meta: resp.meta } }) 79 | else sendResponse({ ok: false, error: resp?.error || 'unknown' }) 80 | })() 81 | return true 82 | } 83 | if (msg.type === 'RESTORE') { 84 | (async () => { 85 | const resp = await offscreenCall('restore', msg.text || '', msg.meta) 86 | if (resp?.ok) sendResponse({ ok: true, data: { text: resp.text } }) 87 | else sendResponse({ ok: false, error: resp?.error || 'unknown' }) 88 | })() 89 | return true 90 | } 91 | }) 92 | -------------------------------------------------------------------------------- /cli/python/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE=python:3.13-slim 2 | FROM ${BASE_IMAGE} 3 | 4 | ENV PYTHONDONTWRITEBYTECODE=1 \ 5 | PYTHONUNBUFFERED=1 \ 6 | AIFW_WORK_DIR=/data/aifw \ 7 | XDG_CONFIG_HOME=/data/config \ 8 | AIFW_MODELS_BASE=/opt/aifw/ner-models 9 | 10 | WORKDIR /opt/aifw 11 | 12 | # Copy requirements first for better cache 13 | COPY cli/python/requirements.txt /opt/aifw/cli/python/requirements.txt 14 | COPY cli/python/services/requirements.txt /opt/aifw/cli/python/services/requirements.txt 15 | COPY libs/aifw-py/requirements.txt /opt/aifw/libs/aifw-py/requirements.txt 16 | 17 | # System deps (git for fetching model assets) and Python deps 18 | RUN apt-get update && \ 19 | apt-get install -y --no-install-recommends git ca-certificates && \ 20 | rm -rf /var/lib/apt/lists/* && \ 21 | pip install --upgrade pip && \ 22 | pip install --no-cache-dir -r /opt/aifw/cli/python/requirements.txt && \ 23 | pip install --no-cache-dir -r /opt/aifw/libs/aifw-py/requirements.txt && \ 24 | python -m pip cache purge || true 25 | 26 | # Copy CLI / services code and aifw-py library into image 27 | COPY cli/python /opt/aifw/cli/python 28 | COPY libs/aifw-py /opt/aifw/libs/aifw-py 29 | 30 | # Copy assets metadata and (optionally) pre-fetched models / wasm if present 31 | COPY assets/*.yaml assets/*.json /opt/aifw/assets/ 32 | 33 | # If OneAIFW-Assets repo is present in build context, copy its models; 34 | # otherwise clone from GitHub (public repo) at build time. 35 | RUN set -e; \ 36 | mkdir -p "${AIFW_MODELS_BASE}"; \ 37 | if [ -d "/opt/aifw/OneAIFW-Assets/models" ]; then \ 38 | cp -R /opt/aifw/OneAIFW-Assets/models/* "${AIFW_MODELS_BASE}/"; \ 39 | else \ 40 | git clone --depth 1 https://github.com/funstory-ai/OneAIFW-Assets.git /opt/aifw-assets && \ 41 | cp -R /opt/aifw-assets/models/* "${AIFW_MODELS_BASE}/" && \ 42 | rm -rf /opt/aifw-assets; \ 43 | fi 44 | 45 | # Copy prebuilt Zig core shared library if provided by CI (zig-out/lib) 46 | # Expected to contain liboneaifw_core.so built for Linux. 47 | COPY zig-out/lib /opt/aifw/zig-out/lib 48 | 49 | # Ensure runtime dirs; no API keys baked in image 50 | RUN mkdir -p ${AIFW_WORK_DIR} /var/log/aifw && \ 51 | chmod -R 777 ${AIFW_WORK_DIR} /var/log/aifw 52 | 53 | # Runtime entrypoint: prepare work dir/config and PYTHONPATH 54 | RUN printf '#!/bin/sh\nset -e\n: "${AIFW_WORK_DIR:=/data/aifw}"\nmkdir -p "${AIFW_WORK_DIR}"\nif [ ! -f "${AIFW_WORK_DIR}/aifw.yaml" ] && [ -f "/opt/aifw/assets/aifw.yaml" ]; then\n cp /opt/aifw/assets/aifw.yaml "${AIFW_WORK_DIR}/aifw.yaml";\nfi\nexport PYTHONPATH="/opt/aifw/cli/python:/opt/aifw/libs/aifw-py:/opt/aifw:${PYTHONPATH:-}"\nexec "$@"\n' > /usr/local/bin/aifw-entrypoint.sh && \ 55 | chmod +x /usr/local/bin/aifw-entrypoint.sh 56 | 57 | # Simple CLI wrapper so `aifw` can be used inside the container 58 | RUN printf '#!/bin/sh\nexec python /opt/aifw/cli/python/aifw.py "$@"\n' > /usr/local/bin/aifw && \ 59 | chmod +x /usr/local/bin/aifw 60 | 61 | # Expose HTTP server port 62 | EXPOSE 8844 63 | 64 | ENTRYPOINT ["/usr/local/bin/aifw-entrypoint.sh"] 65 | 66 | # By default, run HTTP server; can be overridden to use `aifw launch/stop` manually. 67 | WORKDIR /opt/aifw/cli/python 68 | CMD ["python", "-m", "uvicorn", "services.app.main:app", "--host", "0.0.0.0", "--port", "8844"] 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /browser_extension/aifw-extension-sample.js: -------------------------------------------------------------------------------- 1 | // aifw-extension-sample.js 2 | // Initialize aifw-js using vendor bundle and serve model files from IndexedDB via a fetch shim. 3 | 4 | import * as aifw from './vendor/aifw-js/aifw-js.js' 5 | import { getFromCache, putToCache } from './indexeddb-models.js' 6 | 7 | // Logical base used by aifw-js to request models 8 | export const modelsBase = 'https://aifw-js.local/models/' 9 | 10 | // Example remote base hosting the model assets (downloaded once, then cached) 11 | export const remoteModelsBase = 'https://s.immersivetranslate.com/assets/OneAIFW/Models/20250926/' 12 | 13 | export const defaultModelId = 'funstory-ai/neurobert-mini' 14 | 15 | // Set ORT global config before init 16 | const wasmBase = chrome.runtime.getURL('vendor/aifw-js/wasm/'); 17 | 18 | // If offscreen.html is already COOP/COEP (crossOriginIsolated=true), we can use multi-thread; 19 | // otherwise, automatically downgrade 20 | // const threads = (globalThis.crossOriginIsolated && navigator.hardwareConcurrency) ? Math.min(Math.max(2, navigator.hardwareConcurrency), 8) : 1; 21 | 22 | try { 23 | console.log('crossOriginIsolated=', globalThis.crossOriginIsolated); 24 | if (navigator.hardwareConcurrency && navigator.hardwareConcurrency > 1) { 25 | // Force setting navigator.hardwareConcurrency to 1 for avoid importScript errors 26 | Object.defineProperty(navigator, 'hardwareConcurrency', { value: 1, configurable: true }); 27 | } 28 | } catch {} 29 | 30 | export async function ensureModelCached(modelId = defaultModelId, base = remoteModelsBase) { 31 | const files = [ 32 | 'tokenizer.json', 33 | 'tokenizer_config.json', 34 | 'config.json', 35 | 'special_tokens_map.json', 36 | 'vocab.txt', 37 | 'onnx/model_quantized.onnx', 38 | ] 39 | for (const rel of files) { 40 | const url = base.replace(/\/?$/, '/') + rel 41 | const res = await fetch(url) 42 | if (!res.ok) throw new Error('download failed: ' + url) 43 | const ct = res.headers.get('Content-Type') || (rel.endsWith('.json') ? 'application/json; charset=utf-8' : 'application/octet-stream') 44 | // Store under modelsBase + modelId + '/' + rel 45 | const cacheUrl = `${modelsBase}${modelId}/${rel}` 46 | await putToCache(cacheUrl, res, ct) 47 | } 48 | } 49 | 50 | function installModelsFetchShim() { 51 | const base = modelsBase.endsWith('/') ? modelsBase : modelsBase + '/' 52 | const origFetch = globalThis.fetch.bind(globalThis) 53 | globalThis.fetch = async (input, init) => { 54 | try { 55 | const url = typeof input === 'string' ? input : input.url 56 | if (String(url).startsWith(base)) { 57 | const data = await getFromCache(String(url)) 58 | if (data) { 59 | const u8 = data instanceof Uint8Array ? data : new Uint8Array(data) 60 | const ct = String(url).endsWith('.json') ? 'application/json; charset=utf-8' 61 | : String(url).endsWith('.onnx') ? 'application/octet-stream' 62 | : String(url).endsWith('.txt') ? 'text/plain; charset=utf-8' 63 | : 'application/octet-stream' 64 | return new Response(new Blob([u8], { type: ct }), { status: 200 }) 65 | } 66 | } 67 | } catch (e) { 68 | // fallthrough to network 69 | } 70 | return origFetch(input, init) 71 | } 72 | } 73 | 74 | export async function initAifwWithCache() { 75 | installModelsFetchShim() 76 | await aifw.init({ 77 | wasmBase: wasmBase, 78 | modelsBase 79 | }) 80 | return aifw 81 | } 82 | -------------------------------------------------------------------------------- /cli/python/services/fake_llm/echo_server.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Header, HTTPException 2 | from pydantic import BaseModel, Field 3 | from typing import List, Optional, Any, Dict 4 | import time 5 | import uuid 6 | 7 | 8 | app = FastAPI(title="Fake Echo LLM (OpenAI-compatible)", version="0.1.0") 9 | 10 | 11 | # ---- Schemas (minimal) ---- 12 | class ChatMessage(BaseModel): 13 | role: str 14 | content: str 15 | 16 | 17 | class ChatCompletionsIn(BaseModel): 18 | model: Optional[str] = Field(default="echo-001") 19 | messages: List[ChatMessage] 20 | temperature: Optional[float] = 0.0 21 | 22 | 23 | class CompletionsIn(BaseModel): 24 | model: Optional[str] = Field(default="echo-001") 25 | prompt: str 26 | temperature: Optional[float] = 0.0 27 | 28 | 29 | def _check_auth(authorization: Optional[str]): 30 | # Accept any Bearer token; require header to be present for realism 31 | if not authorization or not authorization.lower().startswith("bearer "): 32 | # stay permissive; do not hard fail to simplify local usage 33 | return 34 | 35 | 36 | @app.get("/v1/models") 37 | def list_models(x_api_key: Optional[str] = Header(None), authorization: Optional[str] = Header(None)): 38 | _check_auth(authorization) 39 | return { 40 | "object": "list", 41 | "data": [ 42 | { 43 | "id": "echo-001", 44 | "object": "model", 45 | "created": int(time.time()), 46 | "owned_by": "local", 47 | } 48 | ], 49 | } 50 | 51 | 52 | @app.post("/v1/chat/completions") 53 | def chat_completions(inp: ChatCompletionsIn, x_api_key: Optional[str] = Header(None), authorization: Optional[str] = Header(None)): 54 | _check_auth(authorization) 55 | # Echo last user content; fallback to concat 56 | last_user = next((m.content for m in reversed(inp.messages) if m.role == "user"), None) 57 | if last_user is None: 58 | last_user = "\n\n".join([m.content for m in inp.messages]) 59 | resp_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" 60 | now = int(time.time()) 61 | return { 62 | "id": resp_id, 63 | "object": "chat.completion", 64 | "created": now, 65 | "model": inp.model or "echo-001", 66 | "choices": [ 67 | { 68 | "index": 0, 69 | "message": {"role": "assistant", "content": last_user}, 70 | "finish_reason": "stop", 71 | } 72 | ], 73 | "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, 74 | } 75 | 76 | 77 | @app.post("/v1/completions") 78 | def completions(inp: CompletionsIn, x_api_key: Optional[str] = Header(None), authorization: Optional[str] = Header(None)): 79 | _check_auth(authorization) 80 | resp_id = f"cmpl-{uuid.uuid4().hex[:12]}" 81 | now = int(time.time()) 82 | return { 83 | "id": resp_id, 84 | "object": "text_completion", 85 | "created": now, 86 | "model": inp.model or "echo-001", 87 | "choices": [ 88 | { 89 | "index": 0, 90 | "text": inp.prompt, 91 | "finish_reason": "stop", 92 | } 93 | ], 94 | "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, 95 | } 96 | 97 | 98 | @app.get("/v1/health") 99 | def health(): 100 | return {"status": "ok"} 101 | 102 | 103 | -------------------------------------------------------------------------------- /py-origin/services/fake_llm/echo_server.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Header, HTTPException 2 | from pydantic import BaseModel, Field 3 | from typing import List, Optional, Any, Dict 4 | import time 5 | import uuid 6 | 7 | 8 | app = FastAPI(title="Fake Echo LLM (OpenAI-compatible)", version="0.1.0") 9 | 10 | 11 | # ---- Schemas (minimal) ---- 12 | class ChatMessage(BaseModel): 13 | role: str 14 | content: str 15 | 16 | 17 | class ChatCompletionsIn(BaseModel): 18 | model: Optional[str] = Field(default="echo-001") 19 | messages: List[ChatMessage] 20 | temperature: Optional[float] = 0.0 21 | 22 | 23 | class CompletionsIn(BaseModel): 24 | model: Optional[str] = Field(default="echo-001") 25 | prompt: str 26 | temperature: Optional[float] = 0.0 27 | 28 | 29 | def _check_auth(authorization: Optional[str]): 30 | # Accept any Bearer token; require header to be present for realism 31 | if not authorization or not authorization.lower().startswith("bearer "): 32 | # stay permissive; do not hard fail to simplify local usage 33 | return 34 | 35 | 36 | @app.get("/v1/models") 37 | def list_models(x_api_key: Optional[str] = Header(None), authorization: Optional[str] = Header(None)): 38 | _check_auth(authorization) 39 | return { 40 | "object": "list", 41 | "data": [ 42 | { 43 | "id": "echo-001", 44 | "object": "model", 45 | "created": int(time.time()), 46 | "owned_by": "local", 47 | } 48 | ], 49 | } 50 | 51 | 52 | @app.post("/v1/chat/completions") 53 | def chat_completions(inp: ChatCompletionsIn, x_api_key: Optional[str] = Header(None), authorization: Optional[str] = Header(None)): 54 | _check_auth(authorization) 55 | # Echo last user content; fallback to concat 56 | last_user = next((m.content for m in reversed(inp.messages) if m.role == "user"), None) 57 | if last_user is None: 58 | last_user = "\n\n".join([m.content for m in inp.messages]) 59 | resp_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" 60 | now = int(time.time()) 61 | return { 62 | "id": resp_id, 63 | "object": "chat.completion", 64 | "created": now, 65 | "model": inp.model or "echo-001", 66 | "choices": [ 67 | { 68 | "index": 0, 69 | "message": {"role": "assistant", "content": last_user}, 70 | "finish_reason": "stop", 71 | } 72 | ], 73 | "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, 74 | } 75 | 76 | 77 | @app.post("/v1/completions") 78 | def completions(inp: CompletionsIn, x_api_key: Optional[str] = Header(None), authorization: Optional[str] = Header(None)): 79 | _check_auth(authorization) 80 | resp_id = f"cmpl-{uuid.uuid4().hex[:12]}" 81 | now = int(time.time()) 82 | return { 83 | "id": resp_id, 84 | "object": "text_completion", 85 | "created": now, 86 | "model": inp.model or "echo-001", 87 | "choices": [ 88 | { 89 | "index": 0, 90 | "text": inp.prompt, 91 | "finish_reason": "stop", 92 | } 93 | ], 94 | "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, 95 | } 96 | 97 | 98 | @app.get("/v1/health") 99 | def health(): 100 | return {"status": "ok"} 101 | 102 | 103 | -------------------------------------------------------------------------------- /cli/python/services/app/llm_client.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any 2 | import os 3 | import json 4 | import importlib 5 | 6 | 7 | class LLMClient: 8 | """LiteLLM-based generic LLM caller. 9 | 10 | Requires provider API key(s) via environment variables as per LiteLLM docs. 11 | The `model` parameter selects the provider/model (e.g., "gpt-4o-mini", "glm-4"). 12 | For OpenAI-compatible gateways (e.g., Zhipu), configure OPENAI_API_KEY + OPENAI_API_BASE. 13 | """ 14 | 15 | def __init__(self, default_model: str = "gpt-4o-mini"): 16 | self.default_model = default_model 17 | 18 | def call( 19 | self, 20 | text: str, 21 | model: Optional[str] = None, 22 | temperature: float = 0.0, 23 | ) -> str: 24 | # Lazy import litellm here to surface precise import errors inside the active venv 25 | try: 26 | litellm = importlib.import_module("litellm") 27 | except Exception as exc: 28 | raise RuntimeError( 29 | f"Failed to import litellm. Please ensure it is installed in the current environment: {exc}" 30 | ) 31 | 32 | chosen_model = model or self.default_model 33 | # Normalize common GLM naming to OpenAI-compatible model id 34 | if isinstance(chosen_model, str) and "/" in chosen_model: 35 | # e.g., "zhipuai/glm-4" -> "glm-4" 36 | provider_prefix, maybe_model = chosen_model.split("/", 1) 37 | if provider_prefix.lower() in {"zhipuai", "glm", "openai"} and maybe_model: 38 | chosen_model = maybe_model 39 | 40 | provider_kwargs = {"custom_llm_provider": "openai"} 41 | api_base = os.environ.get("OPENAI_API_BASE") 42 | api_key = os.environ.get("OPENAI_API_KEY") 43 | if api_base: 44 | provider_kwargs["api_base"] = api_base 45 | if api_key: 46 | provider_kwargs["api_key"] = api_key 47 | 48 | resp = litellm.completion( 49 | model=chosen_model, 50 | messages=[ 51 | {"role": "user", "content": text}, 52 | ], 53 | temperature=temperature, 54 | **provider_kwargs, 55 | ) 56 | content = ( 57 | resp.choices[0].message.get("content") 58 | if hasattr(resp.choices[0], "message") 59 | else resp.choices[0].get("message", {}).get("content") 60 | ) 61 | return content or "" 62 | 63 | 64 | def load_llm_api_config(file_path: str) -> Dict[str, Any]: 65 | """Load LLM config for LiteLLM from a JSON file. 66 | 67 | Supported keys (hyphen or underscore are both accepted): 68 | - openai-api-key / openai_api_key 69 | - openai-model / openai_model 70 | - openai-base-url / openai_base_url (OpenAI-compatible base URL) 71 | 72 | Side effects: 73 | - Sets OPENAI_API_KEY and OPENAI_API_BASE 74 | - Returns dict { 'model': } 75 | """ 76 | with open(file_path, 'r', encoding='utf-8') as f: 77 | data = json.load(f) 78 | 79 | def get_any(*keys): 80 | for k in keys: 81 | if k in data and data[k]: 82 | return data[k] 83 | return None 84 | 85 | api_key = get_any('openai-api-key', 'openai_api_key') 86 | model = get_any('openai-model', 'openai_model') 87 | base_url = get_any('openai-base-url', 'openai_base_url') 88 | 89 | if not api_key: 90 | raise ValueError("openai-api-key not found in config file") 91 | os.environ['OPENAI_API_KEY'] = api_key 92 | if base_url: 93 | os.environ['OPENAI_API_BASE'] = base_url 94 | 95 | return {'model': model} 96 | 97 | 98 | -------------------------------------------------------------------------------- /py-origin/services/app/llm_client.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any 2 | import os 3 | import json 4 | import importlib 5 | 6 | 7 | class LLMClient: 8 | """LiteLLM-based generic LLM caller. 9 | 10 | Requires provider API key(s) via environment variables as per LiteLLM docs. 11 | The `model` parameter selects the provider/model (e.g., "gpt-4o-mini", "glm-4"). 12 | For OpenAI-compatible gateways (e.g., Zhipu), configure OPENAI_API_KEY + OPENAI_API_BASE. 13 | """ 14 | 15 | def __init__(self, default_model: str = "gpt-4o-mini"): 16 | self.default_model = default_model 17 | 18 | def call( 19 | self, 20 | text: str, 21 | model: Optional[str] = None, 22 | temperature: float = 0.0, 23 | ) -> str: 24 | # Lazy import litellm here to surface precise import errors inside the active venv 25 | try: 26 | litellm = importlib.import_module("litellm") 27 | except Exception as exc: 28 | raise RuntimeError( 29 | f"Failed to import litellm. Please ensure it is installed in the current environment: {exc}" 30 | ) 31 | 32 | chosen_model = model or self.default_model 33 | # Normalize common GLM naming to OpenAI-compatible model id 34 | if isinstance(chosen_model, str) and "/" in chosen_model: 35 | # e.g., "zhipuai/glm-4" -> "glm-4" 36 | provider_prefix, maybe_model = chosen_model.split("/", 1) 37 | if provider_prefix.lower() in {"zhipuai", "glm", "openai"} and maybe_model: 38 | chosen_model = maybe_model 39 | 40 | provider_kwargs = {"custom_llm_provider": "openai"} 41 | api_base = os.environ.get("OPENAI_API_BASE") 42 | api_key = os.environ.get("OPENAI_API_KEY") 43 | if api_base: 44 | provider_kwargs["api_base"] = api_base 45 | if api_key: 46 | provider_kwargs["api_key"] = api_key 47 | 48 | resp = litellm.completion( 49 | model=chosen_model, 50 | messages=[ 51 | {"role": "user", "content": text}, 52 | ], 53 | temperature=temperature, 54 | **provider_kwargs, 55 | ) 56 | content = ( 57 | resp.choices[0].message.get("content") 58 | if hasattr(resp.choices[0], "message") 59 | else resp.choices[0].get("message", {}).get("content") 60 | ) 61 | return content or "" 62 | 63 | 64 | def load_llm_api_config(file_path: str) -> Dict[str, Any]: 65 | """Load LLM config for LiteLLM from a JSON file. 66 | 67 | Supported keys (hyphen or underscore are both accepted): 68 | - openai-api-key / openai_api_key 69 | - openai-model / openai_model 70 | - openai-base-url / openai_base_url (OpenAI-compatible base URL) 71 | 72 | Side effects: 73 | - Sets OPENAI_API_KEY and OPENAI_API_BASE 74 | - Returns dict { 'model': } 75 | """ 76 | with open(file_path, 'r', encoding='utf-8') as f: 77 | data = json.load(f) 78 | 79 | def get_any(*keys): 80 | for k in keys: 81 | if k in data and data[k]: 82 | return data[k] 83 | return None 84 | 85 | api_key = get_any('openai-api-key', 'openai_api_key') 86 | model = get_any('openai-model', 'openai_model') 87 | base_url = get_any('openai-base-url', 'openai_base_url') 88 | 89 | if not api_key: 90 | raise ValueError("openai-api-key not found in config file") 91 | os.environ['OPENAI_API_KEY'] = api_key 92 | if base_url: 93 | os.environ['OPENAI_API_BASE'] = base_url 94 | 95 | return {'model': model} 96 | 97 | 98 | -------------------------------------------------------------------------------- /apps/webapp/vite.config.js: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | import path from 'node:path' 3 | import fs from 'node:fs' 4 | 5 | export default defineConfig({ 6 | // Avoid SPA history fallback serving index.html for missing JSON/ONNX under /models 7 | appType: 'mpa', 8 | root: '.', 9 | publicDir: 'public', 10 | server: { 11 | host: '127.0.0.1', 12 | port: 5174, 13 | headers: { 14 | 'Cross-Origin-Opener-Policy': 'same-origin', 15 | 'Cross-Origin-Embedder-Policy': 'require-corp', 16 | }, 17 | // Keep offline: disable HMR client and file watching polling 18 | hmr: false, 19 | watch: { usePolling: false }, 20 | fs: { 21 | allow: [ 22 | '..', 23 | '/Users/liuchangsheng/Work/funstory-ai/OneAIFW/libs', 24 | '/Users/liuchangsheng/Work/funstory-ai/OneAIFW/apps/webapp/node_modules', 25 | '/Users/liuchangsheng/Work/funstory-ai/OneAIFW/tests/transformer-js/public/models', 26 | ], 27 | }, 28 | configureServer(server) { 29 | // Serve local models prepared under public/models 30 | const MODELS_ROOT = path.resolve(process.cwd(), 'public', 'models') 31 | server.middlewares.use((req, res, next) => { 32 | const raw = req.url || '' 33 | if (!raw.startsWith('/models/')) return next() 34 | let pathname = raw 35 | try { const u = new URL(raw, 'http://127.0.0.1'); pathname = u.pathname } catch {} 36 | const rel = decodeURIComponent(pathname.replace(/^\/models\//, '')) 37 | const abs = path.join(MODELS_ROOT, rel) 38 | if (fs.existsSync(abs) && fs.statSync(abs).isFile()) { 39 | const stat = fs.statSync(abs) 40 | const ext = path.extname(abs).toLowerCase() 41 | if (ext === '.json') res.setHeader('Content-Type', 'application/json') 42 | else if (ext === '.txt') res.setHeader('Content-Type', 'text/plain; charset=utf-8') 43 | else if (ext === '.onnx') res.setHeader('Content-Type', 'application/octet-stream') 44 | else if (ext === '.wasm') res.setHeader('Content-Type', 'application/wasm') 45 | else if (ext === '.js') res.setHeader('Content-Type', 'application/javascript') 46 | res.setHeader('Cache-Control', 'no-cache') 47 | res.setHeader('Accept-Ranges', 'bytes') 48 | 49 | const range = req.headers['range'] 50 | if (range) { 51 | const m = /bytes=(\d*)-(\d*)/.exec(String(range)) 52 | let start = 0 53 | let end = stat.size - 1 54 | if (m) { 55 | if (m[1]) start = parseInt(m[1], 10) 56 | if (m[2]) end = parseInt(m[2], 10) 57 | } 58 | if (start > end || isNaN(start) || isNaN(end)) { 59 | res.statusCode = 416 60 | res.setHeader('Content-Range', `bytes */${stat.size}`) 61 | return res.end() 62 | } 63 | res.statusCode = 206 64 | res.setHeader('Content-Range', `bytes ${start}-${end}/${stat.size}`) 65 | res.setHeader('Content-Length', String(end - start + 1)) 66 | fs.createReadStream(abs, { start, end }).pipe(res) 67 | return 68 | } 69 | 70 | res.statusCode = 200 71 | res.setHeader('Content-Length', String(stat.size)) 72 | fs.createReadStream(abs).pipe(res) 73 | return 74 | } 75 | res.statusCode = 404 76 | res.end('model not found') 77 | }) 78 | }, 79 | }, 80 | resolve: { 81 | alias: { 82 | '@xenova/transformers': path.resolve(process.cwd(), 'node_modules/@xenova/transformers') 83 | } 84 | }, 85 | optimizeDeps: { 86 | include: ['@xenova/transformers'], 87 | exclude: [], 88 | }, 89 | }); 90 | -------------------------------------------------------------------------------- /browser_extension/indexeddb-models.js: -------------------------------------------------------------------------------- 1 | // The IndexedDB for store model files that used by aifw-js 2 | const DB_NAME = 'aifw-models'; 3 | const DB_VERSION = 1; 4 | const STORE = 'files'; 5 | 6 | function openDB() { 7 | return new Promise((resolve, reject) => { 8 | const req = indexedDB.open(DB_NAME, DB_VERSION); 9 | req.onupgradeneeded = () => { 10 | const db = req.result; 11 | if (!db.objectStoreNames.contains(STORE)) { 12 | const store = db.createObjectStore(STORE, { keyPath: 'url' }); 13 | store.createIndex('url', 'url', { unique: true }); 14 | } 15 | }; 16 | req.onsuccess = () => resolve(req.result); 17 | req.onerror = () => reject(req.error); 18 | req.onblocked = () => console.warn('[idb] open blocked'); 19 | }); 20 | } 21 | 22 | function txDone(tx) { 23 | return new Promise((resolve, reject) => { 24 | tx.oncomplete = () => resolve(); 25 | tx.onerror = () => reject(tx.error); 26 | tx.onabort = () => reject(tx.error || new Error('transaction aborted')); 27 | }); 28 | } 29 | 30 | // Get model file from cache that stored in IndexedDB 31 | export async function getFromCache(url) { 32 | const db = await openDB(); 33 | const tx = db.transaction(STORE, 'readonly'); 34 | const store = tx.objectStore(STORE); 35 | const rec = await new Promise((resolve, reject) => { 36 | const req = store.get(url); 37 | req.onsuccess = () => resolve(req.result || null); 38 | req.onerror = () => reject(req.error); 39 | }); 40 | await txDone(tx); 41 | if (!rec) return null; 42 | 43 | // Support two storage format such as Blob or ArrayBuffer 44 | if (rec.blob instanceof Blob) { 45 | const buf = await rec.blob.arrayBuffer(); 46 | return new Uint8Array(buf); 47 | } 48 | if (rec.arrayBuffer) { 49 | return new Uint8Array(rec.arrayBuffer); 50 | } 51 | return null; 52 | } 53 | 54 | // Put the model file to indexedDB, the data can be format of 55 | // ArrayBuffer/Uint8Array/Blob/Response 56 | export async function putToCache(url, data, contentType) { 57 | let blob; 58 | if (data instanceof Response) { 59 | const type = data.headers.get('Content-Type') || contentType || 'application/octet-stream'; 60 | const buf = await data.arrayBuffer(); 61 | blob = new Blob([buf], { type }); 62 | } else if (data instanceof Blob) { 63 | blob = data; 64 | } else { 65 | const type = contentType || 'application/octet-stream'; 66 | const bytes = data instanceof Uint8Array ? data : new Uint8Array(data); 67 | blob = new Blob([bytes], { type }); 68 | } 69 | 70 | const db = await openDB(); 71 | const tx = db.transaction(STORE, 'readwrite'); 72 | const store = tx.objectStore(STORE); 73 | await new Promise((resolve, reject) => { 74 | const req = store.put({ url, type: blob.type, blob }); 75 | req.onsuccess = () => resolve(); 76 | req.onerror = () => reject(req.error); 77 | }); 78 | await txDone(tx); 79 | } 80 | 81 | // Delet the model file in IndexedDB 82 | export async function deleteFromCache(url) { 83 | const db = await openDB(); 84 | const tx = db.transaction(STORE, 'readwrite'); 85 | const store = tx.objectStore(STORE); 86 | await new Promise((resolve, reject) => { 87 | const req = store.delete(url); 88 | req.onsuccess = () => resolve(); 89 | req.onerror = () => reject(req.error); 90 | }); 91 | await txDone(tx); 92 | } 93 | 94 | export async function clearCache() { 95 | const db = await openDB(); 96 | const tx = db.transaction(STORE, 'readwrite'); 97 | const store = tx.objectStore(STORE); 98 | await new Promise((resolve, reject) => { 99 | const req = store.clear(); 100 | req.onsuccess = () => resolve(); 101 | req.onerror = () => reject(req.error); 102 | }); 103 | await txDone(tx); 104 | } 105 | -------------------------------------------------------------------------------- /libs/aifw-js/scripts/copy-assets.mjs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import fs from 'node:fs' 3 | import path from 'node:path' 4 | import url from 'node:url' 5 | import { createRequire } from 'node:module' 6 | 7 | const require = createRequire(import.meta.url) 8 | 9 | const __filename = url.fileURLToPath(import.meta.url) 10 | const __dirname = path.dirname(__filename) 11 | 12 | function ensureDir(p) { 13 | fs.mkdirSync(p, { recursive: true }) 14 | } 15 | 16 | function copyFile(src, destDir) { 17 | ensureDir(destDir) 18 | const dest = path.join(destDir, path.basename(src)) 19 | fs.copyFileSync(src, dest) 20 | console.log('[copy]', src, '->', dest) 21 | } 22 | 23 | function copyDir(src, dest) { 24 | ensureDir(dest) 25 | for (const e of fs.readdirSync(src)) { 26 | const s = path.join(src, e) 27 | const d = path.join(dest, e) 28 | const st = fs.statSync(s) 29 | if (st.isDirectory()) copyDir(s, d) 30 | else copyFile(s, dest) 31 | } 32 | } 33 | 34 | function resolveTransformersDist() { 35 | let pkgPath 36 | try { 37 | pkgPath = path.dirname(require.resolve('@xenova/transformers/package.json')) 38 | } catch (e) { 39 | return null 40 | } 41 | const dist = path.join(pkgPath, 'dist') 42 | if (!fs.existsSync(dist)) return null 43 | return dist 44 | } 45 | 46 | function copyTransformersWasm(outRoot) { 47 | const dist = resolveTransformersDist() 48 | if (!dist) { 49 | console.warn('[warn] @xenova/transformers dist not found, skipping ORT wasm copy') 50 | return false 51 | } 52 | const out = path.join(outRoot, 'wasm') 53 | const files = ['ort-wasm-simd-threaded.wasm', 'ort-wasm-simd.wasm'] 54 | let copied = false 55 | for (const f of files) { 56 | const p = path.join(dist, f) 57 | if (fs.existsSync(p)) { 58 | copyFile(p, out) 59 | copied = true 60 | } else { 61 | console.warn('[warn] missing ORT wasm in transformers dist:', p) 62 | } 63 | } 64 | return copied 65 | } 66 | 67 | function copyCoreWasm(outRoot) { 68 | const core = path.resolve(__dirname, '../../..', 'zig-out', 'bin', 'liboneaifw_core.wasm') 69 | if (!fs.existsSync(core)) { 70 | console.warn('[warn] core wasm not found:', core) 71 | return 72 | } 73 | copyFile(core, path.join(outRoot, 'wasm')) 74 | } 75 | 76 | function copyModels(outRoot) { 77 | const modelsDir = process.env.AIFW_MODELS_DIR 78 | ? path.resolve(process.env.AIFW_MODELS_DIR) 79 | : path.resolve(__dirname, '../../..', 'ner-models') 80 | const modelIds = (process.env.AIFW_MODEL_IDS || 'funstory-ai/neurobert-mini') 81 | .split(',') 82 | .map((s) => s.trim()) 83 | .filter(Boolean) 84 | 85 | const files = [ 86 | 'tokenizer.json', 87 | 'tokenizer_config.json', 88 | 'config.json', 89 | 'special_tokens_map.json', 90 | 'vocab.txt', 91 | ] 92 | 93 | for (const id of modelIds) { 94 | const srcRoot = path.join(modelsDir, id) 95 | const outRootModel = path.join(outRoot, 'models', id) 96 | if (!fs.existsSync(srcRoot)) throw new Error('model dir missing: ' + srcRoot) 97 | // quantized onnx 98 | const q = path.join(srcRoot, 'onnx', 'model_quantized.onnx') 99 | if (!fs.existsSync(q)) throw new Error('quantized onnx missing: ' + q) 100 | copyFile(q, path.join(outRootModel, 'onnx')) 101 | // configs 102 | for (const f of files) { 103 | const p = path.join(srcRoot, f) 104 | if (!fs.existsSync(p)) { 105 | console.warn('[warn] model config missing:', p) 106 | continue 107 | } 108 | copyFile(p, outRootModel) 109 | } 110 | } 111 | } 112 | 113 | function main() { 114 | const outRoot = path.resolve(__dirname, '..', 'dist') 115 | ensureDir(outRoot) 116 | 117 | // Skip copy ORT WASM and models, because they will be downloaded from Huggingface at runtime 118 | // copyTransformersWasm(outRoot) 119 | // copyModels(outRoot) 120 | 121 | // Always copy core WASM 122 | copyCoreWasm(outRoot) 123 | } 124 | 125 | main() 126 | -------------------------------------------------------------------------------- /README-GUIDE.md: -------------------------------------------------------------------------------- 1 | # **Note: This file is not for user, this file will be deleted** 2 | 3 | # OneAIFW - Local Presidio-based Reversible Anonymization Framework 4 | 5 | This repository provides a local Presidio-based service (OneAIFW) with: 6 | - FastAPI backend using `presidio-analyzer` and `presidio-anonymizer` 7 | - Reversible placeholders and unified API for anonymize → LLM → restore 8 | - Tkinter desktop UI client 9 | - Browser extension (Chrome/Edge MV3) 10 | - Dockerfile + docker-compose for easy local deployment 11 | 12 | ## Quickstart - Service (Docker) 13 | Build profiles for spaCy models via `--build-arg SPACY_PROFILE=...`: 14 | 15 | - minimal (default): en_core_web_sm, zh_core_web_sm, xx_ent_wiki_sm 16 | - fr: minimal + fr_core_news_sm 17 | - de: minimal + de_core_news_sm 18 | - ja: minimal + ja_core_news_sm 19 | - multi: minimal + fr/de/ja 20 | 21 | ```bash 22 | # Build minimal (default) 23 | docker build -t oneaifw:minimal . 24 | 25 | # Build French / German / Japanese 26 | docker build --build-arg SPACY_PROFILE=fr -t oneaifw:fr . 27 | docker build --build-arg SPACY_PROFILE=de -t oneaifw:de . 28 | docker build --build-arg SPACY_PROFILE=ja -t oneaifw:ja . 29 | 30 | # Build multi-language 31 | docker build --build-arg SPACY_PROFILE=multi -t oneaifw:multi . 32 | 33 | # Run (mount host work dir with config/logs and your api keys) 34 | docker run --rm -p 8844:8844 \ 35 | -v $HOME/.aifw:/data/aifw \ 36 | oneaifw:minimal 37 | ``` 38 | 39 | The container copies `/opt/aifw/assets/aifw.yaml` to `/data/aifw/aifw.yaml` if missing. Edit it to point to your API key file (not included in the image). 40 | 41 | ## Unified API 42 | - In-process: `services/app/one_aifw_api.py` (class `OneAIFWAPI`) 43 | - Local wrapper: `services/app/local_api.py` exposes `call(text, api_key_file, model, temperature, language)` 44 | - HTTP endpoint: `POST /api/call` with body `{ text, apiKeyFile, model, temperature, language }` 45 | 46 | ## UI 47 | ```bash 48 | cd ui 49 | pip install -r requirements.txt 50 | python desktop_app.py 51 | ``` 52 | 53 | ## CLI 54 | ```bash 55 | # Unified call examples (module name changed to aifw) 56 | python -m aifw direct_call --api-key-file /path/to/api-key.json "Hello" 57 | python -m aifw launch --work-dir ~/.aifw --log-dest file 58 | python -m aifw call --url http://127.0.0.1:8844 --api-key-file /path/to/api-key.json "Hello" 59 | python -m aifw stop --work-dir ~/.aifw 60 | ``` 61 | 62 | ## Browser Extension 63 | Load `browser_extension` as unpacked extension in Chrome/Edge developer mode. 64 | 65 | ## Notes 66 | - If you still want the HTTP service, start it as shown above; UI/CLI work with the in-process API and do not require the HTTP server. 67 | - spaCy 模型:首次使用请安装 `en_core_web_sm`。安装:`python -m spacy download en_core_web_sm`(在对应 venv 中执行)。 68 | - LLM 网关(OpenAI 兼容):在配置 JSON 中提供 `openai-api-key` / `openai-base-url` / `openai-model`,CLI 通过 `--api-key-file` 读取。 69 | - The anonymization uses placeholders that are robust to LLM round-trips. 70 | 71 | ## Local fake LLM 72 | The local fake LLM is just echo the chat text to client. Launch the local fake LLM by bellow command. 73 | ```bash 74 | python -m uvicorn services.fake_llm.echo_server:app --host 127.0.0.1 --port 8801 75 | ``` 76 | 77 | ## Validate anonymization correctness (using --stage anonymized) 78 | 79 | Use the provided test inputs under `test/` and the local fake LLM (echo) to verify the anonymization output exactly matches the expected anonymized text. 80 | 81 | 1) Generate anonymized text (no LLM, no restore) and compare to expected: 82 | ```bash 83 | cat test/test_en_pii.txt | \ 84 | python -m aifw direct_call \ 85 | --log-dest stdout \ 86 | --api-key-file assets/local-fake-llm-apikey.json \ 87 | --stage anonymized - > out.anonymized.txt 88 | 89 | diff -u test/test_en_pii.anonymized.expected.txt out.anonymized.txt 90 | ``` 91 | 92 | 2) Send anonymized text via fake LLM echo (still no restore) and compare to expected: 93 | ```bash 94 | cat test/test_en_pii.txt | \ 95 | python -m aifw direct_call \ 96 | --log-dest stdout \ 97 | --api-key-file assets/local-fake-llm-apikey.json \ 98 | --stage anonymized_via_llm - > out.anonymized.llm.txt 99 | 100 | diff -u test/test_en_pii.anonymized.expected.txt out.anonymized.llm.txt 101 | ``` 102 | 103 | 3) Optional: verify full pipeline (anonymize → LLM → restore) returns the original text: 104 | ```bash 105 | cat test/test_en_pii.txt | \ 106 | python -m aifw direct_call \ 107 | --log-dest stdout \ 108 | --api-key-file assets/local-fake-llm-apikey.json \ 109 | --stage restored - > out.restored.txt 110 | 111 | diff -u test/test_en_pii.txt out.restored.txt 112 | ``` 113 | -------------------------------------------------------------------------------- /tests/test-aifw-core/test_session.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const core = @import("aifw_core"); 3 | 4 | const Language = core.Language; 5 | 6 | pub fn main() !void { 7 | defer core.aifw_shutdown(); 8 | 9 | try test_session_mask_and_restore_with_meta(); 10 | } 11 | 12 | fn test_session_mask_and_restore_with_meta() !void { 13 | const session = core.aifw_session_create(&.{ 14 | .mask_config = core.MaskConfig.getEnableAllMaskConfig(), 15 | .ner_recog_type = .token_classification, 16 | }); 17 | if (@intFromPtr(session) == 0) { 18 | std.log.err("failed to create session\n", .{}); 19 | return error.TestFailed; 20 | } 21 | defer core.aifw_session_destroy(session); 22 | 23 | // const input1 = "Hi, my email is example.test@funstory.com, my phone number is 13800138027, my name is John Doe"; 24 | // const ner_entities1 = [_]core.NerRecognizer.NerRecogEntity{ 25 | // .{ .entity_type = .USER_NAME, .entity_tag = .Begin, .score = 0.98, .index = 14, .start = 86, .end = 90 }, 26 | // .{ .entity_type = .USER_NAME, .entity_tag = .Inside, .score = 0.98, .index = 15, .start = 91, .end = 94 }, 27 | // }; 28 | const input1 = "我的家庭住址:成都市高新区天府大道100号"; 29 | const ner_entities1 = [_]core.NerRecognizer.NerRecogEntity{ 30 | .{ .entity_type = .PHYSICAL_ADDRESS, .entity_tag = .Begin, .score = 0.98, .index = 6, .start = 21, .end = 30 }, 31 | .{ .entity_type = .PHYSICAL_ADDRESS, .entity_tag = .Begin, .score = 0.98, .index = 16, .start = 51, .end = 57 }, 32 | }; 33 | var masked_text1: [*:0]u8 = undefined; 34 | var mask_meta_data1: *anyopaque = undefined; 35 | var err_no = core.aifw_session_mask_and_out_meta( 36 | session, 37 | input1, 38 | &ner_entities1, 39 | ner_entities1.len, 40 | @intFromEnum(Language.zh), 41 | &masked_text1, 42 | &mask_meta_data1, 43 | ); 44 | if (err_no != 0) { 45 | std.log.err("failed to mask, error={s}\n", .{core.getErrorString(err_no)}); 46 | return error.TestFailed; 47 | } 48 | defer core.aifw_string_free(masked_text1); 49 | 50 | const input2 = "Contact me: a.b+1@test.io and visit https://ziglang.org, my name is John Doe."; 51 | const ner_entities2 = [_]core.NerRecognizer.NerRecogEntity{ 52 | .{ .entity_type = .USER_NAME, .entity_tag = .Begin, .score = 0.98, .index = 10, .start = 68, .end = 77 }, 53 | }; 54 | var masked_text2: [*:0]u8 = undefined; 55 | var mask_meta_data2: *anyopaque = undefined; 56 | err_no = core.aifw_session_mask_and_out_meta( 57 | session, 58 | input2, 59 | &ner_entities2, 60 | ner_entities2.len, 61 | @intFromEnum(Language.en), 62 | &masked_text2, 63 | &mask_meta_data2, 64 | ); 65 | if (err_no != 0) { 66 | std.log.err("failed to mask, error={s}\n", .{core.getErrorString(err_no)}); 67 | return error.TestFailed; 68 | } 69 | defer core.aifw_string_free(masked_text2); 70 | 71 | var restored_text1: [*:0]allowzero u8 = undefined; 72 | err_no = core.aifw_session_restore_with_meta( 73 | session, 74 | masked_text1, 75 | mask_meta_data1, 76 | &restored_text1, 77 | ); 78 | if (err_no != 0) { 79 | std.log.err("failed to restore, error={s}\n", .{core.getErrorString(err_no)}); 80 | return error.TestFailed; 81 | } 82 | try std.testing.expect(@intFromPtr(restored_text1) != 0); 83 | const restored_text1_nonzero = @as([*:0]u8, @ptrCast(restored_text1)); 84 | defer core.aifw_string_free(@as([*:0]u8, @ptrCast(restored_text1_nonzero))); 85 | std.debug.print("input_text1={s}\n", .{input1}); 86 | std.debug.print("masked_text1={s}\n", .{masked_text1}); 87 | std.debug.print("restored_text1={s}\n", .{restored_text1_nonzero}); 88 | try std.testing.expect(std.mem.eql(u8, std.mem.span(restored_text1_nonzero), input1)); 89 | 90 | var restored_text2: [*:0]allowzero u8 = undefined; 91 | err_no = core.aifw_session_restore_with_meta( 92 | session, 93 | masked_text2, 94 | mask_meta_data2, 95 | &restored_text2, 96 | ); 97 | if (err_no != 0) { 98 | std.log.err("failed to restore, error={s}\n", .{core.getErrorString(err_no)}); 99 | return error.TestFailed; 100 | } 101 | try std.testing.expect(@intFromPtr(restored_text2) != 0); 102 | const restored_text2_nonzero = @as([*:0]u8, @ptrCast(restored_text2)); 103 | defer core.aifw_string_free(@as([*:0]u8, @ptrCast(restored_text2_nonzero))); 104 | std.debug.print("input_text2={s}\n", .{input2}); 105 | std.debug.print("masked_text2={s}\n", .{masked_text2}); 106 | std.debug.print("restored_text2={s}\n", .{restored_text2_nonzero}); 107 | try std.testing.expect(std.mem.eql(u8, std.mem.span(restored_text2_nonzero), input2)); 108 | } 109 | -------------------------------------------------------------------------------- /py-origin/README.md: -------------------------------------------------------------------------------- 1 | This sub‑project provides the OneAIFW Python backend and CLI, built on Presidio and LiteLLM. It exposes a FastAPI HTTP service and a simple CLI for masking/restoring text around LLM calls. 2 | 3 | ## Getting Started (py-origin) 4 | It anonymizes sensitive data before LLM calls and restores it afterward. See the root `README.md` for global prerequisites (Zig/Rust/Node/pnpm). Below are minimal steps to run the service and demos to call its APIs. 5 | 6 | ### Clone and create venv 7 | ```bash 8 | git clone https://github.com/funstory-ai/aifw.git 9 | cd aifw 10 | cd py-origin 11 | python -m venv .venv 12 | source .venv/bin/activate # Windows: .venv\\Scripts\\activate 13 | ``` 14 | 15 | ### Install dependencies 16 | ```bash 17 | cd py-origin 18 | pip install -r services/requirements.txt 19 | pip install -r cli/requirements.txt 20 | python -m spacy download en_core_web_sm 21 | python -m spacy download zh_core_web_sm 22 | python -m spacy download xx_ent_wiki_sm 23 | ``` 24 | 25 | ### Prepare config and LLM API key file 26 | The default aifw.yaml is in assets directory, you can modify this file for yourself. 27 | 28 | ```bash 29 | cd py-origin 30 | mkdir -p ~/.aifw 31 | cp assets/aifw.yaml ~/.aifw/aifw.yaml 32 | # edit ~/.aifw/aifw.yaml and set api_key_file to your LLM API key JSON 33 | ``` 34 | 35 | ### Launch HTTP server 36 | Authentication uses the standard `Authorization` header. Configure the HTTP API key via env or CLI: 37 | ```bash 38 | # Env var (example key) 39 | export AIFW_HTTP_API_KEY=8H234B 40 | ``` 41 | 42 | Start the server (logs go to `~/.aifw/`): 43 | ```bash 44 | cd py-origin 45 | python -m aifw launch # add --http-api-key KEY to override env 46 | ``` 47 | You should see output like: 48 | ``` 49 | aifw is running at http://localhost:8844. 50 | logs: ~/.aifw/aifw_server-2025-08.log 51 | ``` 52 | 53 | ## CLI demos for API usage 54 | 55 | The CLI calls the HTTP server to mask PII, optionally call an LLM, and restore text. Use `--http-api-key` if you set `AIFW_HTTP_API_KEY` on the server. 56 | 57 | ```bash 58 | cd py-origin 59 | python -m aifw call "请把如下文本翻译为中文: My email address is test@example.com, and my phone number is 18744325579." 60 | # With explicit HTTP API key: 61 | python -m aifw call --http-api-key 8H234B "..." 62 | ``` 63 | 64 | You can override the LLM API key file per call using `--api-key-file`: 65 | ```bash 66 | cd py-origin 67 | python -m aifw call --api-key-file /path/to/api-keys/your-key.json "..." 68 | ``` 69 | 70 | ### Direct in-process call (no HTTP) 71 | ```bash 72 | cd py-origin 73 | python -m aifw direct_call "请把如下文本翻译为中文: My email address is test@example.com, and my phone number is 18744325579." 74 | ``` 75 | 76 | You can also switch provider dynamically per call: 77 | ```bash 78 | cd py-origin 79 | python -m aifw direct_call --api-key-file /path/to/api-keys/your-key.json "..." 80 | ``` 81 | 82 | ### Single mask + restore (mask_text → restore_text) 83 | Call mask and then restore a single text via the HTTP APIs: 84 | 85 | ```bash 86 | # One command pipeline (mask → restore) 87 | python -m aifw mask_restore "text 1" --http-api-key 8H234B 88 | ``` 89 | 90 | ### Batch mask + restore (mask_text_batch → restore_text_batch) 91 | Mask and restore a list of texts using the batch mode interface: 92 | 93 | ```bash 94 | # One command pipeline (batch mask → batch restore) 95 | python -m aifw mask_restore_batch "text 1" "text 2" --http-api-key 8H234B 96 | ``` 97 | 98 | ### Multi mask, then one restore (many × mask_text → one × restore_text_batch) 99 | Call `mask_text` multiple times, then restore all at once: 100 | 101 | ```bash 102 | # Call mask_text individually for multiple items, then restore all at once 103 | python -m aifw multi_mask_one_restore "text 1" "text 2" --http-api-key 8H234B 104 | ``` 105 | 106 | ### Stop the server 107 | ```bash 108 | cd py-origin 109 | python -m aifw stop 110 | ``` 111 | 112 | ### API documentation 113 | 114 | See `docs/oneaifw_services_api.md` for all API interfaces, request/response formats, and curl examples. All responses include `output` and `error`. The `Authorization` header accepts either `KEY` or `Bearer KEY` formats. 115 | 116 | 117 | ## Docker images for py-origin (spaCy profiles) 118 | 119 | You can build different Docker images for the `py-origin` service with various spaCy model profiles via `--build-arg SPACY_PROFILE=...`: 120 | 121 | - `minimal` (default): en_core_web_sm, zh_core_web_sm, xx_ent_wiki_sm 122 | - `fr`: minimal + fr_core_news_sm 123 | - `de`: minimal + de_core_news_sm 124 | - `ja`: minimal + ja_core_news_sm 125 | - `multi`: minimal + fr/de/ja 126 | 127 | From the repo root: 128 | 129 | ```bash 130 | cd py-origin 131 | 132 | # Build minimal 133 | docker build -t oneaifw:minimal . 134 | 135 | # Build French / German / Japanese 136 | docker build --build-arg SPACY_PROFILE=fr -t oneaifw:fr . 137 | docker build --build-arg SPACY_PROFILE=de -t oneaifw:de . 138 | docker build --build-arg SPACY_PROFILE=ja -t oneaifw:ja . 139 | 140 | # Build multi-language 141 | docker build --build-arg SPACY_PROFILE=multi -t oneaifw:multi . 142 | ``` 143 | -------------------------------------------------------------------------------- /cli/python/services/app/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Header, HTTPException 2 | from pydantic import BaseModel 3 | from typing import Optional, Dict, Any, List, Union 4 | from .one_aifw_api import OneAIFWAPI 5 | from .aifw_utils import cleanup_monthly_logs 6 | import os 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | app = FastAPI(title="OneAIFW Service", version="0.2.0") 12 | 13 | api = OneAIFWAPI() 14 | # HTTP API key for Authorization header; can be set via env AIFW_HTTP_API_KEY 15 | API_KEY = os.environ.get("AIFW_HTTP_API_KEY") or None 16 | 17 | 18 | class ConfigIn(BaseModel): 19 | maskConfig: Dict[str, bool] 20 | 21 | 22 | class CallIn(BaseModel): 23 | text: str 24 | apiKeyFile: Optional[str] = None 25 | model: Optional[str] = None 26 | temperature: Optional[float] = 0.0 27 | 28 | 29 | class MaskIn(BaseModel): 30 | text: str 31 | language: Optional[str] = None 32 | 33 | 34 | class RestoreIn(BaseModel): 35 | text: str 36 | # maskMeta: base64 string of JSON(bytes) for placeholdersMap 37 | maskMeta: str 38 | 39 | 40 | def parse_auth_header(auth: Optional[str]) -> Optional[str]: 41 | if not auth: 42 | return None 43 | s = auth.strip() 44 | if s.lower().startswith("bearer "): 45 | return s[7:].strip() 46 | return s 47 | 48 | 49 | def check_api_key(authorization: Optional[str] = Header(None)): 50 | if not API_KEY: 51 | return True 52 | token = parse_auth_header(authorization) 53 | if token != API_KEY: 54 | logger.error(f"check_api_key: authorization: {authorization}, token: {token}, API_KEY: {API_KEY}, unauthorized error") 55 | raise HTTPException(status_code=401, detail="Unauthorized") 56 | return True 57 | 58 | 59 | @app.get("/api/health") 60 | async def health(): 61 | return {"status": "ok"} 62 | 63 | 64 | @app.post("/api/config") 65 | async def api_config(inp: ConfigIn, authorization: Optional[str] = Header(None)): 66 | check_api_key(authorization) 67 | try: 68 | api.config(mask_config=inp.maskConfig or {}) 69 | return {"output": {"status": "ok"}, "error": None} 70 | except Exception as e: 71 | logger.exception("/api/config failed") 72 | return {"output": None, "error": {"message": str(e), "code": None}} 73 | 74 | 75 | @app.post("/api/call") 76 | async def api_call(inp: CallIn, authorization: Optional[str] = Header(None)): 77 | check_api_key(authorization) 78 | default_key_file = os.environ.get("AIFW_API_KEY_FILE") 79 | chosen_key_file = inp.apiKeyFile or default_key_file 80 | # Server-side monthly log cleanup based on env config 81 | base_log = os.environ.get("AIFW_LOG_FILE") 82 | try: 83 | months = int(os.environ.get("AIFW_LOG_MONTHS_TO_KEEP", "6")) 84 | except Exception: 85 | months = 6 86 | cleanup_monthly_logs(base_log, months) 87 | try: 88 | out = api.call( 89 | text=inp.text, 90 | api_key_file=chosen_key_file, 91 | model=inp.model, 92 | temperature=inp.temperature or 0.0, 93 | ) 94 | return {"output": {"text": out}, "error": None} 95 | except Exception as e: 96 | logger.exception("/api/call failed") 97 | return {"output": None, "error": {"message": str(e), "code": None}} 98 | 99 | 100 | @app.post("/api/mask_text") 101 | async def api_mask_text(inp: MaskIn, authorization: Optional[str] = Header(None)): 102 | check_api_key(authorization) 103 | try: 104 | res = api.mask_text(text=inp.text, language=inp.language) 105 | return {"output": {"text": res["text"], "maskMeta": res["maskMeta"]}, "error": None} 106 | except Exception as e: 107 | logger.exception("/api/mask_text failed") 108 | return {"output": None, "error": {"message": str(e), "code": None}} 109 | 110 | 111 | @app.post("/api/restore_text") 112 | async def api_restore_text(inp: RestoreIn, authorization: Optional[str] = Header(None)): 113 | check_api_key(authorization) 114 | try: 115 | restored = api.restore_text(text=inp.text, mask_meta=inp.maskMeta) 116 | return {"output": {"text": restored}, "error": None} 117 | except Exception as e: 118 | logger.exception("/api/restore_text failed") 119 | return {"output": None, "error": {"message": str(e), "code": None}} 120 | 121 | 122 | @app.post("/api/mask_text_batch") 123 | async def api_mask_text_batch(inp_array: List[MaskIn], authorization: Optional[str] = Header(None)): 124 | check_api_key(authorization) 125 | try: 126 | res_array = [] 127 | for inp in inp_array: 128 | res_array.append(api.mask_text(text=inp.text, language=inp.language)) 129 | return {"output": res_array, "error": None} 130 | except Exception as e: 131 | logger.exception("/api/mask_text_batch failed") 132 | return {"output": None, "error": {"message": str(e), "code": None}} 133 | 134 | 135 | @app.post("/api/restore_text_batch") 136 | async def api_restore_text_batch(inp_array: List[RestoreIn], authorization: Optional[str] = Header(None)): 137 | check_api_key(authorization) 138 | try: 139 | restored_array = [] 140 | for inp in inp_array: 141 | restored = api.restore_text(text=inp.text, mask_meta=inp.maskMeta) 142 | restored_array.append({"text": restored}) 143 | return {"output": restored_array, "error": None} 144 | except Exception as e: 145 | logger.exception("/api/restore_text_batch failed") 146 | return {"output": None, "error": {"message": str(e), "code": None}} -------------------------------------------------------------------------------- /py-origin/services/app/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Header, HTTPException 2 | from pydantic import BaseModel 3 | from typing import Optional, Dict, Any, List, Union 4 | from .one_aifw_api import OneAIFWAPI 5 | from .aifw_utils import cleanup_monthly_logs 6 | import os 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | app = FastAPI(title="OneAIFW Service", version="0.2.0") 12 | 13 | api = OneAIFWAPI() 14 | # HTTP API key for Authorization header; can be set via env AIFW_HTTP_API_KEY 15 | API_KEY = os.environ.get("AIFW_HTTP_API_KEY") or None 16 | 17 | class ConfigIn(BaseModel): 18 | maskConfig: Optional[Dict[str, bool]] = None 19 | 20 | 21 | class CallIn(BaseModel): 22 | text: str 23 | apiKeyFile: Optional[str] = None 24 | model: Optional[str] = None 25 | temperature: Optional[float] = 0.0 26 | 27 | 28 | class MaskIn(BaseModel): 29 | text: str 30 | language: Optional[str] = None 31 | 32 | 33 | class RestoreIn(BaseModel): 34 | text: str 35 | # maskMeta: base64 string of JSON(bytes) for placeholdersMap 36 | maskMeta: str 37 | 38 | 39 | def parse_auth_header(auth: Optional[str]) -> Optional[str]: 40 | if not auth: 41 | return None 42 | s = auth.strip() 43 | if s.lower().startswith("bearer "): 44 | return s[7:].strip() 45 | return s 46 | 47 | 48 | def check_api_key(authorization: Optional[str] = Header(None)): 49 | if not API_KEY: 50 | return True 51 | token = parse_auth_header(authorization) 52 | if token != API_KEY: 53 | logger.error(f"check_api_key: authorization: {authorization}, token: {token}, API_KEY: {API_KEY}, unauthorized error") 54 | raise HTTPException(status_code=401, detail="Unauthorized") 55 | return True 56 | 57 | 58 | @app.get("/api/health") 59 | async def health(): 60 | return {"status": "ok"} 61 | 62 | 63 | @app.post("/api/config") 64 | async def api_config(inp: ConfigIn, authorization: Optional[str] = Header(None)): 65 | check_api_key(authorization) 66 | try: 67 | api.config(mask_config=inp.maskConfig or {}) 68 | return {"output": {"status": "ok"}, "error": None} 69 | except Exception as e: 70 | logger.exception("/api/config failed") 71 | return {"output": None, "error": {"message": str(e), "code": None}} 72 | 73 | 74 | @app.post("/api/call") 75 | async def api_call(inp: CallIn, authorization: Optional[str] = Header(None)): 76 | check_api_key(authorization) 77 | default_key_file = os.environ.get("AIFW_API_KEY_FILE") 78 | chosen_key_file = inp.apiKeyFile or default_key_file 79 | # Server-side monthly log cleanup based on env config 80 | base_log = os.environ.get("AIFW_LOG_FILE") 81 | try: 82 | months = int(os.environ.get("AIFW_LOG_MONTHS_TO_KEEP", "6")) 83 | except Exception: 84 | months = 6 85 | cleanup_monthly_logs(base_log, months) 86 | try: 87 | out = api.call( 88 | text=inp.text, 89 | api_key_file=chosen_key_file, 90 | model=inp.model, 91 | temperature=inp.temperature or 0.0, 92 | ) 93 | return {"output": {"text": out}, "error": None} 94 | except Exception as e: 95 | logger.exception("/api/call failed") 96 | return {"output": None, "error": {"message": str(e), "code": None}} 97 | 98 | 99 | @app.post("/api/mask_text") 100 | async def api_mask_text(inp: MaskIn, authorization: Optional[str] = Header(None)): 101 | check_api_key(authorization) 102 | try: 103 | res = api.mask_text(text=inp.text, language=inp.language) 104 | return {"output": {"text": res["text"], "maskMeta": res["maskMeta"]}, "error": None} 105 | except Exception as e: 106 | logger.exception("/api/mask_text failed") 107 | return {"output": None, "error": {"message": str(e), "code": None}} 108 | 109 | 110 | @app.post("/api/restore_text") 111 | async def api_restore_text(inp: RestoreIn, authorization: Optional[str] = Header(None)): 112 | check_api_key(authorization) 113 | try: 114 | restored = api.restore_text(text=inp.text, mask_meta=inp.maskMeta) 115 | return {"output": {"text": restored}, "error": None} 116 | except Exception as e: 117 | logger.exception("/api/restore_text failed") 118 | return {"output": None, "error": {"message": str(e), "code": None}} 119 | 120 | 121 | @app.post("/api/mask_text_batch") 122 | async def api_mask_text_batch(inp_array: List[MaskIn], authorization: Optional[str] = Header(None)): 123 | check_api_key(authorization) 124 | try: 125 | res_array = [] 126 | for inp in inp_array: 127 | res_array.append(api.mask_text(text=inp.text, language=inp.language)) 128 | return {"output": res_array, "error": None} 129 | except Exception as e: 130 | logger.exception("/api/mask_text_batch failed") 131 | return {"output": None, "error": {"message": str(e), "code": None}} 132 | 133 | 134 | @app.post("/api/restore_text_batch") 135 | async def api_restore_text_batch(inp_array: List[RestoreIn], authorization: Optional[str] = Header(None)): 136 | check_api_key(authorization) 137 | try: 138 | restored_array = [] 139 | for inp in inp_array: 140 | restored = api.restore_text(text=inp.text, mask_meta=inp.maskMeta) 141 | restored_array.append({"text": restored}) 142 | return {"output": restored_array, "error": None} 143 | except Exception as e: 144 | logger.exception("/api/restore_text_batch failed") 145 | return {"output": None, "error": {"message": str(e), "code": None}} -------------------------------------------------------------------------------- /tools/gen_assets_sha3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Generate SHA3-256 hashes for model_quantized.onnx files under OneAIFW-Assets/models 5 | and ORT wasm files under OneAIFW-Assets/wasm. Output a JSON manifest to the 6 | project's assets directory. 7 | 8 | Usage: 9 | python tools/gen_assets_sha3.py --assets ../OneAIFW-Assets --out assets/oneaifw_assets_hashes.json 10 | 11 | If --assets is omitted, defaults to ../OneAIFW-Assets relative to this script. 12 | If --out is omitted, defaults to /assets/oneaifw_assets_hashes.json. 13 | """ 14 | import argparse 15 | import hashlib 16 | import json 17 | import os 18 | import sys 19 | from typing import Dict, Any 20 | 21 | 22 | def sha3_256_hex_prefixed(file_path: str) -> str: 23 | h = hashlib.sha3_256() 24 | with open(file_path, 'rb') as f: 25 | for chunk in iter(lambda: f.read(1024 * 1024), b''): 26 | h.update(chunk) 27 | return '0x' + h.hexdigest() 28 | 29 | 30 | def read_version_from_hello(assets_root: str) -> str: 31 | hello_path = os.path.join(assets_root, 'hello.json') 32 | if not os.path.isfile(hello_path): 33 | return '' 34 | try: 35 | with open(hello_path, 'r', encoding='utf-8') as f: 36 | obj = json.load(f) 37 | v = obj.get('version') 38 | return str(v) if v is not None else '' 39 | except Exception: 40 | return '' 41 | 42 | 43 | def collect_model_hashes(models_root: str) -> Dict[str, Dict[str, str]]: 44 | """ 45 | Scan models_root like: 46 | models///onnx/model_quantized.onnx 47 | Return: 48 | { "/": { "onnx/model_quantized.onnx": "0x..." }, ... } 49 | """ 50 | result: Dict[str, Dict[str, str]] = {} 51 | if not os.path.isdir(models_root): 52 | return result 53 | for org in sorted(os.listdir(models_root)): 54 | org_dir = os.path.join(models_root, org) 55 | if not os.path.isdir(org_dir): 56 | continue 57 | for model in sorted(os.listdir(org_dir)): 58 | model_dir = os.path.join(org_dir, model) 59 | if not os.path.isdir(model_dir): 60 | continue 61 | onnx_path = os.path.join(model_dir, 'onnx', 'model_quantized.onnx') 62 | if os.path.isfile(onnx_path): 63 | model_id = f'{org}/{model}' 64 | try: 65 | digest = sha3_256_hex_prefixed(onnx_path) 66 | except Exception as e: 67 | print(f'[WARN] hash failed for {onnx_path}: {e}', file=sys.stderr) 68 | continue 69 | result[model_id] = {'onnx/model_quantized.onnx': digest} 70 | return result 71 | 72 | 73 | def collect_wasm_hashes(wasm_root: str) -> Dict[str, str]: 74 | """ 75 | Scan wasm_root for *.wasm files and hash them. 76 | Return: 77 | { "": "0x...", ... } 78 | """ 79 | result: Dict[str, str] = {} 80 | if not os.path.isdir(wasm_root): 81 | return result 82 | for name in sorted(os.listdir(wasm_root)): 83 | if not name.endswith('.wasm'): 84 | continue 85 | p = os.path.join(wasm_root, name) 86 | if os.path.isfile(p): 87 | try: 88 | digest = sha3_256_hex_prefixed(p) 89 | except Exception as e: 90 | print(f'[WARN] hash failed for {p}: {e}', file=sys.stderr) 91 | continue 92 | result[name] = digest 93 | return result 94 | 95 | 96 | def main() -> int: 97 | script_dir = os.path.dirname(os.path.abspath(__file__)) 98 | project_root = os.path.abspath(os.path.join(script_dir, '..')) 99 | default_assets_dir = os.path.abspath(os.path.join(project_root, '..', 'OneAIFW-Assets')) 100 | default_out = os.path.join(project_root, 'assets', 'oneaifw_assets_hashes.json') 101 | 102 | parser = argparse.ArgumentParser(description='Generate SHA3-256 manifest for OneAIFW-Assets resources.') 103 | parser.add_argument('--assets', type=str, default=default_assets_dir, help='Path to OneAIFW-Assets directory') 104 | parser.add_argument('--out', type=str, default=default_out, help='Output JSON file path (under project assets)') 105 | args = parser.parse_args() 106 | 107 | assets_root = os.path.abspath(args.assets) 108 | models_root = os.path.join(assets_root, 'models') 109 | wasm_root = os.path.join(assets_root, 'wasm') 110 | out_path = os.path.abspath(args.out) 111 | 112 | if not os.path.isdir(assets_root): 113 | print(f'[ERROR] assets root not found: {assets_root}', file=sys.stderr) 114 | return 2 115 | 116 | version = read_version_from_hello(assets_root) 117 | models_hashes = collect_model_hashes(models_root) 118 | wasm_hashes = collect_wasm_hashes(wasm_root) 119 | 120 | manifest: Dict[str, Any] = { 121 | 'source': assets_root, 122 | 'version': version, 123 | 'models': models_hashes, 124 | 'wasm': wasm_hashes, 125 | } 126 | 127 | os.makedirs(os.path.dirname(out_path), exist_ok=True) 128 | with open(out_path, 'w', encoding='utf-8') as f: 129 | json.dump(manifest, f, ensure_ascii=False, indent=2, sort_keys=True) 130 | 131 | print(f'[OK] wrote manifest: {out_path}') 132 | print(f' models: {len(models_hashes)} entries, wasm: {len(wasm_hashes)} entries') 133 | return 0 134 | 135 | 136 | if __name__ == '__main__': 137 | raise SystemExit(main()) 138 | 139 | 140 | -------------------------------------------------------------------------------- /py-origin/services/app/one_aifw_api.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any, List 2 | import json 3 | import base64 4 | 5 | from .analyzer import AnalyzerWrapper, EntitySpan 6 | from .anonymizer import AnonymizerWrapper 7 | from .llm_client import LLMClient, load_llm_api_config 8 | 9 | 10 | class OneAIFWAPI: 11 | """Unified in-process API for anonymize→LLM→restore flows. 12 | 13 | Intended to be used by local callers (UI/CLI) and wrapped by HTTP server. 14 | Only exposes the generic `call` method; analysis/anonymize/restore are internal. 15 | """ 16 | 17 | def __init__(self): 18 | self._analyzer_wrapper = AnalyzerWrapper() 19 | self._anonymizer_wrapper = AnonymizerWrapper(self._analyzer_wrapper) 20 | self._llm = LLMClient() 21 | # Apply default maskConfig (docs defaults) so behavior is stable. 22 | self.config(mask_config={}) 23 | 24 | def config(self, mask_config: Dict[str, Any]) -> None: 25 | """Configure runtime masking behavior for this API instance. 26 | 27 | This is used by HTTP endpoint POST /api/config and by local CLI direct_call mode. 28 | """ 29 | try: 30 | self._anonymizer_wrapper.set_mask_config(mask_config or {}) 31 | except Exception: 32 | # Configuration should not crash callers; keep previous config. 33 | return 34 | 35 | # Internal helpers (not for external exposure) 36 | def _analyze(self, text: str, language: str = "en") -> List[EntitySpan]: 37 | return self._analyzer_wrapper.analyze(text=text, language=language) 38 | 39 | def _anonymize( 40 | self, 41 | text: str, 42 | operators: Optional[Dict[str, Dict[str, Any]]] = None, 43 | language: str = "en", 44 | ) -> Dict[str, Any]: 45 | return self._anonymizer_wrapper.anonymize( 46 | text=text, operators=operators, language=language 47 | ) 48 | 49 | def _restore(self, text: str, placeholders_map: Dict[str, str]) -> str: 50 | return self._anonymizer_wrapper.restore(text=text, placeholders_map=placeholders_map) 51 | 52 | # Public API 53 | def mask_text(self, text: str, language: Optional[str] = None) -> Dict[str, Any]: 54 | """Mask PII in text and return masked text plus metadata for restoration. 55 | 56 | maskMeta is a base64 string of UTF-8 JSON bytes for placeholdersMap. 57 | """ 58 | lang = language or self._analyzer_wrapper.detect_language(text) 59 | anon = self._anonymizer_wrapper.anonymize(text=text, operators=None, language=lang) 60 | placeholders = anon.get("placeholdersMap", {}) or {} 61 | serialized = json.dumps(placeholders, ensure_ascii=False).encode("utf-8") 62 | mask_meta_b64 = base64.b64encode(serialized).decode("ascii") 63 | return {"text": anon["text"], "maskMeta": mask_meta_b64} 64 | 65 | def restore_text(self, text: str, mask_meta: Any) -> str: 66 | """Restore masked placeholders using base64-encoded JSON metadata.""" 67 | try: 68 | if isinstance(mask_meta, (bytes, bytearray)): 69 | decoded = bytes(mask_meta) 70 | else: 71 | decoded = base64.b64decode(str(mask_meta), validate=False) 72 | placeholders_map = json.loads(decoded.decode("utf-8")) 73 | if not isinstance(placeholders_map, dict): 74 | placeholders_map = {} 75 | except Exception: 76 | placeholders_map = {} 77 | return self._anonymizer_wrapper.restore(text=text, placeholders_map=placeholders_map) 78 | 79 | def call( 80 | self, 81 | text: str, 82 | api_key_file: Optional[str] = None, 83 | model: Optional[str] = None, 84 | temperature: float = 0.0, 85 | ) -> str: 86 | language = self._analyzer_wrapper.detect_language(text) 87 | 88 | # 1) anonymize input 89 | anon = self._anonymizer_wrapper.anonymize(text=text, operators=None, language=language) 90 | anonymized_text = anon["text"] 91 | placeholders = anon["placeholdersMap"] 92 | 93 | # 2) load LLM config if provided 94 | cfg = {"model": None} 95 | if api_key_file: 96 | cfg = load_llm_api_config(api_key_file) 97 | 98 | # 3) LLM call (no source language hint; use anonymized text as-is) 99 | output = self._llm.call( 100 | text=anonymized_text, 101 | model=model or cfg.get("model") or None, 102 | temperature=temperature, 103 | ) 104 | 105 | # 4) restore placeholders back to original values 106 | restored = self._anonymizer_wrapper.restore(text=output, placeholders_map=placeholders) 107 | return restored 108 | 109 | 110 | # Singleton and module-level function for convenience 111 | api = OneAIFWAPI() 112 | 113 | 114 | def call( 115 | text: str, 116 | api_key_file: Optional[str] = None, 117 | model: Optional[str] = None, 118 | temperature: float = 0.0, 119 | ) -> str: 120 | return api.call( 121 | text=text, 122 | api_key_file=api_key_file, 123 | model=model, 124 | temperature=temperature, 125 | ) 126 | 127 | 128 | def mask_text(text: str, language: Optional[str] = None) -> Dict[str, Any]: 129 | return api.mask_text(text=text, language=language) 130 | 131 | 132 | def restore_text(text: str, mask_meta: Any) -> str: 133 | return api.restore_text(text=text, mask_meta=mask_meta) 134 | 135 | 136 | -------------------------------------------------------------------------------- /libs/regex/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | extern crate alloc; 3 | 4 | use alloc::boxed::Box; 5 | use core::ffi::{c_char, c_int, c_uchar, c_ulong}; 6 | use core::{slice, str}; 7 | 8 | use regex_automata::meta::{Builder, Regex}; 9 | use regex_automata::util::syntax; // syntax::parse 10 | 11 | // -------- minimal bump allocator (no dealloc; enough for compile/match) -------- 12 | use core::alloc::{GlobalAlloc, Layout}; 13 | use core::sync::atomic::{AtomicUsize, Ordering}; 14 | 15 | struct BumpAlloc; 16 | const HEAP_SIZE: usize = 4 * 1024 * 1024; 17 | static mut HEAP: [u8; HEAP_SIZE] = [0; HEAP_SIZE]; 18 | static OFF: AtomicUsize = AtomicUsize::new(0); 19 | 20 | unsafe impl GlobalAlloc for BumpAlloc { 21 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 22 | let align = layout.align(); 23 | let size = layout.size(); 24 | let mut off = OFF.load(Ordering::Relaxed); 25 | let base = core::ptr::addr_of_mut!(HEAP) as usize; 26 | loop { 27 | let aligned = (base + off + (align - 1)) & !(align - 1); 28 | let new_off = aligned + size - base; 29 | if new_off > HEAP_SIZE { return core::ptr::null_mut(); } 30 | match OFF.compare_exchange(off, new_off, Ordering::SeqCst, Ordering::Relaxed) { 31 | Ok(_) => return aligned as *mut u8, 32 | Err(o) => off = o, 33 | } 34 | } 35 | } 36 | unsafe fn dealloc(&self, _ptr: *mut u8, _layout: Layout) { 37 | // no-op (bump) 38 | } 39 | } 40 | 41 | #[global_allocator] 42 | static GLOBAL: BumpAlloc = BumpAlloc; 43 | 44 | #[panic_handler] 45 | fn panic_handler(_: &core::panic::PanicInfo) -> ! { 46 | // Use abort to end the program for wasm/unix. 47 | loop { 48 | #[cfg(target_arch = "wasm32")] 49 | core::arch::wasm32::unreachable(); 50 | #[cfg(not(target_arch = "wasm32"))] 51 | core::hint::spin_loop(); 52 | } 53 | } 54 | 55 | // ---------------------- C ABI ---------------------- 56 | 57 | #[repr(C)] 58 | pub struct AifwRegex { 59 | re: Regex, 60 | } 61 | 62 | /// Compile the regular expression. 63 | /// Returns a handle; returns null on failure. 64 | #[no_mangle] 65 | pub extern "C" fn aifw_regex_compile(pattern: *const c_char) -> *mut AifwRegex { 66 | if pattern.is_null() { return core::ptr::null_mut(); } 67 | 68 | // compute C string length 69 | let len = unsafe { 70 | let mut l = 0usize; 71 | while *pattern.add(l) != 0 { l += 1; } 72 | l 73 | }; 74 | let bytes = unsafe { slice::from_raw_parts(pattern as *const u8, len) }; 75 | let p = match str::from_utf8(bytes) { 76 | Ok(s) => s, 77 | Err(_) => return core::ptr::null_mut() 78 | }; 79 | 80 | let hir = match syntax::parse(p) { 81 | Ok(h) => h, 82 | Err(_) => return core::ptr::null_mut(), 83 | }; 84 | let re = match Builder::new().build_from_hir(&hir) { 85 | Ok(r) => r, 86 | Err(_) => return core::ptr::null_mut(), 87 | }; 88 | Box::into_raw(Box::new(AifwRegex { re })) 89 | } 90 | 91 | #[no_mangle] 92 | pub extern "C" fn aifw_regex_free(ptr_re: *mut AifwRegex) { 93 | if !ptr_re.is_null() { 94 | unsafe { drop(Box::from_raw(ptr_re)); } 95 | } 96 | } 97 | 98 | /// Find a match in the haystack. 99 | /// Returns 1 if a match was found, 0 if not, and < 0 on error. 100 | #[no_mangle] 101 | pub extern "C" fn aifw_regex_find( 102 | ptr_re: *mut AifwRegex, 103 | hay_ptr: *const c_uchar, 104 | hay_len: c_ulong, 105 | start: c_ulong, 106 | out_start: *mut c_ulong, 107 | out_end: *mut c_ulong, 108 | ) -> c_int { 109 | if ptr_re.is_null() || hay_ptr.is_null() || out_start.is_null() || out_end.is_null() { 110 | return -1; 111 | } 112 | let re = unsafe { &*ptr_re }; 113 | let hay = unsafe { slice::from_raw_parts(hay_ptr as *const u8, hay_len as usize) }; 114 | let s = core::cmp::min(start as usize, hay.len()); 115 | let sub = &hay[s..]; 116 | match re.re.find(sub) { 117 | Some(m) => { 118 | unsafe { 119 | *out_start = (s + m.start()) as c_ulong; 120 | *out_end = (s + m.end()) as c_ulong; 121 | } 122 | 1 123 | } 124 | None => 0, 125 | } 126 | } 127 | 128 | /// Find a specific capture group span in the haystack for the first match at or after `start`. 129 | /// Returns 1 if a match with the requested group was found, 0 if not, and < 0 on error. 130 | #[no_mangle] 131 | pub extern "C" fn aifw_regex_find_group( 132 | ptr_re: *mut AifwRegex, 133 | hay_ptr: *const c_uchar, 134 | hay_len: c_ulong, 135 | start: c_ulong, 136 | group_index: c_ulong, 137 | out_start: *mut c_ulong, 138 | out_end: *mut c_ulong, 139 | ) -> c_int { 140 | if ptr_re.is_null() || hay_ptr.is_null() || out_start.is_null() || out_end.is_null() { 141 | return -1; 142 | } 143 | let re = unsafe { &*ptr_re }; 144 | let hay = unsafe { slice::from_raw_parts(hay_ptr as *const u8, hay_len as usize) }; 145 | let s = core::cmp::min(start as usize, hay.len()); 146 | let sub = &hay[s..]; 147 | let g = group_index as usize; 148 | let mut caps = re.re.create_captures(); 149 | re.re.captures(sub, &mut caps); 150 | match caps.get_group(0) { 151 | Some(_) => { 152 | match caps.get_group(g) { 153 | Some(m) => { 154 | unsafe { 155 | *out_start = (s + m.start) as c_ulong; 156 | *out_end = (s + m.end) as c_ulong; 157 | } 158 | 1 159 | } 160 | None => 0, 161 | } 162 | } 163 | None => 0, 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /web/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | AIFW Web Module 4 | Provides a web interface for the AIFW project with masking functionality. 5 | """ 6 | 7 | from flask import Flask, render_template, request, jsonify 8 | import os 9 | import sys 10 | 11 | # Add cli/python to path to import AIFW modules 12 | sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'cli', 'python')) 13 | 14 | 15 | try: 16 | from services.app.one_aifw_api import OneAIFWAPI 17 | except ImportError as e: 18 | print(f"Warning: Could not import AIFW modules: {e}") 19 | print("Make sure you're running from the correct directory and py-origin is available") 20 | OneAIFWAPI = None 21 | 22 | app = Flask(__name__) 23 | 24 | # Initialize AIFW API 25 | aifw_api = None 26 | 27 | def initialize_aifw(): 28 | """Initialize AIFW components""" 29 | global aifw_api 30 | try: 31 | if OneAIFWAPI: 32 | aifw_api = OneAIFWAPI() 33 | return True 34 | except Exception as e: 35 | print(f"Error initializing AIFW: {e}") 36 | return False 37 | 38 | # Initialize on startup 39 | initialize_aifw() 40 | 41 | @app.route('/') 42 | def index(): 43 | """Main page with project introduction and input form""" 44 | return render_template('index.html') 45 | 46 | @app.route('/api/health') 47 | def health(): 48 | """Health check endpoint""" 49 | return jsonify({ 50 | "status": "ok", 51 | "aifw_available": aifw_api is not None 52 | }) 53 | 54 | @app.route('/api/mask', methods=['POST']) 55 | def mask_text(): 56 | """API endpoint to mask/anonymize text""" 57 | try: 58 | data = request.get_json() 59 | if not data or 'text' not in data: 60 | return jsonify({"error": "Missing 'text' field"}), 400 61 | 62 | text = data['text'] 63 | language = data.get('language', 'auto') 64 | 65 | if not text.strip(): 66 | return jsonify({"error": "Text cannot be empty"}), 400 67 | 68 | # Perform masking via OneAIFWAPI (uses aifw-py under the hood) 69 | result = aifw_api.mask_text(text=text, language=language) 70 | 71 | return jsonify({ 72 | "original_text": text, 73 | "anonymized_text": result["text"], 74 | # maskMeta is base64-encoded binary meta; keep name for backward UI compat 75 | "placeholders_map": result["maskMeta"], 76 | "language": language 77 | }) 78 | 79 | except Exception as e: 80 | return jsonify({"error": f"Anonymization failed: {str(e)}"}), 500 81 | 82 | 83 | @app.route('/api/config', methods=['POST']) 84 | def update_config(): 85 | """API endpoint to update AIFW mask configuration.""" 86 | try: 87 | data = request.get_json() or {} 88 | mask_config = data.get('mask_config') or {} 89 | if not isinstance(mask_config, dict): 90 | return jsonify({"error": "'mask_config' must be an object"}), 400 91 | 92 | if not aifw_api or not hasattr(aifw_api, "config"): 93 | return jsonify({"error": "AIFW API not available"}), 500 94 | 95 | aifw_api.config(mask_config) 96 | return jsonify({"status": "ok"}) 97 | except Exception as e: 98 | return jsonify({"error": f"Config update failed: {str(e)}"}), 500 99 | 100 | @app.route('/api/restore', methods=['POST']) 101 | def restore_text(): 102 | """API endpoint to restore anonymized text""" 103 | try: 104 | data = request.get_json() 105 | if not data or 'text' not in data or 'placeholders_map' not in data: 106 | return jsonify({"error": "Missing 'text' or 'placeholders_map' field"}), 400 107 | 108 | text = data['text'] 109 | # For new aifw-py flow, placeholders_map actually carries base64-encoded maskMeta bytes 110 | mask_meta_b64 = data['placeholders_map'] 111 | 112 | # Perform restoration via OneAIFWAPI (expects base64 or raw bytes) 113 | restored_text = aifw_api.restore_text(text=text, mask_meta=mask_meta_b64) 114 | 115 | return jsonify({ 116 | "anonymized_text": text, 117 | "restored_text": restored_text, 118 | "placeholders_map": mask_meta_b64 119 | }) 120 | 121 | except Exception as e: 122 | return jsonify({"error": f"Restoration failed: {str(e)}"}), 500 123 | 124 | @app.route('/api/analyze', methods=['POST']) 125 | def analyze_text(): 126 | """API endpoint to analyze text for PII entities""" 127 | try: 128 | data = request.get_json() 129 | if not data or 'text' not in data: 130 | return jsonify({"error": "Missing 'text' field"}), 400 131 | 132 | text = data['text'] 133 | language = data.get('language', 'auto') 134 | 135 | if not aifw_api: 136 | return jsonify({"error": "AIFW API not available"}), 500 137 | 138 | # Perform analysis via OneAIFWAPI get_pii_entities 139 | entities = aifw_api.get_pii_entities(text=text, language=language) 140 | return jsonify({ 141 | "text": text, 142 | "language": language, 143 | "entities": entities 144 | }) 145 | 146 | except Exception as e: 147 | return jsonify({"error": f"Analysis failed: {str(e)}"}), 500 148 | 149 | @app.route('/api/call', methods=['POST']) 150 | def call_llm(): 151 | """API endpoint to call LLM with anonymization""" 152 | try: 153 | data = request.get_json() 154 | if not data or 'text' not in data: 155 | return jsonify({"error": "Missing 'text' field"}), 400 156 | 157 | text = data['text'] 158 | api_key_file = data.get('api_key_file') 159 | model = data.get('model') 160 | temperature = data.get('temperature', 0.0) 161 | 162 | if not aifw_api: 163 | return jsonify({"error": "AIFW API not available"}), 500 164 | 165 | # Call AIFW API 166 | result = aifw_api.call( 167 | text=text, 168 | api_key_file=api_key_file, 169 | model=model, 170 | temperature=temperature 171 | ) 172 | 173 | return jsonify({ 174 | "original_text": text, 175 | "result": result, 176 | "model": model, 177 | "temperature": temperature 178 | }) 179 | 180 | except Exception as e: 181 | return jsonify({"error": f"LLM call failed: {str(e)}"}), 500 182 | 183 | if __name__ == '__main__': 184 | print("Starting AIFW Web Module...") 185 | print(f"AIFW API available: {aifw_api is not None}") 186 | 187 | app.run(debug=False, host='0.0.0.0', port=5001) 188 | -------------------------------------------------------------------------------- /docs/zh_address_design.md: -------------------------------------------------------------------------------- 1 | ## 中文地址识别(优先级与位图驱动)的设计方案 2 | 3 | ### 1. 目标与范围 4 | - **目标**:以 NER 输出的 `PHYSICAL_ADDRESS` 片段为种子,结合正则/启发式对“中文地址 token”进行类型化识别与“按优先级顺序拼接”,得到完整、边界正确且可控的中文地址;并满足“语言门控,仅在中文启用”。 5 | - **核心思想**:将地址拆解为有序的“token 类型层级”,用一个 `u32` 的位图表示一个地址片段当前已经覆盖的层级集合;左右扩展仅在“邻接层级”之间发生,跨层级跳跃被拒绝;同层级的互斥类型不共存(用于切断错误粘连)。 6 | - **隐私判定**:仅当片段的“最低层级位”达到或低于“隐私阈值(道路门牌号及其之后)”时,片段才被视为可掩码的“私人地址”。 7 | 8 | 9 | ### 2. 地址 token 类型与优先级层级(从左到右) 10 | 按照“从宏观到细化”的真实书写顺序定义层级(左高右低),每个层级对应一个 bit(最高位=最大优先次序,最低位=最小优先次序)。 11 | 12 | 建议使用 11 个层级(可扩展),自高到低如下: 13 | 14 | - L11 国家/地区:如“中国、中华人民共和国、中国大陆、台湾、香港、澳门、英国、美国、日本”等(含常见别称/简称;支持简繁) 15 | - L10 行政-省级:如“浙江省、四川省、特别行政区、自治区、自治州、州、盟、地区” 16 | - L9 行政-市级:如“杭州市、成都市、上海市” 17 | - L8 行政-区/县级(互斥组):如“浦东新区、越秀区、西湖区、红安县、××县、××旗” 18 | - L7 街乡镇里村:如“街道、镇、乡、里、村、/开发区/经济技术开发区等功能区” 19 | - L6 道路:如“路、街、道、巷、弄、里、胡同、段、期、大道、大街、环路、环线”等后缀的道路名 20 | - L5 道路门牌号(隐私阈值起点):如“88号、100号、501号、之3、-2(楼号/扩展号)” 21 | - L4 小区/POI(地点):如“花园、广场、中心、数码港、大厦、园、城、座、馆、廊、坊、府、湾”等 POI 词尾 22 | - L3 楼宇/楼栋/楼座(Building Block):如“号楼、栋、幢、座、B座、C栋、号館/號樓/館/樓”等 23 | - L2 楼层(Floor):如“18层、3层、3樓、F3(英文字母可选)” 24 | - L1 单元/房号(Unit/Room):如“单元、室、房、1403室、806房、401室” 25 | 26 | 说明: 27 | - L8 为互斥组,区(区级)与县(县级/旗级)不能同时出现(同一地址仅一种)。 28 | - “开发区/经济技术开发区”归入 L7(街乡镇里村层级),因为其语义在区/县之后、道路之前,作为功能性行政/片区层级。 29 | - “楼/楼层”的语义歧义处理: 30 | - 出现在“号楼/号館/號樓/××栋/××幢/××座”结构中作为 L3(楼宇); 31 | - 出现在“数字+层/樓”结构中作为 L2(楼层)。 32 | 33 | 34 | ### 3. 位图表示(`u32 addr_priorities`) 35 | - 每个层级分配一个 bit 位,最高位对应 L11,最低位对应 L1。映射规范如下: 36 | - bit18 = L11(国家/地区) 37 | - bit17 = L10(省) 38 | - bit16 = L9(市) 39 | - bit15 = L8(区/县/旗) 40 | - bit14 = L7(街/镇/乡/里/村/功能区) 41 | - bit13 = L6(道路) 42 | - bit12 = L5(门牌号) 43 | - bit11 = L4(小区/POI) 44 | - bit10 = L3(楼宇/楼栋/座) 45 | - bit9 = L2(楼层) 46 | - bit8 = L1(单元/房号) 47 | - bit7 = 保留(reserved), bit0 - bit7 都是保留位 48 | - 对任何一个地址片段,`addr_priorities` 是它已覆盖层级的“并集”。例如: 49 | - “上海市浦东新区” → L9+L8(bit16+bit15) 50 | - “银城中路501号” → L6+L5(bit13+bit12) 51 | - “陆家嘴金融广场18层” → L4+L2(bit11+bit10) 52 | 53 | 辅助操作: 54 | - `highest_bit(addr_priorities)`:返回当前集合中最高层级(最左)bit。 55 | - `lowest_bit(addr_priorities)`:返回当前集合中最低层级(最右)bit。 56 | - `is_adjacent(higher, lower)`:判断两个层级是否相邻(如 L9 与 L8、L6 与 L5)。 57 | 58 | 59 | ### 4. 同层级互斥 60 | - 互斥组:L8(区/县/旗)。识别到一个 L8 后,禁止再把另一个 L8 并进同一地址片段。 61 | - 可扩展互斥:将来需要时,可将同层的不同细分类(例如同一地址中不能出现两个不同的道路 L6)设为互斥;当前需至少确保 L8 互斥即可避免“两个地址粘连”。 62 | 63 | 64 | ### 5. Token 识别方式(正则/规则) 65 | 为保证可维护性与性能,采用“小而明确”的规则集合,而非一个超大正则: 66 | - 国家/地区(L11):匹配常见国家/地区名称及别称(如“中国/中华人民共和国/中国大陆/台湾/香港/澳门/英国/美国/日本”等),支持简繁与常见简称。 67 | - 行政后缀(L10/L9/L8/L7):匹配明确后缀词表(简繁体兼容)。 68 | - 道路(L6):匹配“通用名段 + 道路后缀词表”,允许轻分隔(空格、逗号)。 69 | - 门牌号(L5):匹配“数字 + 号/號”,可选 “之数字”“-数字”。 70 | - POI(L4):匹配通用 POI 词尾词表。 71 | - 楼宇(L3):匹配“数字 + 号楼/栋/幢/座/号館/號樓/館/樓/楼”等;支持英文字母楼座(如“B座”)。 72 | - 楼层(L2):匹配“数字 + 层/層/樓/F数字”。 73 | - 单元/房号(L1):匹配“数字(可含-)+ 单元/室/房”。 74 | 75 | 识别策略: 76 | - 对 NER 输出的 `RecogEntity(PHYSICAL_ADDRESS)` 子串先做 token 化,得到它的 `addr_priorities`(可能包含多个层级)。 77 | - 对纯文本扫描同样采用上述规则,识别候选 token,获得类型与范围。 78 | - 避免使用复杂的 Unicode 巨集正则,尽量采用后缀词表 + 轻量匹配(或代码扫描)以规避编译/性能问题。 79 | 80 | 81 | ### 6. 右扩/左扩的“邻接层级”拼接规则 82 | - 右扩规则:设当前片段集合最低层级为 `Lx`,候选 token 的最高层级为 `Ly`,从宏观往细化地扩展,满足任一即可右扩: 83 | - 严格相邻:`Ly = Lx - 1`。 84 | - 允许跳跃(白名单规则):在特定情形允许跳过一层。例如: 85 | - 道路 → 小区/POI(`L6 → L4`):可跳过 `L5(门牌号)`,以覆盖“未书写门牌号、直接进入小区/POI”的常见书写,如“珠海市香洲路明月花园12栋508房”。约束: 86 | - 两 token 间仅允许轻分隔(空格/逗号/换行等); 87 | - 距离上限 ≤ 4 字符(可调); 88 | - 小区/POI 需由已知 POI 词尾匹配确认。 89 | - 例如:已到 L6(道路),可右扩 L5(门牌号);或按白名单从 L6 跳到 L4;之后继续 L3/L2/L1(各自相邻)。 90 | - 左扩规则:设当前片段集合最高层级为 `Lx`,候选 token 的最低层级为 `Ly`,仅当 `Ly = Lx + 1` 时可左扩(从细化往宏观)。 91 | - 例如:已到 L6(道路),可左扩 L7(街镇),再左扩 L8(区/县),再 L9(市),再 L10(省),再 L11(国家/地区)。 92 | - 互斥校验:若候选 token 的层级与已包含层级相同且为互斥组(如 L8),则拒绝合并(认为另起一个地址)。 93 | 94 | 允许的连接符与轻分隔(统一以“字符”为计量单位): 95 | - 轻分隔:空格、Tab、换行、英文/中文逗号(`,`、`,`)等;允许出现在 token 与 token 之间。 96 | - 弱连接符:`之`、`-` 等可出现在 L5 之后的扩展中(如“之3”“-2”)。 97 | - 阈值:轻分隔/弱连接符的“距离”需在合理范围(如 ≤5 字符),超过则认为不相邻。 98 | 99 | 100 | ### 7. 扫描流程(总体管线) 101 | 1) 输入:原始文本 + NER 输出的 `RecogEntity(PHYSICAL_ADDRESS)`(按起点排序)。 102 | 2) 预处理:过滤非中文语言;仅在中文(统称 zh)启用该流程。 103 | 3) 种子选择:从左到右,取一个 `PHYSICAL_ADDRESS` 种子;对该子串 token 化,得到 `addr_priorities` 与初始范围。 104 | 4) 右扩: 105 | - 在当前范围右侧顺序查找“下一个候选”: 106 | - 文本候选:按规则识别最近的地址 token; 107 | - NER 候选:如果下一个 `PHYSICAL_ADDRESS` 片段紧邻或仅隔轻分隔,也纳入候选(先 token 化)。 108 | - 对候选:验证右扩邻接层级 + 互斥 + 距离阈值 + “重地址前瞻阻断”(见 §8),通过则合并并更新集合与范围,继续右扩;否则停止。 109 | - 隐私阈值:右扩完成后,若 `addr_priorities` 的“最低层级位” ≤ L5(门牌号)则视为达到隐私阈值; 110 | - 例外:若出现 “POI + 楼层/房号”(如 L4+L2/L1)且 POI 在合理距离内,也视为隐私地址(覆盖“西湖数码港2号楼401室”“××广场18层”等)。 111 | 5) 左扩: 112 | - 在当前范围左侧查找候选文本 token(通常不再存在左侧 NER 地址片段,因为我们按起点单调前进); 113 | - 验证左扩邻接层级 + 互斥 + 距离阈值,合并直至不能再左扩。 114 | 6) 产出该地址片段;跳到下一个未覆盖的种子,重复步骤 3~5。 115 | 7) 冲突消解(见 §9)并输出最终掩码片段。 116 | 117 | 118 | ### 8. 边界与阻断(防过度合并) 119 | - 重地址前瞻阻断:在右扩过程中,若在近距离(如 ≤ 12 字符)内探测到“新的地址开头形态”(如“名称 + 行政/道路后缀”),则停止当前地址继续右扩,避免把两个地址粘连成一个。 120 | - 重分隔符:句号、分号、问号、顿号、括号、斜杠、竖线等作为“硬边界”,立即停止扩展。 121 | - 总长度上限:右扩累计字符上限(如 ≤48 字符);超过视为异常书写,停止扩展。 122 | - 名称段长度上限:用于 POI 名称的泛化段(汉字/ASCII 字母)设定字符上限(如 ≤16 字符),避免吸入过长非地址文本。 123 | 124 | 125 | ### 9. 与 NER 的集成 126 | - 映射:`LOC/GPE/FAC/ADDRESS` → `PHYSICAL_ADDRESS`(已在 JS 层完成映射)。 127 | - BIO 处理:支持 `B-`/`I-`/`E-`/`S-`;将 `S-` 视作单 token 的 Begin。 128 | - 用法:NER 片段作为“高置信种子”,其 token 化得到 bitset;在扩展过程中,若右侧下一个 NER 片段与当前片段“仅轻分隔相邻”,也可当作候选进行“邻接校验后并入”。 129 | 130 | 131 | ### 10. 冲突消解与排序策略 132 | 当两个 span 重叠: 133 | - 优先级 1:更“深层”的地址(包含更低层级 bit,如 L1/L2/L3),即最低层级位更小者优先; 134 | - 优先级 2:更长跨度; 135 | - 优先级 3:更早起点; 136 | - 优先级 4:更高 NER 置信分数。 137 | 这样确保“完整地址片段”覆盖“POI 碎片”。 138 | 139 | 140 | ### 11. 语言门控与字符处理 141 | - 仅当 `Language` 属于中文族(`zh/zh_cn/zh_tw/zh_hk/zh_hans/zh_hant`)时启用地址流程;其他语言完全跳过。 142 | - UTF-8 感知扫描,保证不会截断中文多字节字符。 143 | - 轻度归一:需要接受半角/全角空格与阿拉伯数字;支持中文数字(“一二三”)可作为增量工作。 144 | 145 | 146 | ### 12. 性能与实现建议 147 | - Token 识别尽量采用“后缀词表 + 小正则 + 代码扫描”,避免极复杂单体正则导致编译/性能问题。 148 | - 位图与邻接判定保证扩展逻辑 O(1) 判定,整体 O(n) 扫描。 149 | - 右扩优先,左扩补全;轻分隔/距离/前瞻阻断作为“刹车”机制。 150 | 151 | 152 | ### 13. 判定与阈值(建议值,可调优) 153 | - 轻分隔最大间距:4 字符(POI 与后续数字之间的距离上限) 154 | - 右扩总字符上限:≤48 字符 (可调优的编译期常量) 155 | - 前瞻阻断窗口:≤12 字符 (可调优的编译期常量) 156 | - 泛化名称段长度:≤16 字符 (可调优的编译期常量) 157 | - 隐私阈值:默认 L5(门牌号)及以下;例外允许 L4+L2/L1。 158 | 159 | 160 | ### 14. 典型用例(期望行为) 161 | - “上海市浦东新区银城中路501号陆家嘴金融广场18层” 162 | - L9+L8+L6+L5+L4+L2 → 达到隐私阈值(L5),整段掩码。 163 | - “我是吴光华,住在广州市越秀区北京西路黄埔花园13栋806房我的表哥在南昌市中山路2348号锦江花园6栋1403房” 164 | - 切成两个地址: 165 | - “广州市越秀区北京西路黄埔花园13栋806房”(L9+L8+L6+L4+L3+L1) 166 | - “南昌市中山路2348号锦江花园6栋1403房”(L9+L6+L5+L4+L3+L1) 167 | - 互斥与邻接规则避免粘连。 168 | - “北京市朝阳区建国路 88 号” 169 | - 允许轻分隔(空格);L9+L8+L6+L5,整段掩码。 170 | - “珠海市香洲路明月花园12栋508房” 171 | - 允许从 L6(道路)跳过 L5(门牌号)直达 L4(小区/POI),随后继续 L3(楼栋)→ L1(房号);最终 L9+L6+L4+L3+L1,整段掩码。 172 | 173 | 174 | ### 15. 开发里程碑(不含代码细节) 175 | - M1:实现 token 类型词表与小规则;为 NER 片段与文本窗口提供 token 化函数 → 输出 `addr_priorities` 与 token span。 176 | - M2:实现基于位图的右扩/左扩与邻接判定、互斥校验、前瞻阻断与距离阈值。 177 | - M3:实现隐私阈值判断与例外(POI+楼层/房号)。 178 | - M4:整合到现有管线(仅中文);实现冲突消解策略(更深层→更长→更早→更高分)。 179 | - M5:测试集覆盖(上述经典用例 + 难例),参数调优。 180 | 181 | 182 | ### 16. 与现有逻辑的差异与收益 183 | - 从“具体 case 打补丁”转为“通用的层级/位图驱动”机制,逻辑稳定、可解释、可扩展。 184 | - 多语言无副作用(语言门控),与 NER 完全兼容(NER 作为种子、token 化后参与邻接扩展)。 185 | - 可通过参数(窗口、阈值、层级词表)快速调优,支撑更多复杂场景。 186 | -------------------------------------------------------------------------------- /cli/python/services/app/one_aifw_api.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any, List 2 | import json 3 | import base64 4 | import os 5 | import sys 6 | import importlib 7 | import importlib.util 8 | 9 | from .llm_client import LLMClient, load_llm_api_config 10 | 11 | 12 | def _load_aifw_py(): 13 | """ 14 | Load libs/aifw-py as package 'aifw_py' so that we can import aifw_py.libaifw. 15 | """ 16 | # repo_root/cli/python/services/app/one_aifw_api.py -> go up 4 levels to repo root 17 | repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..")) 18 | pkg_dir = os.path.join(repo_root, "libs", "aifw-py") 19 | init_py = os.path.join(pkg_dir, "__init__.py") 20 | if not os.path.exists(init_py): 21 | raise RuntimeError("aifw-py package not found at: %s" % pkg_dir) 22 | if "aifw_py" not in sys.modules: 23 | spec = importlib.util.spec_from_file_location( 24 | "aifw_py", 25 | init_py, 26 | submodule_search_locations=[pkg_dir], 27 | ) 28 | mod = importlib.util.module_from_spec(spec) 29 | sys.modules["aifw_py"] = mod 30 | loader = spec.loader 31 | assert loader is not None 32 | loader.exec_module(mod) 33 | return importlib.import_module("aifw_py.libaifw") 34 | 35 | 36 | class OneAIFWAPI: 37 | """Unified in-process API for anonymize→LLM→restore flows. 38 | 39 | Intended to be used by local callers (UI/CLI) and wrapped by HTTP server. 40 | The exposed API function is list in below: 41 | - mask_text: mask a piece of text and return masked text plus metadata for restoration. 42 | - restore_text: restore the masked text plus matching metadata, return a restored text. 43 | # - mask_text_batch: mask a batch of texts and return batch of masked texts plus matching metadatas for restoration. 44 | # - restore_text_batch: restore a batch of masked texts and matching metadatas, return a restored text. 45 | - call: mask a piece of text, process the masked text (e.g., translation), and then restore it. 46 | """ 47 | 48 | def __init__(self): 49 | self._llm = LLMClient() 50 | # Lazy-load aifw-py core 51 | self._aifw = _load_aifw_py() 52 | self._aifw.init() 53 | 54 | def __del__(self): 55 | self._aifw.deinit() 56 | self._aifw = None 57 | self._llm = None 58 | 59 | # Public API 60 | def config(self, mask_config: Dict[str, Any]) -> None: 61 | """ 62 | Configure AIFW core session (e.g. which entity types are masked). 63 | 64 | This delegates to aifw_py.libaifw.config, which calls aifw_session_config() 65 | in the Zig core. The mask_config schema mirrors the JS maskConfig: 66 | { 67 | "maskAddress": bool, 68 | "maskEmail": bool, 69 | "maskOrganization": bool, 70 | "maskUserName": bool, 71 | "maskPhoneNumber": bool, 72 | "maskBankNumber": bool, 73 | "maskPayment": bool, 74 | "maskVerificationCode": bool, 75 | "maskPassword": bool, 76 | "maskRandomSeed": bool, 77 | "maskPrivateKey": bool, 78 | "maskUrl": bool, 79 | "maskAll": bool 80 | } 81 | """ 82 | if not isinstance(mask_config, dict): 83 | return 84 | try: 85 | if hasattr(self._aifw, "config"): 86 | self._aifw.config(mask_config) # type: ignore[attr-defined] 87 | except Exception: 88 | # Configuration errors should not crash callers; keep previous config. 89 | return 90 | 91 | def mask_text(self, text: str, language: Optional[str] = None) -> Dict[str, Any]: 92 | """Mask PII in text and return masked text plus metadata for restoration. 93 | 94 | maskMeta is a base64 string of binary maskMeta bytes produced by aifw core. 95 | """ 96 | # Let aifw-py handle language auto-detection if language is None or "auto" 97 | lang = None if (language is None or language == "" or language == "auto") else language 98 | masked_text, meta_bytes = self._aifw.mask_text(text, lang) 99 | mask_meta_b64 = base64.b64encode(meta_bytes).decode("ascii") 100 | return {"text": masked_text, "maskMeta": mask_meta_b64} 101 | 102 | def restore_text(self, text: str, mask_meta: Any) -> str: 103 | """Restore masked text using base64-encoded binary maskMeta produced by aifw core.""" 104 | try: 105 | if isinstance(mask_meta, (bytes, bytearray)): 106 | meta_bytes = bytes(mask_meta) 107 | else: 108 | meta_bytes = base64.b64decode(str(mask_meta), validate=False) 109 | except Exception: 110 | meta_bytes = b"" 111 | return self._aifw.restore_text(text, meta_bytes) 112 | 113 | def get_pii_entities(self, text: str, language: Optional[str] = None) -> List[Dict[str, Any]]: 114 | """ 115 | Analyze text and return PII spans using aifw core get_pii_spans(). 116 | Returns a list of dicts with {entity_id, entity_type, start, end, text}. 117 | """ 118 | lang = None if (language is None or language == "" or language == "auto") else language 119 | spans = self._aifw.get_pii_spans(text, lang) 120 | 121 | # get_pii_spans now returns character-based indices and string entity_type. 122 | results: List[Dict[str, Any]] = [] 123 | for s in spans: 124 | start = int(getattr(s, "matched_start", 0)) 125 | end = int(getattr(s, "matched_end", 0)) 126 | frag = text[start:end] 127 | results.append( 128 | { 129 | "entity_id": int(getattr(s, "entity_id", 0)), 130 | "entity_type": str(getattr(s, "entity_type", "")), 131 | "start": start, 132 | "end": end, 133 | "score": float(getattr(s, "score", 0.0)), 134 | "text": frag, 135 | } 136 | ) 137 | return results 138 | 139 | # def mask_text_batch(self, texts: List[str], language: Optional[str] = None) -> List[Dict[str, Any]]: 140 | # """Mask a batch of texts and return batch of masked texts plus matching metadatas for restoration.""" 141 | # return [self.mask_text(text=text, language=language) for text in texts] 142 | 143 | # def restore_text_batch(self, texts: List[str], mask_metas: List[Any]) -> str: 144 | # """Restore a batch of masked texts and matching metadatas, return a restored text.""" 145 | # return [self.restore_text(text=text, mask_meta=mask_meta) for text, mask_meta in zip(texts, mask_metas)] 146 | 147 | def call( 148 | self, 149 | text: str, 150 | api_key_file: Optional[str] = None, 151 | model: Optional[str] = None, 152 | temperature: float = 0.0, 153 | ) -> str: 154 | language = self._aifw.detect_language(text) 155 | 156 | # 1) anonymize input 157 | anonymized_text, meta_bytes = self._aifw.mask_text(input_text=text, language=language) 158 | 159 | # 2) load LLM config if provided 160 | cfg = {"model": None} 161 | if api_key_file: 162 | cfg = load_llm_api_config(api_key_file) 163 | 164 | # 3) LLM call (no source language hint; use anonymized text as-is) 165 | output = self._llm.call( 166 | text=anonymized_text, 167 | model=model or cfg.get("model") or None, 168 | temperature=temperature, 169 | ) 170 | 171 | # 4) restore masked text plus matching metadata, return a restored text. 172 | restored = self._aifw.restore_text(masked_text=output, mask_meta=meta_bytes) 173 | return restored 174 | -------------------------------------------------------------------------------- /core/NerRecognizer.zig: -------------------------------------------------------------------------------- 1 | const std = @import("std"); 2 | const entity = @import("recog_entity.zig"); 3 | const NerRecognizer = @This(); 4 | 5 | allocator: std.mem.Allocator, 6 | ner_recog_type: NerRecogType, 7 | 8 | pub const RecogEntity = entity.RecogEntity; 9 | pub const EntityType = entity.EntityType; 10 | pub const EntityBioTag = entity.EntityBioTag; 11 | 12 | pub const NerRecogType = enum(u8) { token_classification, sequence_classification }; 13 | 14 | pub const NerRecogData = extern struct { 15 | /// A constant pointer to the original text 16 | text: [*:0]const u8, 17 | /// The array of NER entities 18 | ner_entities: [*c]const NerRecogEntity, 19 | /// The count of NER entities 20 | ner_entity_count: u32, 21 | }; 22 | 23 | // pub const TokenOffset = extern struct { 24 | // /// The index of the token 25 | // index: usize, 26 | // /// The start index of the token 27 | // start: usize, 28 | // /// The end index of the token 29 | // end: usize, 30 | // }; 31 | 32 | pub const NerRecogEntity = extern struct { 33 | /// The type of the entity, for example, .USER_NAME, .ORGANIZATION, .PHYSICAL_ADDRESS, etc. 34 | entity_type: EntityType, 35 | entity_tag: EntityBioTag, 36 | 37 | /// The score of the entity 38 | score: f32, 39 | /// The index of the token in tokenized tokens from text 40 | index: u32, 41 | /// The start index of the entity 42 | start: u32, 43 | /// The end index of the entity 44 | end: u32, 45 | }; 46 | 47 | pub fn create(allocator: std.mem.Allocator, ner_recog_type: NerRecogType) !*NerRecognizer { 48 | const ner_recognizer = allocator.create(NerRecognizer) catch return error.NerRecognizerCreateFailed; 49 | ner_recognizer.* = init(allocator, ner_recog_type); 50 | return ner_recognizer; 51 | } 52 | 53 | pub fn destroy(self: *const NerRecognizer) void { 54 | self.deinit(); 55 | self.allocator.destroy(self); 56 | } 57 | 58 | pub fn init(allocator: std.mem.Allocator, ner_recog_type: NerRecogType) NerRecognizer { 59 | return .{ .allocator = allocator, .ner_recog_type = ner_recog_type }; 60 | } 61 | 62 | pub fn deinit(self: *const NerRecognizer) void { 63 | _ = self; 64 | // do nothing 65 | } 66 | 67 | /// Convert external NER output (already decoded by caller) to RecogEntity list. 68 | /// token_classification: items = []struct{ start:usize, end:usize, score:f32, et:EntityType } 69 | /// sequence_classification: same as token_classification, but items have all tokens of the text, 70 | /// not just recognized tokens. 71 | pub fn run(self: *const NerRecognizer, ner_data: NerRecogData) ![]RecogEntity { 72 | var pos: usize = 0; 73 | var idx: usize = 0; 74 | std.log.debug("NerRecognizer.run: ner_data.ner_entity_count={d}", .{ner_data.ner_entity_count}); 75 | if (@intFromEnum(std.options.log_level) >= @intFromEnum(std.log.Level.debug)) { 76 | for (ner_data.ner_entities[0..ner_data.ner_entity_count]) |ent| { 77 | std.log.debug("NerRecognizer.run: ner ent: {any}", .{ent}); 78 | } 79 | } 80 | var out = try std.ArrayList(RecogEntity).initCapacity(self.allocator, ner_data.ner_entity_count); 81 | defer out.deinit(self.allocator); 82 | const text = std.mem.span(ner_data.text); 83 | while (idx < ner_data.ner_entity_count) { 84 | std.log.debug("NerRecognizer.run: ner_data.ner_entities[{d}]={any}", .{ idx, ner_data.ner_entities[idx] }); 85 | const e = aggregateNerRecogEntityToRecogEntity(text, &pos, ner_data.ner_entities, ner_data.ner_entity_count, &idx); 86 | std.log.debug("NerRecognizer.run: ner_entity={any}, score={d}, start={d}, end={d}", .{ e.entity_type, e.score, e.start, e.end }); 87 | if (e.entity_type != .None) { 88 | try out.append(self.allocator, .{ 89 | .entity_type = e.entity_type, 90 | .start = e.start, 91 | .end = e.end, 92 | .score = e.score, 93 | .description = switch (self.ner_recog_type) { 94 | .token_classification => "token", 95 | .sequence_classification => "sequence", 96 | }, 97 | }); 98 | } 99 | } 100 | return try out.toOwnedSlice(self.allocator); 101 | } 102 | 103 | const none_recog_entity = RecogEntity{ 104 | .entity_type = .None, 105 | .start = 0, 106 | .end = 0, 107 | .score = 0.0, 108 | .description = null, 109 | }; 110 | 111 | /// Aggregate one or more same type NerRecogEntity to one RecogEntity 112 | /// for example, if the NER entities are: 113 | /// [ 114 | /// { entity_type: .PHYSICAL_ADDRESS, entity_tag: .Begin, start: 0, end: 10, score: 0.9 }, 115 | /// { entity_type: .PHYSICAL_ADDRESS, entity_tag: .Inside, start: 10, end: 20, score: 0.8 }, 116 | /// { entity_type: .PHYSICAL_ADDRESS, entity_tag: .Inside, start: 20, end: 30, score: 0.7 }, 117 | /// ] 118 | /// the function will return the aggregated RecogEntity: 119 | /// { entity_type: .PHYSICAL_ADDRESS, start: 0, end: 30, score: 0.8 } 120 | /// 121 | /// If the NER entities are not the same type, the function will return the first entity. 122 | fn aggregateNerRecogEntityToRecogEntity( 123 | text: []const u8, 124 | pos: *usize, 125 | entities: [*c]const NerRecogEntity, 126 | entities_count: usize, 127 | idx: *usize, 128 | ) RecogEntity { 129 | var i = idx.*; 130 | 131 | var have_entity = false; 132 | var recog_entity: RecogEntity = none_recog_entity; 133 | 134 | std.log.debug("aggregateNerRecogEntityToRecogEntity: entities_count={d}, idx={d}, pos={d}", .{ entities_count, i, pos.* }); 135 | while (i < entities_count) : (i += 1) { 136 | const tok = entities[i]; 137 | std.log.debug("aggregateNerRecogEntityToRecogEntity: i={d}, type={any}, tag={any}, start={d}, end={d}, word={s}", .{ i, tok.entity_type, tok.entity_tag, tok.start, tok.end, text[tok.start..tok.end] }); 138 | const t = tok.entity_type; 139 | const is_begin = tok.entity_tag == .Begin; 140 | if (t == .None) { 141 | if (have_entity) break else continue; 142 | } 143 | 144 | if (!have_entity) { 145 | if (!is_begin) continue; 146 | std.log.debug("aggregateNerRecogEntityToRecogEntity: is_begin=true, type={any}", .{t}); 147 | have_entity = true; 148 | recog_entity.entity_type = t; 149 | recog_entity.start = tok.start; 150 | recog_entity.end = tok.end; 151 | recog_entity.score = tok.score; 152 | recog_entity.description = null; 153 | continue; 154 | } 155 | 156 | if (t != recog_entity.entity_type) { 157 | // another different type entity is found, break the loop 158 | break; 159 | } 160 | 161 | const score = (recog_entity.score + tok.score) / 2; 162 | if (!is_begin) { 163 | std.log.debug("aggregateNerRecogEntityToRecogEntity: is_begin=false, score={d}", .{score}); 164 | recog_entity.end = tok.end; 165 | recog_entity.score = score; 166 | recog_entity.description = null; 167 | } else if (hasSubwordPrefix(text[tok.start..tok.end])) { 168 | std.log.debug("aggregateNerRecogEntityToRecogEntity: is_begin=true, hasSubwordPrefix=true, score={d}", .{score}); 169 | recog_entity.end = tok.end; 170 | recog_entity.score = score; 171 | recog_entity.description = null; 172 | } else { 173 | // another same type entity is found, break the loop 174 | std.log.debug("aggregateNerRecogEntityToRecogEntity: is_begin=true, hasSubwordPrefix=false, score={d}", .{score}); 175 | break; 176 | } 177 | } 178 | 179 | idx.* = i; 180 | if (!have_entity) { 181 | std.log.debug("aggregateNerRecogEntityToRecogEntity: !have_entity, pos={d}", .{pos.*}); 182 | pos.* = text.len; 183 | return none_recog_entity; 184 | } 185 | std.log.debug("aggregateNerRecogEntityToRecogEntity: return recog_entity, pos={d}", .{pos.*}); 186 | return recog_entity; 187 | } 188 | 189 | fn hasSubwordPrefix(word: []const u8) bool { 190 | if (word.len >= 2 and word[0] == '#' and word[1] == '#') return true; 191 | return false; 192 | } 193 | -------------------------------------------------------------------------------- /apps/webapp/src/main.js: -------------------------------------------------------------------------------- 1 | const statusEl = document.getElementById('status'); 2 | const textEl = document.getElementById('text'); 3 | const maskedEl = document.getElementById('masked'); 4 | const restoredEl = document.getElementById('restored'); 5 | const runBtn = document.getElementById('run'); 6 | // Create language selector just above the textarea if not present 7 | let langEl = document.getElementById('lang'); 8 | if (!langEl && textEl && textEl.parentElement) { 9 | const row = document.createElement('div'); 10 | row.className = 'row'; 11 | const label = document.createElement('label'); 12 | label.htmlFor = 'lang'; 13 | label.textContent = 'Language'; 14 | const select = document.createElement('select'); 15 | select.id = 'lang'; 16 | // Supported: Simplified Chinese, Traditional Chinese, English 17 | const opts = [ 18 | { v: 'auto', t: 'Auto (detect)' }, 19 | { v: 'zh-CN', t: 'Chinese (Simplified)' }, 20 | { v: 'zh-TW', t: 'Chinese (Traditional)' }, 21 | { v: 'en', t: 'English' }, 22 | ]; 23 | for (const { v, t } of opts) { 24 | const o = document.createElement('option'); 25 | o.value = v; o.textContent = t; select.appendChild(o); 26 | } 27 | // default to Auto 28 | select.value = 'auto'; 29 | row.appendChild(label); 30 | row.appendChild(select); 31 | // detected language indicator 32 | const detSpan = document.createElement('span'); 33 | detSpan.id = 'lang-detected'; 34 | detSpan.style.marginLeft = '8px'; 35 | row.appendChild(detSpan); 36 | // insert before the textarea row 37 | textEl.parentElement.parentElement?.insertBefore(row, textEl.parentElement); 38 | langEl = select; 39 | } 40 | 41 | // Create batch mode toggle above textarea 42 | let batchEl = document.getElementById('use-batch'); 43 | if (!batchEl && textEl && textEl.parentElement) { 44 | const row = document.createElement('div'); 45 | row.className = 'row'; 46 | const label = document.createElement('label'); 47 | const input = document.createElement('input'); 48 | input.type = 'checkbox'; 49 | input.id = 'use-batch'; 50 | label.appendChild(input); 51 | label.appendChild(document.createTextNode(' Use batch (maskTextBatch)')); 52 | row.appendChild(label); 53 | textEl.parentElement.parentElement?.insertBefore(row, textEl.parentElement); 54 | batchEl = input; 55 | } 56 | 57 | // Create mask-config checkboxes above textarea (to the right of language/batch rows) 58 | const maskCheckboxes = {}; 59 | if (textEl && textEl.parentElement) { 60 | const row = document.createElement('div'); 61 | row.className = 'row'; 62 | const title = document.createElement('span'); 63 | title.textContent = 'Mask types:'; 64 | row.appendChild(title); 65 | const defs = [ 66 | { key: 'maskAddress', id: 'maskAddress', label: 'Address', checked: true }, 67 | { key: 'maskEmail', id: 'maskEmail', label: 'Email', checked: true }, 68 | { key: 'maskOrganization', id: 'maskOrganization', label: 'Organization', checked: true }, 69 | { key: 'maskUserName', id: 'maskUserName', label: 'User name', checked: true }, 70 | { key: 'maskPhoneNumber', id: 'maskPhoneNumber', label: 'Phone', checked: true }, 71 | { key: 'maskBankNumber', id: 'maskBankNumber', label: 'Bank', checked: true }, 72 | { key: 'maskPayment', id: 'maskPayment', label: 'Payment', checked: true }, 73 | { key: 'maskVerificationCode', id: 'maskVerificationCode', label: 'Verification code', checked: true }, 74 | { key: 'maskPassword', id: 'maskPassword', label: 'Password', checked: true }, 75 | { key: 'maskRandomSeed', id: 'maskRandomSeed', label: 'Random seed', checked: true }, 76 | { key: 'maskPrivateKey', id: 'maskPrivateKey', label: 'Private key', checked: true }, 77 | { key: 'maskUrl', id: 'maskUrl', label: 'URL', checked: true }, 78 | ]; 79 | for (const def of defs) { 80 | const label = document.createElement('label'); 81 | label.style.marginLeft = '12px'; 82 | const input = document.createElement('input'); 83 | input.type = 'checkbox'; 84 | input.id = def.id; 85 | input.checked = def.checked; 86 | label.appendChild(input); 87 | label.appendChild(document.createTextNode(' ' + def.label)); 88 | row.appendChild(label); 89 | maskCheckboxes[def.key] = input; 90 | } 91 | textEl.parentElement.parentElement?.insertBefore(row, textEl.parentElement); 92 | } 93 | 94 | function getMaskConfigFromUI() { 95 | const cfg = {}; 96 | for (const [key, el] of Object.entries(maskCheckboxes)) { 97 | cfg[key] = !!el.checked; 98 | } 99 | return cfg; 100 | } 101 | 102 | let aifw; // wrapper lib 103 | 104 | async function main() { 105 | statusEl.textContent = 'Initializing AIFW...'; 106 | aifw = await import('@oneaifw/aifw-js'); 107 | // await aifw.init({ wasmBase: './wasm/' }); 108 | await aifw.init({ maskConfig: getMaskConfigFromUI() }); 109 | statusEl.textContent = 'AIFW initialized.'; 110 | 111 | // graceful shutdown on page exit (bfcache + unload) 112 | let shutdownCalled = false; 113 | function shutdownOnce() { 114 | if (shutdownCalled) return; 115 | shutdownCalled = true; 116 | aifw.deinit(); 117 | } 118 | window.addEventListener('pagehide', shutdownOnce, { once: true }); 119 | window.addEventListener('beforeunload', shutdownOnce, { once: true }); 120 | 121 | // When user toggles mask checkboxes, update config at runtime 122 | for (const el of Object.values(maskCheckboxes)) { 123 | el.addEventListener('change', () => { 124 | if (!aifw || typeof aifw.config !== 'function') return; 125 | const cfg = getMaskConfigFromUI(); 126 | aifw.config(cfg).catch((e) => console.warn('[webapp] config failed', e)); 127 | }); 128 | } 129 | 130 | runBtn.addEventListener('click', async () => { 131 | try { 132 | statusEl.textContent = 'Running...'; 133 | maskedEl.textContent = ''; 134 | restoredEl.textContent = ''; 135 | 136 | const textStr = textEl.value || ''; 137 | let language = (langEl && langEl.value) || 'auto'; 138 | const lines = textStr.split(/\r?\n/); 139 | const useBatch = !!(batchEl && batchEl.checked); 140 | let maskedLines = []; 141 | let metas = []; 142 | // detect language if auto (for display only). Library will also auto-detect per text when language is null/auto 143 | let displayLang = ''; 144 | if (language === 'auto') { 145 | try { 146 | const det = await aifw.detectLanguage(textStr); 147 | if (det.lang === 'zh') displayLang = det.script === 'Hant' ? 'zh-TW' : 'zh-CN'; else displayLang = det.lang || 'en'; 148 | } catch (_) {} 149 | const span = document.getElementById('lang-detected'); 150 | if (span) span.textContent = displayLang ? `(detected: ${displayLang})` : ''; 151 | language = null; // pass null to trigger library auto-detect 152 | } 153 | if (useBatch) { 154 | const inputs = lines.map((line) => ({ text: line, language })); 155 | const results = await aifw.maskTextBatch(inputs); 156 | maskedLines = results.map((r) => (r && r.text) || ''); 157 | metas = results.map((r) => r && r.maskMeta); 158 | } else { 159 | for (const line of lines) { 160 | const [masked, meta] = await aifw.maskText(line, language); 161 | maskedLines.push(masked); 162 | metas.push(meta); 163 | } 164 | } 165 | const maskedStr = maskedLines.join('\n'); 166 | maskedEl.textContent = maskedStr; 167 | 168 | const batchItems = maskedLines.map((m, i) => ({ text: m, maskMeta: metas[i] })); 169 | const restoredObjs = await aifw.restoreTextBatch(batchItems); 170 | const restoredStr = restoredObjs.map((o) => (o && o.text) || '').join('\n'); 171 | restoredEl.textContent = restoredStr; 172 | 173 | // Test restore with empty masked text for just freeing meta, should return empty string 174 | try { 175 | const test_text = "Hi, my email is example.test@funstory.com, my phone number is 13800138027, my name is John Doe"; 176 | const [masked, meta] = await aifw.maskText(test_text, language); 177 | const emptied = await aifw.restoreText('', meta); 178 | // Expect empty string; log for debug without affecting UI 179 | console.log('[webapp] empty-restore result length:', emptied.length); 180 | } catch (e) { 181 | console.warn('[webapp] empty-restore check failed:', e); 182 | } 183 | 184 | // Test getPiiSpans API on the original input 185 | try { 186 | const spans = await aifw.getPiiSpans(textStr, language); 187 | console.log('[webapp] getPiiSpans spans:', spans); 188 | } catch (e) { 189 | console.warn('[webapp] getPiiSpans failed:', e); 190 | } 191 | 192 | statusEl.textContent = 'Done'; 193 | } catch (e) { 194 | statusEl.textContent = `Error: ${e.message || e}`; 195 | } 196 | }); 197 | } 198 | 199 | main().catch((e) => statusEl.textContent = `Error: ${e.message || e}`); 200 | -------------------------------------------------------------------------------- /web/static/css/style.css: -------------------------------------------------------------------------------- 1 | /* AIFW Web Module Styles */ 2 | 3 | body { 4 | background-color: #f8f9fa; 5 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 6 | } 7 | 8 | .navbar-brand { 9 | font-weight: bold; 10 | font-size: 1.5rem; 11 | } 12 | 13 | .card { 14 | box-shadow: 0 0.125rem 0.25rem rgba(0, 0, 0, 0.075); 15 | border: 1px solid rgba(0, 0, 0, 0.125); 16 | } 17 | 18 | .card-header { 19 | border-bottom: 1px solid rgba(0, 0, 0, 0.125); 20 | } 21 | 22 | #originalText, #processedText { 23 | font-family: 'Courier New', monospace; 24 | white-space: pre-wrap; 25 | word-wrap: break-word; 26 | background-color: #f8f9fa; 27 | border: 1px solid #dee2e6; 28 | text-align: left !important; 29 | } 30 | 31 | .btn { 32 | border-radius: 0.375rem; 33 | font-weight: 500; 34 | } 35 | 36 | .btn i { 37 | margin-right: 0.5rem; 38 | } 39 | 40 | .table th { 41 | background-color: #e9ecef; 42 | border-top: none; 43 | font-weight: 600; 44 | } 45 | 46 | .table td { 47 | vertical-align: middle; 48 | } 49 | 50 | .alert { 51 | border-radius: 0.5rem; 52 | border: none; 53 | } 54 | 55 | #statusAlert { 56 | animation: fadeIn 0.3s ease-in; 57 | } 58 | 59 | @keyframes fadeIn { 60 | from { opacity: 0; } 61 | to { opacity: 1; } 62 | } 63 | 64 | /* 敏感信息高亮 */ 65 | .entity-highlight { 66 | background-color: #fff3cd; 67 | border: 1px solid #ffeaa7; 68 | border-radius: 3px; 69 | padding: 1px 3px; 70 | margin: 0 1px; 71 | font-weight: bold; 72 | } 73 | 74 | .entity-email { 75 | background-color: #d1ecf1; 76 | border-color: #bee5eb; 77 | } 78 | 79 | .entity-phone { 80 | background-color: #f8d7da; 81 | border-color: #f5c6cb; 82 | } 83 | 84 | .entity-credit-card { 85 | background-color: #d4edda; 86 | border-color: #c3e6cb; 87 | } 88 | 89 | .entity-person { 90 | background-color: #e2e3e5; 91 | border-color: #d6d8db; 92 | } 93 | 94 | /* 响应式设计 */ 95 | @media (max-width: 768px) { 96 | .container { 97 | padding: 0 15px; 98 | } 99 | 100 | .card-body { 101 | padding: 1rem; 102 | } 103 | 104 | .btn { 105 | margin-bottom: 0.5rem; 106 | width: 100%; 107 | } 108 | 109 | .btn:not(:last-child) { 110 | margin-right: 0; 111 | } 112 | } 113 | 114 | /* 加载动画 */ 115 | .loading { 116 | position: relative; 117 | overflow: hidden; 118 | } 119 | 120 | .loading::after { 121 | content: ''; 122 | position: absolute; 123 | top: 0; 124 | left: -100%; 125 | width: 100%; 126 | height: 100%; 127 | background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.4), transparent); 128 | animation: loading 1.5s infinite; 129 | } 130 | 131 | @keyframes loading { 132 | 0% { left: -100%; } 133 | 100% { left: 100%; } 134 | } 135 | 136 | /* 成功/错误状态 */ 137 | .text-success { 138 | color: #198754 !important; 139 | } 140 | 141 | .text-danger { 142 | color: #dc3545 !important; 143 | } 144 | 145 | .text-warning { 146 | color: #fd7e14 !important; 147 | } 148 | 149 | /* 自定义滚动条 */ 150 | ::-webkit-scrollbar { 151 | width: 8px; 152 | } 153 | 154 | ::-webkit-scrollbar-track { 155 | background: #f1f1f1; 156 | border-radius: 4px; 157 | } 158 | 159 | ::-webkit-scrollbar-thumb { 160 | background: #c1c1c1; 161 | border-radius: 4px; 162 | } 163 | 164 | ::-webkit-scrollbar-thumb:hover { 165 | background: #a8a8a8; 166 | } 167 | 168 | /* 工作原理动画样式 */ 169 | .workflow-animation { 170 | display: flex; 171 | flex-direction: row; 172 | align-items: center; 173 | justify-content: center; 174 | gap: 20px; 175 | padding: 30px 20px; 176 | background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); 177 | border-radius: 10px; 178 | border: 2px solid #dee2e6; 179 | position: relative; 180 | overflow: hidden; 181 | } 182 | 183 | .workflow-step { 184 | display: flex; 185 | flex-direction: column; 186 | align-items: center; 187 | gap: 10px; 188 | padding: 20px 15px; 189 | background: white; 190 | border-radius: 8px; 191 | box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); 192 | border: 2px solid transparent; 193 | transition: all 0.3s ease; 194 | opacity: 0.6; 195 | transform: scale(0.95); 196 | min-width: 120px; 197 | max-width: 150px; 198 | flex-shrink: 0; 199 | } 200 | 201 | .workflow-step.active { 202 | opacity: 1; 203 | transform: scale(1); 204 | border-color: #007bff; 205 | box-shadow: 0 4px 15px rgba(0, 123, 255, 0.3); 206 | } 207 | 208 | .workflow-step.completed { 209 | opacity: 0.8; 210 | border-color: #28a745; 211 | background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%); 212 | } 213 | 214 | .step-icon { 215 | width: 50px; 216 | height: 50px; 217 | border-radius: 50%; 218 | background: linear-gradient(135deg, #007bff 0%, #0056b3 100%); 219 | display: flex; 220 | align-items: center; 221 | justify-content: center; 222 | color: white; 223 | font-size: 20px; 224 | flex-shrink: 0; 225 | transition: all 0.3s ease; 226 | } 227 | 228 | .workflow-step.active .step-icon { 229 | background: linear-gradient(135deg, #28a745 0%, #1e7e34 100%); 230 | animation: pulse 1s infinite; 231 | } 232 | 233 | .workflow-step.completed .step-icon { 234 | background: linear-gradient(135deg, #28a745 0%, #1e7e34 100%); 235 | } 236 | 237 | .step-content h6 { 238 | margin: 0 0 5px 0; 239 | font-weight: 600; 240 | color: #333; 241 | font-size: 1rem; 242 | } 243 | 244 | .step-content p { 245 | margin: 0; 246 | font-size: 0.85rem; 247 | color: #666; 248 | text-align: center; 249 | } 250 | 251 | .workflow-arrow { 252 | font-size: 20px; 253 | color: #007bff; 254 | animation: bounce 2s infinite; 255 | z-index: 10; 256 | flex-shrink: 0; 257 | } 258 | 259 | .workflow-arrow.reverse { 260 | color: #28a745; 261 | } 262 | 263 | @keyframes pulse { 264 | 0% { transform: scale(1); } 265 | 50% { transform: scale(1.1); } 266 | 100% { transform: scale(1); } 267 | } 268 | 269 | @keyframes bounce { 270 | 0%, 20%, 50%, 80%, 100% { transform: translateY(0); } 271 | 40% { transform: translateY(-10px); } 272 | 60% { transform: translateY(-5px); } 273 | } 274 | 275 | /* 动画控制按钮 */ 276 | #startAnimation { 277 | transition: all 0.3s ease; 278 | } 279 | 280 | #startAnimation:hover { 281 | transform: translateY(-2px); 282 | box-shadow: 0 4px 12px rgba(0, 123, 255, 0.3); 283 | } 284 | 285 | /* GitHub按钮样式 */ 286 | .github-btn { 287 | display: inline-flex; 288 | align-items: center; 289 | gap: 8px; 290 | padding: 8px 16px; 291 | background: rgba(255, 255, 255, 0.1); 292 | border: 1px solid rgba(255, 255, 255, 0.3); 293 | border-radius: 6px; 294 | color: white; 295 | text-decoration: none; 296 | font-size: 14px; 297 | font-weight: 500; 298 | transition: all 0.3s ease; 299 | backdrop-filter: blur(10px); 300 | } 301 | 302 | .github-btn:hover { 303 | background: rgba(255, 255, 255, 0.2); 304 | border-color: rgba(255, 255, 255, 0.5); 305 | color: white; 306 | text-decoration: none; 307 | transform: translateY(-1px); 308 | box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15); 309 | } 310 | 311 | .github-btn i.fab { 312 | font-size: 16px; 313 | } 314 | 315 | .github-text { 316 | font-weight: 600; 317 | } 318 | 319 | .github-stars { 320 | display: flex; 321 | align-items: center; 322 | gap: 4px; 323 | background: rgba(255, 255, 255, 0.15); 324 | padding: 4px 8px; 325 | border-radius: 12px; 326 | font-size: 12px; 327 | font-weight: 600; 328 | border: 1px solid rgba(255, 255, 255, 0.2); 329 | } 330 | 331 | .github-stars i.fas { 332 | color: #ffd700; 333 | font-size: 11px; 334 | } 335 | 336 | .star-count { 337 | color: #fff; 338 | font-weight: 600; 339 | transition: all 0.3s ease; 340 | } 341 | 342 | .star-count.loading { 343 | animation: pulse 1s infinite; 344 | } 345 | 346 | .star-count.success { 347 | color: #ffd700 !important; 348 | animation: bounce 0.6s ease; 349 | } 350 | 351 | @keyframes bounce { 352 | 0%, 20%, 50%, 80%, 100% { transform: translateY(0); } 353 | 40% { transform: translateY(-3px); } 354 | 60% { transform: translateY(-1px); } 355 | } 356 | 357 | /* 响应式设计 - 动画 */ 358 | @media (max-width: 768px) { 359 | .workflow-animation { 360 | padding: 15px; 361 | gap: 10px; 362 | } 363 | 364 | .workflow-step { 365 | padding: 12px 15px; 366 | gap: 12px; 367 | } 368 | 369 | .step-icon { 370 | width: 40px; 371 | height: 40px; 372 | font-size: 16px; 373 | } 374 | 375 | .workflow-arrow { 376 | font-size: 20px; 377 | } 378 | 379 | .github-btn { 380 | padding: 6px 12px; 381 | font-size: 13px; 382 | } 383 | 384 | .github-stars { 385 | padding: 3px 6px; 386 | font-size: 11px; 387 | } 388 | } 389 | -------------------------------------------------------------------------------- /docs/oneaifw_services_api_cn.md: -------------------------------------------------------------------------------- 1 | # OneAIFW 后台服务 API 文档 2 | 3 | 本文档描述了 OneAIFW 本地后台服务的 HTTP API。 4 | 5 | 默认服务地址:`http://127.0.0.1:8844` 6 | 7 | 可选鉴权:请求头 `Authorization`(仅当服务端启用了 API_KEY 时生效),值可以是 `` 或 `Bearer `。 8 | 9 | ### 通用说明 10 | - 字符编码:UTF-8 11 | - 错误返回: 12 | - 401 Unauthorized:缺少或错误的 `Authorization` 13 | - 400 Bad Request:非法请求内容 14 | 15 | ## 健康检查 16 | - 方法/路径:GET `/api/health` 17 | - 请求体:无 18 | - 响应(JSON): 19 | ```json 20 | { "status": "ok" } 21 | ``` 22 | 23 | 示例(curl): 24 | ```bash 25 | curl -s -X GET http://127.0.0.1:8844/api/health 26 | ``` 27 | 响应: 28 | ```json 29 | { "status": "ok" } 30 | ``` 31 | 32 | ## LLM匿名化调用(匿名化 → LLM → 反匿名化) 33 | - 方法/路径:POST `/api/call` 34 | - Content-Type:`application/json` 35 | - 请求头:`Authorization: `(若服务端启用鉴权) 36 | - 请求体字段: 37 | - `text` (string, 必填):原始输入文本 38 | - `apiKeyFile` (string, 可选):后端读取的 LLM API 配置文件路径;若省略,使用环境变量 `AIFW_API_KEY_FILE` 39 | - `model` (string, 可选):自己提供的LLM模型名(透传给后端 LLM 客户端) 40 | - `temperature` (number, 可选):采样温度,默认 0.0 41 | - 响应(JSON): 42 | ```json 43 | { "output":{"text": ""}, "error": null } 44 | ``` 45 | 46 | 示例(curl): 47 | ```bash 48 | curl -s -X POST http://127.0.0.1:8844/api/call \ 49 | -H 'Content-Type: application/json' \ 50 | # -H 'Authorization: Bearer ' \ 51 | -d '{"text":"请把如下文本翻译为中文: My email address is test@example.com, and my phone number is 18744325579."}' 52 | ``` 53 | 响应: 54 | ```json 55 | {"output":{"text":"我的电子邮件地址是 test@example.com,我的电话号码是 18744325579。"},"error":null} 56 | ``` 57 | 58 | ## 掩码配置(运行时 config 接口) 59 | 60 | - 方法/路径:POST `/api/config` 61 | - Content-Type:`application/json` 62 | - 请求头:`Authorization: `(若服务端启用鉴权) 63 | - 用途:**在不重启服务的情况下,动态更新当前会话的敏感信息掩码策略**。 64 | - 请求体字段: 65 | - `maskConfig` (object, 必填):各类敏感信息的掩码开关,支持的字段包括: 66 | - `maskAddress` (bool):物理地址,缺省值是false 67 | - `maskEmail` (bool):邮箱地址,缺省值是true 68 | - `maskOrganization` (bool):组织 / 公司名称,缺省值是true 69 | - `maskUserName` (bool):人名 / 用户名,缺省值是true 70 | - `maskPhoneNumber` (bool):电话号码,缺省值是true 71 | - `maskBankNumber` (bool):银行卡号 / 银行账号,缺省值是true 72 | - `maskPayment` (bool):支付相关标识,缺省值是true 73 | - `maskVerificationCode` (bool):验证码 / 一次性代码,缺省值是true 74 | - `maskPassword` (bool):密码,缺省值是true 75 | - `maskRandomSeed` (bool):随机种子 / 初始化向量,缺省值是true 76 | - `maskPrivateKey` (bool):私钥 / 机密密钥,缺省值是true 77 | - `maskUrl` (bool):URL 地址,缺省值是true 78 | - `maskAll` (bool):是否匿名化所有的实体类型,全开或者全关,覆盖上面所有设置,无缺省值。 79 | - 响应(JSON): 80 | ```json 81 | { "output": { "status": "ok" }, "error": null } 82 | ``` 83 | 84 | 示例(curl): 85 | ```bash 86 | curl -s -X POST http://127.0.0.1:8844/api/config \ 87 | -H 'Content-Type: application/json' \ 88 | # -H 'Authorization: Bearer ' \ 89 | -d '{ 90 | "maskConfig": { 91 | "maskEmail": true, 92 | "maskPhoneNumber": true, 93 | "maskUserName": true, 94 | "maskAddress": false, 95 | } 96 | }' 97 | ``` 98 | 99 | ## 匿名化与反匿名化 100 | 101 | 这两个接口一起用于匿名化一段文本,处理匿名化的文本(比如翻译),再反匿名化处理后的文本。必须配对使用:每次匿名化都需要对应一次反匿名化,否则可能造成内存泄漏。可以先批量匿名化、处理完成后再批量反匿名化。 102 | 103 | 重要:配对的匿名化和反匿名化接口调用需要使用相同的 `maskMeta`。 104 | 105 | `maskMeta` 是将 `placeholdersMap`(UTF-8 编码的 JSON 字节)整体 base64 编码得到的字符串;调用方将其视为不透明字符串,按原样传回 `/api/restore_text` 即可。 106 | 107 | ### 匿名化接口(生成 masked text 与 maskMeta) 108 | - 方法/路径:POST `/api/mask_text` 109 | - 请求 Content-Type:`application/json` 110 | - 请求头:`Authorization: `(若服务端启用鉴权) 111 | - 请求体字段: 112 | - `text` (string, 必填):原始输入文本 113 | - `language` (string, 可选):语言提示(如 `en`、`zh`);若省略,服务端自动检测 114 | - 响应 Content-Type:`application/json` 115 | - 响应体: 116 | ```json 117 | { 118 | "output":{ 119 | "text": "", 120 | "maskMeta": "" 121 | }, 122 | "error": null 123 | } 124 | ``` 125 | 126 | 示例(curl): 127 | ```bash 128 | curl -s -X POST http://127.0.0.1:8844/api/mask_text \ 129 | -H 'Content-Type: application/json' \ 130 | # -H 'Authorization: Bearer ' \ 131 | -d '{"text":"My email address is test@example.com, and my phone number is 18744325579.","language":"en"}' 132 | ``` 133 | 响应: 134 | ```json 135 | { 136 | "output":{ 137 | "text":"My email address is __PII_EMAIL_ADDRESS_00000001__, and my phone number is __PII_PHONE_NUMBER_00000002__.", 138 | "maskMeta":"eyJfX1BJSV9QSE9ORV9OVU1CRVJfMDAwMDAwMDJfXyI6ICIxODc0NDMyNTU3OSIsICJfX1BJSV9FTUFJTF9BRERSRVNTXzAwMDAwMDAxX18iOiAidGVzdEBleGFtcGxlLmNvbSJ9" 139 | }, 140 | "error": null 141 | } 142 | ``` 143 | 144 | ### 反匿名化接口(输入 masked text 与 maskMeta 得到反匿名化后的文本) 145 | - 方法/路径:POST `/api/restore_text` 146 | - 请求 Content-Type:`application/json` 147 | - 请求头:`Authorization: `(若服务端启用鉴权) 148 | - 请求体: 149 | ```json 150 | { 151 | "text": "<上一阶段返回的或翻译处理后的 masked_text>", 152 | "maskMeta": "<上一阶段返回的 base64(maskMeta)>" 153 | } 154 | ``` 155 | - 响应 Content-Type:`application/json` 156 | - 响应体: 157 | ```json 158 | { 159 | "output":{"text": ""}, 160 | "error": null 161 | } 162 | ``` 163 | 164 | 示例(curl): 165 | ```bash 166 | curl -s -X POST http://127.0.0.1:8844/api/restore_text \ 167 | -H 'Content-Type: application/json' \ 168 | # -H 'Authorization: Bearer ' \ 169 | -d '{"text":"My email address is __PII_EMAIL_ADDRESS_00000001__, and my phone number is __PII_PHONE_NUMBER_00000002__.", "maskMeta":"eyJfX1BJSV9QSE9ORV9OVU1CRVJfMDAwMDAwMDJfXyI6ICIxODc0NDMyNTU3OSIsICJfX1BJSV9FTUFJTF9BRERSRVNTXzAwMDAwMDAxX18iOiAidGVzdEBleGFtcGxlLmNvbSJ9"}' 170 | ``` 171 | 响应: 172 | ```json 173 | { 174 | "output":{"text":"My email address is test@example.com, and my phone number is 18744325579."}, 175 | "error":null 176 | } 177 | ``` 178 | 179 | ### Python 使用示例 180 | ```python 181 | import requests 182 | 183 | base = "http://127.0.0.1:8844" 184 | 185 | # 1) 调用例子 mask_text(JSON → JSON) 186 | r = requests.post(f"{base}/api/mask_text", json={"text": "张三电话13812345678", "language": "zh"}) 187 | r.raise_for_status() 188 | obj = r.json() 189 | output = obj["output"] 190 | masked_text = output["text"] 191 | mask_meta_b64 = output["maskMeta"] 192 | print("masked:", masked_text) 193 | 194 | # 2) 调用例子 restore_text(JSON → JSON) 195 | r2 = requests.post(f"{base}/api/restore_text", json={"text": masked_text, "maskMeta": mask_meta_b64}) 196 | r2.raise_for_status() 197 | print("restored:", r2.json()["output"]["text"]) 198 | ``` 199 | 200 | ### Node.js(fetch)示例 201 | ```js 202 | // 需要 Node 18+ 或自行引入 fetch polyfill 203 | const base = 'http://127.0.0.1:8844'; 204 | 205 | // 1) 调用例子 mask_text(JSON → JSON) 206 | const jr = await fetch(`${base}/api/mask_text`, { 207 | method: 'POST', 208 | headers: { 'Content-Type': 'application/json' }, 209 | body: JSON.stringify({ text: 'My email is test@example.com' }) 210 | }); 211 | if (!jr.ok) throw new Error(`mask_text http ${jr.status}`); 212 | const obj = await jr.json(); 213 | const maskedText = (obj.output || {}).text; 214 | const maskMetaB64 = (obj.output || {}).maskMeta; 215 | console.log('masked:', maskedText); 216 | 217 | // 2) 调用例子 restore_text(JSON → JSON) 218 | const rr = await fetch(`${base}/api/restore_text`, { 219 | method: 'POST', 220 | headers: { 'Content-Type': 'application/json' }, 221 | body: JSON.stringify({ text: maskedText, maskMeta: maskMetaB64 }) 222 | }); 223 | if (!rr.ok) throw new Error(`restore_text http ${rr.status}`); 224 | const restoredObj = await rr.json(); 225 | console.log('restored:', (restoredObj.output || {}).text); 226 | ``` 227 | 228 | ## 批量接口 229 | 230 | ### 匿名化批量接口:mask_text_batch 231 | - 方法/路径:POST `/api/mask_text_batch` 232 | - 请求 Content-Type:`application/json` 233 | - 请求头:`Authorization: `(若服务端启用鉴权) 234 | - 请求体:对象数组,每项 `{ text, language? }` 235 | - 响应 Content-Type:`application/json` 236 | - 响应体: 237 | ```json 238 | { 239 | "output": [ 240 | { "text": "", "maskMeta": "" }, 241 | { "text": "", "maskMeta": "" } 242 | ], 243 | "error": null 244 | } 245 | ``` 246 | 247 | 示例(curl): 248 | ```bash 249 | curl -s -X POST http://127.0.0.1:8844/api/mask_text_batch \ 250 | -H 'Content-Type: application/json' \ 251 | # -H 'Authorization: Bearer ' \ 252 | -d '[{"text":"A"},{"text":"B","language":"zh"}]' 253 | ``` 254 | 响应: 255 | ```json 256 | { 257 | "output":[ 258 | {"text":"My email address is __PII_EMAIL_ADDRESS_00000001__", 259 | "maskMeta":"eyJfX1BJSV9FTUFJTF9BRERSRVNTXzAwMDAwMDAxX18iOiAidGVzdEBleGFtcGxlLmNvbSJ9"}, 260 | {"text":"and my phone number is __PII_PHONE_NUMBER_00000001__.", 261 | "maskMeta":"eyJfX1BJSV9QSE9ORV9OVU1CRVJfMDAwMDAwMDFfXyI6ICIxODc0NDMyNTU3OSJ9"} 262 | ], 263 | "error": null 264 | } 265 | ``` 266 | 267 | ### 反匿名化批量接口:restore_text_batch 268 | - 方法/路径:POST `/api/restore_text_batch` 269 | - 请求 Content-Type:`application/json` 270 | - 请求头:`Authorization: `(若服务端启用鉴权) 271 | - 请求体:对象数组,每项 `{ text, maskMeta }`(`maskMeta` 为 base64 字符串) 272 | - 响应 Content-Type:`application/json` 273 | - 响应体: 274 | ```json 275 | { 276 | "output": [ 277 | {"text":""}, 278 | {"text":""} 279 | ], 280 | "error": null 281 | } 282 | ``` 283 | 284 | 示例(curl): 285 | ```bash 286 | curl -s -X POST http://127.0.0.1:8844/api/restore_text_batch \ 287 | -H 'Content-Type: application/json' \ 288 | # -H 'Authorization: Bearer ' \ 289 | -d '[{"text":"","maskMeta":""},{"text":"","maskMeta":""}]' 290 | ``` 291 | 响应: 292 | ```json 293 | { 294 | "output":[ 295 | {"text":"My email address is test@example.com"}, 296 | {"text":"and my phone number is 18744325579."} 297 | ], 298 | "error":null 299 | } 300 | ``` 301 | 302 | ## 附注 303 | - `maskMeta` 的内容在服务端是 `placeholdersMap` 的 UTF-8 JSON 字节整体 base64 编码;客户端无需理解其结构,按原样传回 `restore_text` 即可。 304 | - 若启用鉴权,请在请求头携带 `Authorization`(值可以是 `` 或 `Bearer `)。 305 | --------------------------------------------------------------------------------