├── datasets ├── test_sites │ ├── accounts.g.cdcde.com │ │ ├── html.txt │ │ ├── info.txt │ │ └── shot.png │ └── .DS_Store ├── .DS_Store └── overview.png ├── .gitignore ├── WEBtool ├── mainpage.png ├── sidebar.png ├── static │ ├── icon │ │ ├── fish.png │ │ ├── succ.png │ │ ├── file1.png │ │ └── noresult1.png │ ├── js │ │ ├── main.js │ │ └── sidebar.js │ └── css │ │ ├── sidebar.css │ │ └── style.css ├── readme.md ├── utils_web.py ├── app.py ├── templates │ └── index.html └── phishpedia_web.py ├── .github └── workflows │ ├── lint.yml │ ├── pytest.yml │ └── codeql.yml ├── configs.yaml ├── Plugin_for_Chrome ├── client │ ├── manifest.json │ ├── popup │ │ ├── popup.html │ │ ├── popup.css │ │ └── popup.js │ └── background.js ├── README.md └── server │ └── app.py ├── pixi.toml ├── setup.sh ├── logo_recog.py ├── configs.py ├── setup.bat ├── utils.py ├── README.md ├── phishpedia.py ├── LICENSE ├── models.py └── logo_matching.py /datasets/test_sites/accounts.g.cdcde.com/html.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/test_sites/accounts.g.cdcde.com/info.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/test_sites/accounts.g.cdcde.com/shot.png: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | *.pkl 3 | *.pth* 4 | venv/ 5 | __pycache__/ -------------------------------------------------------------------------------- /datasets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/datasets/.DS_Store -------------------------------------------------------------------------------- /WEBtool/mainpage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/WEBtool/mainpage.png -------------------------------------------------------------------------------- /WEBtool/sidebar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/WEBtool/sidebar.png -------------------------------------------------------------------------------- /datasets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/datasets/overview.png -------------------------------------------------------------------------------- /WEBtool/static/icon/fish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/WEBtool/static/icon/fish.png -------------------------------------------------------------------------------- /WEBtool/static/icon/succ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/WEBtool/static/icon/succ.png -------------------------------------------------------------------------------- /WEBtool/static/icon/file1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/WEBtool/static/icon/file1.png -------------------------------------------------------------------------------- /datasets/test_sites/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/datasets/test_sites/.DS_Store -------------------------------------------------------------------------------- /WEBtool/static/icon/noresult1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lindsey98/Phishpedia/HEAD/WEBtool/static/icon/noresult1.png -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: flake8 Lint 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | flake8-lint: 7 | runs-on: ubuntu-latest 8 | name: Lint 9 | steps: 10 | - name: Check out source repository 11 | uses: actions/checkout@v3 12 | - name: Set up Python environment 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: "3.11" 16 | - name: flake8 Lint 17 | uses: py-actions/flake8@v2 18 | with: 19 | ignore: "E266,W293,W504,E501" 20 | -------------------------------------------------------------------------------- /configs.yaml: -------------------------------------------------------------------------------- 1 | ELE_MODEL: # element recognition model -- logo only 2 | CFG_PATH: models/faster_rcnn.yaml # os.path.join(os.path.dirname(__file__), xxx) 3 | WEIGHTS_PATH: models/rcnn_bet365.pth 4 | DETECT_THRE: 0.05 5 | 6 | SIAMESE_MODEL: 7 | NUM_CLASSES: 277 # number of brands, users don't need to modify this even the targetlist is expanded 8 | MATCH_THRE: 0.87 # FIXME: threshold is 0.87 in phish-discovery? 9 | WEIGHTS_PATH: models/resnetv2_rgb_new.pth.tar 10 | TARGETLIST_PATH: models/expand_targetlist.zip 11 | DOMAIN_MAP_PATH: models/domain_map.pkl -------------------------------------------------------------------------------- /Plugin_for_Chrome/client/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 3, 3 | "name": "Phishing Detector", 4 | "version": "1.0", 5 | "description": "Detect phishing websites using screenshot and URL analysis", 6 | "permissions": [ 7 | "activeTab", 8 | "scripting", 9 | "storage", 10 | "tabs" 11 | ], 12 | "host_permissions": [ 13 | "http://localhost:5000/*" 14 | ], 15 | "action": { 16 | "default_popup": "popup/popup.html" 17 | }, 18 | "background": { 19 | "service_worker": "background.js" 20 | }, 21 | "commands": { 22 | "_execute_action": { 23 | "suggested_key": { 24 | "default": "Ctrl+Shift+H", 25 | "mac": "Command+Shift+H" 26 | }, 27 | "description": "Analyze current page for phishing" 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /Plugin_for_Chrome/client/popup/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 |

Phishing Detector

10 | 11 | 14 | 22 | 25 |
26 | 27 | 28 | -------------------------------------------------------------------------------- /Plugin_for_Chrome/client/popup/popup.css: -------------------------------------------------------------------------------- 1 | .container { 2 | width: 300px; 3 | padding: 16px; 4 | } 5 | 6 | h1 { 7 | font-size: 18px; 8 | margin-bottom: 16px; 9 | } 10 | 11 | button { 12 | width: 100%; 13 | padding: 8px; 14 | background-color: #4CAF50; 15 | color: white; 16 | border: none; 17 | border-radius: 4px; 18 | cursor: pointer; 19 | margin-bottom: 16px; 20 | } 21 | 22 | button:hover { 23 | background-color: #45a049; 24 | } 25 | 26 | .hidden { 27 | display: none; 28 | } 29 | 30 | #loading { 31 | text-align: center; 32 | margin: 16px 0; 33 | } 34 | 35 | #result { 36 | margin-top: 16px; 37 | } 38 | 39 | .safe { 40 | color: #4CAF50; 41 | } 42 | 43 | .dangerous { 44 | color: #f44336; 45 | } 46 | 47 | .error-message { 48 | color: #f44336; 49 | } -------------------------------------------------------------------------------- /pixi.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "phishpedia" 3 | channels = ["conda-forge"] 4 | platforms = ["osx-arm64", "linux-64", "win-64"] 5 | 6 | [dependencies] 7 | python = ">=3.8" 8 | pip = "*" 9 | setuptools = "*" 10 | wheel = "*" 11 | numpy = "1.23.0" 12 | requests = "*" 13 | scikit-learn = "*" 14 | spacy = "*" 15 | beautifulsoup4 = "*" 16 | matplotlib = "*" 17 | pandas = "*" 18 | nltk = "*" 19 | tqdm = "*" 20 | unidecode = "*" 21 | gdown = "*" 22 | tldextract = "*" 23 | scipy = "*" 24 | pathlib = "*" 25 | fvcore = "*" 26 | lxml = "*" 27 | psutil = "*" 28 | Pillow = "8.4.0" 29 | 30 | 31 | [pypi-dependencies] 32 | "flask" = "*" 33 | "flask-cors" = "*" 34 | "pycocotools" = "*" 35 | "opencv-python"= "*" 36 | "opencv-contrib-python"= "*" 37 | torch = { version = ">=1.9.0", index = "https://download.pytorch.org/whl/cpu" } 38 | torchvision = { version = ">=0.10.0", index = "https://download.pytorch.org/whl/cpu" } 39 | 40 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail # Safer bash behavior 4 | IFS=$'\n\t' 5 | 6 | # Install Detectron2 7 | pixi run pip install --no-build-isolation git+https://github.com/facebookresearch/detectron2.git 8 | 9 | # Set up model directory 10 | FILEDIR="$(pwd)" 11 | MODELS_DIR="$FILEDIR/models" 12 | mkdir -p "$MODELS_DIR" 13 | cd "$MODELS_DIR" 14 | 15 | # Download model files 16 | pixi run gdown --id "1tE2Mu5WC8uqCxei3XqAd7AWaP5JTmVWH" -O "rcnn_bet365.pth" 17 | pixi run gdown --id "1Q6lqjpl4exW7q_dPbComcj0udBMDl8CW" -O "faster_rcnn.yaml" 18 | pixi run gdown --id "1H0Q_DbdKPLFcZee8I14K62qV7TTy7xvS" -O "resnetv2_rgb_new.pth.tar" 19 | pixi run gdown --id "1fr5ZxBKyDiNZ_1B6rRAfZbAHBBoUjZ7I" -O "expand_targetlist.zip" 20 | pixi run gdown --id "1qSdkSSoCYUkZMKs44Rup_1DPBxHnEKl1" -O "domain_map.pkl" 21 | 22 | # Extract and flatten expand_targetlist 23 | echo "Extracting expand_targetlist.zip..." 24 | unzip -o expand_targetlist.zip -d expand_targetlist 25 | 26 | cd expand_targetlist || error_exit "Extraction directory missing." 27 | 28 | if [ -d "expand_targetlist" ]; then 29 | echo "Flattening nested expand_targetlist/ directory..." 30 | mv expand_targetlist/* . 31 | rm -r expand_targetlist 32 | fi 33 | 34 | echo "Model setup and extraction complete." 35 | -------------------------------------------------------------------------------- /Plugin_for_Chrome/client/background.js: -------------------------------------------------------------------------------- 1 | // 处理截图和URL获取 2 | async function captureTabInfo(tab) { 3 | try { 4 | // 获取截图 5 | const screenshot = await chrome.tabs.captureVisibleTab(null, { 6 | format: 'png' 7 | }); 8 | 9 | // 获取当前URL 10 | const url = tab.url; 11 | 12 | // 发送到服务器进行分析 13 | const response = await fetch('http://localhost:5000/analyze', { 14 | method: 'POST', 15 | headers: { 16 | 'Content-Type': 'application/json', 17 | }, 18 | body: JSON.stringify({ 19 | url: url, 20 | screenshot: screenshot 21 | }) 22 | }); 23 | 24 | const result = await response.json(); 25 | 26 | // 将结果发送到popup 27 | chrome.runtime.sendMessage({ 28 | type: 'analysisResult', 29 | data: result 30 | }); 31 | 32 | } catch (error) { 33 | console.error('Error capturing tab info:', error); 34 | chrome.runtime.sendMessage({ 35 | type: 'error', 36 | data: error.message 37 | }); 38 | } 39 | } 40 | 41 | // 监听快捷键命令 42 | chrome.commands.onCommand.addListener(async (command) => { 43 | if (command === '_execute_action') { 44 | const [tab] = await chrome.tabs.query({ active: true, currentWindow: true }); 45 | if (tab) { 46 | await captureTabInfo(tab); 47 | } 48 | } 49 | }); 50 | 51 | // 监听来自popup的消息 52 | chrome.runtime.onMessage.addListener((request, sender, sendResponse) => { 53 | if (request.type === 'analyze') { 54 | chrome.tabs.query({ active: true, currentWindow: true }, async (tabs) => { 55 | if (tabs[0]) { 56 | await captureTabInfo(tabs[0]); 57 | } 58 | }); 59 | } 60 | return true; 61 | }); -------------------------------------------------------------------------------- /Plugin_for_Chrome/client/popup/popup.js: -------------------------------------------------------------------------------- 1 | document.addEventListener('DOMContentLoaded', () => { 2 | const analyzeBtn = document.getElementById('analyzeBtn'); 3 | const loading = document.getElementById('loading'); 4 | const result = document.getElementById('result'); 5 | const status = document.getElementById('status'); 6 | const legitUrl = document.getElementById('legitUrl'); 7 | const legitUrlLink = document.getElementById('legitUrlLink'); 8 | const error = document.getElementById('error'); 9 | 10 | // 点击分析按钮 11 | analyzeBtn.addEventListener('click', () => { 12 | // 显示加载状态 13 | loading.classList.remove('hidden'); 14 | result.classList.add('hidden'); 15 | error.classList.add('hidden'); 16 | 17 | // 发送消息给background script 18 | chrome.runtime.sendMessage({ 19 | type: 'analyze' 20 | }); 21 | }); 22 | 23 | // 监听来自background的消息 24 | chrome.runtime.onMessage.addListener((message) => { 25 | loading.classList.add('hidden'); 26 | 27 | if (message.type === 'analysisResult') { 28 | result.classList.remove('hidden'); 29 | 30 | if (message.data.isPhishing) { 31 | status.innerHTML = '⚠️ 警告:这可能是一个钓鱼网站!'; 32 | if (message.data.legitUrl) { 33 | legitUrl.classList.remove('hidden'); 34 | legitUrlLink.href = message.data.legitUrl; 35 | legitUrlLink.textContent = message.data.brand; 36 | } 37 | } else { 38 | status.innerHTML = '✓ 这是一个安全的网站'; 39 | legitUrl.classList.add('hidden'); 40 | } 41 | } else if (message.type === 'error') { 42 | error.classList.remove('hidden'); 43 | error.querySelector('.error-message').textContent = message.data; 44 | } 45 | }); 46 | }); -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: Pytest CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | # 第一步:检出代码 15 | - name: Checkout code 16 | uses: actions/checkout@v3 17 | 18 | # 第二步:设置 Miniconda 19 | - name: Set up Miniconda 20 | uses: conda-incubator/setup-miniconda@v2 21 | with: 22 | auto-update-conda: true # 自动更新 Conda 23 | python-version: '3.9' # 指定 Python 版 24 | activate-environment: phishpedia 25 | 26 | # 保存cache 27 | - name: Cache Conda packages and pip cache 28 | uses: actions/cache@v3 29 | with: 30 | path: | 31 | ~/.conda/pkgs # 缓存 Conda 包 32 | ~/.cache/pip # 缓存 pip 包 33 | phishpedia/lib/python3.9/site-packages # 可选:缓存虚拟环境的 site-packages 34 | key: ${{ runner.os }}-conda-${{ hashFiles('**/environment.yml', '**/requirements.txt') }} 35 | restore-keys: | 36 | ${{ runner.os }}-conda- 37 | 38 | # 第三步:升级 pip 39 | - name: Upgrade pip 40 | run: | 41 | python -m pip install --upgrade pip 42 | 43 | 44 | # 第四步:克隆 Phishpedia 仓库并运行 setup.sh 45 | - name: Clone Phishpedia repo and run setup.sh 46 | run: | 47 | git clone https://github.com/lindsey98/Phishpedia.git 48 | cd Phishpedia 49 | chmod +x ./setup.sh 50 | ./setup.sh 51 | 52 | 53 | # 第五步:安装项目依赖和 pytest 54 | - name: Install dependencies and pytest 55 | run: | 56 | 57 | conda run -n phishpedia pip install pytest 58 | conda run -n phishpedia pip install validators 59 | 60 | 61 | # 步骤 6:运行 Pytest 测试 62 | - name: Run Pytest 63 | run: | 64 | 65 | conda run -n phishpedia pytest tests/test_logo_matching.py 66 | conda run -n phishpedia pytest tests/test_logo_recog.py 67 | conda run -n phishpedia pytest tests/test_phishpedia.py 68 | -------------------------------------------------------------------------------- /Plugin_for_Chrome/README.md: -------------------------------------------------------------------------------- 1 | # Plugin_for_Chrome 2 | 3 | ## Project Overview 4 | 5 | `Plugin_for_Chrome` is a Chrome extension project designed to detect phishing websites. 6 | The extension automatically retrieves the current webpage's URL and a screenshot when the user presses a predefined hotkey or clicks the extension button, then sends this information to the server for phishing detection. The server utilizes the Flask framework, loads the Phishpedia model for identification, and returns the detection results. 7 | 8 | ## Directory Structure 9 | 10 | ``` 11 | Plugin_for_Chrome/ 12 | ├── client/ 13 | │ ├── background.js # Handles the extension's background logic, including hotkeys and button click events. 14 | │ ├── manifest.json # Configuration file for the Chrome extension. 15 | │ └── popup/ 16 | │ ├── popup.html # HTML file for the extension's popup page. 17 | │ ├── popup.js # JavaScript file for the extension's popup page. 18 | │ └── popup.css # CSS file for the extension's popup page. 19 | └── server/ 20 | └── app.py # Main program for the Flask server, handling client requests and invoking the Phishpedia model for detection. 21 | ``` 22 | 23 | ## Installation and Usage 24 | 25 | ### Frontend 26 | 27 | 1. Open the Chrome browser and navigate to `chrome://extensions/`. 28 | 2. Enable Developer Mode. 29 | 3. Click on "Load unpacked" and select the `Plugin_for_Chrome` directory. 30 | 31 | ### Backend 32 | 33 | 1. Run the Flask server: 34 | ```bash 35 | pixi run python -m Plugin_for_Chrome.server.app 36 | ``` 37 | ## Using the Extension 38 | 39 | In the Chrome browser, press the hotkey `Ctrl+Shift+H` or click the extension button. 40 | The extension will automatically capture the current webpage's URL and a screenshot, then send them to the server for analysis. 41 | The server will return the detection results, and the extension will display whether the webpage is a phishing site along with the corresponding legitimate website. 42 | 43 | ## Notes 44 | 45 | Ensure that the server is running locally and listening on the default port 5000. 46 | The extension and the server must operate within the same network environment. 47 | 48 | ## Contributing 49 | 50 | Feel free to submit issues and contribute code! 51 | 52 | -------------------------------------------------------------------------------- /logo_recog.py: -------------------------------------------------------------------------------- 1 | from detectron2.config import get_cfg 2 | from detectron2.engine import DefaultPredictor 3 | import cv2 4 | import numpy as np 5 | import torch 6 | 7 | 8 | def pred_rcnn(im, predictor): 9 | ''' 10 | Perform inference for RCNN 11 | :param im: 12 | :param predictor: 13 | :return: 14 | ''' 15 | im = cv2.imread(im) 16 | 17 | if im is not None: 18 | if im.shape[-1] == 4: 19 | im = cv2.cvtColor(im, cv2.COLOR_BGRA2BGR) 20 | else: 21 | print(f"Image at path {im} is None") 22 | return None 23 | 24 | outputs = predictor(im) 25 | 26 | instances = outputs['instances'] 27 | pred_classes = instances.pred_classes # tensor 28 | pred_boxes = instances.pred_boxes # Boxes object 29 | 30 | logo_boxes = pred_boxes[pred_classes == 1].tensor 31 | 32 | return logo_boxes 33 | 34 | 35 | def config_rcnn(cfg_path, weights_path, conf_threshold): 36 | ''' 37 | Configure weights and confidence threshold 38 | :param cfg_path: 39 | :param weights_path: 40 | :param conf_threshold: 41 | :return: 42 | ''' 43 | cfg = get_cfg() 44 | cfg.merge_from_file(cfg_path) 45 | cfg.MODEL.WEIGHTS = weights_path 46 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = conf_threshold 47 | # uncomment if you installed detectron2 cpu version 48 | if not torch.cuda.is_available(): 49 | cfg.MODEL.DEVICE = 'cpu' 50 | 51 | # Initialize model 52 | predictor = DefaultPredictor(cfg) 53 | return predictor 54 | 55 | 56 | COLORS = { 57 | 0: (255, 255, 0), # logo 58 | 1: (36, 255, 12), # input 59 | 2: (0, 255, 255), # button 60 | 3: (0, 0, 255), # label 61 | 4: (255, 0, 0) # block 62 | } 63 | 64 | 65 | def vis(img_path, pred_boxes): 66 | ''' 67 | Visualize rcnn predictions 68 | :param img_path: str 69 | :param pred_boxes: torch.Tensor of shape Nx4, bounding box coordinates in (x1, y1, x2, y2) 70 | :param pred_classes: torch.Tensor of shape Nx1 0 for logo, 1 for input, 2 for button, 3 for label(text near input), 4 for block 71 | :return None 72 | ''' 73 | 74 | check = cv2.imread(img_path) 75 | if pred_boxes is None or len(pred_boxes) == 0: 76 | print("Pred_boxes is None or the length of pred_boxes is 0") 77 | return check 78 | pred_boxes = pred_boxes.numpy() if not isinstance(pred_boxes, np.ndarray) else pred_boxes 79 | 80 | # draw rectangle 81 | for j, box in enumerate(pred_boxes): 82 | if j == 0: 83 | cv2.rectangle(check, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), COLORS[0], 2) 84 | else: 85 | cv2.rectangle(check, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), COLORS[1], 2) 86 | 87 | return check 88 | -------------------------------------------------------------------------------- /WEBtool/readme.md: -------------------------------------------------------------------------------- 1 | # Phishpedia Web Tool 2 | 3 | This is a web tool for Phishpedia which provides a user-friendly interface with brand and domain management capabilities, as well as visualization features for phishing detection. 4 | 5 | ## How to Run 6 | 7 | Run the following command in the web tool directory: 8 | 9 | ```bash 10 | pixi run python WEBtool/phishpedia_web.py 11 | ``` 12 | 13 | you should see an URL after the server is started (http://127.0.0.1:500x). Visit it in your browser. 14 | 15 | ## User Guide 16 | 17 | ### 1. Main Page (For phishing detection) 18 | 19 | ![image-20241228141453032](./mainpage.png) 20 | 21 | 1. **URL Detection** 22 | - Enter the URL to be tested in the "Enter URL" input box 23 | - Click the "Upload Image" button to select the corresponding website screenshot 24 | - Click the "Start Detection!" button to start detection 25 | - Detection results will be displayed below, including text results and visual presentation 26 | 2. **Result Display** 27 | - The original image with logo extracted will be displayed in the "Logo Extraction" box 28 | - Detection results will be displayed in the "Detection Result" box, together with a synthetic explanation 29 | - You can clearly see the detected brand identifiers and related information 30 | 31 | ### 2. Sidebar (For database management) 32 | 33 | Click the sidebar button "☰" at top right corner, this will trigger a sidebar showing database at backend. 34 | 35 | ![image-20241228141419609](./sidebar.png) 36 | 37 | 1. **Brand Management** 38 | - Click "Add Brand" to add a new brand 39 | - Enter brand name and corresponding domains in the form 40 | - Click one brand to select, and click "Delete Brand" to remove the selected brand 41 | - Double-click one brand to see the logo under this brand 42 | 2. **Logo Management** 43 | - Click one brand to select, and click "Add Logo" to add brand logos 44 | - Click one logo to select, and click "Delete Logo" to remove selected logo 45 | 3. **Data Update** 46 | - After making changes, click the "Reload Model" button 47 | - The system will reload the updated dataset 48 | 49 | ## Main Features 50 | 51 | 1. **Phishing Detection** 52 | 53 | - URL input and detection 54 | - Screenshot upload and analysis 55 | - Detection result visualization 56 | 57 | 2. **Brand Management** 58 | - Add/Delete brands 59 | - Add/Delete brand logos 60 | - Domain management 61 | - Model reloading 62 | 63 | ## Directory Structure 64 | 65 | ``` 66 | WEBtool/ 67 | ├── static/ # Static resources like css,icon 68 | ├── templates/ # Web page 69 | ├── phishpedia_web.py # A flask server 70 | ├── utils_web.py # Help functions for server 71 | ├── readme.md # Documentation 72 | └── requirements.txt # Dependency list 73 | ``` 74 | -------------------------------------------------------------------------------- /WEBtool/utils_web.py: -------------------------------------------------------------------------------- 1 | # help function for phishpedia web app 2 | import os 3 | import pickle 4 | import shutil 5 | import socket 6 | import base64 7 | import io 8 | from PIL import Image 9 | import cv2 10 | 11 | 12 | def check_port_inuse(port, host): 13 | try: 14 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 15 | s.settimeout(1) 16 | s.connect((host, port)) 17 | return True 18 | except socket.error: 19 | return False 20 | finally: 21 | if s: 22 | s.close() 23 | 24 | 25 | def allowed_file(filename): 26 | return '.' in filename and \ 27 | filename.rsplit('.', 1)[1].lower() in {'png', 'jpg', 'jpeg'} 28 | 29 | 30 | def initial_upload_folder(upload_folder): 31 | try: 32 | shutil.rmtree(upload_folder) 33 | except FileNotFoundError: 34 | pass 35 | os.makedirs(upload_folder, exist_ok=True) 36 | 37 | 38 | def convert_to_base64(image_array): 39 | if image_array is None: 40 | return None 41 | 42 | image_array_rgb = cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB) 43 | img = Image.fromarray(image_array_rgb) 44 | buffered = io.BytesIO() 45 | img.save(buffered, format="PNG") 46 | plotvis_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') 47 | return plotvis_base64 48 | 49 | 50 | def domain_map_add(brand_name, domains_str, domain_map_path): 51 | domains = [domain.strip() for domain in domains_str.split(',') if domain.strip()] 52 | 53 | # Load existing domain mapping 54 | with open(domain_map_path, 'rb') as f: 55 | domain_map = pickle.load(f) 56 | 57 | # Add new brand and domains 58 | if brand_name in domain_map: 59 | if isinstance(domain_map[brand_name], list): 60 | # Add new domains, avoid duplicates 61 | existing_domains = set(domain_map[brand_name]) 62 | for domain in domains: 63 | if domain not in existing_domains: 64 | domain_map[brand_name].append(domain) 65 | else: 66 | # If current value is not a list, convert to list 67 | old_domain = domain_map[brand_name] 68 | domain_map[brand_name] = [old_domain] + [d for d in domains if d != old_domain] 69 | else: 70 | domain_map[brand_name] = domains 71 | 72 | # Save updated mapping 73 | with open(domain_map_path, 'wb') as f: 74 | pickle.dump(domain_map, f) 75 | 76 | 77 | def domain_map_delete(brand_name, domain_map_path): 78 | # Load existing domain mapping 79 | with open(domain_map_path, 'rb') as f: 80 | domain_map = pickle.load(f) 81 | 82 | print("before deleting", len(domain_map)) 83 | 84 | # Delete brand and its domains 85 | if brand_name in domain_map: 86 | del domain_map[brand_name] 87 | 88 | print("after deleting", len(domain_map)) 89 | 90 | # Save updated mapping 91 | with open(domain_map_path, 'wb') as f: 92 | pickle.dump(domain_map, f) 93 | -------------------------------------------------------------------------------- /configs.py: -------------------------------------------------------------------------------- 1 | # Global configuration 2 | import yaml 3 | from logo_matching import cache_reference_list, load_model_weights 4 | from logo_recog import config_rcnn 5 | import os 6 | import numpy as np 7 | 8 | 9 | def get_absolute_path(relative_path): 10 | base_path = os.path.dirname(__file__) 11 | return os.path.abspath(os.path.join(base_path, relative_path)) 12 | 13 | 14 | def load_config(reload_targetlist=False): 15 | with open(os.path.join(os.path.dirname(__file__), 'configs.yaml')) as file: 16 | configs = yaml.load(file, Loader=yaml.FullLoader) 17 | 18 | # Iterate through the configuration and update paths 19 | for section, settings in configs.items(): 20 | for key, value in settings.items(): 21 | if 'PATH' in key and isinstance(value, str): # Check if the key indicates a path 22 | absolute_path = get_absolute_path(value) 23 | configs[section][key] = absolute_path 24 | 25 | ELE_CFG_PATH = configs['ELE_MODEL']['CFG_PATH'] 26 | ELE_WEIGHTS_PATH = configs['ELE_MODEL']['WEIGHTS_PATH'] 27 | ELE_CONFIG_THRE = configs['ELE_MODEL']['DETECT_THRE'] 28 | ELE_MODEL = config_rcnn(ELE_CFG_PATH, 29 | ELE_WEIGHTS_PATH, 30 | conf_threshold=ELE_CONFIG_THRE) 31 | 32 | # siamese model 33 | SIAMESE_THRE = configs['SIAMESE_MODEL']['MATCH_THRE'] 34 | 35 | print('Load protected logo list') 36 | targetlist_zip_path = configs['SIAMESE_MODEL']['TARGETLIST_PATH'] 37 | targetlist_dir = os.path.dirname(targetlist_zip_path) 38 | zip_file_name = os.path.basename(targetlist_zip_path) 39 | targetlist_folder = zip_file_name.split('.zip')[0] 40 | full_targetlist_folder_dir = os.path.join(targetlist_dir, targetlist_folder) 41 | 42 | # if reload_targetlist or targetlist_zip_path.endswith('.zip') and not os.path.isdir(full_targetlist_folder_dir): 43 | # os.makedirs(full_targetlist_folder_dir, exist_ok=True) 44 | # subprocess.run(f'unzip -o "{targetlist_zip_path}" -d "{full_targetlist_folder_dir}"', shell=True) 45 | 46 | SIAMESE_MODEL = load_model_weights(num_classes=configs['SIAMESE_MODEL']['NUM_CLASSES'], 47 | weights_path=configs['SIAMESE_MODEL']['WEIGHTS_PATH']) 48 | 49 | LOGO_FEATS_NAME = 'LOGO_FEATS.npy' 50 | LOGO_FILES_NAME = 'LOGO_FILES.npy' 51 | 52 | if reload_targetlist or (not os.path.exists(os.path.join(os.path.dirname(__file__), LOGO_FEATS_NAME))): 53 | LOGO_FEATS, LOGO_FILES = cache_reference_list(model=SIAMESE_MODEL, 54 | targetlist_path=full_targetlist_folder_dir) 55 | print('Finish loading protected logo list') 56 | np.save(os.path.join(os.path.dirname(__file__), LOGO_FEATS_NAME), LOGO_FEATS) 57 | np.save(os.path.join(os.path.dirname(__file__), LOGO_FILES_NAME), LOGO_FILES) 58 | 59 | else: 60 | LOGO_FEATS, LOGO_FILES = np.load(os.path.join(os.path.dirname(__file__), LOGO_FEATS_NAME)), \ 61 | np.load(os.path.join(os.path.dirname(__file__), LOGO_FILES_NAME)) 62 | 63 | DOMAIN_MAP_PATH = configs['SIAMESE_MODEL']['DOMAIN_MAP_PATH'] 64 | 65 | return ELE_MODEL, SIAMESE_THRE, SIAMESE_MODEL, LOGO_FEATS, LOGO_FILES, DOMAIN_MAP_PATH 66 | -------------------------------------------------------------------------------- /WEBtool/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | from flask_cors import CORS 3 | import base64 4 | from io import BytesIO 5 | from PIL import Image 6 | from datetime import datetime 7 | import os 8 | from phishpedia import PhishpediaWrapper, result_file_write 9 | 10 | app = Flask(__name__) 11 | CORS(app) 12 | 13 | # 在创建应用时初始化模型 14 | with app.app_context(): 15 | current_dir = os.path.dirname(os.path.realpath(__file__)) 16 | log_dir = os.path.join(current_dir, 'plugin_logs') 17 | os.makedirs(log_dir, exist_ok=True) 18 | phishpedia_cls = PhishpediaWrapper() 19 | 20 | 21 | @app.route('/analyze', methods=['POST']) 22 | def analyze(): 23 | try: 24 | print('Request received') 25 | data = request.get_json() 26 | url = data.get('url') 27 | screenshot_data = data.get('screenshot') 28 | 29 | # 解码Base64图片数据 30 | image_data = base64.b64decode(screenshot_data.split(',')[1]) 31 | image = Image.open(BytesIO(image_data)) 32 | screenshot_path = 'temp_screenshot.png' 33 | image.save(screenshot_path, format='PNG') 34 | 35 | # 调用Phishpedia模型进行识别 36 | phish_category, pred_target, matched_domain, \ 37 | plotvis, siamese_conf, pred_boxes, \ 38 | logo_recog_time, logo_match_time = phishpedia_cls.test_orig_phishpedia(url, screenshot_path, None) 39 | 40 | # 添加结果处理逻辑 41 | result = { 42 | "isPhishing": bool(phish_category), 43 | "brand": pred_target if pred_target else "unknown", 44 | "legitUrl": f"https://{matched_domain[0]}" if matched_domain else "unknown", 45 | "confidence": float(siamese_conf) if siamese_conf is not None else 0.0 46 | } 47 | 48 | # 记录日志 49 | today = datetime.now().strftime('%Y%m%d') 50 | log_file_path = os.path.join(log_dir, f'{today}_results.txt') 51 | 52 | try: 53 | with open(log_file_path, "a+", encoding='ISO-8859-1') as f: 54 | result_file_write(f, current_dir, url, phish_category, pred_target, 55 | matched_domain if matched_domain else ["unknown"], 56 | siamese_conf if siamese_conf is not None else 0.0, 57 | logo_recog_time, logo_match_time) 58 | except UnicodeError: 59 | with open(log_file_path, "a+", encoding='utf-8') as f: 60 | result_file_write(f, current_dir, url, phish_category, pred_target, 61 | matched_domain if matched_domain else ["unknown"], 62 | siamese_conf if siamese_conf is not None else 0.0, 63 | logo_recog_time, logo_match_time) 64 | 65 | if os.path.exists(screenshot_path): 66 | os.remove(screenshot_path) 67 | 68 | return jsonify(result) 69 | 70 | except Exception as e: 71 | print(f"Error in analyze: {str(e)}") 72 | log_error_path = os.path.join(log_dir, 'log_error.txt') 73 | with open(log_error_path, "a+", encoding='utf-8') as f: 74 | f.write(f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} - {str(e)}\n') 75 | return jsonify("ERROR"), 500 76 | 77 | 78 | if __name__ == '__main__': 79 | app.run(host='0.0.0.0', port=5000, debug=False) 80 | -------------------------------------------------------------------------------- /Plugin_for_Chrome/server/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | from flask_cors import CORS 3 | import base64 4 | from io import BytesIO 5 | from PIL import Image 6 | from datetime import datetime 7 | import os 8 | from phishpedia import PhishpediaWrapper, result_file_write 9 | 10 | app = Flask(__name__) 11 | CORS(app) 12 | 13 | # 在创建应用时初始化模型 14 | with app.app_context(): 15 | current_dir = os.path.dirname(os.path.realpath(__file__)) 16 | log_dir = os.path.join(current_dir, 'plugin_logs') 17 | os.makedirs(log_dir, exist_ok=True) 18 | phishpedia_cls = PhishpediaWrapper() 19 | 20 | 21 | @app.route('/analyze', methods=['POST']) 22 | def analyze(): 23 | try: 24 | print('Request received') 25 | data = request.get_json() 26 | url = data.get('url') 27 | screenshot_data = data.get('screenshot') 28 | 29 | # 解码Base64图片数据 30 | image_data = base64.b64decode(screenshot_data.split(',')[1]) 31 | image = Image.open(BytesIO(image_data)) 32 | screenshot_path = 'temp_screenshot.png' 33 | image.save(screenshot_path, format='PNG') 34 | 35 | # 调用Phishpedia模型进行识别 36 | phish_category, pred_target, matched_domain, \ 37 | plotvis, siamese_conf, pred_boxes, \ 38 | logo_recog_time, logo_match_time = phishpedia_cls.test_orig_phishpedia(url, screenshot_path, None) 39 | 40 | # 添加结果处理逻辑 41 | result = { 42 | "isPhishing": bool(phish_category), 43 | "brand": pred_target if pred_target else "unknown", 44 | "legitUrl": f"https://{matched_domain[0]}" if matched_domain else "unknown", 45 | "confidence": float(siamese_conf) if siamese_conf is not None else 0.0 46 | } 47 | 48 | # 记录日志 49 | today = datetime.now().strftime('%Y%m%d') 50 | log_file_path = os.path.join(log_dir, f'{today}_results.txt') 51 | 52 | try: 53 | with open(log_file_path, "a+", encoding='ISO-8859-1') as f: 54 | result_file_write(f, current_dir, url, phish_category, pred_target, 55 | matched_domain if matched_domain else ["unknown"], 56 | siamese_conf if siamese_conf is not None else 0.0, 57 | logo_recog_time, logo_match_time) 58 | except UnicodeError: 59 | with open(log_file_path, "a+", encoding='utf-8') as f: 60 | result_file_write(f, current_dir, url, phish_category, pred_target, 61 | matched_domain if matched_domain else ["unknown"], 62 | siamese_conf if siamese_conf is not None else 0.0, 63 | logo_recog_time, logo_match_time) 64 | 65 | if os.path.exists(screenshot_path): 66 | os.remove(screenshot_path) 67 | 68 | return jsonify(result) 69 | 70 | except Exception as e: 71 | print(f"Error in analyze: {str(e)}") 72 | log_error_path = os.path.join(log_dir, 'log_error.txt') 73 | with open(log_error_path, "a+", encoding='utf-8') as f: 74 | f.write(f'{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} - {str(e)}\n') 75 | return jsonify("ERROR"), 500 76 | 77 | 78 | if __name__ == '__main__': 79 | app.run(host='0.0.0.0', port=5000, debug=False) 80 | -------------------------------------------------------------------------------- /setup.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal enabledelayedexpansion 3 | 4 | :: ------------------------------------------------------------------------------ 5 | :: Initialization and Logging 6 | :: ------------------------------------------------------------------------------ 7 | echo [%DATE% %TIME%] Starting setup... 8 | 9 | :: ------------------------------------------------------------------------------ 10 | :: Tool Checks 11 | :: ------------------------------------------------------------------------------ 12 | where pixi >nul 2>nul || ( 13 | echo [ERROR] pixi not found. Please install Pixi. 14 | exit /b 1 15 | ) 16 | where gdown >nul 2>nul || ( 17 | echo [ERROR] gdown not found. Please install gdown (via pixi). 18 | exit /b 1 19 | ) 20 | where unzip >nul 2>nul || ( 21 | echo [ERROR] unzip not found. Please install unzip utility. 22 | exit /b 1 23 | ) 24 | 25 | :: ------------------------------------------------------------------------------ 26 | :: Setup Directories 27 | :: ------------------------------------------------------------------------------ 28 | set "FILEDIR=%cd%" 29 | set "MODELS_DIR=%FILEDIR%\models" 30 | if not exist "%MODELS_DIR%" mkdir "%MODELS_DIR%" 31 | cd /d "%MODELS_DIR%" 32 | 33 | :: ------------------------------------------------------------------------------ 34 | :: Install Detectron2 35 | :: ------------------------------------------------------------------------------ 36 | echo [%DATE% %TIME%] Installing detectron2... 37 | pixi run pip install --no-build-isolation git+https://github.com/facebookresearch/detectron2.git || ( 38 | echo [ERROR] Failed to install detectron2. 39 | exit /b 1 40 | ) 41 | 42 | :: ------------------------------------------------------------------------------ 43 | :: File Metadata 44 | :: ------------------------------------------------------------------------------ 45 | set RETRY_COUNT=3 46 | 47 | :: Model files and Google Drive IDs 48 | set file1=rcnn_bet365.pth 49 | set id1=1tE2Mu5WC8uqCxei3XqAd7AWaP5JTmVWH 50 | 51 | set file2=faster_rcnn.yaml 52 | set id2=1Q6lqjpl4exW7q_dPbComcj0udBMDl8CW 53 | 54 | set file3=resnetv2_rgb_new.pth.tar 55 | set id3=1H0Q_DbdKPLFcZee8I14K62qV7TTy7xvS 56 | 57 | set file4=expand_targetlist.zip 58 | set id4=1fr5ZxBKyDiNZ_1B6rRAfZbAHBBoUjZ7I 59 | 60 | set file5=domain_map.pkl 61 | set id5=1qSdkSSoCYUkZMKs44Rup_1DPBxHnEKl1 62 | 63 | :: ------------------------------------------------------------------------------ 64 | :: Download Loop 65 | :: ------------------------------------------------------------------------------ 66 | for /L %%i in (1,1,5) do ( 67 | call set "FILENAME=%%file%%i%%" 68 | call set "FILEID=%%id%%i%%" 69 | 70 | if exist "!FILENAME!" ( 71 | echo [INFO] !FILENAME! already exists. Skipping. 72 | ) else ( 73 | set /A count=1 74 | :retry_%%i 75 | echo [%DATE% %TIME%] Downloading !FILENAME! (Attempt !count!/%RETRY_COUNT%)... 76 | pixi run gdown --id !FILEID! -O "!FILENAME!" && goto downloaded_%%i 77 | 78 | set /A count+=1 79 | if !count! LEQ %RETRY_COUNT% ( 80 | timeout /t 2 >nul 81 | goto retry_%%i 82 | ) else ( 83 | echo [ERROR] Failed to download !FILENAME! after %RETRY_COUNT% attempts. 84 | exit /b 1 85 | ) 86 | :downloaded_%%i 87 | ) 88 | ) 89 | 90 | :: ------------------------------------------------------------------------------ 91 | :: Extraction 92 | :: ------------------------------------------------------------------------------ 93 | echo [%DATE% %TIME%] Extracting expand_targetlist.zip... 94 | unzip -o expand_targetlist.zip -d expand_targetlist || ( 95 | echo [ERROR] Failed to unzip file. 96 | exit /b 1 97 | ) 98 | 99 | :: Flatten nested folder if necessary 100 | cd expand_targetlist 101 | if exist expand_targetlist\ ( 102 | echo [INFO] Flattening nested expand_targetlist directory... 103 | move expand_targetlist\*.* . >nul 104 | rmdir expand_targetlist 105 | ) 106 | 107 | :: ------------------------------------------------------------------------------ 108 | :: Done 109 | :: ------------------------------------------------------------------------------ 110 | echo [%DATE% %TIME%] [SUCCESS] Model setup and extraction complete. 111 | endlocal 112 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL Advanced" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | branches: [ "main" ] 19 | schedule: 20 | - cron: '22 9 * * 2' 21 | 22 | jobs: 23 | analyze: 24 | name: Analyze (${{ matrix.language }}) 25 | # Runner size impacts CodeQL analysis time. To learn more, please see: 26 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 27 | # - https://gh.io/supported-runners-and-hardware-resources 28 | # - https://gh.io/using-larger-runners (GitHub.com only) 29 | # Consider using larger runners or machines with greater resources for possible analysis time improvements. 30 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 31 | permissions: 32 | # required for all workflows 33 | security-events: write 34 | 35 | # required to fetch internal or private CodeQL packs 36 | packages: read 37 | 38 | # only required for workflows in private repositories 39 | actions: read 40 | contents: read 41 | 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | include: 46 | - language: python 47 | build-mode: none 48 | # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' 49 | # Use `c-cpp` to analyze code written in C, C++ or both 50 | # Use 'java-kotlin' to analyze code written in Java, Kotlin or both 51 | # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both 52 | # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, 53 | # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. 54 | # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how 55 | # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages 56 | steps: 57 | - name: Checkout repository 58 | uses: actions/checkout@v4 59 | 60 | # Initializes the CodeQL tools for scanning. 61 | - name: Initialize CodeQL 62 | uses: github/codeql-action/init@v3 63 | with: 64 | languages: ${{ matrix.language }} 65 | build-mode: ${{ matrix.build-mode }} 66 | # If you wish to specify custom queries, you can do so here or in a config file. 67 | # By default, queries listed here will override any specified in a config file. 68 | # Prefix the list here with "+" to use these queries and those in the config file. 69 | 70 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 71 | # queries: security-extended,security-and-quality 72 | 73 | # If the analyze step fails for one of the languages you are analyzing with 74 | # "We were unable to automatically build your code", modify the matrix above 75 | # to set the build mode to "manual" for that language. Then modify this step 76 | # to build your code. 77 | # ℹ️ Command-line programs to run using the OS shell. 78 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 79 | - if: matrix.build-mode == 'manual' 80 | shell: bash 81 | run: | 82 | echo 'If you are using a "manual" build mode for one or more of the' \ 83 | 'languages you are analyzing, replace this with the commands to build' \ 84 | 'your code, for example:' 85 | echo ' make bootstrap' 86 | echo ' make release' 87 | exit 1 88 | 89 | - name: Perform CodeQL Analysis 90 | uses: github/codeql-action/analyze@v3 91 | with: 92 | category: "/language:${{matrix.language}}" 93 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | import math 3 | 4 | 5 | def resolution_alignment(img1, img2): 6 | ''' 7 | Resize two images according to the minimum resolution between the two 8 | :param img1: first image in PIL.Image 9 | :param img2: second image in PIL.Image 10 | :return: resized img1 in PIL.Image, resized img2 in PIL.Image 11 | ''' 12 | w1, h1 = img1.size 13 | w2, h2 = img2.size 14 | w_min, h_min = min(w1, w2), min(h1, h2) 15 | if w_min == 0 or h_min == 0: # something wrong, stop resizing 16 | return img1, img2 17 | if w_min < h_min: 18 | img1_resize = img1.resize((int(w_min), math.ceil(h1 * (w_min / w1)))) # ceiling to prevent rounding to 0 19 | img2_resize = img2.resize((int(w_min), math.ceil(h2 * (w_min / w2)))) 20 | else: 21 | img1_resize = img1.resize((math.ceil(w1 * (h_min / h1)), int(h_min))) 22 | img2_resize = img2.resize((math.ceil(w2 * (h_min / h2)), int(h_min))) 23 | return img1_resize, img2_resize 24 | 25 | 26 | def brand_converter(brand_name): 27 | ''' 28 | Helper function to deal with inconsistency in brand naming 29 | ''' 30 | brand_tran_dict = {'Adobe Inc.': 'Adobe', 'Adobe Inc': 'Adobe', 31 | 'ADP, LLC': 'ADP', 'ADP, LLC.': 'ADP', 32 | 'Amazon.com Inc.': 'Amazon', 'Amazon.com Inc': 'Amazon', 33 | 'Americanas.com S,A Comercio Electrnico': 'Americanas.com S', 34 | 'AOL Inc.': 'AOL', 'AOL Inc': 'AOL', 35 | 'Apple Inc.': 'Apple', 'Apple Inc': 'Apple', 36 | 'AT&T Inc.': 'AT&T', 'AT&T Inc': 'AT&T', 37 | 'Banco do Brasil S.A.': 'Banco do Brasil S.A', 38 | 'Credit Agricole S.A.': 'Credit Agricole S.A', 39 | 'DGI (French Tax Authority)': 'DGI French Tax Authority', 40 | 'DHL Airways, Inc.': 'DHL Airways', 'DHL Airways, Inc': 'DHL Airways', 'DHL': 'DHL Airways', 41 | 'Dropbox, Inc.': 'Dropbox', 'Dropbox, Inc': 'Dropbox', 42 | 'eBay Inc.': 'eBay', 'eBay Inc': 'eBay', 43 | 'Facebook, Inc.': 'Facebook', 'Facebook, Inc': 'Facebook', 44 | 'Free (ISP)': 'Free ISP', 45 | 'Google Inc.': 'Google', 'Google Inc': 'Google', 46 | 'Mastercard International Incorporated': 'Mastercard International', 47 | 'Netflix Inc.': 'Netflix', 'Netflix Inc': 'Netflix', 48 | 'PayPal Inc.': 'PayPal', 'PayPal Inc': 'PayPal', 49 | 'Royal KPN N.V.': 'Royal KPN N.V', 50 | 'SF Express Co.': 'SF Express Co', 51 | 'SNS Bank N.V.': 'SNS Bank N.V', 52 | 'Square, Inc.': 'Square', 'Square, Inc': 'Square', 53 | 'Webmail Providers': 'Webmail Provider', 54 | 'Yahoo! Inc': 'Yahoo!', 'Yahoo! Inc.': 'Yahoo!', 55 | 'Microsoft OneDrive': 'Microsoft', 'Office365': 'Microsoft', 'Outlook': 'Microsoft', 56 | 'Global Sources (HK)': 'Global Sources HK', 57 | 'T-Online': 'Deutsche Telekom', 58 | 'Airbnb, Inc': 'Airbnb, Inc.', 59 | 'azul': 'Azul', 60 | 'Raiffeisen Bank S.A': 'Raiffeisen Bank S.A.', 61 | 'Twitter, Inc': 'Twitter, Inc.', 'Twitter': 'Twitter, Inc.', 62 | 'capital_one': 'Capital One Financial Corporation', 63 | 'la_banque_postale': 'La Banque postale', 64 | 'db': 'Deutsche Bank AG', 65 | 'Swiss Post': 'PostFinance', 'PostFinance': 'PostFinance', 66 | 'grupo_bancolombia': 'Bancolombia', 67 | 'barclays': 'Barclays Bank Plc', 68 | 'gov_uk': 'Government of the United Kingdom', 69 | 'Aruba S.p.A': 'Aruba S.p.A.', 70 | 'TSB Bank Plc': 'TSB Bank Limited', 71 | 'strato': 'Strato AG', 72 | 'cogeco': 'Cogeco', 73 | 'Canada Revenue Agency': 'Government of Canada', 74 | 'UniCredit Bulbank': 'UniCredit Bank Aktiengesellschaft', 75 | 'ameli_fr': 'French Health Insurance', 76 | 'Banco de Credito del Peru': 'bcp' 77 | } 78 | # find the value in the dict else return the origin brand name 79 | tran_brand_name = brand_tran_dict.get(brand_name, None) 80 | if tran_brand_name: 81 | return tran_brand_name 82 | else: 83 | return brand_name 84 | 85 | 86 | def l2_norm(x): 87 | """ 88 | l2 normalization 89 | :param x: 90 | :return: 91 | """ 92 | if len(x.shape): 93 | x = x.reshape((x.shape[0], -1)) 94 | return F.normalize(x, p=2, dim=1) 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Phishpedia A Hybrid Deep Learning Based Approach to Visually Identify Phishing Webpages 2 | 3 |
4 | 5 | ![Dialogues](https://img.shields.io/badge/Proctected_Brands_Size-277-green?style=flat-square) 6 | ![Dialogues](https://img.shields.io/badge/Phishing_Benchmark_Size-30k-green?style=flat-square) 7 | 8 |
9 |

10 | Paper • 11 | Website • 12 | Video • 13 | Dataset • 14 | Citation 15 |

16 | 17 | - This is the official implementation of "Phishpedia: A Hybrid Deep Learning Based Approach to Visually Identify Phishing Webpages" USENIX'21 [link to paper](https://www.usenix.org/conference/usenixsecurity21/presentation/lin), [link to our website](https://sites.google.com/view/phishpedia-site/), [link to our dataset](https://drive.google.com/file/d/12ypEMPRQ43zGRqHGut0Esq2z5en0DH4g/view?usp=drive_link). 18 | 19 | - Existing reference-based phishing detectors: 20 | - :x: Lack of **interpretability**, only give binary decision (legit or phish) 21 | - :x: **Not robust against distribution shift**, because the classifier is biased towards the phishing training set 22 | - :x: **Lack of a large-scale phishing benchmark** dataset 23 | - The contributions of our paper: 24 | - :white_check_mark: We propose a phishing identification system Phishpedia, which has high identification accuracy and low runtime overhead, outperforming the relevant state-of-the-art identification approaches. 25 | - :white_check_mark: We are the first to propose to use **consistency-based method** for phishing detection, in place of the traditional classification-based method. We investigate the consistency between the webpage domain and its brand intention. The detected brand intention provides a **visual explanation** for phishing decision. 26 | - :white_check_mark: Phishpedia is **NOT trained on any phishing dataset**, addressing the potential test-time distribution shift problem. 27 | - :white_check_mark: We release a **30k phishing benchmark dataset**, each website is annotated with its URL, HTML, screenshot, and target brand: https://drive.google.com/file/d/12ypEMPRQ43zGRqHGut0Esq2z5en0DH4g/view?usp=drive_link. 28 | - :white_check_mark: We set up a **phishing monitoring system**, investigating emerging domains fed from CertStream, and we have discovered 1,704 real phishing, out of which 1133 are zero-days not reported by industrial antivirus engine (Virustotal). 29 | 30 | 31 | ## Framework 32 | 33 | 34 | 35 | `Input`: A URL and its screenshot `Output`: Phish/Benign, Phishing target 36 | 37 | - Step 1: Enter Deep Object Detection Model, get predicted logos and inputs (inputs are not used for later prediction, just for explanation) 38 | 39 | - Step 2: Enter Deep Siamese Model 40 | - If Siamese report no target, `Return Benign, None` 41 | - Else Siamese report a target, `Return Phish, Phishing target` 42 | 43 | 44 | ## Setup 45 | 46 | Prerequisite: [Pixi installed](https://pixi.sh/latest/) 47 | 48 | For Linux/Mac, 49 | 50 | ```bash 51 | export KMP_DUPLICATE_LIB_OK=TRUE 52 | git clone https://github.com/lindsey98/Phishpedia.git 53 | cd Phishpedia 54 | pixi install 55 | chmod +x setup.sh 56 | ./setup.sh 57 | ``` 58 | 59 | For Windows, in PowerShell, 60 | 61 | ```bash 62 | git clone https://github.com/lindsey98/Phishpedia.git 63 | cd Phishpedia 64 | pixi install 65 | setup.bat 66 | ``` 67 | 68 | ## Running Phishpedia from Command Line 69 | 70 | ```bash 71 | pixi run python phishpedia.py --folder 72 | ``` 73 | 74 | The testing folder should be in the structure of: 75 | 76 | ``` 77 | test_site_1 78 | |__ info.txt (Write the URL) 79 | |__ shot.png (Save the screenshot) 80 | test_site_2 81 | |__ info.txt (Write the URL) 82 | |__ shot.png (Save the screenshot) 83 | ...... 84 | ``` 85 | 86 | ## Running Phishpedia as a GUI tool (web-browser-based) 87 | 88 | See [WEBtool/](WEBtool/) 89 | 90 | ## Install Phishpedia as a Chrome plugin 91 | 92 | See [Plugin_for_Chrome/](Plugin_for_Chrome/) 93 | 94 | 95 | ## Project structure 96 | 97 | ``` 98 | - models/ 99 | |___ rcnn_bet365.pth 100 | |___ faster_rcnn.yaml 101 | |___ resnetv2_rgb_new.pth.tar 102 | |___ expand_targetlist/ 103 | |___ Adobe/ 104 | |___ Amazon/ 105 | |___ ...... 106 | |___ domain_map.pkl 107 | - logo_recog.py: Deep Object Detection Model 108 | - logo_matching.py: Deep Siamese Model 109 | - configs.yaml: Configuration file 110 | - phishpedia.py: Main script 111 | ``` 112 | 113 | ## Miscellaneous 114 | - In our paper, we also implement several phishing detection and identification baselines, see [here](https://github.com/lindsey98/PhishingBaseline) 115 | - The logo targetlist described in our paper includes 181 brands, we have further expanded the targetlist to include 277 brands in this code repository 116 | - For the phish discovery experiment, we obtain feed from [Certstream phish_catcher](https://github.com/x0rz/phishing_catcher), we lower the score threshold to be 40 to process more suspicious websites, readers can refer to their repo for details 117 | - We use Scrapy for website crawling 118 | 119 | ## Citation 120 | 121 | If you find our work useful in your research, please consider citing our paper by: 122 | 123 | ```bibtex 124 | @inproceedings{lin2021phishpedia, 125 | title={Phishpedia: A Hybrid Deep Learning Based Approach to Visually Identify Phishing Webpages}, 126 | author={Lin, Yun and Liu, Ruofan and Divakaran, Dinil Mon and Ng, Jun Yang and Chan, Qing Zhou and Lu, Yiwen and Si, Yuxuan and Zhang, Fan and Dong, Jin Song}, 127 | booktitle={30th $\{$USENIX$\}$ Security Symposium ($\{$USENIX$\}$ Security 21)}, 128 | year={2021} 129 | } 130 | ``` 131 | 132 | ## Contacts 133 | 134 | If you have any issues running our code, you can raise an issue or send an email to liu.ruofan16@u.nus.edu, lin_yun@sjtu.edu.cn, and dcsdjs@nus.edu.sg 135 | -------------------------------------------------------------------------------- /phishpedia.py: -------------------------------------------------------------------------------- 1 | import time 2 | from datetime import datetime 3 | import argparse 4 | import os 5 | import torch 6 | import cv2 7 | from configs import load_config 8 | from logo_recog import pred_rcnn, vis 9 | from logo_matching import check_domain_brand_inconsistency 10 | from tqdm import tqdm 11 | 12 | import re 13 | 14 | os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' 15 | 16 | 17 | def result_file_write(f, folder, url, phish_category, pred_target, matched_domain, siamese_conf, logo_recog_time, 18 | logo_match_time): 19 | f.write(folder + "\t") 20 | f.write(url + "\t") 21 | f.write(str(phish_category) + "\t") 22 | f.write(str(pred_target) + "\t") # write top1 prediction only 23 | f.write(str(matched_domain) + "\t") 24 | f.write(str(siamese_conf) + "\t") 25 | f.write(str(round(logo_recog_time, 4)) + "\t") 26 | f.write(str(round(logo_match_time, 4)) + "\n") 27 | 28 | 29 | class PhishpediaWrapper: 30 | _caller_prefix = "PhishpediaWrapper" 31 | _DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 32 | 33 | def __init__(self): 34 | self._load_config() 35 | 36 | def _load_config(self): 37 | self.ELE_MODEL, self.SIAMESE_THRE, self.SIAMESE_MODEL, \ 38 | self.LOGO_FEATS, self.LOGO_FILES, \ 39 | self.DOMAIN_MAP_PATH = load_config() 40 | print(f'Length of reference list = {len(self.LOGO_FEATS)}') 41 | 42 | def test_orig_phishpedia(self, url, screenshot_path, html_path): 43 | # 0 for benign, 1 for phish, default is benign 44 | phish_category = 0 45 | pred_target = None 46 | matched_domain = None 47 | siamese_conf = None 48 | plotvis = None 49 | logo_match_time = 0 50 | print("Entering phishpedia") 51 | 52 | ####################### Step1: Logo detector ############################################## 53 | start_time = time.time() 54 | pred_boxes = pred_rcnn(im=screenshot_path, predictor=self.ELE_MODEL) 55 | logo_recog_time = time.time() - start_time 56 | 57 | if pred_boxes is not None: 58 | pred_boxes = pred_boxes.detach().cpu().numpy() 59 | plotvis = vis(screenshot_path, pred_boxes) 60 | 61 | # If no element is reported 62 | if pred_boxes is None or len(pred_boxes) == 0: 63 | print('No logo is detected') 64 | return phish_category, pred_target, matched_domain, plotvis, siamese_conf, pred_boxes, logo_recog_time, logo_match_time 65 | 66 | ######################## Step2: Siamese (Logo matcher) ######################################## 67 | start_time = time.time() 68 | pred_target, matched_domain, matched_coord, siamese_conf = check_domain_brand_inconsistency( 69 | logo_boxes=pred_boxes, 70 | domain_map_path=self.DOMAIN_MAP_PATH, 71 | model=self.SIAMESE_MODEL, 72 | logo_feat_list=self.LOGO_FEATS, 73 | file_name_list=self.LOGO_FILES, 74 | url=url, 75 | shot_path=screenshot_path, 76 | similarity_threshold=self.SIAMESE_THRE, 77 | topk=1) 78 | logo_match_time = time.time() - start_time 79 | 80 | if pred_target is None: 81 | print('Did not match to any brand, report as benign') 82 | return phish_category, pred_target, matched_domain, plotvis, siamese_conf, pred_boxes, logo_recog_time, logo_match_time 83 | 84 | print('Match to Target: {} with confidence {:.4f}'.format(pred_target, siamese_conf)) 85 | phish_category = 1 86 | # Visualize, add annotations 87 | cv2.putText(plotvis, "Target: {} with confidence {:.4f}".format(pred_target, siamese_conf), 88 | (int(matched_coord[0] + 20), int(matched_coord[1] + 20)), 89 | cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 0), 2) 90 | 91 | return phish_category, pred_target, matched_domain, plotvis, siamese_conf, pred_boxes, logo_recog_time, logo_match_time 92 | 93 | 94 | if __name__ == '__main__': 95 | 96 | '''run''' 97 | today = datetime.now().strftime('%Y%m%d') 98 | 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument("--folder", required=True, type=str) 101 | parser.add_argument("--output_txt", default=f'{today}_results.txt', help="Output txt path") 102 | args = parser.parse_args() 103 | 104 | request_dir = args.folder 105 | phishpedia_cls = PhishpediaWrapper() 106 | result_txt = args.output_txt 107 | 108 | os.makedirs(request_dir, exist_ok=True) 109 | 110 | for folder in tqdm(os.listdir(request_dir)): 111 | html_path = os.path.join(request_dir, folder, "html.txt") 112 | screenshot_path = os.path.join(request_dir, folder, "shot.png") 113 | info_path = os.path.join(request_dir, folder, 'info.txt') 114 | 115 | if not os.path.exists(screenshot_path): 116 | continue 117 | if not os.path.exists(html_path): 118 | html_path = os.path.join(request_dir, folder, "index.html") 119 | 120 | with open(info_path, 'r') as file: 121 | url = file.read() 122 | 123 | if os.path.exists(result_txt): 124 | with open(result_txt, 'r', encoding='ISO-8859-1') as file: 125 | if url in file.read(): 126 | continue 127 | 128 | _forbidden_suffixes = r"\.(mp3|wav|wma|ogg|mkv|zip|tar|xz|rar|z|deb|bin|iso|csv|tsv|dat|txt|css|log|xml|sql|mdb|apk|bat|exe|jar|wsf|fnt|fon|otf|ttf|ai|bmp|gif|ico|jp(e)?g|png|ps|psd|svg|tif|tiff|cer|rss|key|odp|pps|ppt|pptx|c|class|cpp|cs|h|java|sh|swift|vb|odf|xlr|xls|xlsx|bak|cab|cfg|cpl|cur|dll|dmp|drv|icns|ini|lnk|msi|sys|tmp|3g2|3gp|avi|flv|h264|m4v|mov|mp4|mp(e)?g|rm|swf|vob|wmv|doc(x)?|odt|rtf|tex|wks|wps|wpd)$" 129 | if re.search(_forbidden_suffixes, url, re.IGNORECASE): 130 | continue 131 | 132 | phish_category, pred_target, matched_domain, \ 133 | plotvis, siamese_conf, pred_boxes, \ 134 | logo_recog_time, logo_match_time = phishpedia_cls.test_orig_phishpedia(url, screenshot_path, html_path) 135 | 136 | try: 137 | with open(result_txt, "a+", encoding='ISO-8859-1') as f: 138 | result_file_write(f, folder, url, phish_category, pred_target, matched_domain, siamese_conf, 139 | logo_recog_time, logo_match_time) 140 | except UnicodeError: 141 | with open(result_txt, "a+", encoding='utf-8') as f: 142 | result_file_write(f, folder, url, phish_category, pred_target, matched_domain, siamese_conf, 143 | logo_recog_time, logo_match_time) 144 | if phish_category: 145 | os.makedirs(os.path.join(request_dir, folder), exist_ok=True) 146 | cv2.imwrite(os.path.join(request_dir, folder, "predict.png"), plotvis) 147 | 148 | -------------------------------------------------------------------------------- /WEBtool/static/js/main.js: -------------------------------------------------------------------------------- 1 | new Vue({ 2 | el: '#main-container', 3 | data() { 4 | return { 5 | url: '', 6 | result: null, 7 | uploadedImage: null, 8 | imageUrl: '', 9 | uploadSuccess: false, 10 | } 11 | }, 12 | methods: { 13 | startDetection() { 14 | if (!this.url) { 15 | alert('Please enter a valid URL.'); 16 | return; 17 | } 18 | 19 | // 发送 POST 请求到 /detect 路由 20 | fetch('/detect', { 21 | method: 'POST', 22 | headers: { 23 | 'Content-Type': 'application/json' 24 | }, 25 | body: JSON.stringify({ 26 | url: this.url, 27 | imageUrl: this.imageUrl 28 | }) 29 | }) 30 | .then(response => response.json()) 31 | .then(data => { 32 | this.result = data; // Update all data 33 | 34 | if (data.logo_extraction) { // Logo Extraction Result 35 | document.getElementById('original-image').src = `data:image/png;base64,${data.logo_extraction}`; 36 | } 37 | 38 | // Detectoin Result 39 | const labelElement = document.getElementById('detection-label'); 40 | const explanationElement = document.getElementById('detection-explanation'); 41 | const matched_brand_element = document.getElementById('matched-brand'); 42 | const siamese_conf_element = document.getElementById('siamese-conf'); 43 | const correct_domain_element = document.getElementById('correct-domain'); 44 | const detection_time_element = document.getElementById('detection-time'); 45 | 46 | detection_time_element.textContent = data.detection_time + ' s'; 47 | if (data.result === 'Benign') { 48 | labelElement.className = 'benign'; 49 | labelElement.textContent = 'Benign'; 50 | matched_brand_element.textContent = data.matched_brand; 51 | siamese_conf_element.textContent = data.confidence; 52 | correct_domain_element.textContent = data.correct_domain; 53 | explanationElement.innerHTML = ` 54 |

This website has been analyzed and determined to be ${labelElement.textContent.toLowerCase()}. 55 | Because we have matched a brand ${data.matched_brand} with confidence ${Math.round(data.confidence * 100, 3)}, 56 | and the domain extracted from url is within the domain list under the brand (which is [${data.correct_domain}]). 57 | Enjoy your surfing!

58 | `; 59 | } else if (data.result === 'Phishing') { 60 | labelElement.className = 'phishing'; 61 | labelElement.textContent = 'Phishing'; 62 | matched_brand_element.textContent = data.matched_brand; 63 | siamese_conf_element.textContent = data.confidence; 64 | correct_domain_element.textContent = data.correct_domain; 65 | explanationElement.innerHTML = ` 66 |

This website has been analyzed and determined to be ${labelElement.textContent.toLowerCase()}. 67 | Because we have matched a brand ${data.matched_brand} with confidence ${Math.round(data.confidence * 100, 3)}%, 68 | but the domain extracted from url is NOT within the domain list under the brand (which is [${data.correct_domain}]). 69 | Please proceed with caution!

70 | `; 71 | } else { 72 | labelElement.className = 'unknown'; 73 | labelElement.textContent = 'Unknown'; 74 | matched_brand_element.textContent = "unknown"; 75 | siamese_conf_element.textContent = "0.00"; 76 | correct_domain_element.textContent = "unknown"; 77 | explanationElement.innerHTML = ` 78 |

Sorry, we don't find any matched brand in database so this website is determined to be ${labelElement.textContent.toLowerCase()}.

79 |

It is still possible that this is a phishing site. Please proceed with caution!

80 | `; 81 | } 82 | }) 83 | .catch(error => { 84 | console.error('Error:', error); 85 | alert('检测失败,请稍后重试。'); 86 | }); 87 | }, 88 | handleImageUpload(event) { // 处理图片上传事件 89 | const file = event.target.files[0]; 90 | if (file) { 91 | this.uploadedImage = file; 92 | this.uploadImage(); 93 | } 94 | }, 95 | uploadImage() { // 上传图片到服务器 96 | const formData = new FormData(); 97 | formData.append('image', this.uploadedImage); 98 | 99 | fetch('/upload', { // 假设上传图片的路由是 /upload 100 | method: 'POST', 101 | body: formData 102 | }) 103 | .then(response => response.json()) 104 | .then(data => { 105 | if (data.success) { 106 | this.imageUrl = data.imageUrl; // 更新图片URL 107 | this.uploadSuccess = true; // 标记上传成功 108 | } else { 109 | alert('上传图片失败: ' + data.error); 110 | } 111 | }) 112 | .catch(error => { 113 | console.error('Error:', error); 114 | alert('上传图片失败,请稍后重试。'); 115 | }); 116 | }, 117 | clearUpload() { // 清除上传的图像 118 | fetch('/clear_upload', { // 假设删除图片的路由是 /delete-image 119 | method: 'POST', 120 | headers: { 121 | 'Content-Type': 'application/json' 122 | }, 123 | body: JSON.stringify({ imageUrl: this.imageUrl }) 124 | }) 125 | .then(response => response.json()) 126 | .then(data => { 127 | if (data.success) { 128 | this.imageUrl = ''; 129 | this.uploadSuccess = false; // 重置上传状态 130 | } else { 131 | alert('删除图片失败: ' + data.error); 132 | } 133 | }) 134 | .catch(error => { 135 | console.error('Error:', error); 136 | alert('删除图片失败,请稍后重试。'); 137 | }); 138 | } 139 | } 140 | }); 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /WEBtool/static/css/sidebar.css: -------------------------------------------------------------------------------- 1 | /* 侧边栏样式 */ 2 | .sidebar { 3 | position: fixed; 4 | top: 0; 5 | right: -400px; 6 | width: 300px; 7 | height: 100%; 8 | background-color: #ffffff; 9 | box-shadow: -2px 0 5px rgba(0, 0, 0, 0.1); 10 | transition: right 0.3s ease; 11 | z-index: 1000; 12 | display: flex; 13 | flex-direction: column; 14 | padding: 20px; 15 | } 16 | 17 | /* 侧边栏打开时显示 */ 18 | .sidebar.open { 19 | right: 0; 20 | } 21 | 22 | /* 侧边栏标题 */ 23 | .sidebar-header { 24 | display: flex; 25 | justify-content: space-between; 26 | align-items: center; 27 | font-size: 18px; 28 | font-weight: bold; 29 | margin-bottom: 20px; 30 | } 31 | 32 | /* 关闭按钮 */ 33 | .close-sidebar { 34 | background: none; 35 | border: none; 36 | font-size: 18px; 37 | cursor: pointer; 38 | color: #333; 39 | } 40 | 41 | /* 右上角按钮样式 */ 42 | .sidebar-toggle { 43 | position: absolute; 44 | top: 15px; 45 | right: 15px; 46 | background: #87CEFA; 47 | color: white; 48 | border: none; 49 | border-radius: 5px; 50 | padding: 10px 15px; 51 | font-size: 18px; 52 | font-weight: bold; 53 | cursor: pointer; 54 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 55 | transition: background-color 0.3s ease; 56 | } 57 | 58 | .sidebar-toggle:hover { 59 | background-color: #0056b3; 60 | } 61 | 62 | /* 按钮容器样式 */ 63 | .sidebar-buttons { 64 | display: flex; 65 | flex-wrap: wrap; 66 | gap: 10px; 67 | margin-bottom: 20px; 68 | justify-content: space-between; 69 | } 70 | 71 | /* 按钮基础样式 */ 72 | .sidebar-button { 73 | flex: 1 1 calc(50% - 10px); 74 | display: flex; 75 | justify-content: center; 76 | align-items: center; 77 | background-color: #87CEFA; 78 | color: white; 79 | font-size: 14px; 80 | font-weight: bold; 81 | border: none; 82 | border-radius: 3px; 83 | padding: 5px 10px; 84 | cursor: pointer; 85 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 86 | transition: background-color 0.3s ease, transform 0.2s ease; 87 | } 88 | 89 | /* 按钮悬停效果 */ 90 | .sidebar-button:hover { 91 | background-color: #0056b3; 92 | transform: translateY(-2px); 93 | } 94 | 95 | /* 按钮点击效果 */ 96 | .sidebar-button:active { 97 | background-color: #003d80; 98 | transform: translateY(0); 99 | } 100 | 101 | /* ============ 文件树 ============ */ 102 | /* 文件树样式 */ 103 | #file-tree-root { 104 | list-style-type: none; 105 | padding-left: 20px; 106 | height: 580px; 107 | max-height: 580px; 108 | overflow-y: auto; 109 | border: 1px solid #ccc; 110 | background-color: white; 111 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 112 | } 113 | 114 | .file-item { 115 | margin-bottom: 5px; 116 | } 117 | 118 | .file-folder { 119 | cursor: pointer; 120 | } 121 | 122 | .folder-name { 123 | display: flex; 124 | align-items: center; 125 | } 126 | 127 | .folder-icon { 128 | margin-right: 5px; 129 | } 130 | 131 | .file-file { 132 | cursor: pointer; 133 | } 134 | 135 | .file-icon { 136 | margin-right: 5px; 137 | } 138 | 139 | .hidden { 140 | display: none; 141 | } 142 | 143 | 144 | .file-folder>ul { 145 | padding-left: 20px; 146 | } 147 | 148 | /* 预览框样式 */ 149 | #image-preview-box { 150 | position: absolute; 151 | background-color: white; 152 | border: 1px solid #ccc; 153 | padding: 10px; 154 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 155 | max-width: 400px; 156 | max-height: 300px; 157 | overflow: hidden; 158 | } 159 | 160 | /* 选中样式 */ 161 | .selected { 162 | border: 2px solid #007bff; 163 | padding: 2px; 164 | box-sizing: border-box; 165 | } 166 | 167 | 168 | /* ============== 表单 ============= */ 169 | .form-container { 170 | position: fixed; 171 | top: 50%; 172 | left: 50%; 173 | transform: translate(-50%, -50%); 174 | background-color: #ffffff; 175 | padding: 20px 30px; 176 | border-radius: 10px; 177 | box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); 178 | width: 300px; 179 | max-width: 90%; 180 | z-index: 1001; 181 | } 182 | 183 | /* 表单标题 */ 184 | .form-container h3 { 185 | font-size: 22px; 186 | font-weight: bold; 187 | color: #333; 188 | margin-bottom: 20px; 189 | text-align: center; 190 | font-family: 'Arial', sans-serif; 191 | } 192 | 193 | input[type="label"] { 194 | width: 20%; 195 | } 196 | 197 | /* 输入框样式 */ 198 | input[type="text"] { 199 | width: 90%; 200 | padding: 12px; 201 | margin: 12px 0; 202 | border: 1px solid #ddd; 203 | border-radius: 8px; 204 | background-color: #f9f9f9; 205 | font-size: 16px; 206 | color: #333; 207 | box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.1); 208 | transition: border-color 0.3s ease, background-color 0.3s ease; 209 | text-align: center; 210 | } 211 | 212 | /* 输入框聚焦效果 */ 213 | input[type="text"]:focus { 214 | border-color: #3498db; 215 | background-color: #fff; 216 | outline: none; 217 | } 218 | 219 | /* 提交按钮样式 */ 220 | button[type="submit"] { 221 | background-color: #3498db; 222 | color: white; 223 | } 224 | 225 | /* 取消按钮样式 */ 226 | button[type="button"] { 227 | background-color: #7c7c7c; 228 | color: white; 229 | } 230 | 231 | /* 表单按钮容器 */ 232 | .form-actions { 233 | width: 100%; 234 | display: flex; 235 | justify-content: space-between; 236 | gap: 12px; 237 | margin-top: 20px; 238 | } 239 | 240 | /* 提交按钮样式 */ 241 | button[type="submit"] { 242 | background-color: #3498db; 243 | color: white; 244 | padding: 10px 20px; 245 | border: none; 246 | border-radius: 5px; 247 | font-size: 14px; 248 | cursor: pointer; 249 | transition: background-color 0.3s ease, transform 0.2s ease; 250 | } 251 | 252 | /* 提交按钮悬停效果 */ 253 | button[type="submit"]:hover { 254 | background-color: #2980b9; 255 | transform: translateY(-2px); 256 | } 257 | 258 | /* 提交按钮点击效果 */ 259 | button[type="submit"]:active { 260 | background-color: #1abc9c; 261 | transform: translateY(0); 262 | } 263 | 264 | /* 取消按钮样式 */ 265 | button[type="button"] { 266 | background-color: #7c7c7c; 267 | color: white; 268 | padding: 10px 20px; 269 | border: none; 270 | border-radius: 5px; 271 | font-size: 14px; 272 | cursor: pointer; 273 | transition: background-color 0.3s ease, transform 0.2s ease; 274 | } 275 | 276 | /* 取消按钮悬停效果 */ 277 | button[type="button"]:hover { 278 | background-color: #555; 279 | transform: translateY(-2px); 280 | } 281 | 282 | /* 取消按钮点击效果 */ 283 | button[type="button"]:active { 284 | background-color: #333; 285 | transform: translateY(0); 286 | } 287 | 288 | /* 浮层样式 */ 289 | #overlay { 290 | position: fixed; 291 | top: 0; 292 | left: 0; 293 | width: 100%; 294 | height: 100%; 295 | background-color: rgba(0, 0, 0, 0.5); 296 | display: flex; 297 | justify-content: center; 298 | align-items: center; 299 | z-index: 1002; 300 | } 301 | 302 | /* 转圈动画样式 */ 303 | #spinner { 304 | border: 2px solid #f3f3f3; 305 | border-top: 2px solid #3498db; 306 | border-radius: 50%; 307 | width: 16px; 308 | height: 16px; 309 | animation: spin 2s linear infinite; 310 | margin-right: 10px; 311 | } 312 | 313 | /* 转圈动画 */ 314 | @keyframes spin { 315 | 0% { 316 | transform: rotate(0deg); 317 | } 318 | 319 | 100% { 320 | transform: rotate(360deg); 321 | } 322 | } 323 | 324 | /* 浮层中的文本样式 */ 325 | #overlay p { 326 | color: white; 327 | font-size: 16px; 328 | font-weight: bold; 329 | text-align: center; 330 | line-height: 16px; 331 | margin: 0; 332 | } 333 | 334 | #overlay .spinner-container { 335 | display: flex; 336 | align-items: center; 337 | } -------------------------------------------------------------------------------- /WEBtool/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | PhishPedia 8 | 9 | 10 | 11 | 12 | 13 | 14 | 19 | 20 | 26 | 27 | 28 | 71 | 72 | 73 |
74 |
75 |
76 | 77 |
78 | 79 | 80 |
81 | 82 |
83 |
84 | Upload Icon 86 |

87 | 88 |

Or ctrl+v here

89 | 91 |
92 |
93 |
94 | Success Icon 96 | Uploaded Successfully! 97 |
98 | Uploaded Image 99 | 100 |
101 |
102 | 103 | 104 |
105 |
106 |
107 |
108 |
109 | Logo Extraction 110 |
111 | Original Webpage Screenshot 113 |
114 |
115 |
116 | Detection Result 117 |
118 |
119 | 📊 120 | Result 121 |
122 |
123 |
124 |
125 |
    126 |
  • 127 | 🏷️ 128 | Matched Brand 129 | 130 |
  • 131 |
  • 132 | 💬 133 | Siamese Confidence 134 | 135 |
  • 136 |
  • 137 | 🌐 138 | Correct Domain 139 | 140 |
  • 141 |
  • 142 | ⏱️ 143 | Detection Time 144 | 145 |
  • 146 |
  • 147 |
    148 |
  • 149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Lint as: python3 16 | """Bottleneck ResNet v2 with GroupNorm and Weight Standardization.""" 17 | 18 | from collections import OrderedDict # pylint: disable=g-importing-member 19 | 20 | import torch 21 | import torch.nn as nn 22 | import torch.nn.functional as F 23 | 24 | 25 | class StdConv2d(nn.Conv2d): 26 | 27 | def forward(self, x): 28 | w = self.weight 29 | v, m = torch.var_mean(w, dim=[1, 2, 3], keepdim=True, unbiased=False) 30 | w = (w - m) / torch.sqrt(v + 1e-10) 31 | return F.conv2d(x, w, self.bias, self.stride, self.padding, 32 | self.dilation, self.groups) 33 | 34 | 35 | def conv3x3(cin, cout, stride=1, groups=1, bias=False): 36 | return StdConv2d(cin, cout, kernel_size=3, stride=stride, 37 | padding=1, bias=bias, groups=groups) 38 | 39 | 40 | def conv1x1(cin, cout, stride=1, bias=False): 41 | return StdConv2d(cin, cout, kernel_size=1, stride=stride, 42 | padding=0, bias=bias) 43 | 44 | 45 | def tf2th(conv_weights): 46 | """Possibly convert HWIO to OIHW.""" 47 | if conv_weights.ndim == 4: 48 | conv_weights = conv_weights.transpose([3, 2, 0, 1]) 49 | return torch.from_numpy(conv_weights) 50 | 51 | 52 | class PreActBottleneck(nn.Module): 53 | """Pre-activation (v2) bottleneck block. 54 | 55 | Follows the implementation of "Identity Mappings in Deep Residual Networks": 56 | https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua 57 | 58 | Except it puts the stride on 3x3 conv when available. 59 | """ 60 | 61 | def __init__(self, cin, cout=None, cmid=None, stride=1): 62 | super().__init__() 63 | cout = cout or cin 64 | cmid = cmid or cout // 4 65 | 66 | self.gn1 = nn.GroupNorm(32, cin) 67 | self.conv1 = conv1x1(cin, cmid) 68 | self.gn2 = nn.GroupNorm(32, cmid) 69 | self.conv2 = conv3x3(cmid, cmid, stride) # Original code has it on conv1!! 70 | self.gn3 = nn.GroupNorm(32, cmid) 71 | self.conv3 = conv1x1(cmid, cout) 72 | self.relu = nn.ReLU(inplace=True) 73 | 74 | if (stride != 1 or cin != cout): 75 | # Projection also with pre-activation according to paper. 76 | self.downsample = conv1x1(cin, cout, stride) 77 | 78 | def forward(self, x): 79 | out = self.relu(self.gn1(x)) 80 | 81 | # Residual branch 82 | residual = x 83 | if hasattr(self, 'downsample'): 84 | residual = self.downsample(out) 85 | 86 | # Unit's branch 87 | out = self.conv1(out) 88 | out = self.conv2(self.relu(self.gn2(out))) 89 | out = self.conv3(self.relu(self.gn3(out))) 90 | 91 | return out + residual 92 | 93 | def load_from(self, weights, prefix=''): 94 | convname = 'standardized_conv2d' 95 | with torch.no_grad(): 96 | self.conv1.weight.copy_(tf2th(weights[f'{prefix}a/{convname}/kernel'])) 97 | self.conv2.weight.copy_(tf2th(weights[f'{prefix}b/{convname}/kernel'])) 98 | self.conv3.weight.copy_(tf2th(weights[f'{prefix}c/{convname}/kernel'])) 99 | self.gn1.weight.copy_(tf2th(weights[f'{prefix}a/group_norm/gamma'])) 100 | self.gn2.weight.copy_(tf2th(weights[f'{prefix}b/group_norm/gamma'])) 101 | self.gn3.weight.copy_(tf2th(weights[f'{prefix}c/group_norm/gamma'])) 102 | self.gn1.bias.copy_(tf2th(weights[f'{prefix}a/group_norm/beta'])) 103 | self.gn2.bias.copy_(tf2th(weights[f'{prefix}b/group_norm/beta'])) 104 | self.gn3.bias.copy_(tf2th(weights[f'{prefix}c/group_norm/beta'])) 105 | if hasattr(self, 'downsample'): 106 | w = weights[f'{prefix}a/proj/{convname}/kernel'] 107 | self.downsample.weight.copy_(tf2th(w)) 108 | 109 | 110 | class ResNetV2(nn.Module): 111 | """Implementation of Pre-activation (v2) ResNet mode.""" 112 | 113 | def __init__(self, block_units, width_factor, head_size=21843, zero_head=False): 114 | super().__init__() 115 | wf = width_factor # shortcut 'cause we'll use it a lot. 116 | 117 | # The following will be unreadable if we split lines. 118 | # pylint: disable=line-too-long 119 | self.root = nn.Sequential(OrderedDict([ 120 | ('conv', StdConv2d(3, 64 * wf, kernel_size=7, stride=2, padding=3, bias=False)), 121 | ('pad', nn.ConstantPad2d(1, 0)), 122 | ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=0)), 123 | # The following is subtly not the same! 124 | # ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)), 125 | ])) 126 | 127 | self.body = nn.Sequential(OrderedDict([ 128 | ('block1', nn.Sequential(OrderedDict( 129 | [('unit01', PreActBottleneck(cin=64 * wf, cout=256 * wf, cmid=64 * wf))] + 130 | [(f'unit{i:02d}', PreActBottleneck(cin=256 * wf, cout=256 * wf, cmid=64 * wf)) for i in 131 | range(2, block_units[0] + 1)], 132 | ))), 133 | ('block2', nn.Sequential(OrderedDict( 134 | [('unit01', PreActBottleneck(cin=256 * wf, cout=512 * wf, cmid=128 * wf, stride=2))] + 135 | [(f'unit{i:02d}', PreActBottleneck(cin=512 * wf, cout=512 * wf, cmid=128 * wf)) for i in 136 | range(2, block_units[1] + 1)], 137 | ))), 138 | ('block3', nn.Sequential(OrderedDict( 139 | [('unit01', PreActBottleneck(cin=512 * wf, cout=1024 * wf, cmid=256 * wf, stride=2))] + 140 | [(f'unit{i:02d}', PreActBottleneck(cin=1024 * wf, cout=1024 * wf, cmid=256 * wf)) for i in 141 | range(2, block_units[2] + 1)], 142 | ))), 143 | ('block4', nn.Sequential(OrderedDict( 144 | [('unit01', PreActBottleneck(cin=1024 * wf, cout=2048 * wf, cmid=512 * wf, stride=2))] + 145 | [(f'unit{i:02d}', PreActBottleneck(cin=2048 * wf, cout=2048 * wf, cmid=512 * wf)) for i in 146 | range(2, block_units[3] + 1)], 147 | ))), 148 | ])) 149 | # pylint: enable=line-too-long 150 | 151 | self.zero_head = zero_head 152 | self.head = nn.Sequential(OrderedDict([ 153 | ('gn', nn.GroupNorm(32, 2048 * wf)), 154 | ('relu', nn.ReLU(inplace=True)), 155 | ('avg', nn.AdaptiveAvgPool2d(output_size=1)), 156 | ('conv', nn.Conv2d(2048 * wf, head_size, kernel_size=1, bias=True)), 157 | ])) 158 | 159 | def features(self, x): 160 | x = self.head[:-1](self.body(self.root(x))) 161 | 162 | return x.squeeze(-1).squeeze(-1) 163 | 164 | def forward(self, x): 165 | x = self.head(self.body(self.root(x))) 166 | assert x.shape[-2:] == (1, 1) # We should have no spatial shape left. 167 | return x[..., 0, 0] 168 | 169 | def load_from(self, weights, prefix='resnet/'): 170 | with torch.no_grad(): 171 | self.root.conv.weight.copy_( 172 | tf2th(weights[f'{prefix}root_block/standardized_conv2d/kernel'])) # pylint: disable=line-too-long 173 | self.head.gn.weight.copy_(tf2th(weights[f'{prefix}group_norm/gamma'])) 174 | self.head.gn.bias.copy_(tf2th(weights[f'{prefix}group_norm/beta'])) 175 | if self.zero_head: 176 | nn.init.zeros_(self.head.conv.weight) 177 | nn.init.zeros_(self.head.conv.bias) 178 | else: 179 | self.head.conv.weight.copy_( 180 | tf2th(weights[f'{prefix}head/conv2d/kernel'])) # pylint: disable=line-too-long 181 | self.head.conv.bias.copy_(tf2th(weights[f'{prefix}head/conv2d/bias'])) 182 | 183 | for bname, block in self.body.named_children(): 184 | for uname, unit in block.named_children(): 185 | unit.load_from(weights, prefix=f'{prefix}{bname}/{uname}/') 186 | 187 | 188 | KNOWN_MODELS = OrderedDict([ 189 | ('BiT-M-R50x1', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 1, *a, **kw)), 190 | ('BiT-M-R50x3', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 3, *a, **kw)), 191 | ('BiT-M-R101x1', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 1, *a, **kw)), 192 | ('BiT-M-R101x3', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 3, *a, **kw)), 193 | ('BiT-M-R152x2', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 2, *a, **kw)), 194 | ('BiT-M-R152x4', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 4, *a, **kw)), 195 | ('BiT-S-R50x1', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 1, *a, **kw)), 196 | ('BiT-S-R50x3', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 3, *a, **kw)), 197 | ('BiT-S-R101x1', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 1, *a, **kw)), 198 | ('BiT-S-R101x3', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 3, *a, **kw)), 199 | ('BiT-S-R152x2', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 2, *a, **kw)), 200 | ('BiT-S-R152x4', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 4, *a, **kw)), 201 | ]) 202 | -------------------------------------------------------------------------------- /WEBtool/static/css/style.css: -------------------------------------------------------------------------------- 1 | body, 2 | html { 3 | margin: 0; 4 | padding: 0; 5 | font-family: Arial, sans-serif; 6 | background-color: #faf4f2; 7 | } 8 | 9 | ul { 10 | list-style-type: none; 11 | padding: 0; 12 | } 13 | 14 | li { 15 | margin: 5px 0; 16 | } 17 | 18 | #header { 19 | display: flex; 20 | align-items: center; 21 | justify-content: flex-start; 22 | position: absolute; 23 | top: 0px; 24 | left: 0px; 25 | background-color: rgba(255, 255, 255, 0.8); 26 | padding: 10px 10px; 27 | border-radius: 5px; 28 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 29 | width: 100%; 30 | margin-bottom: 10px; 31 | } 32 | 33 | #logo-icon { 34 | height: 60px; 35 | width: auto; 36 | margin-right: 20px; 37 | } 38 | 39 | #logo-text { 40 | display: flex; 41 | align-items: center; 42 | height: 80px; 43 | line-height: 80px; 44 | letter-spacing: 2px; 45 | background: linear-gradient(90deg, #3498db, #f9f388); 46 | -webkit-background-clip: text; 47 | background-clip: text; 48 | -webkit-text-fill-color: transparent; 49 | text-shadow: 1px 1px 3px rgba(0, 0, 0, 0.2); 50 | font-size: 35px; 51 | font-weight: bold; 52 | } 53 | 54 | 55 | #main-container { 56 | display: flex; 57 | flex-direction: column; 58 | align-items: center; 59 | width: 100%; 60 | margin-top: 130px; 61 | } 62 | 63 | #input-container { 64 | display: flex; 65 | flex-direction: column; 66 | align-items: center; 67 | width: 1200px; 68 | padding: 20px; 69 | border-radius: 8px; 70 | border: 1px solid #ddd; 71 | background-color: #dff0fb; 72 | } 73 | 74 | .inner-container { 75 | width: 100%; 76 | height: 100%; 77 | display: flex; 78 | flex-direction: column; 79 | align-items: center; 80 | border-radius: 5px; 81 | border: 3px dashed white; 82 | background-color: #eaf4fb; 83 | padding-top: 20px; 84 | padding-bottom: 20px; 85 | } 86 | 87 | #output-container { 88 | display: flex; 89 | flex-direction: column; 90 | align-items: center; 91 | width: 1240px; 92 | margin-top: 10px; 93 | } 94 | 95 | /* ============================= URL输入区域 =============================*/ 96 | #url-input-container { 97 | display: flex; 98 | justify-content: center; 99 | align-items: center; 100 | gap: 10px; 101 | width: 500px; 102 | } 103 | 104 | .custom-label { 105 | background-color: #87CEFA; 106 | color: white; 107 | border-radius: 25px; 108 | padding: 10px 20px; 109 | font-size: 16px; 110 | font-weight: bold; 111 | border: none; 112 | text-align: center; 113 | white-space: nowrap; 114 | } 115 | 116 | #url-input { 117 | background-color: #dcdcdc; 118 | color: #333; 119 | border: none; 120 | border-radius: 15px; 121 | padding: 10px 20px; 122 | font-size: 16px; 123 | outline: none; 124 | width: 300px; 125 | box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.1); 126 | } 127 | 128 | #url-input::placeholder { 129 | color: #888; 130 | font-style: italic; 131 | } 132 | 133 | /* ============================= 图片上传区域 =============================*/ 134 | #image-upload-container { 135 | display: flex; 136 | justify-content: center; 137 | align-items: center; 138 | width: 410px; 139 | } 140 | 141 | .drop-area { 142 | border: 2px dashed #007BFF; 143 | border-radius: 8px; 144 | background-color: #ffffff; 145 | padding: 20px; 146 | text-align: center; 147 | font-size: 1.2em; 148 | color: #004085; 149 | margin-top: 10px; 150 | width: 100%; 151 | height: 20vh; 152 | margin: 20px auto; 153 | transition: background-color 0.3s ease; 154 | } 155 | 156 | 157 | .upload-icon { 158 | width: 50px; 159 | height: 50px; 160 | margin-bottom: 10px; 161 | } 162 | 163 | .upload-label { 164 | cursor: pointer; 165 | margin-bottom: -10px; 166 | background-color: white; 167 | color: black; 168 | padding: 10px 20px; 169 | border: 2px solid #ccc; 170 | border-radius: 50%; 171 | border-radius: 6px; 172 | text-align: center; 173 | font-size: small; 174 | display: inline-block; 175 | line-height: 1; 176 | font-family: Arial, 177 | sans-serif; 178 | } 179 | 180 | .upload-label:hover { 181 | background-color: #f0f0f0; 182 | } 183 | 184 | .upload-success-area { 185 | display: flex; 186 | flex-direction: column; 187 | align-items: center; 188 | justify-content: center; 189 | padding: 20px; 190 | border: 2px dashed #007BFF; 191 | border-radius: 8px; 192 | background-color: #ffffff; 193 | margin-top: 10px; 194 | margin-bottom: 10px; 195 | } 196 | 197 | .success-message { 198 | display: flex; 199 | align-items: center; 200 | margin-bottom: 10px; 201 | font-size: larger; 202 | } 203 | 204 | .success-icon { 205 | width: 30px; 206 | height: 30px; 207 | margin-right: 5px; 208 | } 209 | 210 | .success-text { 211 | font-size: 16px; 212 | } 213 | 214 | .uploaded-thumbnail { 215 | width: 400px; 216 | height: auto; 217 | margin-top: 10px; 218 | margin-bottom: 10px; 219 | } 220 | 221 | .clear-button { 222 | padding: 10px 20px; 223 | background-color: #888888; 224 | color: white; 225 | border: none; 226 | border-radius: 8px; 227 | font-size: 16px; 228 | font-weight: bold; 229 | cursor: pointer; 230 | transition: background-color 0.3s ease; 231 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 232 | } 233 | 234 | .clear-button:hover { 235 | background-color: #555555; 236 | } 237 | 238 | #start-detection-button { 239 | background-color: #007BFF; 240 | color: white; 241 | border: none; 242 | border-radius: 25px; 243 | padding: 10px 20px; 244 | font-size: 16px; 245 | font-weight: bold; 246 | cursor: pointer; 247 | margin-top: 0px; 248 | width: 410px; 249 | transition: background-color 0.3s ease; 250 | } 251 | 252 | #start-detection-button:hover { 253 | background-color: #0056b3; 254 | } 255 | 256 | /* ============================= 结果容器样式 =============================*/ 257 | #result-container { 258 | display: flex; 259 | flex-direction: row; 260 | justify-content: space-between; 261 | align-items: flex-start; 262 | width: 100%; 263 | max-width: 1500px; 264 | gap: 20px; 265 | } 266 | 267 | #original-image-container, 268 | #detection-result-container { 269 | display: flex; 270 | flex-direction: column; 271 | align-items: center; 272 | width: 50%; 273 | height: 450px; 274 | border: 1px solid #ddd; 275 | border-radius: 10px; 276 | padding-top: 10px; 277 | padding-left: 20px; 278 | padding-right: 20px; 279 | padding-bottom: 20px; 280 | background-color: #ffffff; 281 | box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); 282 | transition: transform 0.3s ease; 283 | } 284 | 285 | #original-image-container:hover, 286 | #detection-result-container:hover { 287 | transform: scale(1.02); 288 | transition: transform 0.3s ease; 289 | } 290 | 291 | .result_title { 292 | width: 100%; 293 | height: 20px; 294 | margin-top: 0px; 295 | text-align: center; 296 | padding: 10px; 297 | border-radius: 8px; 298 | font-family: Arial, 299 | sans-serif; 300 | font-weight: bold; 301 | font-size: 18px; 302 | } 303 | 304 | #logo-extraction-result { 305 | width: 100%; 306 | height: 100%; 307 | display: flex; 308 | justify-content: center; 309 | align-items: center; 310 | overflow: hidden; 311 | margin-top: 10px; 312 | background-color: #f9f9f9; 313 | border: 1px solid #ddd; 314 | border-radius: 8px; 315 | } 316 | 317 | #original-image { 318 | max-height: 100%; 319 | max-width: 100%; 320 | object-fit: contain; 321 | } 322 | 323 | #detection-result { 324 | width: 100%; 325 | height: 100%; 326 | margin-top: 10px; 327 | text-align: left; 328 | padding: 10px; 329 | background-color: #f9f9f9; 330 | border: 1px solid #ddd; 331 | border-radius: 8px; 332 | } 333 | 334 | #detection-label { 335 | display: inline-block; 336 | font-family: Arial, sans-serif; 337 | font-size: 14px; 338 | font-weight: bold; 339 | color: white; 340 | padding: 3px 6px; 341 | border-radius: 16px; 342 | text-align: center; 343 | transition: transform 0.2s, box-shadow 0.2s; 344 | } 345 | 346 | #detection-label.benign { 347 | background: linear-gradient(90deg, #4CAF50, #4CAF50); 348 | } 349 | 350 | #detection-label.phishing { 351 | background: linear-gradient(90deg, #F44336, #F44336); 352 | } 353 | 354 | #detection-label.unknown { 355 | background: linear-gradient(90deg, #9E9E9E, #9E9E9E); 356 | } 357 | 358 | #detection-explanation { 359 | font-size: 14px; 360 | color: #333; 361 | } 362 | 363 | .separator { 364 | width: 100%; 365 | height: 2px; 366 | background-color: #ddd; 367 | margin: 10px 0; 368 | } 369 | 370 | 371 | .tasks-list { 372 | list-style: none; 373 | padding: 0; 374 | margin: 0; 375 | } 376 | 377 | .tasks-list li { 378 | display: flex; 379 | align-items: center; 380 | justify-content: flex-start; 381 | padding: 8px 0; 382 | border-bottom: 1px solid #eee; 383 | } 384 | 385 | .tasks-list li:last-child { 386 | border-bottom: none; 387 | } 388 | 389 | .icon { 390 | margin-right: 8px; 391 | font-size: 16px; 392 | } 393 | 394 | .task { 395 | font-size: 14px; 396 | color: #555; 397 | margin-right: 12px; 398 | } 399 | 400 | .result { 401 | font-size: 14px; 402 | color: #5b5b5b; 403 | background-color: #cdcdcd; 404 | padding: 3px 6px; 405 | border-radius: 10px; 406 | } 407 | 408 | #detection-explanation { 409 | font-family: Arial, sans-serif; 410 | font-size: 14px; 411 | line-height: 1.8; 412 | color: #333; 413 | background-color: #f9f9f9; 414 | padding: 16px; 415 | border-left: 4px solid #0078d4; 416 | border-radius: 8px; 417 | box-shadow: 0 2px 6px rgba(0, 0, 0, 0.1); 418 | margin: 16px 0; 419 | } 420 | 421 | #detection-explanation p { 422 | margin: 0; 423 | } 424 | 425 | #detection-explanation strong { 426 | color: #d9534f; 427 | font-weight: bold; 428 | background-color: #fff0f0; 429 | padding: 2px 4px; 430 | border-radius: 4px; 431 | } -------------------------------------------------------------------------------- /WEBtool/phishpedia_web.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from flask import request, Flask, jsonify, render_template, send_from_directory 4 | from flask_cors import CORS 5 | from utils_web import allowed_file, convert_to_base64, domain_map_add, domain_map_delete, check_port_inuse, initial_upload_folder 6 | from configs import load_config 7 | from phishpedia import PhishpediaWrapper 8 | 9 | phishpedia_cls = None 10 | 11 | # flask for API server 12 | app = Flask(__name__) 13 | cors = CORS(app, supports_credentials=True) 14 | app.config['CORS_HEADERS'] = 'Content-Type' 15 | app.config['UPLOAD_FOLDER'] = 'static/uploads' 16 | app.config['FILE_TREE_ROOT'] = '../models/expand_targetlist' # 主目录路径 17 | app.config['DOMAIN_MAP_PATH'] = '../models/domain_map.pkl' 18 | 19 | 20 | @app.route('/') 21 | def index(): 22 | """渲染主页面""" 23 | return render_template('index.html') 24 | 25 | 26 | @app.route('/upload', methods=['POST']) 27 | def upload_file(): 28 | """处理文件上传请求""" 29 | if 'image' not in request.files: 30 | return jsonify({'error': 'No file part'}), 400 31 | file = request.files['image'] 32 | 33 | if file.filename == '': 34 | return jsonify({'error': 'No selected file'}), 400 35 | 36 | if file and allowed_file(file.filename): 37 | filename = file.filename 38 | if filename.count('.') > 1: 39 | return jsonify({'error': 'Invalid file name'}), 400 40 | elif any(sep in filename for sep in (os.sep, os.altsep)): 41 | return jsonify({'error': 'Invalid file name'}), 400 42 | elif '..' in filename: 43 | return jsonify({'error': 'Invalid file name'}), 400 44 | file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) 45 | file_path = os.path.normpath(file_path) 46 | if not file_path.startswith(app.config['UPLOAD_FOLDER']): 47 | return jsonify({'error': 'Invalid file path'}), 400 48 | file.save(file_path) 49 | return jsonify({'success': True, 'imageUrl': f'/uploads/{filename}'}), 200 50 | 51 | return jsonify({'error': 'Invalid file type'}), 400 52 | 53 | 54 | @app.route('/uploads/') 55 | def uploaded_file(filename): 56 | """提供上传文件的访问路径""" 57 | return send_from_directory(app.config['UPLOAD_FOLDER'], filename) 58 | 59 | 60 | @app.route('/clear_upload', methods=['POST']) 61 | def delete_image(): 62 | data = request.get_json() 63 | image_url = data.get('imageUrl') 64 | 65 | if not image_url: 66 | return jsonify({'success': False, 'error': 'No image URL provided'}), 400 67 | 68 | try: 69 | # 假设 image_url 是相对于静态目录的路径 70 | filename = image_url.split('/')[-1] 71 | image_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) 72 | image_path = os.path.normpath(image_path) 73 | if not image_path.startswith(app.config['UPLOAD_FOLDER']): 74 | return jsonify({'success': False, 'error': 'Invalid file path'}), 400 75 | os.remove(image_path) 76 | return jsonify({'success': True}), 200 77 | except Exception: 78 | return jsonify({'success': False}), 500 79 | 80 | 81 | @app.route('/detect', methods=['POST']) 82 | def detect(): 83 | data = request.json 84 | url = data.get('url', '') 85 | imageUrl = data.get('imageUrl', '') 86 | 87 | filename = imageUrl.split('/')[-1] 88 | screenshot_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) 89 | screenshot_path = os.path.normpath(screenshot_path) 90 | if not screenshot_path.startswith(app.config['UPLOAD_FOLDER']): 91 | return jsonify({'success': False, 'error': 'Invalid file path'}), 400 92 | 93 | phish_category, pred_target, matched_domain, plotvis, siamese_conf, _, logo_recog_time, logo_match_time = phishpedia_cls.test_orig_phishpedia( 94 | url, screenshot_path, None) 95 | 96 | # 处理检测结果 97 | if phish_category == 0: 98 | if pred_target is None: 99 | result = 'Unknown' 100 | else: 101 | result = 'Benign' 102 | else: 103 | result = 'Phishing' 104 | 105 | plot_base64 = convert_to_base64(plotvis) 106 | 107 | # 返回检测结果 108 | result = { 109 | 'result': result, # 检测结果 110 | 'matched_brand': pred_target, # 匹配到的品牌 111 | 'correct_domain': matched_domain, # 正确的域名 112 | 'confidence': round(float(siamese_conf), 3), # 置信度,直接返回百分比 113 | 'detection_time': round(float(logo_recog_time) + float(logo_match_time), 3), # 检测时间 114 | 'logo_extraction': plot_base64 # logo标注结果,直接返回图像 115 | } 116 | return jsonify(result) 117 | 118 | 119 | @app.route('/get-directory', methods=['GET']) 120 | def get_file_tree(): 121 | """ 122 | 获取主目录的文件树 123 | """ 124 | def build_file_tree(path): 125 | tree = [] 126 | try: 127 | for entry in os.listdir(path): 128 | entry_path = os.path.join(path, entry) 129 | entry_path = os.path.normpath(entry_path) 130 | if not entry_path.startswith(path): 131 | continue 132 | if os.path.isdir(entry_path): 133 | tree.append({ 134 | 'name': entry, 135 | 'type': 'directory', 136 | 'children': build_file_tree(entry_path) # 递归子目录 137 | }) 138 | elif entry.lower().endswith(('.png', '.jpeg', '.jpg')): 139 | tree.append({ 140 | 'name': entry, 141 | 'type': 'file' 142 | }) 143 | else: 144 | continue 145 | except PermissionError: 146 | pass # 忽略权限错误 147 | return sorted(tree, key=lambda x: x['name'].lower()) # 按 name 字段排序,不区分大小写 148 | 149 | root_path = app.config['FILE_TREE_ROOT'] 150 | if not os.path.exists(root_path): 151 | return jsonify({'error': 'Root directory does not exist'}), 404 152 | 153 | file_tree = build_file_tree(root_path) 154 | return jsonify({'file_tree': file_tree}), 200 155 | 156 | 157 | @app.route('/view-file', methods=['GET']) 158 | def view_file(): 159 | file_name = request.args.get('file') 160 | file_path = os.path.join(app.config['FILE_TREE_ROOT'], file_name) 161 | file_path = os.path.normpath(file_path) 162 | if not file_path.startswith(app.config['FILE_TREE_ROOT']): 163 | return jsonify({'error': 'Invalid file path'}), 400 164 | 165 | if not os.path.exists(file_path): 166 | return jsonify({'error': 'File not found'}), 404 167 | 168 | if file_name.lower().endswith(('.png', '.jpeg', '.jpg')): 169 | return send_from_directory(app.config['FILE_TREE_ROOT'], file_name) 170 | 171 | return jsonify({'error': 'Unsupported file type'}), 400 172 | 173 | 174 | @app.route('/add-logo', methods=['POST']) 175 | def add_logo(): 176 | if 'logo' not in request.files: 177 | return jsonify({'success': False, 'error': 'No file part'}), 400 178 | 179 | logo = request.files['logo'] 180 | if logo.filename == '': 181 | return jsonify({'success': False, 'error': 'No selected file'}), 400 182 | 183 | if logo and allowed_file(logo.filename): 184 | directory = request.form.get('directory') 185 | if not directory: 186 | return jsonify({'success': False, 'error': 'No directory specified'}), 400 187 | 188 | directory_path = os.path.join(app.config['FILE_TREE_ROOT'], directory) 189 | directory_path = os.path.normpath(directory_path) 190 | if not directory_path.startswith(app.config['FILE_TREE_ROOT']): 191 | return jsonify({'success': False, 'error': 'Invalid directory path'}), 400 192 | 193 | if not os.path.exists(directory_path): 194 | return jsonify({'success': False, 'error': 'Directory does not exist'}), 400 195 | 196 | file_path = os.path.join(directory_path, logo.filename) 197 | file_path = os.path.normpath(file_path) 198 | if not file_path.startswith(directory_path): 199 | return jsonify({'success': False, 'error': 'Invalid file path'}), 400 200 | logo.save(file_path) 201 | return jsonify({'success': True, 'message': 'Logo added successfully'}), 200 202 | 203 | return jsonify({'success': False, 'error': 'Invalid file type'}), 400 204 | 205 | 206 | @app.route('/del-logo', methods=['POST']) 207 | def del_logo(): 208 | directory = request.form.get('directory') 209 | filename = request.form.get('filename') 210 | 211 | if not directory or not filename: 212 | return jsonify({'success': False, 'error': 'Directory and filename must be specified'}), 400 213 | 214 | directory_path = os.path.join(app.config['FILE_TREE_ROOT'], directory) 215 | directory_path = os.path.normpath(directory_path) 216 | if not directory_path.startswith(app.config['FILE_TREE_ROOT']): 217 | return jsonify({'success': False, 'error': 'Invalid directory path'}), 400 218 | file_path = os.path.join(directory_path, filename) 219 | file_path = os.path.normpath(file_path) 220 | if not file_path.startswith(directory_path): 221 | return jsonify({'success': False, 'error': 'Invalid file path'}), 400 222 | 223 | if not os.path.exists(file_path): 224 | return jsonify({'success': False, 'error': 'File does not exist'}), 400 225 | 226 | try: 227 | os.remove(file_path) 228 | return jsonify({'success': True, 'message': 'Logo deleted successfully'}), 200 229 | except Exception: 230 | return jsonify({'success': False}), 500 231 | 232 | 233 | @app.route('/add-brand', methods=['POST']) 234 | def add_brand(): 235 | brand_name = request.form.get('brandName') 236 | brand_domain = request.form.get('brandDomain') 237 | 238 | if not brand_name or not brand_domain: 239 | return jsonify({'success': False, 'error': 'Brand name and domain must be specified'}), 400 240 | 241 | # 创建品牌目录 242 | brand_directory_path = os.path.join(app.config['FILE_TREE_ROOT'], brand_name) 243 | brand_directory_path = os.path.normpath(brand_directory_path) 244 | if not brand_directory_path.startswith(app.config['FILE_TREE_ROOT']): 245 | return jsonify({'success': False, 'error': 'Invalid brand directory path'}), 400 246 | 247 | if os.path.exists(brand_directory_path): 248 | return jsonify({'success': False, 'error': 'Brand already exists'}), 400 249 | 250 | try: 251 | os.makedirs(brand_directory_path) 252 | domain_map_add(brand_name, brand_domain, app.config['DOMAIN_MAP_PATH']) 253 | return jsonify({'success': True, 'message': 'Brand added successfully'}), 200 254 | except Exception: 255 | return jsonify({'success': False}), 500 256 | 257 | 258 | @app.route('/del-brand', methods=['POST']) 259 | def del_brand(): 260 | directory = request.json.get('directory') 261 | 262 | if not directory: 263 | return jsonify({'success': False, 'error': 'Directory must be specified'}), 400 264 | 265 | directory_path = os.path.join(app.config['FILE_TREE_ROOT'], directory) 266 | directory_path = os.path.normpath(directory_path) 267 | if not directory_path.startswith(app.config['FILE_TREE_ROOT']): 268 | return jsonify({'success': False, 'error': 'Invalid directory path'}), 400 269 | 270 | if not os.path.exists(directory_path): 271 | return jsonify({'success': False, 'error': 'Directory does not exist'}), 400 272 | 273 | try: 274 | shutil.rmtree(directory_path) 275 | domain_map_delete(directory, app.config['DOMAIN_MAP_PATH']) 276 | return jsonify({'success': True, 'message': 'Brand deleted successfully'}), 200 277 | except Exception: 278 | return jsonify({'success': False}), 500 279 | 280 | 281 | @app.route('/reload-model', methods=['POST']) 282 | def reload_model(): 283 | global phishpedia_cls 284 | try: 285 | load_config(reload_targetlist=True) 286 | # Reinitialize Phishpedia 287 | phishpedia_cls = PhishpediaWrapper() 288 | return jsonify({'success': True, 'message': 'Brand deleted successfully'}), 200 289 | except Exception: 290 | return jsonify({'success': False}), 500 291 | 292 | 293 | if __name__ == "__main__": 294 | ip_address = '0.0.0.0' 295 | port = 5000 296 | while check_port_inuse(port, ip_address): 297 | port = port + 1 298 | 299 | # 加载核心检测逻辑 300 | phishpedia_cls = PhishpediaWrapper() 301 | 302 | initial_upload_folder(app.config['UPLOAD_FOLDER']) 303 | 304 | app.run(host=ip_address, port=port) 305 | -------------------------------------------------------------------------------- /WEBtool/static/js/sidebar.js: -------------------------------------------------------------------------------- 1 | // sidebar.js 2 | new Vue({ 3 | el: '#sidebar', 4 | data() { 5 | return { 6 | selectedDirectory: null, // 记录当前选中的目录 7 | selectedFile: null, // 记录当前选中的文件 8 | selectedDirectoryName: '', 9 | selectedFileName: '', 10 | showAddBrandForm: false, // 控制表单显示与隐藏 11 | brandName: '', // 品牌名称 12 | brandDomain: '', // 品牌域名 13 | } 14 | }, 15 | mounted() { 16 | // 网页加载时调用 fetchFileTree 函数 17 | this.fetchFileTree(); 18 | document.getElementById('logo-file-input').addEventListener('change', this.handleLogoFileSelect); 19 | 20 | const sidebar = document.getElementById("sidebar"); 21 | const sidebarToggle = document.getElementById("sidebar-toggle"); 22 | const closeSidebar = document.getElementById("close-sidebar"); 23 | 24 | // 点击打开侧边栏 25 | sidebarToggle.addEventListener("click", () => { 26 | sidebar.classList.add("open"); 27 | }); 28 | 29 | // 点击关闭侧边栏 30 | closeSidebar.addEventListener("click", () => { 31 | sidebar.classList.remove("open"); 32 | this.clearSelected(); 33 | }); 34 | 35 | // 点击侧边栏外部关闭 36 | document.addEventListener("click", (event) => { 37 | if (!sidebar.contains(event.target) && !sidebarToggle.contains(event.target)) { 38 | sidebar.classList.remove("open"); 39 | this.clearSelected(); 40 | } 41 | }); 42 | }, 43 | methods: { 44 | // 递归渲染文件树 45 | renderFileTree(directory, parentPath = '') { 46 | // 获取文件树容器 47 | const fileTreeRoot = document.getElementById('file-tree-root'); 48 | fileTreeRoot.innerHTML = ''; // 清空现有内容 49 | 50 | // 递归生成文件树节点 51 | const createFileTreeNode = (item, parentPath) => { 52 | const li = document.createElement('li'); 53 | li.classList.add('file-item'); 54 | 55 | const currentPath = parentPath ? `${parentPath}/${item.name}` : item.name; 56 | 57 | if (item.type === 'directory') { 58 | li.classList.add('file-folder'); 59 | 60 | const folderNameContainer = document.createElement('div'); 61 | folderNameContainer.classList.add('folder-name'); 62 | folderNameContainer.innerHTML = `📁${item.name}`; 63 | li.appendChild(folderNameContainer); 64 | 65 | if (item.children) { 66 | const ul = document.createElement('ul'); 67 | ul.classList.add('hidden'); // 默认隐藏子目录 68 | item.children.forEach((child) => { 69 | ul.appendChild(createFileTreeNode(child, currentPath)); // 传递当前目录的路径 70 | }); 71 | li.appendChild(ul); 72 | 73 | // 单击选中目录 74 | folderNameContainer.addEventListener('click', (e) => { 75 | e.stopPropagation(); 76 | this.selectDirectory(e, item.name); 77 | }); 78 | 79 | // 双击展开/隐藏目录 80 | folderNameContainer.addEventListener('dblclick', (e) => { 81 | e.stopPropagation(); 82 | ul.classList.toggle('hidden'); 83 | }); 84 | } 85 | } else { 86 | li.classList.add('file-file'); 87 | li.innerHTML = `📄${item.name}`; 88 | 89 | // 单击选中文件 90 | li.addEventListener('click', (event) => { 91 | this.selectFile(event, item.name, parentPath); 92 | }); 93 | } 94 | 95 | return li; 96 | }; 97 | 98 | // 遍历顶层文件和目录 99 | directory.forEach((item) => { 100 | fileTreeRoot.appendChild(createFileTreeNode(item, parentPath)); 101 | }); 102 | }, 103 | // 获取文件树数据 104 | fetchFileTree() { 105 | // 发送请求获取文件树数据 106 | fetch('/get-directory') // 后端文件树接口 107 | .then((response) => response.json()) 108 | .then((data) => { 109 | if (data.file_tree) { 110 | this.fileTree = data.file_tree; // 存储文件树数据 111 | this.renderFileTree(this.fileTree); // 渲染文件树 112 | } else { 113 | console.error('Invalid file tree data'); 114 | alert('文件树加载失败'); 115 | } 116 | }) 117 | .catch((error) => { 118 | console.error('Error fetching file tree:', error); 119 | alert('无法加载文件树,请稍后重试。'); 120 | }); 121 | }, 122 | 123 | // 选中目录 124 | selectDirectory(event, directoryName) { 125 | const folderNameContainer = event.currentTarget; 126 | 127 | if (this.selectedDirectory) { 128 | this.selectedDirectory.classList.remove('selected'); 129 | } 130 | if (this.selectedFile) { 131 | this.selectedFile.classList.remove('selected'); 132 | } 133 | 134 | // 设置当前选中的目录 135 | this.selectedDirectory = folderNameContainer; 136 | this.selectedDirectoryName = directoryName; 137 | folderNameContainer.classList.add('selected'); 138 | this.selectedFile = null; 139 | this.selectedFileName = ''; 140 | }, 141 | 142 | // 选中文件 143 | selectFile(event, fileName, parentPath) { 144 | const fileElement = event.currentTarget; 145 | 146 | if (this.selectedDirectory) { 147 | this.selectedDirectory.classList.remove('selected'); 148 | } 149 | if (this.selectedFile) { 150 | this.selectedFile.classList.remove('selected'); 151 | } 152 | 153 | // 设置当前选中的文件 154 | this.selectedFile = fileElement; 155 | this.selectedFileName = fileName; 156 | fileElement.classList.add('selected'); 157 | this.selectedDirectory = null; 158 | this.selectedDirectoryName = parentPath; 159 | }, 160 | 161 | // 增加品牌 162 | addBrand() { 163 | this.showAddBrandForm = true; 164 | }, 165 | 166 | // 关闭添加品牌的表单 167 | closeAddBrandForm() { 168 | this.showAddBrandForm = false; 169 | this.brandName = ''; 170 | this.brandDomain = ''; 171 | }, 172 | 173 | // 提交添加品牌的表单 174 | submitAddBrandForm() { 175 | if (!this.brandName || !this.brandDomain) { 176 | alert('Please fill in all fields.'); 177 | closeAddBrandForm() 178 | return; 179 | } 180 | 181 | const formData = new FormData(); 182 | formData.append('brandName', this.brandName); 183 | formData.append('brandDomain', this.brandDomain); 184 | 185 | fetch('/add-brand', { 186 | method: 'POST', 187 | body: formData 188 | }) 189 | .then(response => response.json()) 190 | .then(data => { 191 | if (data.success) { 192 | alert('Brand added successfully.'); 193 | this.fetchFileTree(); 194 | this.closeAddBrandForm(); 195 | } else { 196 | alert('Failed to add brand: ' + data.error); 197 | } 198 | }) 199 | .catch(error => { 200 | console.error('Error:', error); 201 | alert('Failed to add brand, please try again.'); 202 | }); 203 | }, 204 | 205 | // 删除品牌 206 | delBrand() { 207 | if (this.selectedDirectory == null) { 208 | alert('Please select a brand first.'); 209 | return; 210 | } 211 | const formData = new FormData(); 212 | formData.append('directory', this.selectedDirectoryName); 213 | 214 | fetch('/del-brand', { 215 | method: 'POST', 216 | headers: { 217 | 'Content-Type': 'application/json' 218 | }, 219 | body: JSON.stringify({ 220 | directory: this.selectedDirectoryName 221 | }) 222 | }) 223 | .then(response => response.json()) 224 | .then(data => { 225 | if (data.success) { 226 | alert('Brand deletedsuccessfully.'); 227 | this.fetchFileTree(); 228 | } 229 | }) 230 | }, 231 | 232 | // 增加logo 233 | addLogo() { 234 | console.log('addLogo'); 235 | if (this.selectedDirectory == null) { 236 | alert('Please select a brand first.'); 237 | return; 238 | } 239 | document.getElementById('logo-file-input').click(); 240 | }, 241 | 242 | handleLogoFileSelect(event) { 243 | const file = event.target.files[0]; 244 | if (file) { 245 | const formData = new FormData(); 246 | formData.append('logo', file); 247 | formData.append('directory', this.selectedDirectoryName); 248 | 249 | fetch('/add-logo', { 250 | method: 'POST', 251 | body: formData 252 | }) 253 | .then(response => response.json()) 254 | .then(data => { 255 | if (data.success) { 256 | this.fetchFileTree(); 257 | } else { 258 | alert('Failed to add logo: ' + data.error); 259 | } 260 | }) 261 | .catch(error => { 262 | console.error('Error:', error); 263 | alert('Failed to add logo, please try again.'); 264 | }); 265 | } 266 | }, 267 | 268 | // 删除logo 269 | delLogo() { 270 | if (this.selectedFile == null) { 271 | alert('Please select a logo first.'); 272 | return; 273 | } 274 | 275 | const formData = new FormData(); 276 | formData.append('directory', this.selectedDirectoryName); 277 | formData.append('filename', this.selectedFileName); 278 | 279 | fetch('/del-logo', { 280 | method: 'POST', 281 | body: formData 282 | }) 283 | .then(response => response.json()) 284 | .then(data => { 285 | if (data.success) { 286 | this.fetchFileTree(); 287 | } else { 288 | alert('Failed to delete logo: ' + data.error); 289 | } 290 | }) 291 | .catch(error => { 292 | console.error('Error:', error); 293 | alert('Failed to delete logo, please try again.'); 294 | }); 295 | }, 296 | 297 | async reloadModel() { 298 | const overlay = document.getElementById('overlay'); 299 | 300 | overlay.style.display = 'flex'; 301 | 302 | try { 303 | const response = await fetch('/reload-model', { 304 | method: 'POST', 305 | headers: { 306 | 'Content-Type': 'application/json' 307 | } 308 | }); 309 | const data = await response.json(); 310 | } catch (error) { 311 | alert('Failed to reload model.'); 312 | } finally { 313 | overlay.style.display = 'none'; 314 | } 315 | }, 316 | 317 | clearSelected() { 318 | if (this.selectedDirectory) { 319 | this.selectedDirectory.classList.remove('selected'); 320 | this.selectDirectory = null; 321 | } 322 | if (this.selectedFile) { 323 | this.selectedFile.classList.remove('selected'); 324 | this.selectFile = null; 325 | } 326 | this.selectedDirectoryName = ''; 327 | this.selectedFileName = ''; 328 | }, 329 | } 330 | }); -------------------------------------------------------------------------------- /logo_matching.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageOps 2 | from torchvision import transforms 3 | from utils import brand_converter, resolution_alignment, l2_norm 4 | from models import KNOWN_MODELS 5 | import torch 6 | import os 7 | import numpy as np 8 | from collections import OrderedDict 9 | from tqdm import tqdm 10 | from tldextract import tldextract 11 | import pickle 12 | 13 | COUNTRY_TLDs = [ 14 | ".af", 15 | ".ax", 16 | ".al", 17 | ".dz", 18 | ".as", 19 | ".ad", 20 | ".ao", 21 | ".ai", 22 | ".aq", 23 | ".ag", 24 | ".ar", 25 | ".am", 26 | ".aw", 27 | ".ac", 28 | ".au", 29 | ".at", 30 | ".az", 31 | ".bs", 32 | ".bh", 33 | ".bd", 34 | ".bb", 35 | ".eus", 36 | ".by", 37 | ".be", 38 | ".bz", 39 | ".bj", 40 | ".bm", 41 | ".bt", 42 | ".bo", 43 | ".bq",".an",".nl", 44 | ".ba", 45 | ".bw", 46 | ".bv", 47 | ".br", 48 | ".io", 49 | ".vg", 50 | ".bn", 51 | ".bg", 52 | ".bf", 53 | ".mm", 54 | ".bi", 55 | ".kh", 56 | ".cm", 57 | ".ca", 58 | ".cv", 59 | ".cat", 60 | ".ky", 61 | ".cf", 62 | ".td", 63 | ".cl", 64 | ".cn", 65 | ".cx", 66 | ".cc", 67 | ".co", 68 | ".km", 69 | ".cd", 70 | ".cg", 71 | ".ck", 72 | ".cr", 73 | ".ci", 74 | ".hr", 75 | ".cu", 76 | ".cw", 77 | ".cy", 78 | ".cz", 79 | ".dk", 80 | ".dj", 81 | ".dm", 82 | ".do", 83 | ".tl",".tp", 84 | ".ec", 85 | ".eg", 86 | ".sv", 87 | ".gq", 88 | ".er", 89 | ".ee", 90 | ".et", 91 | ".eu", 92 | ".fk", 93 | ".fo", 94 | ".fm", 95 | ".fj", 96 | ".fi", 97 | ".fr", 98 | ".gf", 99 | ".pf", 100 | ".tf", 101 | ".ga", 102 | ".gal", 103 | ".gm", 104 | ".ps", 105 | ".ge", 106 | ".de", 107 | ".gh", 108 | ".gi", 109 | ".gr", 110 | ".gl", 111 | ".gd", 112 | ".gp", 113 | ".gu", 114 | ".gt", 115 | ".gg", 116 | ".gn", 117 | ".gw", 118 | ".gy", 119 | ".ht", 120 | ".hm", 121 | ".hn", 122 | ".hk", 123 | ".hu", 124 | ".is", 125 | ".in", 126 | ".id", 127 | ".ir", 128 | ".iq", 129 | ".ie", 130 | ".im", 131 | ".il", 132 | ".it", 133 | ".jm", 134 | ".jp", 135 | ".je", 136 | ".jo", 137 | ".kz", 138 | ".ke", 139 | ".ki", 140 | ".kw", 141 | ".kg", 142 | ".la", 143 | ".lv", 144 | ".lb", 145 | ".ls", 146 | ".lr", 147 | ".ly", 148 | ".li", 149 | ".lt", 150 | ".lu", 151 | ".mo", 152 | ".mk", 153 | ".mg", 154 | ".mw", 155 | ".my", 156 | ".mv", 157 | ".ml", 158 | ".mt", 159 | ".mh", 160 | ".mq", 161 | ".mr", 162 | ".mu", 163 | ".yt", 164 | ".mx", 165 | ".md", 166 | ".mc", 167 | ".mn", 168 | ".me", 169 | ".ms", 170 | ".ma", 171 | ".mz", 172 | ".mm", 173 | ".na", 174 | ".nr", 175 | ".np", 176 | ".nl", 177 | ".nc", 178 | ".nz", 179 | ".ni", 180 | ".ne", 181 | ".ng", 182 | ".nu", 183 | ".nf", 184 | ".nc",".tr", 185 | ".kp", 186 | ".mp", 187 | ".no", 188 | ".om", 189 | ".pk", 190 | ".pw", 191 | ".ps", 192 | ".pa", 193 | ".pg", 194 | ".py", 195 | ".pe", 196 | ".ph", 197 | ".pn", 198 | ".pl", 199 | ".pt", 200 | ".pr", 201 | ".qa", 202 | ".ro", 203 | ".ru", 204 | ".rw", 205 | ".re", 206 | ".bq",".an", 207 | ".bl",".gp",".fr", 208 | ".sh", 209 | ".kn", 210 | ".lc", 211 | ".mf",".gp",".fr", 212 | ".pm", 213 | ".vc", 214 | ".ws", 215 | ".sm", 216 | ".st", 217 | ".sa", 218 | ".sn", 219 | ".rs", 220 | ".sc", 221 | ".sl", 222 | ".sg", 223 | ".bq",".an",".nl", 224 | ".sx",".an", 225 | ".sk", 226 | ".si", 227 | ".sb", 228 | ".so", 229 | ".so", 230 | ".za", 231 | ".gs", 232 | ".kr", 233 | ".ss", 234 | ".es", 235 | ".lk", 236 | ".sd", 237 | ".sr", 238 | ".sj", 239 | ".sz", 240 | ".se", 241 | ".ch", 242 | ".sy", 243 | ".tw", 244 | ".tj", 245 | ".tz", 246 | ".th", 247 | ".tg", 248 | ".tk", 249 | ".to", 250 | ".tt", 251 | ".tn", 252 | ".tr", 253 | ".tm", 254 | ".tc", 255 | ".tv", 256 | ".ug", 257 | ".ua", 258 | ".ae", 259 | ".uk", 260 | ".us", 261 | ".vi", 262 | ".uy", 263 | ".uz", 264 | ".vu", 265 | ".va", 266 | ".ve", 267 | ".vn", 268 | ".wf", 269 | ".eh", 270 | ".ma", 271 | ".ye", 272 | ".zm", 273 | ".zw" 274 | ] 275 | 276 | def check_domain_brand_inconsistency(logo_boxes, 277 | domain_map_path: str, 278 | model, logo_feat_list, 279 | file_name_list, shot_path: str, 280 | url: str, similarity_threshold: float, 281 | topk: float = 3): 282 | # targetlist domain list 283 | with open(domain_map_path, 'rb') as handle: 284 | domain_map = pickle.load(handle) 285 | 286 | print('Number of logo boxes:', len(logo_boxes)) 287 | suffix_part = '.'+ tldextract.extract(url).suffix 288 | domain_part = tldextract.extract(url).domain 289 | extracted_domain = domain_part + suffix_part 290 | matched_target, matched_domain, matched_coord, this_conf = None, None, None, None 291 | 292 | if len(logo_boxes) > 0: 293 | # siamese prediction for logo box 294 | for i, coord in enumerate(logo_boxes): 295 | 296 | if i == topk: 297 | break 298 | 299 | min_x, min_y, max_x, max_y = coord 300 | bbox = [float(min_x), float(min_y), float(max_x), float(max_y)] 301 | matched_target, matched_domain, this_conf = pred_brand(model, domain_map, 302 | logo_feat_list, file_name_list, 303 | shot_path, bbox, 304 | similarity_threshold=similarity_threshold, 305 | grayscale=False, 306 | do_aspect_ratio_check=False, 307 | do_resolution_alignment=False) 308 | 309 | # print(target_this, domain_this, this_conf) 310 | # domain matcher to avoid FP 311 | if matched_target and matched_domain: 312 | matched_coord = coord 313 | matched_domain_parts = [tldextract.extract(x).domain for x in matched_domain] 314 | matched_suffix_parts = [tldextract.extract(x).suffix for x in matched_domain] 315 | 316 | # If the webpage domain exactly aligns with the target website's domain => Benign 317 | if extracted_domain in matched_domain: 318 | matched_target, matched_domain = None, None # Clear if domains are consistent 319 | elif domain_part in matched_domain_parts: # # elIf only the 2nd-level-domains align, and the tld is regional => Benign 320 | if "." + suffix_part.split('.')[-1] in COUNTRY_TLDs: 321 | matched_target, matched_domain = None, None 322 | else: 323 | break # Inconsistent domain found, break the loop 324 | else: 325 | break # Inconsistent domain found, break the loop 326 | 327 | return brand_converter(matched_target), matched_domain, matched_coord, this_conf 328 | 329 | 330 | def load_model_weights(num_classes: int, weights_path: str): 331 | ''' 332 | :param num_classes: number of protected brands 333 | :param weights_path: siamese weights 334 | :return model: siamese model 335 | ''' 336 | # Initialize model 337 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 338 | model = KNOWN_MODELS["BiT-M-R50x1"](head_size=num_classes, zero_head=True) 339 | 340 | # Load weights 341 | weights = torch.load(weights_path, map_location='cpu') 342 | weights = weights['model'] if 'model' in weights.keys() else weights 343 | new_state_dict = OrderedDict() 344 | for k, v in weights.items(): 345 | if 'module.' in k: 346 | name = k.split('module.')[1] 347 | else: 348 | name = k 349 | new_state_dict[name] = v 350 | 351 | model.load_state_dict(new_state_dict) 352 | model.to(device) 353 | model.eval() 354 | return model 355 | 356 | 357 | def cache_reference_list(model, targetlist_path: str, grayscale=False): 358 | ''' 359 | cache the embeddings of the reference list 360 | :param targetlist_path: targetlist folder 361 | :param grayscale: convert logo to grayscale or not, default is RGB 362 | :return logo_feat_list: targetlist embeddings 363 | :return file_name_list: targetlist paths 364 | ''' 365 | 366 | # Prediction for targetlists 367 | logo_feat_list = [] 368 | file_name_list = [] 369 | 370 | target_list = os.listdir(targetlist_path) 371 | for target in tqdm(target_list): 372 | if target.startswith('.'): # skip hidden files 373 | continue 374 | logo_list = os.listdir(os.path.join(targetlist_path, target)) 375 | for logo_path in logo_list: 376 | # List of valid image extensions 377 | valid_extensions = ['.png', 'PNG', '.jpeg', '.jpg', '.JPG', '.JPEG'] 378 | if any(logo_path.endswith(ext) for ext in valid_extensions): 379 | skip_prefixes = ['loginpage', 'homepage'] 380 | if any(logo_path.startswith(prefix) for prefix in skip_prefixes): # skip homepage/loginpage 381 | continue 382 | try: 383 | logo_feat_list.append(get_embedding(img=os.path.join(targetlist_path, target, logo_path), 384 | model=model, grayscale=grayscale)) 385 | file_name_list.append(str(os.path.join(targetlist_path, target, logo_path))) 386 | except OSError: 387 | print(f"Error opening image: {os.path.join(targetlist_path, target, logo_path)}") 388 | continue 389 | 390 | return logo_feat_list, file_name_list 391 | 392 | 393 | @torch.no_grad() 394 | def get_embedding(img, model, grayscale=False): 395 | ''' 396 | Inference for a single image 397 | :param img: image path in str or image in PIL.Image 398 | :param model: model to make inference 399 | :param grayscale: convert image to grayscale or not 400 | :return feature embedding of shape (2048,) 401 | ''' 402 | # img_size = 224 403 | img_size = 128 404 | mean = [0.5, 0.5, 0.5] 405 | std = [0.5, 0.5, 0.5] 406 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 407 | 408 | img_transforms = transforms.Compose( 409 | [transforms.ToTensor(), 410 | transforms.Normalize(mean=mean, std=std), 411 | ]) 412 | 413 | img = Image.open(img) if isinstance(img, str) else img 414 | img = img.convert("L").convert("RGB") if grayscale else img.convert("RGB") 415 | 416 | ## Resize the image while keeping the original aspect ratio 417 | pad_color = 255 if grayscale else (255, 255, 255) 418 | img = ImageOps.expand( 419 | img, 420 | ( 421 | (max(img.size) - img.size[0]) // 2, 422 | (max(img.size) - img.size[1]) // 2, 423 | (max(img.size) - img.size[0]) // 2, 424 | (max(img.size) - img.size[1]) // 2 425 | ), 426 | fill=pad_color 427 | ) 428 | 429 | img = img.resize((img_size, img_size)) 430 | 431 | # Predict the embedding 432 | img = img_transforms(img) 433 | img = img[None, ...].to(device) 434 | logo_feat = model.features(img) 435 | logo_feat = l2_norm(logo_feat).squeeze(0).cpu().numpy() # L2-normalization final shape is (2048,) 436 | 437 | return logo_feat 438 | 439 | def chunked_dot(logo_feat_list, img_feat, chunk_size=128): 440 | sim_list = [] 441 | 442 | for start in range(0, logo_feat_list.shape[0], chunk_size): 443 | end = start + chunk_size 444 | chunk = logo_feat_list[start:end] 445 | sim_chunk = np.dot(chunk, img_feat.T) # shape: (chunk_size, M) 446 | sim_list.extend(sim_chunk) 447 | 448 | return sim_list 449 | 450 | def pred_brand(model, domain_map, logo_feat_list, file_name_list, shot_path: str, gt_bbox, similarity_threshold, 451 | grayscale=False, 452 | do_resolution_alignment=True, 453 | do_aspect_ratio_check=True): 454 | ''' 455 | Return predicted brand for one cropped image 456 | :param model: model to use 457 | :param domain_map: brand-domain dictionary 458 | :param logo_feat_list: reference logo feature embeddings 459 | :param file_name_list: reference logo paths 460 | :param shot_path: path to the screenshot 461 | :param gt_bbox: 1x4 np.ndarray/list/tensor bounding box coords 462 | :param similarity_threshold: similarity threshold for siamese 463 | :param do_resolution_alignment: if the similarity does not exceed the threshold, do we align their resolutions to have a retry 464 | :param do_aspect_ratio_check: once two logos are similar, whether we want to a further check on their aspect ratios 465 | :param grayscale: convert image(cropped) to grayscale or not 466 | :return: predicted target, predicted target's domain 467 | ''' 468 | 469 | try: 470 | img = Image.open(shot_path) 471 | except OSError: # if the image cannot be identified, return nothing 472 | print('Screenshot cannot be open') 473 | return None, None, None 474 | 475 | # get predicted box --> crop from screenshot 476 | cropped = img.crop((gt_bbox[0], gt_bbox[1], gt_bbox[2], gt_bbox[3])) 477 | img_feat = get_embedding(cropped, model, grayscale=grayscale) 478 | 479 | # get cosine similarity with every protected logo 480 | sim_list = chunked_dot(logo_feat_list, img_feat) # take dot product for every pair of embeddings (Cosine Similarity) 481 | pred_brand_list = file_name_list 482 | 483 | assert len(sim_list) == len(pred_brand_list) 484 | 485 | # get top 3 brands 486 | idx = np.argsort(sim_list)[::-1][:3] 487 | pred_brand_list = np.array(pred_brand_list)[idx] 488 | sim_list = np.array(sim_list)[idx] 489 | 490 | # top1,2,3 candidate logos 491 | top3_brandlist = [brand_converter(os.path.basename(os.path.dirname(x))) for x in pred_brand_list] 492 | top3_domainlist = [domain_map[x] for x in top3_brandlist] 493 | top3_simlist = sim_list 494 | 495 | for j in range(3): 496 | predicted_brand, predicted_domain = None, None 497 | 498 | # If we are trying those lower rank logo, the predicted brand of them should be the same as top1 logo, otherwise might be false positive 499 | if top3_brandlist[j] != top3_brandlist[0]: 500 | continue 501 | 502 | # If the largest similarity exceeds threshold 503 | if top3_simlist[j] >= similarity_threshold: 504 | predicted_brand = top3_brandlist[j] 505 | predicted_domain = top3_domainlist[j] 506 | final_sim = top3_simlist[j] 507 | 508 | # Else if not exceed, try resolution alignment, see if can improve 509 | elif do_resolution_alignment: 510 | orig_candidate_logo = Image.open(pred_brand_list[j]) 511 | cropped, candidate_logo = resolution_alignment(cropped, orig_candidate_logo) 512 | img_feat = get_embedding(cropped, model, grayscale=grayscale) 513 | logo_feat = get_embedding(candidate_logo, model, grayscale=grayscale) 514 | final_sim = logo_feat.dot(img_feat) 515 | if final_sim >= similarity_threshold: 516 | predicted_brand = top3_brandlist[j] 517 | predicted_domain = top3_domainlist[j] 518 | else: 519 | break # no hope, do not try other lower rank logos 520 | 521 | ## If there is a prediction, do aspect ratio check 522 | if predicted_brand is not None: 523 | if do_aspect_ratio_check: 524 | orig_candidate_logo = Image.open(pred_brand_list[j]) 525 | ratio_crop = cropped.size[0] / cropped.size[1] 526 | ratio_logo = orig_candidate_logo.size[0] / orig_candidate_logo.size[1] 527 | # aspect ratios of matched pair must not deviate by more than factor of 2.5 528 | if max(ratio_crop, ratio_logo) / min(ratio_crop, ratio_logo) > 2.5: 529 | continue # did not pass aspect ratio check, try other 530 | return predicted_brand, predicted_domain, final_sim 531 | 532 | return None, None, top3_simlist[0] 533 | --------------------------------------------------------------------------------