├── index.db ├── data ├── synth_db │ ├── base_1.jpg │ ├── base_2.jpg │ ├── base_3.jpg │ ├── base_4.jpg │ └── base_5.jpg ├── synth_new │ ├── new_1_copy.jpg │ ├── new_1_crop.jpg │ ├── new_1_flip.jpg │ ├── new_1_ps.jpg │ ├── new_1_rot.jpg │ ├── new_2_copy.jpg │ ├── new_2_crop.jpg │ ├── new_2_flip.jpg │ ├── new_2_ps.jpg │ ├── new_2_rot.jpg │ ├── new_3_copy.jpg │ ├── new_3_crop.jpg │ ├── new_3_flip.jpg │ ├── new_3_ps.jpg │ ├── new_3_rot.jpg │ ├── new_4_copy.jpg │ ├── new_4_crop.jpg │ ├── new_4_flip.jpg │ ├── new_4_ps.jpg │ ├── new_4_rot.jpg │ ├── new_5_copy.jpg │ ├── new_5_crop.jpg │ ├── new_5_flip.jpg │ ├── new_5_ps.jpg │ ├── new_5_rot.jpg │ ├── new_1_bright.jpg │ ├── new_1_jpeg30.jpg │ ├── new_2_bright.jpg │ ├── new_2_jpeg30.jpg │ ├── new_3_bright.jpg │ ├── new_3_jpeg30.jpg │ ├── new_4_bright.jpg │ ├── new_4_jpeg30.jpg │ ├── new_5_bright.jpg │ ├── new_5_jpeg30.jpg │ ├── new_unique_1.jpg │ ├── new_unique_2.jpg │ ├── new_unique_3.jpg │ ├── new_unique_4.jpg │ └── new_unique_5.jpg └── synth_labels.csv ├── reports ├── new_1_copy__VS__base_1.jpg ├── new_1_crop__VS__base_1.jpg ├── new_1_flip__VS__base_1.jpg ├── new_1_ps__VS__base_1.jpg ├── new_2_copy__VS__base_2.jpg ├── new_2_copy__VS__base_3.jpg ├── new_2_crop__VS__base_2.jpg ├── new_2_flip__VS__base_2.jpg ├── new_2_flip__VS__base_3.jpg ├── new_2_ps__VS__base_2.jpg ├── new_3_copy__VS__base_2.jpg ├── new_3_copy__VS__base_3.jpg ├── new_3_crop__VS__base_3.jpg ├── new_3_flip__VS__base_2.jpg ├── new_3_flip__VS__base_3.jpg ├── new_3_ps__VS__base_1.jpg ├── new_3_ps__VS__base_2.jpg ├── new_3_ps__VS__base_3.jpg ├── new_4_copy__VS__base_4.jpg ├── new_4_crop__VS__base_4.jpg ├── new_4_flip__VS__base_4.jpg ├── new_4_ps__VS__base_4.jpg ├── new_5_copy__VS__base_5.jpg ├── new_5_crop__VS__base_5.jpg ├── new_5_flip__VS__base_5.jpg ├── new_5_ps__VS__base_5.jpg ├── new_5_rot__VS__base_5.jpg ├── new_1_bright__VS__base_1.jpg ├── new_2_bright__VS__base_2.jpg ├── new_2_jpeg30__VS__base_2.jpg ├── new_3_bright__VS__base_3.jpg ├── new_3_jpeg30__VS__base_3.jpg ├── new_4_bright__VS__base_4.jpg ├── new_4_jpeg30__VS__base_4.jpg ├── new_5_bright__VS__base_5.jpg ├── new_5_jpeg30__VS__base_5.jpg ├── tune_out │ └── tune_results.csv └── dup_report.csv ├── __pycache__ └── duplicate_check.cpython-313.pyc ├── tools ├── __pycache__ │ └── tune_thresholds.cpython-312.pyc ├── generate_synthetic.py ├── tune_thresholds.py └── verify_synthetic.py ├── duplicate_check ├── __pycache__ │ ├── indexer.cpython-312.pyc │ ├── matcher.cpython-312.pyc │ ├── report.cpython-312.pyc │ ├── __init__.cpython-312.pyc │ └── features.cpython-312.pyc ├── __init__.py ├── report.py ├── indexer.py ├── features.py └── matcher.py ├── requirements.txt ├── config.yaml ├── tests ├── test_matcher.py └── test_features.py ├── LICENSE ├── duplicate_check.py ├── run_smoke.py ├── dupcheck_cli.py ├── README_zh.md ├── README_en.md └── README.md /index.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/index.db -------------------------------------------------------------------------------- /data/synth_db/base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_1.jpg -------------------------------------------------------------------------------- /data/synth_db/base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_2.jpg -------------------------------------------------------------------------------- /data/synth_db/base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_3.jpg -------------------------------------------------------------------------------- /data/synth_db/base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_4.jpg -------------------------------------------------------------------------------- /data/synth_db/base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_5.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_1.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_2.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_3.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_4.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_5.jpg -------------------------------------------------------------------------------- /reports/new_1_copy__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_copy__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_1_crop__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_crop__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_1_flip__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_flip__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_1_ps__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_ps__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_2_copy__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_copy__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_copy__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_copy__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_2_crop__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_crop__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_flip__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_flip__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_flip__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_flip__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_2_ps__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_ps__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_copy__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_copy__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_copy__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_copy__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_crop__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_crop__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_flip__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_flip__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_flip__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_flip__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_ps__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_3_ps__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_ps__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_4_copy__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_copy__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_crop__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_crop__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_flip__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_flip__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_ps__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_ps__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_5_copy__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_copy__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_crop__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_crop__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_flip__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_flip__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_ps__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_ps__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_rot__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_rot__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_1_bright__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_bright__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_2_bright__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_bright__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_jpeg30__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_jpeg30__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_bright__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_bright__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_jpeg30__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_jpeg30__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_4_bright__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_bright__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_jpeg30__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_jpeg30__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_5_bright__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_bright__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_jpeg30__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_jpeg30__VS__base_5.jpg -------------------------------------------------------------------------------- /__pycache__/duplicate_check.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/__pycache__/duplicate_check.cpython-313.pyc -------------------------------------------------------------------------------- /tools/__pycache__/tune_thresholds.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/tools/__pycache__/tune_thresholds.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/indexer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/indexer.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/matcher.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/matcher.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/report.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/report.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/features.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/features.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__init__.py: -------------------------------------------------------------------------------- 1 | """duplicate_check package init for the skeleton project.""" 2 | from . import features, indexer, matcher, report 3 | 4 | __all__ = ["features", "indexer", "matcher", "report"] 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | numpy 3 | Pillow 4 | imagehash 5 | faiss-cpu 6 | torch 7 | torchvision 8 | # Optional CLIP support (install from source if wheel unavailable) 9 | clip-anytorch 10 | tqdm 11 | matplotlib 12 | piexif 13 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | phash_bits: 64 2 | phash_thresh: 10 3 | tile_grid: 8 4 | tile_hamming_thresh: 6 5 | orb_max_features: 2000 6 | orb_inliers_thresh: 25 7 | orb_inlier_ratio: 0.25 8 | ncc_thresh: 0.92 9 | roi_margin_ratio: 0.12 10 | max_roi_matches: 60 11 | topk_recall: 50 12 | -------------------------------------------------------------------------------- /tests/test_matcher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from duplicate_check import matcher 5 | 6 | 7 | @pytest.mark.parametrize("dtype", [np.uint8, np.float32], ids=["uint8", "float32"]) 8 | def test_count_good_matches_dtype_handling(dtype): 9 | cv2 = pytest.importorskip("cv2") 10 | rng = np.random.default_rng(42) 11 | desc1 = (rng.random((32, 64)) * (255 if dtype == np.uint8 else 1)).astype(dtype) 12 | desc2 = desc1.copy().astype(dtype) 13 | result = matcher._count_good_matches(desc1, desc2) 14 | assert isinstance(result, int) 15 | assert result >= 0 16 | 17 | 18 | def test_count_good_matches_mixed_dtype(): 19 | pytest.importorskip("cv2") 20 | rng = np.random.default_rng(7) 21 | desc1 = (rng.random((16, 32)) * 255).astype(np.uint8) 22 | desc2 = desc1.astype(np.float32) / 255.0 23 | result = matcher._count_good_matches(desc1, desc2) 24 | assert isinstance(result, int) 25 | assert result >= 0 26 | -------------------------------------------------------------------------------- /reports/tune_out/tune_results.csv: -------------------------------------------------------------------------------- 1 | phash,orb,ncc,tp,fp,fn 2 | 6,10,0.85,5,0,30 3 | 6,10,0.9,5,0,30 4 | 6,10,0.92,5,0,30 5 | 6,10,0.95,5,0,30 6 | 6,25,0.85,5,0,30 7 | 6,25,0.9,5,0,30 8 | 6,25,0.92,5,0,30 9 | 6,25,0.95,5,0,30 10 | 6,50,0.85,5,0,30 11 | 6,50,0.9,5,0,30 12 | 6,50,0.92,5,0,30 13 | 6,50,0.95,5,0,30 14 | 8,10,0.85,5,0,30 15 | 8,10,0.9,5,0,30 16 | 8,10,0.92,5,0,30 17 | 8,10,0.95,5,0,30 18 | 8,25,0.85,5,0,30 19 | 8,25,0.9,5,0,30 20 | 8,25,0.92,5,0,30 21 | 8,25,0.95,5,0,30 22 | 8,50,0.85,5,0,30 23 | 8,50,0.9,5,0,30 24 | 8,50,0.92,5,0,30 25 | 8,50,0.95,5,0,30 26 | 10,10,0.85,8,0,27 27 | 10,10,0.9,8,0,27 28 | 10,10,0.92,8,0,27 29 | 10,10,0.95,8,0,27 30 | 10,25,0.85,8,0,27 31 | 10,25,0.9,8,0,27 32 | 10,25,0.92,8,0,27 33 | 10,25,0.95,8,0,27 34 | 10,50,0.85,8,0,27 35 | 10,50,0.9,8,0,27 36 | 10,50,0.92,8,0,27 37 | 10,50,0.95,8,0,27 38 | 12,10,0.85,8,0,27 39 | 12,10,0.9,8,0,27 40 | 12,10,0.92,8,0,27 41 | 12,10,0.95,8,0,27 42 | 12,25,0.85,8,0,27 43 | 12,25,0.9,8,0,27 44 | 12,25,0.92,8,0,27 45 | 12,25,0.95,8,0,27 46 | 12,50,0.85,8,0,27 47 | 12,50,0.9,8,0,27 48 | 12,50,0.92,8,0,27 49 | 12,50,0.95,8,0,27 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 DupCheck contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /duplicate_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Entrypoint for the duplicate image checking skeleton. 3 | 4 | This script wires the components together and provides a simple CLI. 5 | """ 6 | import argparse 7 | from pathlib import Path 8 | 9 | # ...existing code... 10 | def parse_args(): 11 | p = argparse.ArgumentParser(description="Duplicate image check skeleton") 12 | p.add_argument("--db_dir", required=True, help="Path to image database directory") 13 | p.add_argument("--input_dir", required=True, help="Path to new images to check") 14 | p.add_argument("--out_dir", required=True, help="Output reports directory") 15 | p.add_argument("--topk", type=int, default=50) 16 | return p.parse_args() 17 | 18 | 19 | def main(): 20 | args = parse_args() 21 | db_dir = Path(args.db_dir) 22 | input_dir = Path(args.input_dir) 23 | out_dir = Path(args.out_dir) 24 | out_dir.mkdir(parents=True, exist_ok=True) 25 | 26 | # Lazy imports to keep CLI responsive if modules missing 27 | from duplicate_check import indexer, features, matcher, report 28 | 29 | print(f"Indexing DB: {db_dir}") 30 | idx = indexer.build_index(db_dir) 31 | 32 | print(f"Processing inputs from: {input_dir}") 33 | results = [] 34 | for img_path in sorted(input_dir.iterdir()): 35 | if not img_path.is_file(): 36 | continue 37 | print(f"Checking {img_path.name}...") 38 | feats = features.compute_features(img_path) 39 | cand = matcher.recall_candidates(feats, idx, topk=args.topk) 40 | detailed = matcher.rerank_and_verify(img_path, cand, idx) 41 | results.extend(detailed) 42 | 43 | csv_path = out_dir / "dup_report.csv" 44 | report.write_csv(results, csv_path) 45 | print(f"Done. Report: {csv_path}") 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /data/synth_labels.csv: -------------------------------------------------------------------------------- 1 | new_image,matched_image,label 2 | new_1_copy.jpg,base_1.jpg,partial_duplicate 3 | new_1_crop.jpg,base_1.jpg,partial_duplicate 4 | new_1_rot.jpg,base_1.jpg,partial_duplicate 5 | new_1_bright.jpg,base_1.jpg,partial_duplicate 6 | new_1_jpeg30.jpg,base_1.jpg,partial_duplicate 7 | new_1_ps.jpg,base_1.jpg,partial_duplicate 8 | new_1_flip.jpg,base_1.jpg,partial_duplicate 9 | new_2_copy.jpg,base_2.jpg,partial_duplicate 10 | new_2_crop.jpg,base_2.jpg,partial_duplicate 11 | new_2_rot.jpg,base_2.jpg,partial_duplicate 12 | new_2_bright.jpg,base_2.jpg,partial_duplicate 13 | new_2_jpeg30.jpg,base_2.jpg,partial_duplicate 14 | new_2_ps.jpg,base_2.jpg,partial_duplicate 15 | new_2_flip.jpg,base_2.jpg,partial_duplicate 16 | new_3_copy.jpg,base_3.jpg,partial_duplicate 17 | new_3_crop.jpg,base_3.jpg,partial_duplicate 18 | new_3_rot.jpg,base_3.jpg,partial_duplicate 19 | new_3_bright.jpg,base_3.jpg,partial_duplicate 20 | new_3_jpeg30.jpg,base_3.jpg,partial_duplicate 21 | new_3_ps.jpg,base_3.jpg,partial_duplicate 22 | new_3_flip.jpg,base_3.jpg,partial_duplicate 23 | new_4_copy.jpg,base_4.jpg,partial_duplicate 24 | new_4_crop.jpg,base_4.jpg,partial_duplicate 25 | new_4_rot.jpg,base_4.jpg,partial_duplicate 26 | new_4_bright.jpg,base_4.jpg,partial_duplicate 27 | new_4_jpeg30.jpg,base_4.jpg,partial_duplicate 28 | new_4_ps.jpg,base_4.jpg,partial_duplicate 29 | new_4_flip.jpg,base_4.jpg,partial_duplicate 30 | new_5_copy.jpg,base_5.jpg,partial_duplicate 31 | new_5_crop.jpg,base_5.jpg,partial_duplicate 32 | new_5_rot.jpg,base_5.jpg,partial_duplicate 33 | new_5_bright.jpg,base_5.jpg,partial_duplicate 34 | new_5_jpeg30.jpg,base_5.jpg,partial_duplicate 35 | new_5_ps.jpg,base_5.jpg,partial_duplicate 36 | new_5_flip.jpg,base_5.jpg,partial_duplicate 37 | new_unique_1.jpg,,unique 38 | new_unique_2.jpg,,unique 39 | new_unique_3.jpg,,unique 40 | new_unique_4.jpg,,unique 41 | new_unique_5.jpg,,unique 42 | -------------------------------------------------------------------------------- /tests/test_features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from pathlib import Path 4 | from PIL import Image 5 | 6 | from duplicate_check import features 7 | 8 | 9 | @pytest.fixture() 10 | def sample_image(tmp_path: Path) -> Path: 11 | path = tmp_path / "sample.png" 12 | img = Image.new("RGB", (96, 80), color=(128, 128, 128)) 13 | for x in range(96): 14 | for y in range(80): 15 | img.putpixel((x, y), (x % 256, y % 256, (x + y) % 256)) 16 | img.save(path) 17 | return path 18 | 19 | 20 | def test_compute_phash_variants_multiscale(sample_image: Path): 21 | variants = features.compute_phash_variants(sample_image) 22 | unique = {v for v in variants if v} 23 | assert len(variants) >= len(features.MULTISCALE_LEVELS), "expect multi-scale hashes" 24 | assert len(unique) >= len(features.MULTISCALE_LEVELS), "hashes should cover multiple scales/orientations" 25 | 26 | 27 | def test_compute_tile_hashes_structure(sample_image: Path): 28 | tiles = features.compute_tile_hashes(sample_image, grid=4) 29 | assert tiles, "tiles should not be empty" 30 | scales = {tile.get("scale") for tile in tiles} 31 | assert features.MULTISCALE_LEVELS[0] in scales 32 | w, h = Image.open(sample_image).size 33 | for tile in tiles: 34 | bbox = tile.get("bbox") 35 | assert isinstance(bbox, tuple) and len(bbox) == 4 36 | x0, y0, x1, y1 = bbox 37 | assert 0 <= x0 <= x1 <= w 38 | assert 0 <= y0 <= y1 <= h 39 | 40 | 41 | def test_compute_embedding_returns_vector(sample_image: Path): 42 | emb = features.compute_embedding(sample_image) 43 | assert emb is not None 44 | arr = np.asarray(emb) 45 | assert arr.ndim == 1 and arr.size > 0 46 | 47 | 48 | def test_compute_features_attaches_tiles(sample_image: Path): 49 | feats = features.compute_features(sample_image) 50 | assert feats.tiles is not None and len(feats.tiles) > 0 51 | assert isinstance(feats.tiles[0], dict) 52 | -------------------------------------------------------------------------------- /run_smoke.py: -------------------------------------------------------------------------------- 1 | """Run a simple smoke test of the duplicate check pipeline without pytest. 2 | 3 | Creates temporary directories with tiny JPEG fixtures and runs the main flow. 4 | """ 5 | import base64 6 | import tempfile 7 | from pathlib import Path 8 | 9 | from duplicate_check import indexer, features, matcher, report 10 | 11 | 12 | _TINY_JPEG_B64 = ( 13 | "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxISEBUQEBAVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolGxUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGy0lICYtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAJ8BPgMBIgACEQEDEQH/xAAbAAABBQEBAAAAAAAAAAAAAAAAAQIEBQYDB//EADwQAAEDAgQDBgMHAwMFAAAAAAEAAgMEEQUSITEGE0FRMmFxgZGh8COhsUIjUmKyweHxFSNDU5LxJENT/8QAGQEAAwEBAQAAAAAAAAAAAAAAAAECAwQF/8QAJhEBAAICAgIBAwUAAAAAAAAAAAECAxESIQQxQVEiUYGh8GH/2gAMAwEAAhEDEQA/AO4gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD//Z" 14 | ) 15 | 16 | 17 | def _write_tiny_jpeg(path: Path) -> None: 18 | """Write a minimal 1x1 JPEG so PIL/OpenCV can read it.""" 19 | path.write_bytes(base64.b64decode(_TINY_JPEG_B64)) 20 | 21 | 22 | def run(): 23 | with tempfile.TemporaryDirectory() as db_dir, tempfile.TemporaryDirectory() as in_dir, tempfile.TemporaryDirectory() as out_dir: 24 | dbp = Path(db_dir) 25 | inp = Path(in_dir) 26 | outp = Path(out_dir) 27 | # create tiny but valid JPEG fixtures 28 | _write_tiny_jpeg(dbp / "db_1.jpg") 29 | _write_tiny_jpeg(inp / "new_1.jpg") 30 | 31 | print("Building index...") 32 | idx = indexer.build_index(dbp) 33 | print("Computing features for input...") 34 | feats = features.compute_features(inp / "new_1.jpg") 35 | print("Recalling candidates...") 36 | cands = matcher.recall_candidates(feats, idx) 37 | print("Reranking/verifying...") 38 | rows = matcher.rerank_and_verify(inp / "new_1.jpg", cands, idx) 39 | csvp = outp / "dup_report.csv" 40 | report.write_csv(rows, csvp) 41 | print(f"Smoke run complete. Report: {csvp}") 42 | 43 | 44 | if __name__ == "__main__": 45 | run() 46 | -------------------------------------------------------------------------------- /dupcheck_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Simple CLI for running duplicate detection pipeline. 3 | 4 | Usage example: 5 | python dupcheck_cli.py --db_dir ./images_db --input_dir ./images_new --out_dir ./reports 6 | """ 7 | import argparse 8 | from pathlib import Path 9 | 10 | 11 | def parse_args(): 12 | p = argparse.ArgumentParser() 13 | p.add_argument("--db_dir", required=True) 14 | p.add_argument("--input_dir", required=True) 15 | p.add_argument("--out_dir", required=True) 16 | p.add_argument("--topk", type=int, default=50) 17 | p.add_argument("--index_db", default="./index.db", help="Path to sqlite index DB") 18 | p.add_argument("--rebuild_index", action="store_true", help="Rebuild sqlite index from db_dir") 19 | p.add_argument("--phash_thresh", type=int, default=10) 20 | p.add_argument("--orb_inliers_thresh", type=int, default=25) 21 | p.add_argument("--ncc_thresh", type=float, default=0.92) 22 | p.add_argument("--vector_score_thresh", type=float, default=0.0, help="Minimum FAISS similarity to accept a vector candidate") 23 | return p.parse_args() 24 | 25 | 26 | def main(): 27 | args = parse_args() 28 | db_dir = Path(args.db_dir) 29 | input_dir = Path(args.input_dir) 30 | out_dir = Path(args.out_dir) 31 | out_dir.mkdir(parents=True, exist_ok=True) 32 | 33 | from duplicate_check import indexer, features, matcher, report 34 | # import modules from package 35 | # 从包中导入模块 36 | 37 | # use sqlite-backed index if available 38 | # 优先使用 SQLite 索引以支持持久化和增量更新 39 | idx = None 40 | db_path = Path(args.index_db) 41 | if db_path.exists() and not args.rebuild_index: 42 | print(f"Loading index from {db_path}...") 43 | try: 44 | idx = indexer.load_index_from_db(db_path) 45 | except Exception: 46 | idx = None 47 | 48 | if idx is None: 49 | if args.rebuild_index or not db_path.exists(): 50 | print("Building sqlite index...") 51 | indexer.build_index_db(db_dir, db_path) 52 | else: 53 | print("Building in-memory index...") 54 | idx = indexer.load_index_from_db(db_path) if db_path.exists() else indexer.build_index(db_dir) 55 | 56 | results = [] 57 | for p in sorted(input_dir.iterdir()): 58 | if not p.is_file(): 59 | continue 60 | print(f"Checking {p.name}...") 61 | feats = features.compute_features(p) 62 | cands = matcher.recall_candidates( 63 | feats, 64 | idx, 65 | topk=args.topk, 66 | phash_thresh=args.phash_thresh, 67 | vector_score_thresh=args.vector_score_thresh, 68 | ) 69 | rows = matcher.rerank_and_verify(p, cands, idx, orb_inliers_thresh=args.orb_inliers_thresh, ncc_thresh=args.ncc_thresh) 70 | # generate evidence images for rows 71 | for r in rows: 72 | if r.get("matched_image"): 73 | dbp = Path(idx["by_id"][r["matched_image"]]["path"]) 74 | evid = out_dir / f"{p.stem}__VS__{dbp.stem}.jpg" 75 | report.make_evidence_image(p, dbp, evid, draw_matches=True, matches=r.get("match_pairs")) 76 | r["evidence_img_path"] = str(evid) 77 | results.extend(rows) 78 | 79 | csvp = out_dir / "dup_report.csv" 80 | report.write_csv(results, csvp) 81 | print(f"Done. Report: {csvp}") 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /tools/generate_synthetic.py: -------------------------------------------------------------------------------- 1 | """Generate synthetic dataset for duplicate detection experiments. 2 | 3 | Creates two folders under project `data/synth_db` and `data/synth_new` and a 4 | labels CSV `data/synth_labels.csv` listing ground-truth matches. 5 | 6 | Usage: 7 | python tools/generate_synthetic.py --out_dir ./data --count 5 8 | 9 | This reproduces the same patterns used in the interactive session. 10 | """ 11 | import argparse 12 | from pathlib import Path 13 | from PIL import Image, ImageDraw, ImageEnhance 14 | import random 15 | import csv 16 | 17 | 18 | def generate(out_dir: Path, count: int = 5): 19 | db = out_dir / 'synth_db' 20 | new = out_dir / 'synth_new' 21 | db.mkdir(parents=True, exist_ok=True) 22 | new.mkdir(parents=True, exist_ok=True) 23 | 24 | labels = [] 25 | for i in range(1, count+1): 26 | img = Image.new('RGB',(400,300),(200+i*5,180+i*3,160+i*2)) 27 | draw = ImageDraw.Draw(img) 28 | for x in range(50,350,6): 29 | for y in range(60,240,6): 30 | if (x*y+i) % 13 < 4: 31 | draw.point((x,y),(0,0,0)) 32 | base = db / f'base_{i}.jpg' 33 | img.save(base) 34 | 35 | # exact copy 36 | img.save(new / f'new_{i}_copy.jpg') 37 | labels.append((f'new_{i}_copy.jpg', base.name)) 38 | 39 | # cropped 40 | crop = img.crop((80,70,320,230)) 41 | crop.save(new / f'new_{i}_crop.jpg') 42 | labels.append((f'new_{i}_crop.jpg', base.name)) 43 | 44 | # rotated 45 | rot = img.rotate(15, expand=True, fillcolor=(200,200,200)) 46 | rot.save(new / f'new_{i}_rot.jpg') 47 | labels.append((f'new_{i}_rot.jpg', base.name)) 48 | 49 | # brightness 50 | bright = ImageEnhance.Brightness(img).enhance(1.3) 51 | bright.save(new / f'new_{i}_bright.jpg') 52 | labels.append((f'new_{i}_bright.jpg', base.name)) 53 | 54 | # compressed 55 | img.save(new / f'new_{i}_jpeg30.jpg', quality=30) 56 | labels.append((f'new_{i}_jpeg30.jpg', base.name)) 57 | 58 | # ps overlay (draw rectangle) 59 | ps = img.copy() 60 | d = ImageDraw.Draw(ps) 61 | d.rectangle((120,90,220,160), fill=(255,255,255)) 62 | ps.save(new / f'new_{i}_ps.jpg') 63 | labels.append((f'new_{i}_ps.jpg', base.name)) 64 | 65 | # flipped 66 | flip = img.transpose(Image.FLIP_LEFT_RIGHT) 67 | flip.save(new / f'new_{i}_flip.jpg') 68 | labels.append((f'new_{i}_flip.jpg', base.name)) 69 | 70 | # add some unique images 71 | for j in range(1, count+1): 72 | u = Image.new('RGB',(300,200),(random.randint(0,255),random.randint(0,255),random.randint(0,255))) 73 | u.save(new / f'new_unique_{j}.jpg') 74 | labels.append((f'new_unique_{j}.jpg','')) 75 | 76 | # write labels.csv 77 | labp = out_dir / 'synth_labels.csv' 78 | with open(labp, 'w', newline='', encoding='utf-8') as f: 79 | w = csv.writer(f) 80 | w.writerow(['new_image','matched_image','label']) 81 | for newn, dbn in labels: 82 | lab = 'unique' if dbn=='' else 'partial_duplicate' 83 | w.writerow([newn, dbn, lab]) 84 | 85 | print('Synthetic dataset created:') 86 | print(' DB:', db) 87 | print(' NEW:', new) 88 | print(' Labels:', labp) 89 | 90 | 91 | if __name__ == '__main__': 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('--out_dir', default='./data') 94 | parser.add_argument('--count', type=int, default=5) 95 | args = parser.parse_args() 96 | generate(Path(args.out_dir), count=args.count) 97 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | # DupCheck — 图片重复与伪造检测 2 | 3 | ## 项目简介 4 | DupCheck 面向广义的“图库去重 / 篡改检测”需求:不仅适用于理赔审核,也可服务内容审核、电商验真、版权保护等场景;项目最初用于防止第三方维修工重复上传维修照片骗取维修资金,原是某项目的子模块,后来被我独立化、优化并扩展为通用工具。系统会把新上传图片与历史图库逐一比对,识别完全重复、局部重复以及轻度改动的图像,并输出可供人工复核的证据。 5 | 6 | 项目依赖常见的 Python 图像 / 深度学习库,便于集成到各类上传管线或后台审核系统。 7 | 8 | ## 检测流程 9 | 1. **构建索引**:对图库图片计算多姿态 pHash(原图、旋转、翻转)、块哈希、缓存 ORB 特征,并可生成 ResNet-18 嵌入,确保几何和粗语义变换仍可召回。 10 | 2. **召回候选**:新上传图片通过 pHash/块哈希匹配,并可结合基于 ResNet-18 的 FAISS 向量检索;如有需要再进行多姿态 ORB 比对,将旋转、翻转的嫌疑图拉入候选集。 11 | 3. **精排验证**:对最佳姿态组合执行 ORB + RANSAC,若单应关系可靠,则在对应区域做 NCC,判断是否为 `exact_patch`。 12 | 4. **结果输出**:检测结论写入 `dup_report.csv`,命令行可生成对照证据图,辅助人工审核。 13 | 5. **阈值调优**:可选运行 `tools/tune_thresholds.py` 做网格搜索,为不同业务场景选取合适的 pHash/ORB/NCC 阈值组合。 14 | 15 | > **扩展建议**:若图库规模巨大或需集群部署,可在 `duplicate_check/indexer.py` / `load_index_from_db` 中替换内置 FAISS 索引,改写为向 Milvus、Qdrant、Pinecone 等外部向量数据库写入,再在 `matcher.recall_candidates` 中改为查询该服务。 16 | > **性能提示**:可调整 `DUPC_TILE_SCALES`(如 `1.0,0.6`)与 `DUPC_TILE_GRID`,在多尺度鲁棒性与运行速度之间取得平衡。 17 | 18 | ## 目录结构 19 | - `duplicate_check/` —— 核心库模块(`features`、`indexer`、`matcher`、`report`)。 20 | - `dupcheck_cli.py` —— 主命令行工具,支持内存索引或 SQLite 索引。 21 | - `duplicate_check.py` —— 兼容性入口脚本。 22 | - `tools/` —— 合成数据生成、阈值调参等辅助脚本。 23 | - `tests/` —— 测试文件夹。 24 | - `data/` —— 文档示例使用的合成数据集。 25 | 26 | ## 环境依赖 27 | 建议在 Python 3.9 及以上版本下创建虚拟环境,并安装 `requirements.txt` 中的依赖。OpenCV、Pillow、imagehash、`torch`、`torchvision` 与可选的 `faiss-cpu` 能启用全部功能,缺失时流程会自动降级。 28 | 29 | ```bash 30 | python -m venv .venv 31 | source .venv/bin/activate 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | ## 快速体验 36 | 1. 生成示例数据集: 37 | ```bash 38 | python tools/generate_synthetic.py --out_dir data --count 5 39 | ``` 40 | 2. 重建 SQLite 索引并执行检测: 41 | ```bash 42 | python dupcheck_cli.py \ 43 | --db_dir data/synth_db \ 44 | --input_dir data/synth_new \ 45 | --out_dir reports \ 46 | --index_db ./index.db \ 47 | --rebuild_index \ 48 | --vector_score_thresh 0.3 49 | ``` 50 | 3. 查看 `reports/dup_report.csv` 以及生成的证据图片。 51 | 4. (可选)对合成标注集进行评估,查看召回差异: 52 | ```bash 53 | python tools/verify_synthetic.py \ 54 | --db_dir data/synth_db \ 55 | --input_dir data/synth_new \ 56 | --labels data/synth_labels.csv \ 57 | --phash_thresh 16 \ 58 | --orb_inliers_thresh 6 \ 59 | --ncc_thresh 0.85 60 | ``` 61 | 5. (可选)执行阈值网格搜索,找到更优配置: 62 | ```bash 63 | python tools/tune_thresholds.py \ 64 | --labels data/synth_labels.csv \ 65 | --db_dir data/synth_db \ 66 | --input_dir data/synth_new \ 67 | --out_dir reports/tune_out 68 | ``` 69 | 70 | 若要复用已有索引,可省略 `--rebuild_index`。通过调整 `--phash_thresh`、`--orb_inliers_thresh`、`--ncc_thresh` 等参数探索查准率和召回率的平衡。 71 | 72 | ## 常用命令 73 | ```bash 74 | # 重建索引 75 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index 76 | 77 | # 自定义阈值运行 78 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94 79 | 80 | # 直接使用缓存索引 81 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db 82 | ``` 83 | 84 | ## 阈值调参 85 | 使用 `tools/tune_thresholds.py` 对多个阈值组合做网格搜索: 86 | 87 | ```bash 88 | python tools/tune_thresholds.py \ 89 | --labels data/synth_labels.csv \ 90 | --db_dir data/synth_db \ 91 | --input_dir data/synth_new \ 92 | --out_dir reports/tune_out 93 | ``` 94 | 95 | 脚本会输出 `tune_results.csv`,其中包含每组参数的 TP/FP/FN 统计,可据此锁定最适合的数据集配置。 96 | 97 | ## 许可协议 98 | 99 | 本项目以 [MIT License](LICENSE) 开源发布。 100 | ``` 101 | -------------------------------------------------------------------------------- /duplicate_check/report.py: -------------------------------------------------------------------------------- 1 | """Reporting utilities: CSV output and evidence image generation (stub). 2 | 3 | 报告模块:生成 CSV 报表并创建证据图(并排显示、可绘制匹配连线)。 4 | """ 5 | import csv 6 | from pathlib import Path 7 | from typing import List, Dict 8 | from shutil import copyfile 9 | 10 | try: 11 | import cv2 12 | except Exception: 13 | cv2 = None 14 | 15 | 16 | CSV_FIELDS = [ 17 | "new_image", 18 | "matched_image", 19 | "final_label", 20 | "score", 21 | "inliers", 22 | "inlier_ratio", 23 | "ncc_peak", 24 | "evidence_img_path", 25 | ] 26 | 27 | 28 | def write_csv(rows: List[Dict], out_path: Path): 29 | with out_path.open("w", newline="", encoding="utf-8") as f: 30 | writer = csv.DictWriter(f, fieldnames=CSV_FIELDS) 31 | writer.writeheader() 32 | for r in rows: 33 | writer.writerow({k: r.get(k, "") for k in CSV_FIELDS}) 34 | 35 | 36 | def make_evidence_image(new_img_path: Path, db_img_path: Path, out_path: Path, draw_matches: bool = False, matches=None): 37 | """Create a side-by-side evidence image. If cv2 and matches provided, draw matches.""" 38 | if cv2 is None: 39 | # fallback: copy new image 40 | # 若未安装 OpenCV,则回退为直接复制新图作为证据图 41 | try: 42 | copyfile(str(new_img_path), str(out_path)) 43 | except Exception: 44 | pass 45 | return 46 | 47 | na = cv2.imread(str(new_img_path)) 48 | db = cv2.imread(str(db_img_path)) 49 | if na is None or db is None: 50 | try: 51 | copyfile(str(new_img_path), str(out_path)) 52 | except Exception: 53 | pass 54 | return 55 | 56 | # Resize to same height 57 | h = max(na.shape[0], db.shape[0]) 58 | def resize_keep(asrc, height): 59 | h0, w0 = asrc.shape[:2] 60 | scale = height / h0 61 | return cv2.resize(asrc, (int(w0 * scale), height)) 62 | 63 | na_r = resize_keep(na, h) 64 | db_r = resize_keep(db, h) 65 | 66 | if draw_matches and matches: 67 | # matches: list of ((xq,yq),(xd,yd)) pairs 68 | # build a canvas that is na_r + db_r side-by-side and draw lines 69 | concat = cv2.hconcat([na_r, db_r]) 70 | wq = na_r.shape[1] 71 | # compute scale factors from original images to resized ones 72 | hq_orig = na.shape[0] 73 | wq_orig = na.shape[1] 74 | hd_orig = db.shape[0] 75 | wd_orig = db.shape[1] 76 | h_res = h 77 | na_scale_x = na_r.shape[1] / max(1, wq_orig) 78 | na_scale_y = na_r.shape[0] / max(1, hq_orig) 79 | db_scale_x = db_r.shape[1] / max(1, wd_orig) 80 | db_scale_y = db_r.shape[0] / max(1, hd_orig) 81 | for (xq, yq), (xd, yd) in matches: 82 | pt1 = (int(xq * na_scale_x), int(yq * na_scale_y)) 83 | pt2 = (int(wq + xd * db_scale_x), int(yd * db_scale_y)) 84 | cv2.line(concat, pt1, pt2, (0, 255, 0), 1) 85 | cv2.circle(concat, pt1, 3, (0, 0, 255), -1) 86 | cv2.circle(concat, pt2, 3, (0, 0, 255), -1) 87 | try: 88 | cv2.imwrite(str(out_path), concat) 89 | except Exception: 90 | try: 91 | copyfile(str(new_img_path), str(out_path)) 92 | except Exception: 93 | pass 94 | return 95 | 96 | 97 | concat = cv2.hconcat([na_r, db_r]) 98 | try: 99 | cv2.imwrite(str(out_path), concat) 100 | except Exception: 101 | try: 102 | copyfile(str(new_img_path), str(out_path)) 103 | except Exception: 104 | pass 105 | -------------------------------------------------------------------------------- /tools/tune_thresholds.py: -------------------------------------------------------------------------------- 1 | """Threshold tuning helper. 2 | 3 | Usage: 4 | python tools/tune_thresholds.py --labels labels.csv --db_dir ./images_db --input_dir ./images_new --out_dir ./reports 5 | 6 | labels.csv should contain columns: new_image, matched_image, label (unique/partial_duplicate/exact_patch) 7 | 8 | This script sweeps phash_thresh, orb_inliers_thresh, ncc_thresh and reports simple match rate vs ground truth. NCC now operates on a warped ROI; use --roi_margin_ratio / --max_roi_matches to keep tuning aligned. 9 | 10 | 阈值调优脚本。 11 | 12 | 用法: 13 | python tools/tune_thresholds.py --labels labels.csv --db_dir ./images_db --input_dir ./images_new --out_dir ./reports 14 | 15 | labels.csv 应包含列:new_image, matched_image, label(unique/partial_duplicate/exact_patch) 16 | 17 | 本脚本对 phash_thresh、orb_inliers_thresh、ncc_thresh 做网格搜索,并报告与标注的 TP/FP/FN 统计。NCC 已改为基于单应 ROI 的对齐互相关,可通过 --roi_margin_ratio / --max_roi_matches 调整 ROI 设定。 18 | """ 19 | import sys 20 | import argparse 21 | import csv 22 | from pathlib import Path 23 | 24 | # Ensure repo root is on sys.path so `duplicate_check` package is importable 25 | _ROOT = Path(__file__).resolve().parents[1] 26 | if str(_ROOT) not in sys.path: 27 | sys.path.insert(0, str(_ROOT)) 28 | 29 | from duplicate_check import indexer, features, matcher 30 | 31 | 32 | def parse_args(): 33 | p = argparse.ArgumentParser() 34 | p.add_argument("--labels", required=True) 35 | p.add_argument("--db_dir", required=True) 36 | p.add_argument("--input_dir", required=True) 37 | p.add_argument("--out_dir", required=True) 38 | p.add_argument("--roi_margin_ratio", type=float, default=0.12) 39 | p.add_argument("--max_roi_matches", type=int, default=60) 40 | return p.parse_args() 41 | 42 | 43 | def load_labels(path): 44 | rows = {} 45 | with open(path, newline='', encoding='utf-8') as f: 46 | r = csv.DictReader(f) 47 | for row in r: 48 | rows[row['new_image']] = row 49 | return rows 50 | 51 | 52 | def main(): 53 | args = parse_args() 54 | labels = load_labels(args.labels) 55 | db_dir = Path(args.db_dir) 56 | input_dir = Path(args.input_dir) 57 | out_dir = Path(args.out_dir); out_dir.mkdir(parents=True, exist_ok=True) 58 | 59 | idx = indexer.build_index(db_dir) 60 | 61 | # simple sweep 62 | phash_range = [6,8,10,12] 63 | orb_range = [10,25,50] 64 | ncc_range = [0.85,0.9,0.92,0.95] 65 | 66 | results = [] 67 | for ph in phash_range: 68 | for orb_th in orb_range: 69 | for ncc in ncc_range: 70 | tp=0; fp=0; fn=0 71 | for p in input_dir.iterdir(): 72 | if not p.is_file(): 73 | continue 74 | feats = features.compute_features(p) 75 | cands = matcher.recall_candidates(feats, idx, phash_thresh=ph) 76 | rows = matcher.rerank_and_verify( 77 | p, 78 | cands, 79 | idx, 80 | orb_inliers_thresh=orb_th, 81 | ncc_thresh=ncc, 82 | roi_margin_ratio=args.roi_margin_ratio, 83 | max_roi_matches=args.max_roi_matches, 84 | ) 85 | predicted = rows[0]['matched_image'] if rows else None 86 | gt = labels.get(p.name, {}).get('matched_image') 87 | if gt and predicted == gt: 88 | tp+=1 89 | elif gt and predicted != gt: 90 | fn+=1 91 | elif not gt and predicted: 92 | fp+=1 93 | results.append((ph,orb_th,ncc,tp,fp,fn)) 94 | # write out 95 | outp = out_dir / 'tune_results.csv' 96 | with open(outp, 'w', newline='', encoding='utf-8') as f: 97 | w=csv.writer(f) 98 | w.writerow(['phash','orb','ncc','tp','fp','fn']) 99 | for r in results: 100 | w.writerow(r) 101 | print('Done. Results:', outp) 102 | 103 | if __name__=='__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /reports/dup_report.csv: -------------------------------------------------------------------------------- 1 | new_image,matched_image,final_label,score,inliers,inlier_ratio,ncc_peak,evidence_img_path 2 | new_1_bright.jpg,base_1.jpg,partial_duplicate,15.648811834888072,166,0.35319148936170214,0.0,reports/new_1_bright__VS__base_1.jpg 3 | new_1_copy.jpg,base_1.jpg,partial_duplicate,18.364583452542625,1411,1.0,0.0038532966282218695,reports/new_1_copy__VS__base_1.jpg 4 | new_1_crop.jpg,base_1.jpg,partial_duplicate,1.157927119731903,90,0.4090909090909091,0.009402623400092125,reports/new_1_crop__VS__base_1.jpg 5 | new_1_flip.jpg,base_1.jpg,partial_duplicate,17.548451742953002,1405,0.9992887624466572,0.0,reports/new_1_flip__VS__base_1.jpg 6 | new_1_ps.jpg,base_1.jpg,partial_duplicate,17.587277013366506,990,0.9482758620689655,-0.0046783494763076305,reports/new_1_ps__VS__base_1.jpg 7 | new_2_bright.jpg,base_2.jpg,partial_duplicate,14.278943573362971,162,0.3894230769230769,0.0,reports/new_2_bright__VS__base_2.jpg 8 | new_2_copy.jpg,base_2.jpg,partial_duplicate,16.791666785875954,1414,1.0,0.0,reports/new_2_copy__VS__base_2.jpg 9 | new_2_copy.jpg,base_3.jpg,partial_duplicate,13.86416643242519,27,0.29347826086956524,0.0,reports/new_2_copy__VS__base_3.jpg 10 | new_2_crop.jpg,base_2.jpg,partial_duplicate,1.1774003977885679,75,0.39473684210526316,-0.002573883393779397,reports/new_2_crop__VS__base_2.jpg 11 | new_2_flip.jpg,base_2.jpg,partial_duplicate,16.010963896910347,1414,1.0,0.0,reports/new_2_flip__VS__base_2.jpg 12 | new_2_flip.jpg,base_3.jpg,partial_duplicate,13.815720035438547,27,0.29347826086956524,0.0,reports/new_2_flip__VS__base_3.jpg 13 | new_2_jpeg30.jpg,base_2.jpg,partial_duplicate,13.72376012705414,154,0.4425287356321839,0.0,reports/new_2_jpeg30__VS__base_2.jpg 14 | new_2_ps.jpg,base_2.jpg,partial_duplicate,15.997115687025545,1006,0.9599236641221374,0.0,reports/new_2_ps__VS__base_2.jpg 15 | new_3_bright.jpg,base_3.jpg,partial_duplicate,13.73724901047934,161,0.4086294416243655,0.0020003009121865034,reports/new_3_bright__VS__base_3.jpg 16 | new_3_copy.jpg,base_3.jpg,partial_duplicate,16.274227647299178,1414,1.0,0.0020335863810032606,reports/new_3_copy__VS__base_3.jpg 17 | new_3_copy.jpg,base_2.jpg,partial_duplicate,13.877052221405371,31,0.27927927927927926,0.0,reports/new_3_copy__VS__base_2.jpg 18 | new_3_crop.jpg,base_3.jpg,partial_duplicate,1.186078881467139,85,0.43147208121827413,0.0,reports/new_3_crop__VS__base_3.jpg 19 | new_3_flip.jpg,base_3.jpg,partial_duplicate,15.493055590306328,1414,1.0,0.0,reports/new_3_flip__VS__base_3.jpg 20 | new_3_jpeg30.jpg,base_3.jpg,partial_duplicate,13.432698663339925,144,0.4161849710982659,0.0016456048469990492,reports/new_3_jpeg30__VS__base_3.jpg 21 | new_3_ps.jpg,base_3.jpg,partial_duplicate,15.450083545037916,968,0.944390243902439,0.0,reports/new_3_ps__VS__base_3.jpg 22 | new_3_ps.jpg,base_1.jpg,partial_duplicate,15.28378456336357,25,0.25510204081632654,0.0,reports/new_3_ps__VS__base_1.jpg 23 | new_3_ps.jpg,base_2.jpg,partial_duplicate,14.509196431986936,28,0.27450980392156865,0.0,reports/new_3_ps__VS__base_2.jpg 24 | new_4_bright.jpg,base_4.jpg,partial_duplicate,13.11780427361023,123,0.36607142857142855,0.004998629447072744,reports/new_4_bright__VS__base_4.jpg 25 | new_4_copy.jpg,base_4.jpg,partial_duplicate,15.802083333333334,1442,1.0,0.0,reports/new_4_copy__VS__base_4.jpg 26 | new_4_crop.jpg,base_4.jpg,partial_duplicate,1.1693956007455526,77,0.39086294416243655,0.0,reports/new_4_crop__VS__base_4.jpg 27 | new_4_flip.jpg,base_4.jpg,partial_duplicate,14.595425144247953,1270,0.959214501510574,0.0,reports/new_4_flip__VS__base_4.jpg 28 | new_4_jpeg30.jpg,base_4.jpg,partial_duplicate,13.432213366064898,146,0.37823834196891193,0.0,reports/new_4_jpeg30__VS__base_4.jpg 29 | new_4_ps.jpg,base_4.jpg,partial_duplicate,15.009747378792026,1015,0.9424326833797586,0.0,reports/new_4_ps__VS__base_4.jpg 30 | new_5_bright.jpg,base_5.jpg,partial_duplicate,15.370038690126217,103,0.356401384083045,0.0,reports/new_5_bright__VS__base_5.jpg 31 | new_5_copy.jpg,base_5.jpg,partial_duplicate,17.821538426124057,1449,1.0,0.00033754599280655384,reports/new_5_copy__VS__base_5.jpg 32 | new_5_crop.jpg,base_5.jpg,partial_duplicate,1.1523947505389942,73,0.3989071038251366,-0.0007181827677413821,reports/new_5_crop__VS__base_5.jpg 33 | new_5_flip.jpg,base_5.jpg,partial_duplicate,17.01505248709818,1449,1.0,0.0,reports/new_5_flip__VS__base_5.jpg 34 | new_5_jpeg30.jpg,base_5.jpg,partial_duplicate,14.749811130209066,131,0.37110481586402266,0.0,reports/new_5_jpeg30__VS__base_5.jpg 35 | new_5_ps.jpg,base_5.jpg,partial_duplicate,17.09211623273774,1021,0.9453703703703704,0.0,reports/new_5_ps__VS__base_5.jpg 36 | new_5_rot.jpg,base_5.jpg,partial_duplicate,7.367151720660745,42,0.2781456953642384,0.0,reports/new_5_rot__VS__base_5.jpg 37 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | # DupCheck — Duplicate & Tamper Detection 2 | 3 | ## Overview 4 | DupCheck solves broad “duplicate / tamper detection” needs: It works in insurance claim review, content moderation, e-commerce authenticity checks, and copyright protection. It began as a submodule designed to stop third-party repair contractors from re-uploading maintenance photos to claim duplicate reimbursements; I later spun it out, optimised it, and expanded it into a general-purpose toolkit. Uploads are compared against a reference gallery to flag exact copies, crops, rotations, flips, and lightly edited variants, producing reviewer-friendly evidence. 5 | 6 | The pipeline is pure Python with minimal dependencies, making it easy to embed into intake pipelines or back-office review systems. 7 | 8 | ## Detection flow 9 | 1. **Index build** – each gallery image is converted to multiple perceptual hashes (original, rotations, flips), multi-scale tile hashes, cached ORB descriptors, and optional ResNet-18 / CLIP embeddings to support geometric and coarse semantic changes. 10 | 2. **Candidate recall** – a new upload is compared with the index via pHash buckets, tile voting, and optional FAISS (ResNet-18/CLIP) vector search; if needed, multi-orientation ORB matching pulls in additional suspects. 11 | 3. **Verification** – the best orientation pair runs ORB + RANSAC. When the homography is reliable, NCC on the corresponding patch upgrades matches to `exact_patch`. 12 | 4. **Reporting** – results are written to `dup_report.csv`, and the CLI can render side-by-side evidence images for manual review. 13 | 5. **Threshold tuning** – optionally run `tools/tune_thresholds.py` to grid-search `phash/ORB/NCC` thresholds and pick the best configuration for your data. 14 | 15 | > **Scaling tip:** Set `DUPC_VECTOR_INDEX=ivf_pq` or `hnsw` to switch the built-in FAISS index; for even larger deployments, replace the FAISS block in `duplicate_check/indexer.py` / `load_index_from_db` with writes to Milvus, Qdrant, Pinecone, etc., and query that service from `matcher.recall_candidates` before ORB reranking. 16 | > **Performance tip:** Tune `DUPC_TILE_SCALES` (e.g., `1.0,0.6`) and `DUPC_TILE_GRID` to trade multi-scale robustness for runtime when processing massive galleries. 17 | 18 | ## Project layout 19 | - `duplicate_check/` — core library modules (`features`, `indexer`, `matcher`, `report`). 20 | - `dupcheck_cli.py` — main CLI wrapper supporting in-memory or SQLite indices. 21 | - `duplicate_check.py` — minimal entry point kept for backwards compatibility. 22 | - `tools/` — utilities for synthetic data generation and threshold tuning. 23 | - `tests/` — quick test. 24 | - `data/` — sample synthetic dataset used by the documentation examples. 25 | 26 | ## Requirements 27 | Install dependencies listed in `requirements.txt` inside a Python 3.9+ environment. OpenCV, Pillow, imagehash, `torch`, `torchvision`, and (optionally) `faiss-cpu` enable the full feature set; the pipeline falls back gracefully if some extras are unavailable. 28 | 29 | ```bash 30 | python -m venv .venv 31 | source .venv/bin/activate 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | Optional extras: install `faiss-cpu` (for ANN recall) and either `open-clip-torch` or `clip` if you want CLIP-ViT embeddings in addition to ResNet. 36 | 37 | ## Quick start 38 | 1. Generate the demo dataset: 39 | ```bash 40 | python tools/generate_synthetic.py --out_dir data --count 5 41 | ``` 42 | 2. Rebuild the SQLite index and run detection: 43 | ```bash 44 | python dupcheck_cli.py \ 45 | --db_dir data/synth_db \ 46 | --input_dir data/synth_new \ 47 | --out_dir reports \ 48 | --index_db ./index.db \ 49 | --rebuild_index \ 50 | --vector_score_thresh 0.3 51 | ``` 52 | 3. Inspect `reports/dup_report.csv` and the generated evidence JPEGs. 53 | 4. (Optional) Benchmark on the labelled synthetic set and review mismatches: 54 | ```bash 55 | python tools/verify_synthetic.py \ 56 | --db_dir data/synth_db \ 57 | --input_dir data/synth_new \ 58 | --labels data/synth_labels.csv \ 59 | --phash_thresh 16 \ 60 | --orb_inliers_thresh 6 \ 61 | --ncc_thresh 0.85 62 | ``` 63 | 5. (Optional) Launch a grid search over thresholds: 64 | ```bash 65 | python tools/tune_thresholds.py \ 66 | --labels data/synth_labels.csv \ 67 | --db_dir data/synth_db \ 68 | --input_dir data/synth_new \ 69 | --out_dir reports/tune_out 70 | ``` 71 | 72 | To reuse an existing index, drop the `--rebuild_index` flag. Tweak `--phash_thresh`, `--orb_inliers_thresh`, and `--ncc_thresh` to experiment with precision/recall. 73 | 74 | ## CLI examples 75 | ```bash 76 | # Rebuild index for fresh data 77 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index 78 | 79 | # Run with custom thresholds 80 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94 81 | 82 | # Quick scan using the cached index 83 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db 84 | ``` 85 | 86 | ## Threshold tuning 87 | Use `tools/tune_thresholds.py` with the synthetic labels to grid-search thresholds: 88 | 89 | ```bash 90 | python tools/tune_thresholds.py \ 91 | --labels data/synth_labels.csv \ 92 | --db_dir data/synth_db \ 93 | --input_dir data/synth_new \ 94 | --out_dir reports/tune_out 95 | ``` 96 | 97 | The script writes `tune_results.csv` with TP/FP/FN counts for each parameter combo, making it easy to lock in settings for your own data. 98 | 99 | ## License 100 | 101 | This project is released under the [MIT License](LICENSE). 102 | -------------------------------------------------------------------------------- /tools/verify_synthetic.py: -------------------------------------------------------------------------------- 1 | """Quick evaluator for the synthetic DupCheck dataset. 2 | 3 | Usage example: 4 | python tools/verify_synthetic.py \ 5 | --db_dir data/synth_db \ 6 | --input_dir data/synth_new \ 7 | --labels data/synth_labels.csv 8 | 9 | The script runs the duplicate detection pipeline against the synthetic 10 | dataset and reports how many annotated duplicates / uniques are detected 11 | correctly along with any mismatches it finds. 12 | """ 13 | import argparse 14 | import csv 15 | import sys 16 | from pathlib import Path 17 | from typing import Dict, List 18 | 19 | ROOT = Path(__file__).resolve().parents[1] 20 | if str(ROOT) not in sys.path: 21 | sys.path.insert(0, str(ROOT)) 22 | 23 | from duplicate_check import features, indexer, matcher 24 | 25 | 26 | def check_dependencies() -> List[str]: 27 | missing: List[str] = [] 28 | if not getattr(features, "PIL_AVAILABLE", False) or getattr(features, "imagehash", None) is None: 29 | missing.append("Pillow + imagehash (needed for perceptual hash and tile hashing)") 30 | if getattr(features, "cv2", None) is None: 31 | missing.append("opencv-python (needed for ORB matching and NCC verification)") 32 | return missing 33 | 34 | 35 | def load_labels(path: Path) -> Dict[str, Dict[str, str]]: 36 | rows: Dict[str, Dict[str, str]] = {} 37 | with path.open(newline="", encoding="utf-8") as f: 38 | reader = csv.DictReader(f) 39 | for row in reader: 40 | rows[row["new_image"]] = row 41 | return rows 42 | 43 | 44 | def evaluate( 45 | db_dir: Path, 46 | input_dir: Path, 47 | labels_path: Path, 48 | *, 49 | topk: int, 50 | phash_thresh: int, 51 | orb_inliers_thresh: int, 52 | ncc_thresh: float, 53 | vector_score_thresh: float, 54 | roi_margin_ratio: float, 55 | max_roi_matches: int, 56 | ) -> Dict[str, object]: 57 | labels = load_labels(labels_path) 58 | idx = indexer.build_index(db_dir) 59 | 60 | stats = { 61 | "duplicate_total": 0, 62 | "duplicate_hits": 0, 63 | "unique_total": 0, 64 | "unique_hits": 0, 65 | "mismatches": [], 66 | } 67 | 68 | for img_path in sorted(input_dir.iterdir()): 69 | if not img_path.is_file(): 70 | continue 71 | feats = features.compute_features(img_path) 72 | cands = matcher.recall_candidates( 73 | feats, 74 | idx, 75 | topk=topk, 76 | phash_thresh=phash_thresh, 77 | vector_score_thresh=vector_score_thresh, 78 | ) 79 | rows = matcher.rerank_and_verify( 80 | img_path, 81 | cands, 82 | idx, 83 | orb_inliers_thresh=orb_inliers_thresh, 84 | ncc_thresh=ncc_thresh, 85 | roi_margin_ratio=roi_margin_ratio, 86 | max_roi_matches=max_roi_matches, 87 | ) 88 | 89 | meta = labels.get(img_path.name, {"matched_image": "", "label": "unique"}) 90 | gt_match = meta.get("matched_image") or "" 91 | gt_label = meta.get("label", "unique") 92 | 93 | predicted_label = rows[0]["final_label"] if rows else "unique" 94 | predicted_match = rows[0]["matched_image"] if rows else "" 95 | if predicted_label == "unique": 96 | predicted_match = "" 97 | 98 | if gt_match: 99 | stats["duplicate_total"] += 1 100 | if predicted_match == gt_match: 101 | stats["duplicate_hits"] += 1 102 | else: 103 | stats["mismatches"].append( 104 | { 105 | "image": img_path.name, 106 | "expected_match": gt_match, 107 | "expected_label": gt_label, 108 | "predicted_match": rows[0]["matched_image"] if rows else "", 109 | "predicted_label": predicted_label, 110 | } 111 | ) 112 | else: 113 | stats["unique_total"] += 1 114 | if not predicted_match: 115 | stats["unique_hits"] += 1 116 | else: 117 | stats["mismatches"].append( 118 | { 119 | "image": img_path.name, 120 | "expected_match": "", 121 | "expected_label": gt_label, 122 | "predicted_match": rows[0]["matched_image"] if rows else "", 123 | "predicted_label": predicted_label, 124 | } 125 | ) 126 | 127 | return stats 128 | 129 | 130 | def format_summary(stats: Dict[str, object]) -> str: 131 | dup_total = stats["duplicate_total"] or 1 132 | uniq_total = stats["unique_total"] or 1 133 | lines: List[str] = [] 134 | lines.append( 135 | f"Duplicate accuracy: {stats['duplicate_hits']}/{stats['duplicate_total']}" 136 | f" ({stats['duplicate_hits']/dup_total:.1%})" 137 | ) 138 | lines.append( 139 | f"Unique accuracy: {stats['unique_hits']}/{stats['unique_total']}" 140 | f" ({stats['unique_hits']/uniq_total:.1%})" 141 | ) 142 | mismatches = stats["mismatches"] 143 | if mismatches: 144 | lines.append("\nMismatches:") 145 | for miss in mismatches: 146 | lines.append( 147 | f" - {miss['image']}: expected {miss['expected_match'] or 'unique'}" 148 | f" → predicted {miss['predicted_match'] or miss['predicted_label']}" 149 | ) 150 | else: 151 | lines.append("\nAll samples matched expected labels.") 152 | return "\n".join(lines) 153 | 154 | 155 | def parse_args() -> argparse.Namespace: 156 | p = argparse.ArgumentParser(description="Evaluate synthetic DupCheck dataset") 157 | p.add_argument("--db_dir", default="data/synth_db") 158 | p.add_argument("--input_dir", default="data/synth_new") 159 | p.add_argument("--labels", default="data/synth_labels.csv") 160 | p.add_argument("--topk", type=int, default=50) 161 | p.add_argument("--phash_thresh", type=int, default=10) 162 | p.add_argument("--orb_inliers_thresh", type=int, default=25) 163 | p.add_argument("--ncc_thresh", type=float, default=0.92) 164 | p.add_argument("--vector_score_thresh", type=float, default=0.0) 165 | p.add_argument("--roi_margin_ratio", type=float, default=0.12) 166 | p.add_argument("--max_roi_matches", type=int, default=60) 167 | return p.parse_args() 168 | 169 | 170 | def main() -> None: 171 | args = parse_args() 172 | missing = check_dependencies() 173 | if missing: 174 | print("Warning: required imaging dependencies missing; results will be unreliable.") 175 | for item in missing: 176 | print(f" - {item}") 177 | print("Install them via `pip install -r requirements.txt` and re-run this script.") 178 | return 179 | stats = evaluate( 180 | Path(args.db_dir), 181 | Path(args.input_dir), 182 | Path(args.labels), 183 | topk=args.topk, 184 | phash_thresh=args.phash_thresh, 185 | orb_inliers_thresh=args.orb_inliers_thresh, 186 | ncc_thresh=args.ncc_thresh, 187 | vector_score_thresh=args.vector_score_thresh, 188 | roi_margin_ratio=args.roi_margin_ratio, 189 | max_roi_matches=args.max_roi_matches, 190 | ) 191 | print(format_summary(stats)) 192 | 193 | 194 | if __name__ == "__main__": 195 | main() 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DupCheck — Duplicate & Tamper Detection / 图片重复与伪造检测 2 | 3 |
4 | English | 中文 5 |
6 | 7 | --- 8 | 9 |
10 | English 11 | 12 | ### Overview 13 | DupCheck targets broader duplicate/tamper detection needs: It works for insurance claim review, content moderation, e-commerce authenticity checks, and copyright protection. It was originally a submodule built to stop third-party repair contractors from re-uploading maintenance photos to claim duplicate reimbursements; I later spun it out, optimised it, and generalised it for additional scenarios. The system compares every new image against a reference gallery, flags exact copies, crops, rotations/flips, and subtle edits, then produces reviewer-friendly evidence. 14 | 15 | The implementation is pure Python and depends only on widely available imaging libraries, which keeps integration with existing intake or back-office pipelines straightforward. 16 | 17 | ### Detection flow 18 | 1. **Index build** – gallery images are converted to multi-orientation pHash, multi-scale tile hashes, cached ORB descriptors, and optional ResNet-18 / CLIP embeddings so geometric tweaks and coarse semantics remain discoverable. 19 | 2. **Candidate recall** – a new upload is matched through pHash buckets, tile voting, and optional FAISS (ResNet-18/CLIP) vector search; if necessary, orientation-aware ORB matching pulls in additional suspects. 20 | 3. **Verification** – the best orientation pair runs ORB + RANSAC. We warp the database image via the estimated homography, crop a small ROI around the inlier hull, and run ZNCC on that aligned patch; high correlation promotes the match to `exact_patch`. 21 | 4. **Reporting** – matches are recorded in `dup_report.csv`, and the CLI can render side-by-side evidence images for manual review. 22 | 23 | > **Scaling tip:** Set `DUPC_VECTOR_INDEX=ivf_pq` or `hnsw` to switch the built-in FAISS index; for very large galleries or cluster deployments, replace the in-process FAISS index with an external vector database (e.g., Milvus, Qdrant, Pinecone). A natural hook is the `duplicate_check/indexer.py::build_index` / `load_index_from_db` functions—swap the FAISS creation for remote writes, and query that service inside `matcher.recall_candidates` before running ORB reranking. 24 | > **Performance tip:** Adjust `DUPC_TILE_SCALES` (e.g., `1.0,0.6`) and `DUPC_TILE_GRID` to balance multi-scale accuracy against runtime when processing massive galleries. 25 | 26 | ### Project layout 27 | - `duplicate_check/` — core modules (`features`, `indexer`, `matcher`, `report`). 28 | - `dupcheck_cli.py` — main CLI with in-memory and SQLite index support. 29 | - `duplicate_check.py` — legacy entrypoint kept for backward compatibility. 30 | - `tools/` — helpers for synthetic data generation and threshold tuning. 31 | - `tests/` - quick test. 32 | - `data/` — synthetic dataset used in docs and examples. 33 | 34 | ### Requirements 35 | Install the dependencies from `requirements.txt` inside a Python 3.9+ environment. Pillow, OpenCV, imagehash, `torch`, `torchvision`, and (optionally) `faiss-cpu` unlock the full feature set; the code degrades gracefully if some extras are missing. 36 | 37 | ```bash 38 | python -m venv .venv 39 | source .venv/bin/activate 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | Optional extras: install `faiss-cpu` (for ANN recall) and either `open-clip-torch` or `clip` if you want CLIP-ViT embeddings in addition to ResNet. 44 | 45 | ### Quick start 46 | 1. Generate the demo dataset: 47 | ```bash 48 | python tools/generate_synthetic.py --out_dir data --count 5 49 | ``` 50 | 2. Rebuild the SQLite index and run detection: 51 | ```bash 52 | python dupcheck_cli.py \ 53 | --db_dir data/synth_db \ 54 | --input_dir data/synth_new \ 55 | --out_dir reports \ 56 | --index_db ./index.db \ 57 | --rebuild_index \ 58 | --vector_score_thresh 0.3 59 | ``` 60 | 3. Inspect `reports/dup_report.csv` alongside the generated evidence JPEGs. 61 | 4. (Optional) Benchmark on the synthetic labels and inspect mismatches: 62 | ```bash 63 | python tools/verify_synthetic.py \ 64 | --db_dir data/synth_db \ 65 | --input_dir data/synth_new \ 66 | --labels data/synth_labels.csv \ 67 | --phash_thresh 16 \ 68 | --orb_inliers_thresh 6 \ 69 | --ncc_thresh 0.88 \ 70 | --roi_margin_ratio 0.12 \ 71 | --max_roi_matches 60 72 | ``` 73 | 5. (Optional) Run a threshold grid search to tune the pipeline: 74 | ```bash 75 | python tools/tune_thresholds.py \ 76 | --labels data/synth_labels.csv \ 77 | --db_dir data/synth_db \ 78 | --input_dir data/synth_new \ 79 | --out_dir reports/tune_out 80 | ``` 81 | 82 | Drop `--rebuild_index` to reuse a cached index. Tune `--phash_thresh`, `--orb_inliers_thresh`, and `--ncc_thresh` to explore different precision/recall tradeoffs. 83 | 84 | ### CLI examples 85 | ```bash 86 | # Rebuild index for fresh data 87 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index 88 | 89 | # Run with custom thresholds 90 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94 91 | 92 | # Quick scan using the cached index 93 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db 94 | ``` 95 | 96 | ### Threshold tuning 97 | Use `tools/tune_thresholds.py` with the synthetic labels to sweep detection thresholds: 98 | 99 | ```bash 100 | python tools/tune_thresholds.py \ 101 | --labels data/synth_labels.csv \ 102 | --db_dir data/synth_db \ 103 | --input_dir data/synth_new \ 104 | --out_dir reports/tune_out 105 | ``` 106 | 107 | The script writes `tune_results.csv` containing TP/FP/FN counts for each parameter combo so you can lock in thresholds for your own dataset. 108 | 109 | ## License 110 | 111 | This project is released under the [MIT License](LICENSE). 112 | 113 |
114 | 115 |
116 | 中文 117 | 118 | ### 项目简介 119 | DupCheck 面向广义的“图库去重 / 篡改检测”场景:不仅可用于理赔审核,也适合内容审核、电商验真、图像版权保护等业务。它最初用于防止第三方维修工重复上传维修照片骗取维修资金,原为某项目的子模块;后来我将其独立化、优化,并扩展为通用的重复与篡改检测工具,可适用于更多场景。系统会把新上传图片与历史图库逐一比对,识别完全重复、局部重复、旋转/翻转及轻度改动的图像,并输出便于人工复核的证据。 120 | 121 | 项目依赖常见的 Python 图像 / 深度学习库,可嵌入各类上传管线或后台审核流程。 122 | 123 | ### 检测流程 124 | 1. **构建索引**:对图库图片计算多姿态 pHash(原图、旋转、翻转)、多尺度块哈希、缓存 ORB 关键点,并可生成 ResNet-18 / CLIP 嵌入,确保几何和粗语义变化也能被召回。 125 | 2. **召回候选**:新图片通过 pHash/块哈希匹配,并可结合基于 ResNet-18/CLIP 的 FAISS 向量检索;如有需要再执行多姿态 ORB 匹配,把旋转、翻转的嫌疑图拉入候选集。 126 | 3. **精排验证**:对最佳姿态组合执行 ORB + RANSAC,将数据库图像按单应变换对齐到查询图坐标系后,在内点凸包附近裁剪 ROI,计算对齐区域的 ZNCC,判定是否为 `exact_patch`。 127 | 4. **结果输出**:检测结论写入 `dup_report.csv`,命令行可生成对照证据图,辅助人工审核。 128 | 5. **阈值调优**:可选地运行 `tools/tune_thresholds.py` 做网格搜索,针对不同场景选择更合适的 `phash/ORB/NCC` 参数。 129 | 130 | > **扩展建议**:可通过设置环境变量 `DUPC_VECTOR_INDEX=ivf_pq` 或 `hnsw` 切换内置 FAISS 索引;若图库规模巨大或需集群部署,可在 `duplicate_check/indexer.py` / `load_index_from_db` 中替换 FAISS,为 Milvus、Qdrant、Pinecone 等外部向量库写入,并在 `matcher.recall_candidates` 中调用该服务。 131 | 132 | ### 目录结构 133 | - `duplicate_check/` —— 核心模块(`features`、`indexer`、`matcher`、`report`)。 134 | - `dupcheck_cli.py` —— 主命令行工具,支持内存或 SQLite 索引。 135 | - `duplicate_check.py` —— 保留的兼容性入口脚本。 136 | - `tools/` —— 合成数据生成、阈值调参等辅助脚本。 137 | - `tests/` —— 测试。 138 | - `data/` —— 文档示例所用的合成数据集。 139 | 140 | ### 环境依赖 141 | 建议在 Python 3.9+ 中创建虚拟环境,并安装 `requirements.txt` 列出的依赖。OpenCV、Pillow、imagehash、`torch`、`torchvision` 与可选的 `faiss-cpu` 能启用全部功能,缺失时流程会自动降级。 142 | 143 | ```bash 144 | python -m venv .venv 145 | source .venv/bin/activate 146 | pip install -r requirements.txt 147 | ``` 148 | 149 | 可选依赖:`faiss-cpu`(向量召回),以及 `open-clip-torch` 或 `clip`(启用 CLIP-ViT 向量)。 150 | 151 | ### 快速体验 152 | 1. 生成示例数据集: 153 | ```bash 154 | python tools/generate_synthetic.py --out_dir data --count 5 155 | ``` 156 | 2. 重建 SQLite 索引并运行检测: 157 | ```bash 158 | python dupcheck_cli.py \ 159 | --db_dir data/synth_db \ 160 | --input_dir data/synth_new \ 161 | --out_dir reports \ 162 | --index_db ./index.db \ 163 | --rebuild_index 164 | ``` 165 | 3. 查看 `reports/dup_report.csv` 及生成的证据图片。 166 | 4. (可选)对合成标注集进行评估,查看召回差异: 167 | ```bash 168 | python tools/verify_synthetic.py \ 169 | --db_dir data/synth_db \ 170 | --input_dir data/synth_new \ 171 | --labels data/synth_labels.csv \ 172 | --phash_thresh 16 \ 173 | --orb_inliers_thresh 6 \ 174 | --ncc_thresh 0.88 \ 175 | --roi_margin_ratio 0.12 \ 176 | --max_roi_matches 60 177 | ``` 178 | 179 | 如需复用已有索引,可省略 `--rebuild_index`。可通过 `--phash_thresh`、`--orb_inliers_thresh`、`--ncc_thresh` 调整查准率与召回率之间的权衡。 180 | 181 | ### 常用命令 182 | ```bash 183 | # 重建索引 184 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index 185 | 186 | # 自定义阈值运行 187 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94 188 | 189 | # 使用已有索引快速扫描 190 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db 191 | ``` 192 | 193 | ### 阈值调参 194 | 使用 `tools/tune_thresholds.py` 对阈值组合进行网格搜索: 195 | 196 | ```bash 197 | python tools/tune_thresholds.py \ 198 | --labels data/synth_labels.csv \ 199 | --db_dir data/synth_db \ 200 | --input_dir data/synth_new \ 201 | --out_dir reports/tune_out 202 | ``` 203 | 204 | 脚本会输出 `tune_results.csv`,包含每组参数的 TP/FP/FN 统计,可据此锁定适合业务数据的阈值。 205 | 206 |
207 | -------------------------------------------------------------------------------- /duplicate_check/indexer.py: -------------------------------------------------------------------------------- 1 | """Index utilities with in-memory + SQLite backends supporting tile & vector lookups.""" 2 | from contextlib import closing 3 | import json 4 | import os 5 | import sqlite3 6 | from pathlib import Path 7 | from typing import Any, Dict, List, Optional 8 | 9 | try: 10 | import numpy as np 11 | except Exception: 12 | np = None 13 | 14 | try: 15 | import faiss 16 | except Exception: 17 | faiss = None 18 | 19 | from duplicate_check.features import ( 20 | compute_phash, 21 | compute_tile_hashes, 22 | compute_phash_variants, 23 | compute_embedding, 24 | ) 25 | 26 | VECTOR_INDEX_TYPE = os.environ.get("DUPC_VECTOR_INDEX", "flat").lower() 27 | VECTOR_INDEX_NLIST = int(os.environ.get("DUPC_VECTOR_NLIST", "1024")) 28 | VECTOR_INDEX_PQ_M = int(os.environ.get("DUPC_VECTOR_PQ_M", "16")) 29 | VECTOR_HNSW_M = int(os.environ.get("DUPC_VECTOR_HNSW_M", "32")) 30 | VECTOR_HNSW_EF = int(os.environ.get("DUPC_VECTOR_HNSW_EF", "64")) 31 | 32 | 33 | def _build_vector_index(embeddings: List[Any], ids: List[str]) -> Optional[Dict[str, Any]]: 34 | if not embeddings or faiss is None or np is None: 35 | return None 36 | try: 37 | mat = np.stack(embeddings).astype("float32") 38 | except Exception: 39 | return None 40 | if mat.size == 0: 41 | return None 42 | dim = mat.shape[1] 43 | index = None 44 | metric = "ip" 45 | index_type = VECTOR_INDEX_TYPE 46 | try: 47 | if index_type == "ivf_pq" and mat.shape[0] > VECTOR_INDEX_PQ_M: 48 | nlist = min(max(1, VECTOR_INDEX_NLIST), mat.shape[0]) 49 | quantizer = faiss.IndexFlatIP(dim) 50 | index = faiss.IndexIVFPQ(quantizer, dim, nlist, VECTOR_INDEX_PQ_M, 8) 51 | index.train(mat) 52 | index.add(mat) 53 | index.nprobe = max(1, min(nlist, nlist // 10 or 1)) 54 | elif index_type == "hnsw": 55 | hnsw_m = max(2, VECTOR_HNSW_M) 56 | index = faiss.IndexHNSWFlat(dim, hnsw_m) 57 | index.hnsw.efConstruction = max(hnsw_m, VECTOR_HNSW_EF) 58 | index.add(mat) 59 | index.hnsw.efSearch = max(hnsw_m, VECTOR_HNSW_EF) 60 | else: 61 | index = faiss.IndexFlatIP(dim) 62 | index.add(mat) 63 | index_type = "flat" 64 | except Exception: 65 | index = None 66 | if index is None: 67 | return None 68 | return {"index": index, "ids": ids, "metric": metric, "type": index_type} 69 | 70 | 71 | def build_index(db_dir: Path, tile_grid: int = 8) -> Dict[str, Any]: 72 | """Build an in-memory index containing multi-scale hashes and optional vectors.""" 73 | idx = {"by_id": {}, "by_phash": {}, "by_tile": {}, "vector": None} 74 | use_vectors = faiss is not None and np is not None 75 | vector_embeddings = [] 76 | vector_ids = [] 77 | for p in sorted(db_dir.iterdir()): 78 | if not p.is_file(): 79 | continue 80 | pid = p.name 81 | ph_variants = compute_phash_variants(p) 82 | primary_ph = ph_variants[0] 83 | tiles = compute_tile_hashes(p, grid=tile_grid) 84 | idx["by_id"][pid] = { 85 | "path": str(p), 86 | "phash": primary_ph, 87 | "phash_variants": ph_variants, 88 | "tiles": tiles, 89 | } 90 | for ph in ph_variants: 91 | bucket = idx["by_phash"].setdefault(ph, []) 92 | if pid not in bucket: 93 | bucket.append(pid) 94 | for tile in tiles: 95 | th = tile.get("hash") 96 | if not th: 97 | continue 98 | entry = { 99 | "img_id": pid, 100 | "bbox": tile.get("bbox", (0, 0, 0, 0)), 101 | "scale": tile.get("scale", 1.0), 102 | } 103 | idx["by_tile"].setdefault(th, []).append(entry) 104 | if use_vectors and np is not None: 105 | try: 106 | emb_val = compute_embedding(p) 107 | if emb_val is None: 108 | continue 109 | emb = np.asarray(emb_val, dtype=np.float32) 110 | if emb.ndim == 1 and emb.size > 0: 111 | vector_embeddings.append(emb) 112 | vector_ids.append(pid) 113 | except Exception: 114 | continue 115 | if use_vectors and vector_embeddings: 116 | idx["vector"] = _build_vector_index(vector_embeddings, vector_ids) 117 | return idx 118 | 119 | 120 | def load_index(path: Path) -> Dict[str, Any]: 121 | with open(path, "r", encoding="utf-8") as f: 122 | return json.load(f) 123 | 124 | 125 | def save_index(idx: Dict[str, Any], path: Path) -> None: 126 | to_dump = dict(idx) 127 | if "vector" in to_dump: 128 | to_dump["vector"] = None 129 | with open(path, "w", encoding="utf-8") as f: 130 | json.dump(to_dump, f) 131 | 132 | 133 | def init_sqlite(db_path: Path) -> None: 134 | db_path.parent.mkdir(parents=True, exist_ok=True) 135 | conn = sqlite3.connect(str(db_path)) 136 | with closing(conn): 137 | cur = conn.cursor() 138 | cur.execute("PRAGMA journal_mode=WAL") 139 | cur.execute( 140 | "CREATE TABLE IF NOT EXISTS images(\n" 141 | " img_id TEXT PRIMARY KEY,\n" 142 | " path TEXT,\n" 143 | " phash TEXT,\n" 144 | " w INTEGER,\n" 145 | " h INTEGER\n" 146 | ")" 147 | ) 148 | cur.execute( 149 | "CREATE TABLE IF NOT EXISTS tiles(\n" 150 | " img_id TEXT,\n" 151 | " tile_hash TEXT,\n" 152 | " x0 INTEGER,\n" 153 | " y0 INTEGER,\n" 154 | " x1 INTEGER,\n" 155 | " y1 INTEGER,\n" 156 | " scale REAL DEFAULT 1.0\n" 157 | ")" 158 | ) 159 | try: 160 | cur.execute("PRAGMA table_info(tiles)") 161 | existing_cols = {row[1] for row in cur.fetchall()} 162 | if "scale" not in existing_cols: 163 | cur.execute("ALTER TABLE tiles ADD COLUMN scale REAL DEFAULT 1.0") 164 | except Exception: 165 | pass 166 | cur.execute("CREATE INDEX IF NOT EXISTS idx_tiles_hash ON tiles(tile_hash)") 167 | conn.commit() 168 | 169 | 170 | def add_image_to_db(db_path: Path, image_path: Path, tile_grid: int = 8) -> None: 171 | conn = sqlite3.connect(str(db_path)) 172 | with closing(conn): 173 | cur = conn.cursor() 174 | ph = compute_phash(image_path) 175 | tiles = compute_tile_hashes(image_path, grid=tile_grid) 176 | try: 177 | from PIL import Image 178 | 179 | w, h = Image.open(str(image_path)).size 180 | except Exception: 181 | w, h = 0, 0 182 | img_id = image_path.name 183 | cur.execute( 184 | "INSERT OR REPLACE INTO images(img_id,path,phash,w,h) VALUES (?,?,?,?,?)", 185 | (img_id, str(image_path), ph, w, h), 186 | ) 187 | cur.execute("DELETE FROM tiles WHERE img_id = ?", (img_id,)) 188 | tile_rows = [] 189 | for tile in tiles: 190 | th = tile.get("hash") 191 | bbox = tile.get("bbox", (0, 0, 0, 0)) 192 | scale = tile.get("scale", 1.0) 193 | tile_rows.append((img_id, th, bbox[0], bbox[1], bbox[2], bbox[3], float(scale))) 194 | cur.executemany( 195 | "INSERT INTO tiles(img_id,tile_hash,x0,y0,x1,y1,scale) VALUES (?,?,?,?,?,?,?)", 196 | tile_rows, 197 | ) 198 | conn.commit() 199 | 200 | 201 | def build_index_db(db_dir: Path, db_path: Path, tile_grid: int = 8) -> None: 202 | init_sqlite(db_path) 203 | for p in sorted(db_dir.iterdir()): 204 | if not p.is_file(): 205 | continue 206 | add_image_to_db(db_path, p, tile_grid=tile_grid) 207 | 208 | 209 | def load_index_from_db(db_path: Path) -> Dict[str, Any]: 210 | conn = sqlite3.connect(str(db_path)) 211 | idx = {"by_id": {}, "by_phash": {}, "by_tile": {}, "vector": None} 212 | with closing(conn): 213 | cur = conn.cursor() 214 | for img_id, path, phash, w, h in cur.execute( 215 | "SELECT img_id,path,phash,w,h FROM images" 216 | ): 217 | idx["by_id"][img_id] = {"path": path, "phash": phash, "phash_variants": [phash], "tiles": []} 218 | idx["by_phash"].setdefault(phash, []).append(img_id) 219 | try: 220 | tile_rows = cur.execute( 221 | "SELECT img_id,tile_hash,x0,y0,x1,y1,scale FROM tiles" 222 | ) 223 | scale_included = True 224 | except sqlite3.OperationalError: 225 | tile_rows = cur.execute( 226 | "SELECT img_id,tile_hash,x0,y0,x1,y1 FROM tiles" 227 | ) 228 | scale_included = False 229 | for row in tile_rows: 230 | if scale_included: 231 | img_id, th, x0, y0, x1, y1, scale = row 232 | else: 233 | img_id, th, x0, y0, x1, y1 = row 234 | scale = 1.0 235 | tile_entry = { 236 | "hash": th, 237 | "bbox": (x0, y0, x1, y1), 238 | "scale": float(scale), 239 | } 240 | rec = idx["by_id"].setdefault( 241 | img_id, 242 | {"path": "", "phash": "", "phash_variants": [], "tiles": []}, 243 | ) 244 | rec.setdefault("tiles", []).append(tile_entry) 245 | idx["by_tile"].setdefault(th, []).append( 246 | {"img_id": img_id, "bbox": tile_entry["bbox"], "scale": tile_entry["scale"]} 247 | ) 248 | 249 | # Augment with variant phashes for better recall 250 | for img_id, rec in list(idx["by_id"].items()): 251 | path = Path(rec.get("path", "")) 252 | try: 253 | variants = compute_phash_variants(path) 254 | except Exception: 255 | variants = [rec.get("phash")] 256 | rec["phash_variants"] = variants or [rec.get("phash")] 257 | for ph in rec["phash_variants"]: 258 | if not ph: 259 | continue 260 | bucket = idx["by_phash"].setdefault(ph, []) 261 | if img_id not in bucket: 262 | bucket.append(img_id) 263 | # Build vector index on demand 264 | use_vectors = faiss is not None and np is not None 265 | if use_vectors: 266 | vector_embeddings: List[np.ndarray] = [] 267 | vector_ids: List[str] = [] 268 | for img_id, rec in idx["by_id"].items(): 269 | path = rec.get("path") 270 | if not path: 271 | continue 272 | try: 273 | emb_val = compute_embedding(Path(path)) 274 | except Exception: 275 | emb_val = None 276 | if emb_val is None: 277 | continue 278 | try: 279 | arr = np.asarray(emb_val, dtype=np.float32) 280 | except Exception: 281 | continue 282 | if arr.ndim != 1 or arr.size == 0: 283 | continue 284 | vector_embeddings.append(arr) 285 | vector_ids.append(img_id) 286 | idx["vector"] = _build_vector_index(vector_embeddings, vector_ids) 287 | return idx 288 | -------------------------------------------------------------------------------- /duplicate_check/features.py: -------------------------------------------------------------------------------- 1 | """Feature extraction utilities: pHash, tile-hash, ORB descriptors. 2 | 3 | This module implements: 4 | - compute_phash(image_path) -> hex string 5 | - compute_tile_hashes(image_path, grid) -> list of (hex, bbox) 6 | - compute_orb_descriptors(image_path, max_features) -> {kps, descs} 7 | - compute_features(image_path) -> ImageFeatures 8 | 9 | If OpenCV/imagehash/Pillow are missing, functions will raise ImportError. 10 | 11 | 特征提取工具:pHash、块哈希(tile-hash)、ORB 特征。 12 | 13 | 本模块实现: 14 | - compute_phash(image_path) -> 十六进制字符串 15 | - compute_tile_hashes(image_path, grid) -> 返回 (hash, bbox) 列表 16 | - compute_orb_descriptors(image_path, max_features) -> 返回 {kps, descs} 17 | - compute_features(image_path) -> 返回 ImageFeatures 18 | 19 | 如果系统缺少 OpenCV/imagehash/Pillow,函数会进行降级或抛出异常。 20 | """ 21 | from dataclasses import dataclass 22 | from typing import Any, Dict, List, Tuple, Optional 23 | from pathlib import Path 24 | import hashlib 25 | import io 26 | import os 27 | 28 | # Optional dependencies 29 | # 可选依赖 30 | try: 31 | import imagehash 32 | from PIL import Image 33 | PIL_AVAILABLE = True 34 | except Exception: 35 | imagehash = None 36 | Image = None 37 | PIL_AVAILABLE = False 38 | 39 | try: 40 | import numpy as np 41 | except Exception: 42 | np = None 43 | 44 | try: 45 | import cv2 46 | except Exception: 47 | cv2 = None 48 | 49 | try: 50 | import clip 51 | CLIP_AVAILABLE = True 52 | except Exception: 53 | clip = None 54 | CLIP_AVAILABLE = False 55 | 56 | try: 57 | import torch 58 | from torchvision import models 59 | TORCH_AVAILABLE = True 60 | except Exception: 61 | torch = None 62 | models = None 63 | TORCH_AVAILABLE = False 64 | 65 | _EMBED_MODEL = None 66 | _EMBED_TRANSFORM = None 67 | _CLIP_MODEL = None 68 | _CLIP_PREPROCESS = None 69 | 70 | def _parse_scales(env_name: str, default: Tuple[float, ...]) -> Tuple[float, ...]: 71 | raw = os.getenv(env_name) 72 | if not raw: 73 | return default 74 | values: List[float] = [] 75 | for part in raw.split(","): 76 | part = part.strip() 77 | if not part: 78 | continue 79 | try: 80 | val = float(part) 81 | except ValueError: 82 | continue 83 | if val > 0: 84 | values.append(val) 85 | return tuple(values) if values else default 86 | 87 | 88 | MULTISCALE_LEVELS: Tuple[float, ...] = _parse_scales("DUPC_TILE_SCALES", (1.0, 0.75)) 89 | DEFAULT_TILE_GRID = max(1, int(os.getenv("DUPC_TILE_GRID", "8"))) 90 | 91 | @dataclass 92 | class ImageFeatures: 93 | phash: str 94 | orb: Dict[str, Any] 95 | size: Tuple[int, int] 96 | embedding: Optional[Any] = None 97 | tiles: Optional[List[Dict[str, Any]]] = None 98 | 99 | 100 | def compute_phash(image_path: Path, hash_size: int = 8) -> str: 101 | """Compute pHash for the image and return as hex string.""" 102 | if PIL_AVAILABLE and imagehash is not None: 103 | img = Image.open(str(image_path)).convert("RGB") 104 | ph = imagehash.phash(img, hash_size=hash_size) 105 | return ph.__str__() 106 | # Fallback: use SHA1 of file contents and return truncated hex 107 | h = hashlib.sha1() 108 | with open(str(image_path), "rb") as f: 109 | for chunk in iter(lambda: f.read(8192), b""): 110 | h.update(chunk) 111 | return h.hexdigest()[:16] 112 | 113 | 114 | def compute_phash_variants( 115 | image_path: Path, 116 | hash_size: int = 8, 117 | scales: Tuple[float, ...] = MULTISCALE_LEVELS, 118 | ) -> List[str]: 119 | """Return a list of pHash values with multi-scale + orientation variants.""" 120 | if not PIL_AVAILABLE or imagehash is None: 121 | return [compute_phash(image_path, hash_size=hash_size)] 122 | variants: List[str] = [] 123 | with Image.open(str(image_path)) as img: 124 | base = img.convert("RGB") 125 | transforms: List[Image.Image] = [] 126 | for scale in scales: 127 | if scale <= 0: 128 | continue 129 | if scale == 1.0: 130 | scaled = base 131 | else: 132 | w = max(1, int(base.width * scale)) 133 | h = max(1, int(base.height * scale)) 134 | scaled = base.resize((w, h)) 135 | transforms.extend( 136 | [ 137 | scaled, 138 | scaled.rotate(90, expand=True), 139 | scaled.rotate(180, expand=True), 140 | scaled.rotate(270, expand=True), 141 | scaled.transpose(Image.FLIP_LEFT_RIGHT), 142 | scaled.transpose(Image.FLIP_TOP_BOTTOM), 143 | ] 144 | ) 145 | for im in transforms: 146 | variants.append(imagehash.phash(im, hash_size=hash_size).__str__()) 147 | # deduplicate while preserving order 148 | seen: List[str] = [] 149 | for v in variants: 150 | if v not in seen: 151 | seen.append(v) 152 | return seen 153 | 154 | 155 | def _load_embedder(): 156 | global _EMBED_MODEL, _EMBED_TRANSFORM 157 | if not TORCH_AVAILABLE: 158 | return None, None 159 | if _EMBED_MODEL is not None and _EMBED_TRANSFORM is not None: 160 | return _EMBED_MODEL, _EMBED_TRANSFORM 161 | try: 162 | weights = None 163 | try: 164 | weights = models.ResNet18_Weights.DEFAULT # type: ignore[attr-defined] 165 | except Exception: 166 | weights = None 167 | if weights is not None: 168 | model = models.resnet18(weights=weights) 169 | transform = weights.transforms() 170 | else: 171 | model = models.resnet18(pretrained=True) 172 | from torchvision import transforms 173 | 174 | transform = transforms.Compose( 175 | [ 176 | transforms.Resize(256), 177 | transforms.CenterCrop(224), 178 | transforms.ToTensor(), 179 | transforms.Normalize( 180 | mean=[0.485, 0.456, 0.406], 181 | std=[0.229, 0.224, 0.225], 182 | ), 183 | ] 184 | ) 185 | model.fc = torch.nn.Identity() 186 | model.eval() 187 | model.to("cpu") 188 | _EMBED_MODEL = model 189 | _EMBED_TRANSFORM = transform 190 | except Exception: 191 | _EMBED_MODEL = None 192 | _EMBED_TRANSFORM = None 193 | return _EMBED_MODEL, _EMBED_TRANSFORM 194 | 195 | 196 | def _load_clip_model(): 197 | global _CLIP_MODEL, _CLIP_PREPROCESS 198 | if not CLIP_AVAILABLE or not TORCH_AVAILABLE: 199 | return None, None 200 | if _CLIP_MODEL is not None and _CLIP_PREPROCESS is not None: 201 | return _CLIP_MODEL, _CLIP_PREPROCESS 202 | try: 203 | device = "cpu" 204 | model, preprocess = clip.load("ViT-B/32", device=device) 205 | model.eval() 206 | _CLIP_MODEL = model 207 | _CLIP_PREPROCESS = preprocess 208 | except Exception: 209 | _CLIP_MODEL = None 210 | _CLIP_PREPROCESS = None 211 | return _CLIP_MODEL, _CLIP_PREPROCESS 212 | 213 | 214 | def _fallback_embedding(image_path: Path, size: int = 64) -> Optional[Any]: 215 | if np is None or not PIL_AVAILABLE or Image is None: 216 | return None 217 | try: 218 | img = Image.open(str(image_path)).convert("RGB") 219 | img = img.resize((size, size)) 220 | arr = np.asarray(img, dtype=np.float32) 221 | if arr.size == 0: 222 | return None 223 | arr = arr / 255.0 224 | emb = arr.reshape(-1) 225 | norm = np.linalg.norm(emb) 226 | if norm > 0: 227 | emb = emb / norm 228 | return emb 229 | except Exception: 230 | return None 231 | 232 | 233 | def compute_embedding(image_path: Path) -> Optional[Any]: 234 | """Compute fused embeddings (ResNet18 + optional CLIP) for ANN recall.""" 235 | if np is None: 236 | return _fallback_embedding(image_path) 237 | embeddings: List[np.ndarray] = [] 238 | 239 | model, transform = _load_embedder() 240 | if model is not None and transform is not None and Image is not None: 241 | try: 242 | img = Image.open(str(image_path)).convert("RGB") 243 | tensor = transform(img).unsqueeze(0) 244 | with torch.no_grad(): 245 | vec = model(tensor.to("cpu")).squeeze(0).numpy() 246 | norm = np.linalg.norm(vec) 247 | if norm > 0: 248 | vec = vec / norm 249 | embeddings.append(vec.astype("float32")) 250 | except Exception: 251 | pass 252 | 253 | clip_model, clip_preprocess = _load_clip_model() 254 | if clip_model is not None and clip_preprocess is not None: 255 | try: 256 | img = Image.open(str(image_path)).convert("RGB") 257 | tensor = clip_preprocess(img).unsqueeze(0) 258 | with torch.no_grad(): 259 | vec = clip_model.encode_image(tensor.to("cpu")).squeeze(0).cpu().numpy() 260 | norm = np.linalg.norm(vec) 261 | if norm > 0: 262 | vec = vec / norm 263 | embeddings.append(vec.astype("float32")) 264 | except Exception: 265 | pass 266 | 267 | if embeddings: 268 | try: 269 | fused = np.concatenate(embeddings) 270 | norm = np.linalg.norm(fused) 271 | if norm > 0: 272 | fused = fused / norm 273 | return fused.astype("float32") 274 | except Exception: 275 | pass 276 | 277 | return _fallback_embedding(image_path) 278 | 279 | 280 | def compute_tile_hashes( 281 | image_path: Path, 282 | grid: int = DEFAULT_TILE_GRID, 283 | hash_size: int = 8, 284 | scales: Tuple[float, ...] = MULTISCALE_LEVELS, 285 | ) -> List[Dict[str, Any]]: 286 | """Split image into grid x grid tiles across multiple scales and compute pHash per tile.""" 287 | tiles: List[Dict[str, Any]] = [] 288 | if not PIL_AVAILABLE or Image is None or imagehash is None: 289 | ph = compute_phash(image_path, hash_size=hash_size) 290 | tiles.append({"hash": ph, "bbox": (0, 0, 0, 0), "scale": 1.0}) 291 | return tiles 292 | 293 | base = Image.open(str(image_path)).convert("RGB") 294 | w_base, h_base = base.size 295 | for scale in scales: 296 | if scale <= 0: 297 | continue 298 | if scale == 1.0: 299 | img = base 300 | w, h = w_base, h_base 301 | else: 302 | w = max(1, int(w_base * scale)) 303 | h = max(1, int(h_base * scale)) 304 | img = base.resize((w, h)) 305 | if w == 0 or h == 0: 306 | continue 307 | 308 | tile_w = max(1, w // grid) 309 | tile_h = max(1, h // grid) 310 | for yi in range(grid): 311 | for xi in range(grid): 312 | x0 = xi * tile_w 313 | y0 = yi * tile_h 314 | x1 = x0 + tile_w if xi < grid - 1 else w 315 | y1 = y0 + tile_h if yi < grid - 1 else h 316 | crop = img.crop((x0, y0, x1, y1)) 317 | ph = imagehash.phash(crop, hash_size=hash_size) 318 | inv = 1.0 / scale if scale != 0 else 1.0 319 | bbox = ( 320 | int(x0 * inv), 321 | int(y0 * inv), 322 | int(x1 * inv), 323 | int(y1 * inv), 324 | ) 325 | tiles.append({ 326 | "hash": ph.__str__(), 327 | "bbox": bbox, 328 | "scale": float(scale), 329 | }) 330 | return tiles 331 | 332 | 333 | def compute_orb_descriptors(image_path: Path, max_features: int = 2000) -> Dict: 334 | """Extract ORB keypoints and descriptors using OpenCV. 335 | 336 | Returns dict {"kps": list of cv2.KeyPoint, "descs": np.ndarray} 337 | """ 338 | if cv2 is None: 339 | # Graceful fallback: return empty descriptors 340 | # 优雅降级:返回空的关键点/描述子 341 | return {"kps": [], "descs": None} 342 | img = cv2.imread(str(image_path)) 343 | if img is None: 344 | raise IOError(f"Unable to read image: {image_path}") 345 | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 346 | orb = cv2.ORB_create(nfeatures=max_features) 347 | kps, descs = orb.detectAndCompute(gray, None) 348 | return {"kps": kps, "descs": descs} 349 | 350 | 351 | def compute_features(image_path: Path, orb_max_features: int = 2000, tile_grid: int = DEFAULT_TILE_GRID) -> ImageFeatures: 352 | ph = compute_phash(image_path) 353 | orb = {} 354 | try: 355 | orb = compute_orb_descriptors(image_path, max_features=orb_max_features) 356 | except Exception: 357 | orb = {"kps": [], "descs": None} 358 | size = (0, 0) 359 | embedding = None 360 | if PIL_AVAILABLE and Image is not None: 361 | try: 362 | img = Image.open(str(image_path)) 363 | size = img.size 364 | except Exception: 365 | size = (0, 0) 366 | try: 367 | embedding = compute_embedding(image_path) 368 | except Exception: 369 | embedding = None 370 | try: 371 | tiles = compute_tile_hashes(image_path, grid=tile_grid) 372 | except Exception: 373 | tiles = [] 374 | feats = ImageFeatures(phash=ph, orb=orb, size=size, embedding=embedding) 375 | feats.tiles = tiles 376 | try: 377 | feats._path = str(image_path) 378 | except Exception: 379 | pass 380 | return feats 381 | -------------------------------------------------------------------------------- /duplicate_check/matcher.py: -------------------------------------------------------------------------------- 1 | """Matcher: recall via phash and tile-hash, precise verification via ORB+RANSAC and NCC. 2 | 3 | Matcher 模块:通过 phash 和 tile-hash 召回候选,使用 ORB+RANSAC 与 NCC 做精排与判定。 4 | """ 5 | from pathlib import Path 6 | from typing import Dict, Any, List, Tuple, Optional 7 | from duplicate_check.features import ImageFeatures 8 | import hashlib 9 | 10 | try: 11 | import numpy as np 12 | except Exception: 13 | np = None 14 | 15 | try: 16 | import cv2 17 | except Exception: 18 | cv2 = None 19 | 20 | try: 21 | import faiss 22 | except Exception: 23 | faiss = None 24 | 25 | 26 | _DB_FEATURE_VARIANT_CACHE: Dict[str, List[Dict[str, Any]]] = {} 27 | 28 | 29 | def hamming_distance_hex(a: str, b: str) -> int: 30 | # imagehash returns hex string; convert to int 31 | # imagehash 返回十六进制字符串;将其转换为整数并计算汉明距离 32 | ai = int(a, 16) 33 | bi = int(b, 16) 34 | x = ai ^ bi 35 | return x.bit_count() 36 | 37 | 38 | def _has_descriptors(variant: Dict[str, Any]) -> bool: 39 | desc = variant.get("descs") 40 | try: 41 | return desc is not None and len(desc) > 0 42 | except Exception: 43 | return False 44 | 45 | 46 | def _count_good_matches(desc1, desc2, ratio: float = 0.75) -> int: 47 | if cv2 is None or np is None: 48 | return 0 49 | if desc1 is None or desc2 is None: 50 | return 0 51 | try: 52 | if len(desc1) == 0 or len(desc2) == 0: 53 | return 0 54 | except Exception: 55 | return 0 56 | dtype1 = getattr(desc1, "dtype", None) 57 | dtype2 = getattr(desc2, "dtype", None) 58 | norm = cv2.NORM_HAMMING 59 | if dtype1 is not None: 60 | if dtype1 == np.float32: 61 | norm = cv2.NORM_L2 62 | elif dtype1 == np.uint8: 63 | norm = cv2.NORM_HAMMING 64 | if dtype1 is not None and dtype2 is not None and dtype1 != dtype2: 65 | try: 66 | desc2 = desc2.astype(dtype1) 67 | except Exception: 68 | pass 69 | bf = cv2.BFMatcher(norm, crossCheck=False) 70 | try: 71 | matches = bf.knnMatch(desc1, desc2, k=2) 72 | except cv2.error: 73 | return 0 74 | good = 0 75 | for pair in matches: 76 | if len(pair) != 2: 77 | continue 78 | m, n = pair 79 | if m.distance < ratio * n.distance: 80 | good += 1 81 | return good 82 | 83 | 84 | def _variant_orientation(name: Optional[str]) -> str: 85 | if not name: 86 | return "" 87 | parts = name.split("_", 1) 88 | return parts[1] if len(parts) == 2 else parts[0] 89 | 90 | 91 | def _filter_inlier_matches(matches: List[Any], mask: Optional[List[int]]) -> List[Any]: 92 | if not matches: 93 | return [] 94 | if not mask: 95 | return matches 96 | return [m for m, keep in zip(matches, mask) if keep] 97 | 98 | 99 | def _limit_matches(matches: List[Any], max_count: int) -> List[Any]: 100 | if max_count <= 0 or not matches: 101 | return matches 102 | if len(matches) <= max_count: 103 | return matches 104 | stride = max(1, len(matches) // max_count) 105 | limited = matches[::stride] 106 | if len(limited) > max_count: 107 | limited = limited[:max_count] 108 | if not limited: 109 | return matches[:max_count] 110 | return limited 111 | 112 | 113 | def _compute_roi_from_matches( 114 | matches: List[Any], 115 | keypoints: List[Any], 116 | image_path: Path, 117 | margin_ratio: float = 0.15, 118 | *, 119 | index_attr: str = "trainIdx", 120 | max_fraction: float = 0.6, 121 | min_size: int = 16, 122 | ) -> Optional[Tuple[int, int, int, int]]: 123 | if cv2 is None or not matches or not keypoints: 124 | return None 125 | img = cv2.imread(str(image_path), cv2.IMREAD_COLOR) 126 | if img is None: 127 | return None 128 | h, w = img.shape[:2] 129 | xs: List[float] = [] 130 | ys: List[float] = [] 131 | for m in matches: 132 | idx = getattr(m, index_attr, None) 133 | if idx is None or idx >= len(keypoints): 134 | continue 135 | pt = keypoints[idx].pt 136 | xs.append(float(pt[0])) 137 | ys.append(float(pt[1])) 138 | if len(xs) < 2 or len(ys) < 2: 139 | return None 140 | min_x, max_x = min(xs), max(xs) 141 | min_y, max_y = min(ys), max(ys) 142 | width = max_x - min_x 143 | height = max_y - min_y 144 | if width <= 0 or height <= 0: 145 | return None 146 | margin_x = max(10.0, width * margin_ratio) 147 | margin_y = max(10.0, height * margin_ratio) 148 | x0 = max(0, int(min_x - margin_x)) 149 | y0 = max(0, int(min_y - margin_y)) 150 | x1 = min(w, int(max_x + margin_x)) 151 | y1 = min(h, int(max_y + margin_y)) 152 | roi_w = x1 - x0 153 | roi_h = y1 - y0 154 | if roi_w <= 0 or roi_h <= 0: 155 | return None 156 | max_w = max(min_size, int(w * max_fraction)) 157 | max_h = max(min_size, int(h * max_fraction)) 158 | if roi_w > max_w: 159 | cx = (x0 + x1) / 2.0 160 | half = max_w / 2.0 161 | x0 = max(0, int(round(cx - half))) 162 | x1 = min(w, int(round(cx + half))) 163 | if roi_h > max_h: 164 | cy = (y0 + y1) / 2.0 165 | half = max_h / 2.0 166 | y0 = max(0, int(round(cy - half))) 167 | y1 = min(h, int(round(cy + half))) 168 | if x1 - x0 <= 0 or y1 - y0 <= 0: 169 | return None 170 | return (x0, y0, x1, y1) 171 | 172 | 173 | def _compute_feature_variants_for_path( 174 | path: Path, 175 | cache: Dict[str, List[Dict[str, Any]]] | None = None, 176 | max_features: int = 2000, 177 | ) -> List[Dict[str, Any]]: 178 | key = str(path) 179 | if cache is not None and key in cache: 180 | return cache[key] 181 | 182 | variants: List[Dict[str, Any]] = [] 183 | if cv2 is None: 184 | variants.append({"name": "orb_rot0", "algo": "orb", "kps": [], "descs": None}) 185 | variants.append({"name": "akaze_rot0", "algo": "akaze", "kps": [], "descs": None}) 186 | else: 187 | img = cv2.imread(str(path)) 188 | if img is None: 189 | variants.append({"name": "orb_rot0", "algo": "orb", "kps": [], "descs": None}) 190 | variants.append({"name": "akaze_rot0", "algo": "akaze", "kps": [], "descs": None}) 191 | else: 192 | detectors: List[Tuple[str, Any]] = [] 193 | try: 194 | detectors.append(("orb", cv2.ORB_create(nfeatures=max_features))) 195 | except Exception: 196 | detectors.append(("orb", None)) 197 | try: 198 | detectors.append(("akaze", cv2.AKAZE_create())) 199 | except Exception: 200 | detectors.append(("akaze", None)) 201 | 202 | transforms = [ 203 | ("rot0", img), 204 | ("rot90", cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)), 205 | ("rot180", cv2.rotate(img, cv2.ROTATE_180)), 206 | ("rot270", cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)), 207 | ("flip0", cv2.flip(img, 1)), 208 | ] 209 | 210 | for algo, detector in detectors: 211 | seen = set() 212 | for name, mat in transforms: 213 | variant_name = f"{algo}_{name}" 214 | if mat is None or variant_name in seen: 215 | continue 216 | seen.add(variant_name) 217 | if detector is None: 218 | variants.append({"name": variant_name, "algo": algo, "kps": [], "descs": None}) 219 | continue 220 | try: 221 | gray = cv2.cvtColor(mat, cv2.COLOR_BGR2GRAY) 222 | except Exception: 223 | variants.append({"name": variant_name, "algo": algo, "kps": [], "descs": None}) 224 | continue 225 | kps, descs = detector.detectAndCompute(gray, None) 226 | variants.append({"name": variant_name, "algo": algo, "kps": kps or [], "descs": descs}) 227 | 228 | if cache is not None: 229 | cache[key] = variants 230 | return variants 231 | 232 | 233 | def _get_db_feature_variants(path: Path) -> List[Dict[str, Any]]: 234 | return _compute_feature_variants_for_path(path, _DB_FEATURE_VARIANT_CACHE) 235 | 236 | 237 | def _best_orb_match(q_variants: List[Dict[str, Any]], db_variants: List[Dict[str, Any]]) -> Tuple[int, int, Optional[Tuple[str, str]]]: 238 | best_good = 0 239 | best_len = 1 240 | best_pair: Optional[Tuple[str, str]] = None 241 | for q_var in q_variants: 242 | if not _has_descriptors(q_var): 243 | continue 244 | q_desc = q_var.get("descs") 245 | q_len = len(q_desc) 246 | for d_var in db_variants: 247 | if not _has_descriptors(d_var): 248 | continue 249 | good = _count_good_matches(q_desc, d_var.get("descs")) 250 | if good > best_good: 251 | best_good = good 252 | best_len = max(1, q_len) 253 | best_pair = (q_var.get("name"), d_var.get("name")) 254 | return best_good, best_len, best_pair 255 | 256 | def recall_candidates( 257 | features: ImageFeatures, 258 | index: Dict, 259 | topk: int = 50, 260 | phash_thresh: int = 10, 261 | tile_match_count: int = 3, 262 | vector_score_thresh: float = 0.0, 263 | ) -> List[Dict[str, Any]]: 264 | """Recall candidates by global pHash and tile-hash. Returns list of dicts with scores. 265 | 266 | 通过全局 pHash 和块哈希召回候选,返回包含分数的字典列表。 267 | """ 268 | ph = features.phash 269 | hits: Dict[str, Dict[str, Any]] = {} 270 | # global phash exact-ish match 271 | for phash_key, ids in index.get("by_phash", {}).items(): 272 | d = hamming_distance_hex(ph, phash_key) 273 | if d <= phash_thresh: 274 | for i in ids: 275 | hits.setdefault(i, {"score": 0.0, "reason": []}) 276 | hits[i]["score"] += max(0, (phash_thresh - d) / phash_thresh) 277 | hits[i]["reason"].append(("phash", d)) 278 | 279 | # tile recall: compute query tile hashes (if possible) and count matches 280 | q_tiles = getattr(features, "tiles", None) 281 | if q_tiles is None: 282 | try: 283 | from duplicate_check.features import compute_tile_hashes, DEFAULT_TILE_GRID 284 | 285 | if hasattr(features, "_path") and features._path: 286 | q_tiles = compute_tile_hashes(Path(features._path), grid=DEFAULT_TILE_GRID) 287 | features.tiles = q_tiles 288 | except Exception: 289 | q_tiles = None 290 | 291 | if q_tiles: 292 | tile_counts: Dict[str, int] = {} 293 | for tile in q_tiles: 294 | th = tile.get("hash") 295 | if not th: 296 | continue 297 | for entry in index.get("by_tile", {}).get(th, []): 298 | img_id = entry.get("img_id") 299 | if img_id is None: 300 | continue 301 | tile_counts.setdefault(img_id, 0) 302 | tile_counts[img_id] += 1 303 | for img_id, cnt in tile_counts.items(): 304 | entry = hits.setdefault(img_id, {"score": 0.0, "reason": []}) 305 | entry["score"] += cnt / (len(q_tiles) or 1) 306 | entry.setdefault("reason", []).append(("tiles", cnt)) 307 | 308 | # Vector-based recall via FAISS (optional) 309 | vector_index = index.get("vector") if isinstance(index, dict) else None 310 | if vector_index and np is not None and faiss is not None: 311 | q_emb = getattr(features, "embedding", None) 312 | if q_emb is None and hasattr(features, "_path"): 313 | try: 314 | from duplicate_check.features import compute_embedding 315 | 316 | q_emb = compute_embedding(Path(features._path)) 317 | except Exception: 318 | q_emb = None 319 | try: 320 | if q_emb is not None: 321 | vec = np.asarray(q_emb, dtype=np.float32) 322 | if vec.ndim == 1 and vec.size > 0: 323 | norm = np.linalg.norm(vec) 324 | if norm > 0: 325 | vec = vec / norm 326 | vec = vec.reshape(1, -1) 327 | index_obj = vector_index.get("index") 328 | ids = vector_index.get("ids", []) 329 | metric = vector_index.get("metric", "ip") 330 | if index_obj is not None and len(ids): 331 | topn = min(max(topk * 2, 32), len(ids)) 332 | D, I = index_obj.search(vec, topn) 333 | for dist, idx_id in zip(D[0], I[0]): 334 | if idx_id < 0 or idx_id >= len(ids): 335 | continue 336 | db_id = ids[idx_id] 337 | if metric == "ip": 338 | score = float(dist) 339 | else: 340 | score = float(1.0 / (1.0 + dist)) 341 | if score <= 0: 342 | continue 343 | if score >= vector_score_thresh: 344 | entry = hits.setdefault(db_id, {"score": 0.0, "reason": []}) 345 | entry["score"] += score 346 | entry.setdefault("reason", []).append(("vector", score)) 347 | except Exception: 348 | pass 349 | 350 | # Orientation-aware ORB scoring 351 | query_path = None 352 | if hasattr(features, "_path") and features._path: 353 | try: 354 | query_path = Path(features._path) 355 | except Exception: 356 | query_path = None 357 | 358 | q_variants: List[Dict[str, Any]] = [] 359 | if query_path is not None: 360 | q_variants = getattr(features, "_feature_variants", None) or [] 361 | if not q_variants: 362 | try: 363 | q_variants = _compute_feature_variants_for_path(query_path) 364 | features._feature_variants = q_variants 365 | except Exception: 366 | q_variants = [] 367 | 368 | has_query_orb = any(_has_descriptors(v) for v in q_variants) 369 | 370 | if has_query_orb: 371 | for img_id in list(hits.keys()): 372 | rec = index.get("by_id", {}).get(img_id) 373 | if rec is None: 374 | continue 375 | db_variants = _get_db_feature_variants(Path(rec["path"])) 376 | best_good, best_len, best_pair = _best_orb_match(q_variants, db_variants) 377 | if best_good <= 0 or best_pair is None: 378 | continue 379 | entry = hits.setdefault(img_id, {"score": 0.0, "reason": []}) 380 | entry["score"] += min(1.0, best_good / max(1, best_len)) 381 | entry.setdefault("reason", []).append(("orb", best_good)) 382 | entry["best_orient"] = best_pair 383 | 384 | # Fallback: add strong ORB matches not yet recalled 385 | if len(hits) < topk: 386 | ORB_FALLBACK_MIN = 25 387 | for img_id, rec in index.get("by_id", {}).items(): 388 | if img_id in hits: 389 | continue 390 | db_variants = _get_db_feature_variants(Path(rec["path"])) 391 | best_good, best_len, best_pair = _best_orb_match(q_variants, db_variants) 392 | if best_good < ORB_FALLBACK_MIN or best_pair is None: 393 | continue 394 | entry = hits.setdefault(img_id, {"score": 0.0, "reason": []}) 395 | entry["score"] += min(1.0, best_good / max(1, best_len)) 396 | entry.setdefault("reason", []).append(("orb", best_good)) 397 | entry["best_orient"] = best_pair 398 | if len(hits) >= topk: 399 | break 400 | 401 | # Convert hits to sorted list 402 | out = [] 403 | for img_id, v in hits.items(): 404 | out.append( 405 | { 406 | "db_id": img_id, 407 | "score": v.get("score", 0.0), 408 | "reason": v.get("reason", []), 409 | "orientation": v.get("best_orient"), 410 | } 411 | ) 412 | out.sort(key=lambda x: x["score"], reverse=True) 413 | return out[:topk] 414 | 415 | 416 | def _orb_ransac_inliers(kps1, desc1, kps2, desc2, ratio=0.75, ransac_thresh=5.0): 417 | """Match descriptors using BFMatcher and compute RANSAC homography inliers. 418 | 419 | Returns (inlier_count, inlier_ratio, matches_mask, H) 420 | """ 421 | if cv2 is None or np is None or desc1 is None or desc2 is None: 422 | return 0, 0.0, None, None, [] 423 | try: 424 | if len(desc1) == 0 or len(desc2) == 0: 425 | return 0, 0.0, None, None, [] 426 | except Exception: 427 | return 0, 0.0, None, None, [] 428 | dtype1 = getattr(desc1, "dtype", None) 429 | dtype2 = getattr(desc2, "dtype", None) 430 | if dtype1 is not None and dtype2 is not None and dtype1 != dtype2: 431 | try: 432 | desc2 = desc2.astype(dtype1) 433 | dtype2 = dtype1 434 | except Exception: 435 | pass 436 | if dtype1 is None or dtype2 is None: 437 | return 0, 0.0, None, None, [] 438 | if desc1.shape[1] != desc2.shape[1]: 439 | return 0, 0.0, None, None, [] 440 | norm = cv2.NORM_HAMMING if dtype1 == np.uint8 else cv2.NORM_L2 441 | bf = cv2.BFMatcher(norm, crossCheck=False) 442 | try: 443 | matches = bf.knnMatch(desc1, desc2, k=2) 444 | except cv2.error: 445 | return 0, 0.0, None, None, [] 446 | good = [] 447 | for m_n in matches: 448 | if len(m_n) != 2: 449 | continue 450 | m, n = m_n 451 | if m.distance < ratio * n.distance: 452 | good.append(m) 453 | if len(good) < 4: 454 | return 0, 0.0, None, None, good 455 | src_pts = np.float32([kps1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2) 456 | dst_pts = np.float32([kps2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2) 457 | method = getattr(cv2, "USAC_MAGSAC", cv2.RANSAC) 458 | try: 459 | H, mask = cv2.findHomography(src_pts, dst_pts, method, ransac_thresh) 460 | except Exception: 461 | H, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, ransac_thresh) 462 | if mask is None and method != cv2.RANSAC: 463 | try: 464 | H, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, ransac_thresh) 465 | except Exception: 466 | mask = None 467 | if mask is None: 468 | return 0, 0.0, None, H, good 469 | inliers = int(mask.sum()) 470 | inlier_ratio = inliers / max(1, len(good)) 471 | return inliers, inlier_ratio, mask.ravel().tolist(), H, good 472 | 473 | 474 | def _ncc_peak( 475 | img_query_path: Path, 476 | db_path: Path, 477 | bbox_query: Tuple[int, int, int, int], 478 | bbox_db: Tuple[int, int, int, int], 479 | *, 480 | min_size: int = 16, 481 | ) -> float: 482 | """Compute normalized cross correlation between query and db patch. 483 | 484 | For simplicity, we load images via OpenCV, extract the db bbox, resize query to same 485 | and compute cv2.matchTemplate with TM_CCOEFF_NORMED. 486 | """ 487 | if cv2 is None or np is None: 488 | return 0.0 489 | q = cv2.imread(str(img_query_path), cv2.IMREAD_COLOR) 490 | d = cv2.imread(str(db_path), cv2.IMREAD_COLOR) 491 | if q is None or d is None: 492 | return 0.0 493 | qx0, qy0, qx1, qy1 = bbox_query 494 | dx0, dy0, dx1, dy1 = bbox_db 495 | q_patch = q[qy0:qy1, qx0:qx1] 496 | d_patch = d[dy0:dy1, dx0:dx1] 497 | if q_patch.size == 0 or d_patch.size == 0: 498 | return 0.0 499 | if q_patch.shape[0] < min_size or q_patch.shape[1] < min_size: 500 | return 0.0 501 | if d_patch.shape[0] < min_size or d_patch.shape[1] < min_size: 502 | return 0.0 503 | # Resize query ROI to the database ROI size for comparison 504 | q_resized = cv2.resize(q_patch, (d_patch.shape[1], d_patch.shape[0])) 505 | qf = cv2.cvtColor(q_resized, cv2.COLOR_BGR2GRAY) 506 | pf = cv2.cvtColor(d_patch, cv2.COLOR_BGR2GRAY) 507 | res = cv2.matchTemplate(pf, qf, cv2.TM_CCOEFF_NORMED) 508 | return float(res.max()) if res.size else 0.0 509 | 510 | 511 | def rerank_and_verify( 512 | input_path: Path, 513 | candidates: List[Dict[str, Any]], 514 | index: Dict, 515 | orb_inliers_thresh: int = 25, 516 | orb_inlier_ratio: float = 0.25, 517 | ncc_thresh: float = 0.92, 518 | roi_margin_ratio: float = 0.12, 519 | max_roi_matches: int = 60, 520 | ) -> List[Dict[str, Any]]: 521 | """For each candidate, run ORB matching + RANSAC and NCC to generate final decision rows.""" 522 | rows: List[Dict[str, Any]] = [] 523 | 524 | try: 525 | q_variants = _compute_feature_variants_for_path(input_path) 526 | except Exception: 527 | q_variants = [] 528 | q_map = {var.get("name"): var for var in q_variants} 529 | has_query_orb = any(_has_descriptors(v) for v in q_variants) 530 | 531 | for c in candidates: 532 | db_id = c.get("db_id") 533 | db_rec = index.get("by_id", {}).get(db_id) if db_id else None 534 | if db_rec is None: 535 | continue 536 | db_path = Path(db_rec["path"]) 537 | db_variants = _get_db_feature_variants(db_path) 538 | d_map = {var.get("name"): var for var in db_variants} 539 | has_db_orb = any(_has_descriptors(v) for v in db_variants) 540 | 541 | orientation_hint = c.get("orientation") 542 | pair_order: List[Tuple[str, str]] = [] 543 | if orientation_hint and isinstance(orientation_hint, (tuple, list)) and len(orientation_hint) == 2: 544 | q_name, d_name = orientation_hint 545 | if q_name in q_map and d_name in d_map: 546 | pair_order.append((q_name, d_name)) 547 | 548 | for q_var in q_variants: 549 | for d_var in db_variants: 550 | pair = (q_var.get("name"), d_var.get("name")) 551 | if pair not in pair_order: 552 | pair_order.append(pair) 553 | 554 | best = None 555 | for q_name, d_name in pair_order: 556 | q_var = q_map.get(q_name) 557 | d_var = d_map.get(d_name) 558 | if not q_var or not d_var: 559 | continue 560 | if not _has_descriptors(q_var) or not _has_descriptors(d_var): 561 | continue 562 | inliers, inlier_ratio, mask, H, good_matches = _orb_ransac_inliers( 563 | q_var["kps"], 564 | q_var["descs"], 565 | d_var["kps"], 566 | d_var["descs"], 567 | ) 568 | if best is None or inliers > best["inliers"]: 569 | best = { 570 | "q": q_var, 571 | "d": d_var, 572 | "q_name": q_name, 573 | "d_name": d_name, 574 | "algo_q": q_var.get("algo", "orb"), 575 | "algo_d": d_var.get("algo", "orb"), 576 | "inliers": inliers, 577 | "inlier_ratio": inlier_ratio, 578 | "matches": good_matches, 579 | "mask": mask, 580 | } 581 | 582 | has_descriptors = has_query_orb and has_db_orb 583 | 584 | if best is None: 585 | if not has_descriptors: 586 | reasons = {r[0] for r in c.get("reason", [])} 587 | if "phash" in reasons: 588 | rows.append( 589 | { 590 | "new_image": str(input_path.name), 591 | "matched_image": db_id, 592 | "final_label": "phash_duplicate", 593 | "score": float(max(c.get("score", 0.5), 0.5)), 594 | "inliers": 0, 595 | "inlier_ratio": 0.0, 596 | "ncc_peak": 0.0, 597 | "evidence_img_path": "", 598 | "match_pairs": [], 599 | "orientation": "", 600 | } 601 | ) 602 | continue 603 | 604 | label = "unique" 605 | score = c.get("score", 0.0) 606 | ncc_peak = 0.0 607 | evidence = "" 608 | 609 | if ( 610 | best["inliers"] >= orb_inliers_thresh 611 | and best["inlier_ratio"] >= orb_inlier_ratio 612 | ): 613 | label = "partial_duplicate" 614 | score = max(score, min(0.99, 0.5 + best["inlier_ratio"])) 615 | matches_for_roi = _filter_inlier_matches(best.get("matches") or [], best.get("mask")) 616 | matches_for_roi = _limit_matches(matches_for_roi, max_roi_matches) 617 | if ( 618 | matches_for_roi 619 | and len(matches_for_roi) >= 4 620 | and best.get("algo_q") == "orb" 621 | and best.get("algo_d") == "orb" 622 | and _variant_orientation(best.get("q_name")) == "rot0" 623 | and _variant_orientation(best.get("d_name")) == "rot0" 624 | ): 625 | q_bbox = _compute_roi_from_matches( 626 | matches_for_roi, 627 | best["q"]["kps"], 628 | input_path, 629 | margin_ratio=roi_margin_ratio, 630 | index_attr="queryIdx", 631 | ) 632 | d_bbox = _compute_roi_from_matches( 633 | matches_for_roi, 634 | best["d"]["kps"], 635 | db_path, 636 | margin_ratio=roi_margin_ratio, 637 | index_attr="trainIdx", 638 | ) 639 | if q_bbox and d_bbox: 640 | try: 641 | ncc_peak = _ncc_peak(input_path, db_path, q_bbox, d_bbox) 642 | except Exception: 643 | ncc_peak = 0.0 644 | if ncc_peak >= ncc_thresh: 645 | label = "exact_patch" 646 | score = 0.99 647 | else: 648 | continue 649 | 650 | match_pairs: List[Tuple[Tuple[float, float], Tuple[float, float]]] = [] 651 | try: 652 | matches = best.get("matches") or [] 653 | q_kps = best["q"]["kps"] 654 | d_kps = best["d"]["kps"] 655 | if matches and q_kps and d_kps: 656 | for m in matches: 657 | pt_q = q_kps[m.queryIdx].pt 658 | pt_d = d_kps[m.trainIdx].pt 659 | match_pairs.append(((float(pt_q[0]), float(pt_q[1])), (float(pt_d[0]), float(pt_d[1])))) 660 | except Exception: 661 | match_pairs = [] 662 | 663 | rows.append( 664 | { 665 | "new_image": str(input_path.name), 666 | "matched_image": db_id, 667 | "final_label": label, 668 | "score": float(score), 669 | "inliers": int(best["inliers"]), 670 | "inlier_ratio": float(best["inlier_ratio"]), 671 | "ncc_peak": float(ncc_peak), 672 | "evidence_img_path": evidence, 673 | "match_pairs": match_pairs, 674 | "orientation": f"{best['q_name']}->{best['d_name']}", 675 | } 676 | ) 677 | 678 | return rows 679 | --------------------------------------------------------------------------------