├── index.db ├── data ├── synth_db │ ├── base_1.jpg │ ├── base_2.jpg │ ├── base_3.jpg │ ├── base_4.jpg │ └── base_5.jpg ├── synth_new │ ├── new_1_copy.jpg │ ├── new_1_crop.jpg │ ├── new_1_flip.jpg │ ├── new_1_ps.jpg │ ├── new_1_rot.jpg │ ├── new_2_copy.jpg │ ├── new_2_crop.jpg │ ├── new_2_flip.jpg │ ├── new_2_ps.jpg │ ├── new_2_rot.jpg │ ├── new_3_copy.jpg │ ├── new_3_crop.jpg │ ├── new_3_flip.jpg │ ├── new_3_ps.jpg │ ├── new_3_rot.jpg │ ├── new_4_copy.jpg │ ├── new_4_crop.jpg │ ├── new_4_flip.jpg │ ├── new_4_ps.jpg │ ├── new_4_rot.jpg │ ├── new_5_copy.jpg │ ├── new_5_crop.jpg │ ├── new_5_flip.jpg │ ├── new_5_ps.jpg │ ├── new_5_rot.jpg │ ├── new_1_bright.jpg │ ├── new_1_jpeg30.jpg │ ├── new_2_bright.jpg │ ├── new_2_jpeg30.jpg │ ├── new_3_bright.jpg │ ├── new_3_jpeg30.jpg │ ├── new_4_bright.jpg │ ├── new_4_jpeg30.jpg │ ├── new_5_bright.jpg │ ├── new_5_jpeg30.jpg │ ├── new_unique_1.jpg │ ├── new_unique_2.jpg │ ├── new_unique_3.jpg │ ├── new_unique_4.jpg │ └── new_unique_5.jpg └── synth_labels.csv ├── reports ├── new_1_copy__VS__base_1.jpg ├── new_1_crop__VS__base_1.jpg ├── new_1_flip__VS__base_1.jpg ├── new_1_ps__VS__base_1.jpg ├── new_2_copy__VS__base_2.jpg ├── new_2_copy__VS__base_3.jpg ├── new_2_crop__VS__base_2.jpg ├── new_2_flip__VS__base_2.jpg ├── new_2_flip__VS__base_3.jpg ├── new_2_ps__VS__base_2.jpg ├── new_3_copy__VS__base_2.jpg ├── new_3_copy__VS__base_3.jpg ├── new_3_crop__VS__base_3.jpg ├── new_3_flip__VS__base_2.jpg ├── new_3_flip__VS__base_3.jpg ├── new_3_ps__VS__base_1.jpg ├── new_3_ps__VS__base_2.jpg ├── new_3_ps__VS__base_3.jpg ├── new_4_copy__VS__base_4.jpg ├── new_4_crop__VS__base_4.jpg ├── new_4_flip__VS__base_4.jpg ├── new_4_ps__VS__base_4.jpg ├── new_5_copy__VS__base_5.jpg ├── new_5_crop__VS__base_5.jpg ├── new_5_flip__VS__base_5.jpg ├── new_5_ps__VS__base_5.jpg ├── new_5_rot__VS__base_5.jpg ├── new_1_bright__VS__base_1.jpg ├── new_2_bright__VS__base_2.jpg ├── new_2_jpeg30__VS__base_2.jpg ├── new_3_bright__VS__base_3.jpg ├── new_3_jpeg30__VS__base_3.jpg ├── new_4_bright__VS__base_4.jpg ├── new_4_jpeg30__VS__base_4.jpg ├── new_5_bright__VS__base_5.jpg ├── new_5_jpeg30__VS__base_5.jpg ├── tune_out │ └── tune_results.csv └── dup_report.csv ├── __pycache__ └── duplicate_check.cpython-313.pyc ├── tools ├── __pycache__ │ └── tune_thresholds.cpython-312.pyc ├── generate_synthetic.py ├── tune_thresholds.py └── verify_synthetic.py ├── duplicate_check ├── __pycache__ │ ├── indexer.cpython-312.pyc │ ├── matcher.cpython-312.pyc │ ├── report.cpython-312.pyc │ ├── __init__.cpython-312.pyc │ └── features.cpython-312.pyc ├── __init__.py ├── report.py ├── indexer.py ├── features.py └── matcher.py ├── requirements.txt ├── config.yaml ├── tests ├── test_matcher.py └── test_features.py ├── LICENSE ├── duplicate_check.py ├── run_smoke.py ├── dupcheck_cli.py ├── README_zh.md ├── README_en.md └── README.md /index.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/index.db -------------------------------------------------------------------------------- /data/synth_db/base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_1.jpg -------------------------------------------------------------------------------- /data/synth_db/base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_2.jpg -------------------------------------------------------------------------------- /data/synth_db/base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_3.jpg -------------------------------------------------------------------------------- /data/synth_db/base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_4.jpg -------------------------------------------------------------------------------- /data/synth_db/base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_5.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_copy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_copy.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_crop.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_flip.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_flip.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_ps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_ps.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_rot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_rot.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_1_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_2_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_3_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_4_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_bright.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_bright.jpg -------------------------------------------------------------------------------- /data/synth_new/new_5_jpeg30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_jpeg30.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_1.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_2.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_3.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_4.jpg -------------------------------------------------------------------------------- /data/synth_new/new_unique_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_5.jpg -------------------------------------------------------------------------------- /reports/new_1_copy__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_copy__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_1_crop__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_crop__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_1_flip__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_flip__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_1_ps__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_ps__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_2_copy__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_copy__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_copy__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_copy__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_2_crop__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_crop__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_flip__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_flip__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_flip__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_flip__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_2_ps__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_ps__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_copy__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_copy__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_copy__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_copy__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_crop__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_crop__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_flip__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_flip__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_flip__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_flip__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_ps__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_3_ps__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_ps__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_4_copy__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_copy__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_crop__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_crop__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_flip__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_flip__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_ps__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_ps__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_5_copy__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_copy__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_crop__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_crop__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_flip__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_flip__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_ps__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_ps__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_rot__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_rot__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_1_bright__VS__base_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_bright__VS__base_1.jpg -------------------------------------------------------------------------------- /reports/new_2_bright__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_bright__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_2_jpeg30__VS__base_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_jpeg30__VS__base_2.jpg -------------------------------------------------------------------------------- /reports/new_3_bright__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_bright__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_3_jpeg30__VS__base_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_jpeg30__VS__base_3.jpg -------------------------------------------------------------------------------- /reports/new_4_bright__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_bright__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_4_jpeg30__VS__base_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_jpeg30__VS__base_4.jpg -------------------------------------------------------------------------------- /reports/new_5_bright__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_bright__VS__base_5.jpg -------------------------------------------------------------------------------- /reports/new_5_jpeg30__VS__base_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_jpeg30__VS__base_5.jpg -------------------------------------------------------------------------------- /__pycache__/duplicate_check.cpython-313.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/__pycache__/duplicate_check.cpython-313.pyc -------------------------------------------------------------------------------- /tools/__pycache__/tune_thresholds.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/tools/__pycache__/tune_thresholds.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/indexer.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/indexer.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/matcher.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/matcher.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/report.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/report.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/__init__.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/__init__.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__pycache__/features.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/features.cpython-312.pyc -------------------------------------------------------------------------------- /duplicate_check/__init__.py: -------------------------------------------------------------------------------- 1 | """duplicate_check package init for the skeleton project.""" 2 | from . import features, indexer, matcher, report 3 | 4 | __all__ = ["features", "indexer", "matcher", "report"] 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | numpy 3 | Pillow 4 | imagehash 5 | faiss-cpu 6 | torch 7 | torchvision 8 | # Optional CLIP support (install from source if wheel unavailable) 9 | clip-anytorch 10 | tqdm 11 | matplotlib 12 | piexif 13 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | phash_bits: 64 2 | phash_thresh: 10 3 | tile_grid: 8 4 | tile_hamming_thresh: 6 5 | orb_max_features: 2000 6 | orb_inliers_thresh: 25 7 | orb_inlier_ratio: 0.25 8 | ncc_thresh: 0.92 9 | roi_margin_ratio: 0.12 10 | max_roi_matches: 60 11 | topk_recall: 50 12 | -------------------------------------------------------------------------------- /tests/test_matcher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from duplicate_check import matcher 5 | 6 | 7 | @pytest.mark.parametrize("dtype", [np.uint8, np.float32], ids=["uint8", "float32"]) 8 | def test_count_good_matches_dtype_handling(dtype): 9 | cv2 = pytest.importorskip("cv2") 10 | rng = np.random.default_rng(42) 11 | desc1 = (rng.random((32, 64)) * (255 if dtype == np.uint8 else 1)).astype(dtype) 12 | desc2 = desc1.copy().astype(dtype) 13 | result = matcher._count_good_matches(desc1, desc2) 14 | assert isinstance(result, int) 15 | assert result >= 0 16 | 17 | 18 | def test_count_good_matches_mixed_dtype(): 19 | pytest.importorskip("cv2") 20 | rng = np.random.default_rng(7) 21 | desc1 = (rng.random((16, 32)) * 255).astype(np.uint8) 22 | desc2 = desc1.astype(np.float32) / 255.0 23 | result = matcher._count_good_matches(desc1, desc2) 24 | assert isinstance(result, int) 25 | assert result >= 0 26 | -------------------------------------------------------------------------------- /reports/tune_out/tune_results.csv: -------------------------------------------------------------------------------- 1 | phash,orb,ncc,tp,fp,fn 2 | 6,10,0.85,5,0,30 3 | 6,10,0.9,5,0,30 4 | 6,10,0.92,5,0,30 5 | 6,10,0.95,5,0,30 6 | 6,25,0.85,5,0,30 7 | 6,25,0.9,5,0,30 8 | 6,25,0.92,5,0,30 9 | 6,25,0.95,5,0,30 10 | 6,50,0.85,5,0,30 11 | 6,50,0.9,5,0,30 12 | 6,50,0.92,5,0,30 13 | 6,50,0.95,5,0,30 14 | 8,10,0.85,5,0,30 15 | 8,10,0.9,5,0,30 16 | 8,10,0.92,5,0,30 17 | 8,10,0.95,5,0,30 18 | 8,25,0.85,5,0,30 19 | 8,25,0.9,5,0,30 20 | 8,25,0.92,5,0,30 21 | 8,25,0.95,5,0,30 22 | 8,50,0.85,5,0,30 23 | 8,50,0.9,5,0,30 24 | 8,50,0.92,5,0,30 25 | 8,50,0.95,5,0,30 26 | 10,10,0.85,8,0,27 27 | 10,10,0.9,8,0,27 28 | 10,10,0.92,8,0,27 29 | 10,10,0.95,8,0,27 30 | 10,25,0.85,8,0,27 31 | 10,25,0.9,8,0,27 32 | 10,25,0.92,8,0,27 33 | 10,25,0.95,8,0,27 34 | 10,50,0.85,8,0,27 35 | 10,50,0.9,8,0,27 36 | 10,50,0.92,8,0,27 37 | 10,50,0.95,8,0,27 38 | 12,10,0.85,8,0,27 39 | 12,10,0.9,8,0,27 40 | 12,10,0.92,8,0,27 41 | 12,10,0.95,8,0,27 42 | 12,25,0.85,8,0,27 43 | 12,25,0.9,8,0,27 44 | 12,25,0.92,8,0,27 45 | 12,25,0.95,8,0,27 46 | 12,50,0.85,8,0,27 47 | 12,50,0.9,8,0,27 48 | 12,50,0.92,8,0,27 49 | 12,50,0.95,8,0,27 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 DupCheck contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /duplicate_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Entrypoint for the duplicate image checking skeleton. 3 | 4 | This script wires the components together and provides a simple CLI. 5 | """ 6 | import argparse 7 | from pathlib import Path 8 | 9 | # ...existing code... 10 | def parse_args(): 11 | p = argparse.ArgumentParser(description="Duplicate image check skeleton") 12 | p.add_argument("--db_dir", required=True, help="Path to image database directory") 13 | p.add_argument("--input_dir", required=True, help="Path to new images to check") 14 | p.add_argument("--out_dir", required=True, help="Output reports directory") 15 | p.add_argument("--topk", type=int, default=50) 16 | return p.parse_args() 17 | 18 | 19 | def main(): 20 | args = parse_args() 21 | db_dir = Path(args.db_dir) 22 | input_dir = Path(args.input_dir) 23 | out_dir = Path(args.out_dir) 24 | out_dir.mkdir(parents=True, exist_ok=True) 25 | 26 | # Lazy imports to keep CLI responsive if modules missing 27 | from duplicate_check import indexer, features, matcher, report 28 | 29 | print(f"Indexing DB: {db_dir}") 30 | idx = indexer.build_index(db_dir) 31 | 32 | print(f"Processing inputs from: {input_dir}") 33 | results = [] 34 | for img_path in sorted(input_dir.iterdir()): 35 | if not img_path.is_file(): 36 | continue 37 | print(f"Checking {img_path.name}...") 38 | feats = features.compute_features(img_path) 39 | cand = matcher.recall_candidates(feats, idx, topk=args.topk) 40 | detailed = matcher.rerank_and_verify(img_path, cand, idx) 41 | results.extend(detailed) 42 | 43 | csv_path = out_dir / "dup_report.csv" 44 | report.write_csv(results, csv_path) 45 | print(f"Done. Report: {csv_path}") 46 | 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /data/synth_labels.csv: -------------------------------------------------------------------------------- 1 | new_image,matched_image,label 2 | new_1_copy.jpg,base_1.jpg,partial_duplicate 3 | new_1_crop.jpg,base_1.jpg,partial_duplicate 4 | new_1_rot.jpg,base_1.jpg,partial_duplicate 5 | new_1_bright.jpg,base_1.jpg,partial_duplicate 6 | new_1_jpeg30.jpg,base_1.jpg,partial_duplicate 7 | new_1_ps.jpg,base_1.jpg,partial_duplicate 8 | new_1_flip.jpg,base_1.jpg,partial_duplicate 9 | new_2_copy.jpg,base_2.jpg,partial_duplicate 10 | new_2_crop.jpg,base_2.jpg,partial_duplicate 11 | new_2_rot.jpg,base_2.jpg,partial_duplicate 12 | new_2_bright.jpg,base_2.jpg,partial_duplicate 13 | new_2_jpeg30.jpg,base_2.jpg,partial_duplicate 14 | new_2_ps.jpg,base_2.jpg,partial_duplicate 15 | new_2_flip.jpg,base_2.jpg,partial_duplicate 16 | new_3_copy.jpg,base_3.jpg,partial_duplicate 17 | new_3_crop.jpg,base_3.jpg,partial_duplicate 18 | new_3_rot.jpg,base_3.jpg,partial_duplicate 19 | new_3_bright.jpg,base_3.jpg,partial_duplicate 20 | new_3_jpeg30.jpg,base_3.jpg,partial_duplicate 21 | new_3_ps.jpg,base_3.jpg,partial_duplicate 22 | new_3_flip.jpg,base_3.jpg,partial_duplicate 23 | new_4_copy.jpg,base_4.jpg,partial_duplicate 24 | new_4_crop.jpg,base_4.jpg,partial_duplicate 25 | new_4_rot.jpg,base_4.jpg,partial_duplicate 26 | new_4_bright.jpg,base_4.jpg,partial_duplicate 27 | new_4_jpeg30.jpg,base_4.jpg,partial_duplicate 28 | new_4_ps.jpg,base_4.jpg,partial_duplicate 29 | new_4_flip.jpg,base_4.jpg,partial_duplicate 30 | new_5_copy.jpg,base_5.jpg,partial_duplicate 31 | new_5_crop.jpg,base_5.jpg,partial_duplicate 32 | new_5_rot.jpg,base_5.jpg,partial_duplicate 33 | new_5_bright.jpg,base_5.jpg,partial_duplicate 34 | new_5_jpeg30.jpg,base_5.jpg,partial_duplicate 35 | new_5_ps.jpg,base_5.jpg,partial_duplicate 36 | new_5_flip.jpg,base_5.jpg,partial_duplicate 37 | new_unique_1.jpg,,unique 38 | new_unique_2.jpg,,unique 39 | new_unique_3.jpg,,unique 40 | new_unique_4.jpg,,unique 41 | new_unique_5.jpg,,unique 42 | -------------------------------------------------------------------------------- /tests/test_features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from pathlib import Path 4 | from PIL import Image 5 | 6 | from duplicate_check import features 7 | 8 | 9 | @pytest.fixture() 10 | def sample_image(tmp_path: Path) -> Path: 11 | path = tmp_path / "sample.png" 12 | img = Image.new("RGB", (96, 80), color=(128, 128, 128)) 13 | for x in range(96): 14 | for y in range(80): 15 | img.putpixel((x, y), (x % 256, y % 256, (x + y) % 256)) 16 | img.save(path) 17 | return path 18 | 19 | 20 | def test_compute_phash_variants_multiscale(sample_image: Path): 21 | variants = features.compute_phash_variants(sample_image) 22 | unique = {v for v in variants if v} 23 | assert len(variants) >= len(features.MULTISCALE_LEVELS), "expect multi-scale hashes" 24 | assert len(unique) >= len(features.MULTISCALE_LEVELS), "hashes should cover multiple scales/orientations" 25 | 26 | 27 | def test_compute_tile_hashes_structure(sample_image: Path): 28 | tiles = features.compute_tile_hashes(sample_image, grid=4) 29 | assert tiles, "tiles should not be empty" 30 | scales = {tile.get("scale") for tile in tiles} 31 | assert features.MULTISCALE_LEVELS[0] in scales 32 | w, h = Image.open(sample_image).size 33 | for tile in tiles: 34 | bbox = tile.get("bbox") 35 | assert isinstance(bbox, tuple) and len(bbox) == 4 36 | x0, y0, x1, y1 = bbox 37 | assert 0 <= x0 <= x1 <= w 38 | assert 0 <= y0 <= y1 <= h 39 | 40 | 41 | def test_compute_embedding_returns_vector(sample_image: Path): 42 | emb = features.compute_embedding(sample_image) 43 | assert emb is not None 44 | arr = np.asarray(emb) 45 | assert arr.ndim == 1 and arr.size > 0 46 | 47 | 48 | def test_compute_features_attaches_tiles(sample_image: Path): 49 | feats = features.compute_features(sample_image) 50 | assert feats.tiles is not None and len(feats.tiles) > 0 51 | assert isinstance(feats.tiles[0], dict) 52 | -------------------------------------------------------------------------------- /run_smoke.py: -------------------------------------------------------------------------------- 1 | """Run a simple smoke test of the duplicate check pipeline without pytest. 2 | 3 | Creates temporary directories with tiny JPEG fixtures and runs the main flow. 4 | """ 5 | import base64 6 | import tempfile 7 | from pathlib import Path 8 | 9 | from duplicate_check import indexer, features, matcher, report 10 | 11 | 12 | _TINY_JPEG_B64 = ( 13 | "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxISEBUQEBAVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolGxUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGy0lICYtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAJ8BPgMBIgACEQEDEQH/xAAbAAABBQEBAAAAAAAAAAAAAAAAAQIEBQYDB//EADwQAAEDAgQDBgMHAwMFAAAAAAEAAgMEEQUSITEGE0FRMmFxgZGh8COhsUIjUmKyweHxFSNDU5LxJENT/8QAGQEAAwEBAQAAAAAAAAAAAAAAAAECAwQF/8QAJhEBAAICAgIBAwUAAAAAAAAAAAECAxESIQQxQVEiUYGh8GH/2gAMAwEAAhEDEQA/AO4gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD//Z" 14 | ) 15 | 16 | 17 | def _write_tiny_jpeg(path: Path) -> None: 18 | """Write a minimal 1x1 JPEG so PIL/OpenCV can read it.""" 19 | path.write_bytes(base64.b64decode(_TINY_JPEG_B64)) 20 | 21 | 22 | def run(): 23 | with tempfile.TemporaryDirectory() as db_dir, tempfile.TemporaryDirectory() as in_dir, tempfile.TemporaryDirectory() as out_dir: 24 | dbp = Path(db_dir) 25 | inp = Path(in_dir) 26 | outp = Path(out_dir) 27 | # create tiny but valid JPEG fixtures 28 | _write_tiny_jpeg(dbp / "db_1.jpg") 29 | _write_tiny_jpeg(inp / "new_1.jpg") 30 | 31 | print("Building index...") 32 | idx = indexer.build_index(dbp) 33 | print("Computing features for input...") 34 | feats = features.compute_features(inp / "new_1.jpg") 35 | print("Recalling candidates...") 36 | cands = matcher.recall_candidates(feats, idx) 37 | print("Reranking/verifying...") 38 | rows = matcher.rerank_and_verify(inp / "new_1.jpg", cands, idx) 39 | csvp = outp / "dup_report.csv" 40 | report.write_csv(rows, csvp) 41 | print(f"Smoke run complete. Report: {csvp}") 42 | 43 | 44 | if __name__ == "__main__": 45 | run() 46 | -------------------------------------------------------------------------------- /dupcheck_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Simple CLI for running duplicate detection pipeline. 3 | 4 | Usage example: 5 | python dupcheck_cli.py --db_dir ./images_db --input_dir ./images_new --out_dir ./reports 6 | """ 7 | import argparse 8 | from pathlib import Path 9 | 10 | 11 | def parse_args(): 12 | p = argparse.ArgumentParser() 13 | p.add_argument("--db_dir", required=True) 14 | p.add_argument("--input_dir", required=True) 15 | p.add_argument("--out_dir", required=True) 16 | p.add_argument("--topk", type=int, default=50) 17 | p.add_argument("--index_db", default="./index.db", help="Path to sqlite index DB") 18 | p.add_argument("--rebuild_index", action="store_true", help="Rebuild sqlite index from db_dir") 19 | p.add_argument("--phash_thresh", type=int, default=10) 20 | p.add_argument("--orb_inliers_thresh", type=int, default=25) 21 | p.add_argument("--ncc_thresh", type=float, default=0.92) 22 | p.add_argument("--vector_score_thresh", type=float, default=0.0, help="Minimum FAISS similarity to accept a vector candidate") 23 | return p.parse_args() 24 | 25 | 26 | def main(): 27 | args = parse_args() 28 | db_dir = Path(args.db_dir) 29 | input_dir = Path(args.input_dir) 30 | out_dir = Path(args.out_dir) 31 | out_dir.mkdir(parents=True, exist_ok=True) 32 | 33 | from duplicate_check import indexer, features, matcher, report 34 | # import modules from package 35 | # 从包中导入模块 36 | 37 | # use sqlite-backed index if available 38 | # 优先使用 SQLite 索引以支持持久化和增量更新 39 | idx = None 40 | db_path = Path(args.index_db) 41 | if db_path.exists() and not args.rebuild_index: 42 | print(f"Loading index from {db_path}...") 43 | try: 44 | idx = indexer.load_index_from_db(db_path) 45 | except Exception: 46 | idx = None 47 | 48 | if idx is None: 49 | if args.rebuild_index or not db_path.exists(): 50 | print("Building sqlite index...") 51 | indexer.build_index_db(db_dir, db_path) 52 | else: 53 | print("Building in-memory index...") 54 | idx = indexer.load_index_from_db(db_path) if db_path.exists() else indexer.build_index(db_dir) 55 | 56 | results = [] 57 | for p in sorted(input_dir.iterdir()): 58 | if not p.is_file(): 59 | continue 60 | print(f"Checking {p.name}...") 61 | feats = features.compute_features(p) 62 | cands = matcher.recall_candidates( 63 | feats, 64 | idx, 65 | topk=args.topk, 66 | phash_thresh=args.phash_thresh, 67 | vector_score_thresh=args.vector_score_thresh, 68 | ) 69 | rows = matcher.rerank_and_verify(p, cands, idx, orb_inliers_thresh=args.orb_inliers_thresh, ncc_thresh=args.ncc_thresh) 70 | # generate evidence images for rows 71 | for r in rows: 72 | if r.get("matched_image"): 73 | dbp = Path(idx["by_id"][r["matched_image"]]["path"]) 74 | evid = out_dir / f"{p.stem}__VS__{dbp.stem}.jpg" 75 | report.make_evidence_image(p, dbp, evid, draw_matches=True, matches=r.get("match_pairs")) 76 | r["evidence_img_path"] = str(evid) 77 | results.extend(rows) 78 | 79 | csvp = out_dir / "dup_report.csv" 80 | report.write_csv(results, csvp) 81 | print(f"Done. Report: {csvp}") 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /tools/generate_synthetic.py: -------------------------------------------------------------------------------- 1 | """Generate synthetic dataset for duplicate detection experiments. 2 | 3 | Creates two folders under project `data/synth_db` and `data/synth_new` and a 4 | labels CSV `data/synth_labels.csv` listing ground-truth matches. 5 | 6 | Usage: 7 | python tools/generate_synthetic.py --out_dir ./data --count 5 8 | 9 | This reproduces the same patterns used in the interactive session. 10 | """ 11 | import argparse 12 | from pathlib import Path 13 | from PIL import Image, ImageDraw, ImageEnhance 14 | import random 15 | import csv 16 | 17 | 18 | def generate(out_dir: Path, count: int = 5): 19 | db = out_dir / 'synth_db' 20 | new = out_dir / 'synth_new' 21 | db.mkdir(parents=True, exist_ok=True) 22 | new.mkdir(parents=True, exist_ok=True) 23 | 24 | labels = [] 25 | for i in range(1, count+1): 26 | img = Image.new('RGB',(400,300),(200+i*5,180+i*3,160+i*2)) 27 | draw = ImageDraw.Draw(img) 28 | for x in range(50,350,6): 29 | for y in range(60,240,6): 30 | if (x*y+i) % 13 < 4: 31 | draw.point((x,y),(0,0,0)) 32 | base = db / f'base_{i}.jpg' 33 | img.save(base) 34 | 35 | # exact copy 36 | img.save(new / f'new_{i}_copy.jpg') 37 | labels.append((f'new_{i}_copy.jpg', base.name)) 38 | 39 | # cropped 40 | crop = img.crop((80,70,320,230)) 41 | crop.save(new / f'new_{i}_crop.jpg') 42 | labels.append((f'new_{i}_crop.jpg', base.name)) 43 | 44 | # rotated 45 | rot = img.rotate(15, expand=True, fillcolor=(200,200,200)) 46 | rot.save(new / f'new_{i}_rot.jpg') 47 | labels.append((f'new_{i}_rot.jpg', base.name)) 48 | 49 | # brightness 50 | bright = ImageEnhance.Brightness(img).enhance(1.3) 51 | bright.save(new / f'new_{i}_bright.jpg') 52 | labels.append((f'new_{i}_bright.jpg', base.name)) 53 | 54 | # compressed 55 | img.save(new / f'new_{i}_jpeg30.jpg', quality=30) 56 | labels.append((f'new_{i}_jpeg30.jpg', base.name)) 57 | 58 | # ps overlay (draw rectangle) 59 | ps = img.copy() 60 | d = ImageDraw.Draw(ps) 61 | d.rectangle((120,90,220,160), fill=(255,255,255)) 62 | ps.save(new / f'new_{i}_ps.jpg') 63 | labels.append((f'new_{i}_ps.jpg', base.name)) 64 | 65 | # flipped 66 | flip = img.transpose(Image.FLIP_LEFT_RIGHT) 67 | flip.save(new / f'new_{i}_flip.jpg') 68 | labels.append((f'new_{i}_flip.jpg', base.name)) 69 | 70 | # add some unique images 71 | for j in range(1, count+1): 72 | u = Image.new('RGB',(300,200),(random.randint(0,255),random.randint(0,255),random.randint(0,255))) 73 | u.save(new / f'new_unique_{j}.jpg') 74 | labels.append((f'new_unique_{j}.jpg','')) 75 | 76 | # write labels.csv 77 | labp = out_dir / 'synth_labels.csv' 78 | with open(labp, 'w', newline='', encoding='utf-8') as f: 79 | w = csv.writer(f) 80 | w.writerow(['new_image','matched_image','label']) 81 | for newn, dbn in labels: 82 | lab = 'unique' if dbn=='' else 'partial_duplicate' 83 | w.writerow([newn, dbn, lab]) 84 | 85 | print('Synthetic dataset created:') 86 | print(' DB:', db) 87 | print(' NEW:', new) 88 | print(' Labels:', labp) 89 | 90 | 91 | if __name__ == '__main__': 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('--out_dir', default='./data') 94 | parser.add_argument('--count', type=int, default=5) 95 | args = parser.parse_args() 96 | generate(Path(args.out_dir), count=args.count) 97 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | # DupCheck — 图片重复与伪造检测 2 | 3 | ## 项目简介 4 | DupCheck 面向广义的“图库去重 / 篡改检测”需求:不仅适用于理赔审核,也可服务内容审核、电商验真、版权保护等场景;项目最初用于防止第三方维修工重复上传维修照片骗取维修资金,原是某项目的子模块,后来被我独立化、优化并扩展为通用工具。系统会把新上传图片与历史图库逐一比对,识别完全重复、局部重复以及轻度改动的图像,并输出可供人工复核的证据。 5 | 6 | 项目依赖常见的 Python 图像 / 深度学习库,便于集成到各类上传管线或后台审核系统。 7 | 8 | ## 检测流程 9 | 1. **构建索引**:对图库图片计算多姿态 pHash(原图、旋转、翻转)、块哈希、缓存 ORB 特征,并可生成 ResNet-18 嵌入,确保几何和粗语义变换仍可召回。 10 | 2. **召回候选**:新上传图片通过 pHash/块哈希匹配,并可结合基于 ResNet-18 的 FAISS 向量检索;如有需要再进行多姿态 ORB 比对,将旋转、翻转的嫌疑图拉入候选集。 11 | 3. **精排验证**:对最佳姿态组合执行 ORB + RANSAC,若单应关系可靠,则在对应区域做 NCC,判断是否为 `exact_patch`。 12 | 4. **结果输出**:检测结论写入 `dup_report.csv`,命令行可生成对照证据图,辅助人工审核。 13 | 5. **阈值调优**:可选运行 `tools/tune_thresholds.py` 做网格搜索,为不同业务场景选取合适的 pHash/ORB/NCC 阈值组合。 14 | 15 | > **扩展建议**:若图库规模巨大或需集群部署,可在 `duplicate_check/indexer.py` / `load_index_from_db` 中替换内置 FAISS 索引,改写为向 Milvus、Qdrant、Pinecone 等外部向量数据库写入,再在 `matcher.recall_candidates` 中改为查询该服务。 16 | > **性能提示**:可调整 `DUPC_TILE_SCALES`(如 `1.0,0.6`)与 `DUPC_TILE_GRID`,在多尺度鲁棒性与运行速度之间取得平衡。 17 | 18 | ## 目录结构 19 | - `duplicate_check/` —— 核心库模块(`features`、`indexer`、`matcher`、`report`)。 20 | - `dupcheck_cli.py` —— 主命令行工具,支持内存索引或 SQLite 索引。 21 | - `duplicate_check.py` —— 兼容性入口脚本。 22 | - `tools/` —— 合成数据生成、阈值调参等辅助脚本。 23 | - `tests/` —— 测试文件夹。 24 | - `data/` —— 文档示例使用的合成数据集。 25 | 26 | ## 环境依赖 27 | 建议在 Python 3.9 及以上版本下创建虚拟环境,并安装 `requirements.txt` 中的依赖。OpenCV、Pillow、imagehash、`torch`、`torchvision` 与可选的 `faiss-cpu` 能启用全部功能,缺失时流程会自动降级。 28 | 29 | ```bash 30 | python -m venv .venv 31 | source .venv/bin/activate 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | ## 快速体验 36 | 1. 生成示例数据集: 37 | ```bash 38 | python tools/generate_synthetic.py --out_dir data --count 5 39 | ``` 40 | 2. 重建 SQLite 索引并执行检测: 41 | ```bash 42 | python dupcheck_cli.py \ 43 | --db_dir data/synth_db \ 44 | --input_dir data/synth_new \ 45 | --out_dir reports \ 46 | --index_db ./index.db \ 47 | --rebuild_index \ 48 | --vector_score_thresh 0.3 49 | ``` 50 | 3. 查看 `reports/dup_report.csv` 以及生成的证据图片。 51 | 4. (可选)对合成标注集进行评估,查看召回差异: 52 | ```bash 53 | python tools/verify_synthetic.py \ 54 | --db_dir data/synth_db \ 55 | --input_dir data/synth_new \ 56 | --labels data/synth_labels.csv \ 57 | --phash_thresh 16 \ 58 | --orb_inliers_thresh 6 \ 59 | --ncc_thresh 0.85 60 | ``` 61 | 5. (可选)执行阈值网格搜索,找到更优配置: 62 | ```bash 63 | python tools/tune_thresholds.py \ 64 | --labels data/synth_labels.csv \ 65 | --db_dir data/synth_db \ 66 | --input_dir data/synth_new \ 67 | --out_dir reports/tune_out 68 | ``` 69 | 70 | 若要复用已有索引,可省略 `--rebuild_index`。通过调整 `--phash_thresh`、`--orb_inliers_thresh`、`--ncc_thresh` 等参数探索查准率和召回率的平衡。 71 | 72 | ## 常用命令 73 | ```bash 74 | # 重建索引 75 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index 76 | 77 | # 自定义阈值运行 78 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94 79 | 80 | # 直接使用缓存索引 81 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db 82 | ``` 83 | 84 | ## 阈值调参 85 | 使用 `tools/tune_thresholds.py` 对多个阈值组合做网格搜索: 86 | 87 | ```bash 88 | python tools/tune_thresholds.py \ 89 | --labels data/synth_labels.csv \ 90 | --db_dir data/synth_db \ 91 | --input_dir data/synth_new \ 92 | --out_dir reports/tune_out 93 | ``` 94 | 95 | 脚本会输出 `tune_results.csv`,其中包含每组参数的 TP/FP/FN 统计,可据此锁定最适合的数据集配置。 96 | 97 | ## 许可协议 98 | 99 | 本项目以 [MIT License](LICENSE) 开源发布。 100 | ``` 101 | -------------------------------------------------------------------------------- /duplicate_check/report.py: -------------------------------------------------------------------------------- 1 | """Reporting utilities: CSV output and evidence image generation (stub). 2 | 3 | 报告模块:生成 CSV 报表并创建证据图(并排显示、可绘制匹配连线)。 4 | """ 5 | import csv 6 | from pathlib import Path 7 | from typing import List, Dict 8 | from shutil import copyfile 9 | 10 | try: 11 | import cv2 12 | except Exception: 13 | cv2 = None 14 | 15 | 16 | CSV_FIELDS = [ 17 | "new_image", 18 | "matched_image", 19 | "final_label", 20 | "score", 21 | "inliers", 22 | "inlier_ratio", 23 | "ncc_peak", 24 | "evidence_img_path", 25 | ] 26 | 27 | 28 | def write_csv(rows: List[Dict], out_path: Path): 29 | with out_path.open("w", newline="", encoding="utf-8") as f: 30 | writer = csv.DictWriter(f, fieldnames=CSV_FIELDS) 31 | writer.writeheader() 32 | for r in rows: 33 | writer.writerow({k: r.get(k, "") for k in CSV_FIELDS}) 34 | 35 | 36 | def make_evidence_image(new_img_path: Path, db_img_path: Path, out_path: Path, draw_matches: bool = False, matches=None): 37 | """Create a side-by-side evidence image. If cv2 and matches provided, draw matches.""" 38 | if cv2 is None: 39 | # fallback: copy new image 40 | # 若未安装 OpenCV,则回退为直接复制新图作为证据图 41 | try: 42 | copyfile(str(new_img_path), str(out_path)) 43 | except Exception: 44 | pass 45 | return 46 | 47 | na = cv2.imread(str(new_img_path)) 48 | db = cv2.imread(str(db_img_path)) 49 | if na is None or db is None: 50 | try: 51 | copyfile(str(new_img_path), str(out_path)) 52 | except Exception: 53 | pass 54 | return 55 | 56 | # Resize to same height 57 | h = max(na.shape[0], db.shape[0]) 58 | def resize_keep(asrc, height): 59 | h0, w0 = asrc.shape[:2] 60 | scale = height / h0 61 | return cv2.resize(asrc, (int(w0 * scale), height)) 62 | 63 | na_r = resize_keep(na, h) 64 | db_r = resize_keep(db, h) 65 | 66 | if draw_matches and matches: 67 | # matches: list of ((xq,yq),(xd,yd)) pairs 68 | # build a canvas that is na_r + db_r side-by-side and draw lines 69 | concat = cv2.hconcat([na_r, db_r]) 70 | wq = na_r.shape[1] 71 | # compute scale factors from original images to resized ones 72 | hq_orig = na.shape[0] 73 | wq_orig = na.shape[1] 74 | hd_orig = db.shape[0] 75 | wd_orig = db.shape[1] 76 | h_res = h 77 | na_scale_x = na_r.shape[1] / max(1, wq_orig) 78 | na_scale_y = na_r.shape[0] / max(1, hq_orig) 79 | db_scale_x = db_r.shape[1] / max(1, wd_orig) 80 | db_scale_y = db_r.shape[0] / max(1, hd_orig) 81 | for (xq, yq), (xd, yd) in matches: 82 | pt1 = (int(xq * na_scale_x), int(yq * na_scale_y)) 83 | pt2 = (int(wq + xd * db_scale_x), int(yd * db_scale_y)) 84 | cv2.line(concat, pt1, pt2, (0, 255, 0), 1) 85 | cv2.circle(concat, pt1, 3, (0, 0, 255), -1) 86 | cv2.circle(concat, pt2, 3, (0, 0, 255), -1) 87 | try: 88 | cv2.imwrite(str(out_path), concat) 89 | except Exception: 90 | try: 91 | copyfile(str(new_img_path), str(out_path)) 92 | except Exception: 93 | pass 94 | return 95 | 96 | 97 | concat = cv2.hconcat([na_r, db_r]) 98 | try: 99 | cv2.imwrite(str(out_path), concat) 100 | except Exception: 101 | try: 102 | copyfile(str(new_img_path), str(out_path)) 103 | except Exception: 104 | pass 105 | -------------------------------------------------------------------------------- /tools/tune_thresholds.py: -------------------------------------------------------------------------------- 1 | """Threshold tuning helper. 2 | 3 | Usage: 4 | python tools/tune_thresholds.py --labels labels.csv --db_dir ./images_db --input_dir ./images_new --out_dir ./reports 5 | 6 | labels.csv should contain columns: new_image, matched_image, label (unique/partial_duplicate/exact_patch) 7 | 8 | This script sweeps phash_thresh, orb_inliers_thresh, ncc_thresh and reports simple match rate vs ground truth. NCC now operates on a warped ROI; use --roi_margin_ratio / --max_roi_matches to keep tuning aligned. 9 | 10 | 阈值调优脚本。 11 | 12 | 用法: 13 | python tools/tune_thresholds.py --labels labels.csv --db_dir ./images_db --input_dir ./images_new --out_dir ./reports 14 | 15 | labels.csv 应包含列:new_image, matched_image, label(unique/partial_duplicate/exact_patch) 16 | 17 | 本脚本对 phash_thresh、orb_inliers_thresh、ncc_thresh 做网格搜索,并报告与标注的 TP/FP/FN 统计。NCC 已改为基于单应 ROI 的对齐互相关,可通过 --roi_margin_ratio / --max_roi_matches 调整 ROI 设定。 18 | """ 19 | import sys 20 | import argparse 21 | import csv 22 | from pathlib import Path 23 | 24 | # Ensure repo root is on sys.path so `duplicate_check` package is importable 25 | _ROOT = Path(__file__).resolve().parents[1] 26 | if str(_ROOT) not in sys.path: 27 | sys.path.insert(0, str(_ROOT)) 28 | 29 | from duplicate_check import indexer, features, matcher 30 | 31 | 32 | def parse_args(): 33 | p = argparse.ArgumentParser() 34 | p.add_argument("--labels", required=True) 35 | p.add_argument("--db_dir", required=True) 36 | p.add_argument("--input_dir", required=True) 37 | p.add_argument("--out_dir", required=True) 38 | p.add_argument("--roi_margin_ratio", type=float, default=0.12) 39 | p.add_argument("--max_roi_matches", type=int, default=60) 40 | return p.parse_args() 41 | 42 | 43 | def load_labels(path): 44 | rows = {} 45 | with open(path, newline='', encoding='utf-8') as f: 46 | r = csv.DictReader(f) 47 | for row in r: 48 | rows[row['new_image']] = row 49 | return rows 50 | 51 | 52 | def main(): 53 | args = parse_args() 54 | labels = load_labels(args.labels) 55 | db_dir = Path(args.db_dir) 56 | input_dir = Path(args.input_dir) 57 | out_dir = Path(args.out_dir); out_dir.mkdir(parents=True, exist_ok=True) 58 | 59 | idx = indexer.build_index(db_dir) 60 | 61 | # simple sweep 62 | phash_range = [6,8,10,12] 63 | orb_range = [10,25,50] 64 | ncc_range = [0.85,0.9,0.92,0.95] 65 | 66 | results = [] 67 | for ph in phash_range: 68 | for orb_th in orb_range: 69 | for ncc in ncc_range: 70 | tp=0; fp=0; fn=0 71 | for p in input_dir.iterdir(): 72 | if not p.is_file(): 73 | continue 74 | feats = features.compute_features(p) 75 | cands = matcher.recall_candidates(feats, idx, phash_thresh=ph) 76 | rows = matcher.rerank_and_verify( 77 | p, 78 | cands, 79 | idx, 80 | orb_inliers_thresh=orb_th, 81 | ncc_thresh=ncc, 82 | roi_margin_ratio=args.roi_margin_ratio, 83 | max_roi_matches=args.max_roi_matches, 84 | ) 85 | predicted = rows[0]['matched_image'] if rows else None 86 | gt = labels.get(p.name, {}).get('matched_image') 87 | if gt and predicted == gt: 88 | tp+=1 89 | elif gt and predicted != gt: 90 | fn+=1 91 | elif not gt and predicted: 92 | fp+=1 93 | results.append((ph,orb_th,ncc,tp,fp,fn)) 94 | # write out 95 | outp = out_dir / 'tune_results.csv' 96 | with open(outp, 'w', newline='', encoding='utf-8') as f: 97 | w=csv.writer(f) 98 | w.writerow(['phash','orb','ncc','tp','fp','fn']) 99 | for r in results: 100 | w.writerow(r) 101 | print('Done. Results:', outp) 102 | 103 | if __name__=='__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /reports/dup_report.csv: -------------------------------------------------------------------------------- 1 | new_image,matched_image,final_label,score,inliers,inlier_ratio,ncc_peak,evidence_img_path 2 | new_1_bright.jpg,base_1.jpg,partial_duplicate,15.648811834888072,166,0.35319148936170214,0.0,reports/new_1_bright__VS__base_1.jpg 3 | new_1_copy.jpg,base_1.jpg,partial_duplicate,18.364583452542625,1411,1.0,0.0038532966282218695,reports/new_1_copy__VS__base_1.jpg 4 | new_1_crop.jpg,base_1.jpg,partial_duplicate,1.157927119731903,90,0.4090909090909091,0.009402623400092125,reports/new_1_crop__VS__base_1.jpg 5 | new_1_flip.jpg,base_1.jpg,partial_duplicate,17.548451742953002,1405,0.9992887624466572,0.0,reports/new_1_flip__VS__base_1.jpg 6 | new_1_ps.jpg,base_1.jpg,partial_duplicate,17.587277013366506,990,0.9482758620689655,-0.0046783494763076305,reports/new_1_ps__VS__base_1.jpg 7 | new_2_bright.jpg,base_2.jpg,partial_duplicate,14.278943573362971,162,0.3894230769230769,0.0,reports/new_2_bright__VS__base_2.jpg 8 | new_2_copy.jpg,base_2.jpg,partial_duplicate,16.791666785875954,1414,1.0,0.0,reports/new_2_copy__VS__base_2.jpg 9 | new_2_copy.jpg,base_3.jpg,partial_duplicate,13.86416643242519,27,0.29347826086956524,0.0,reports/new_2_copy__VS__base_3.jpg 10 | new_2_crop.jpg,base_2.jpg,partial_duplicate,1.1774003977885679,75,0.39473684210526316,-0.002573883393779397,reports/new_2_crop__VS__base_2.jpg 11 | new_2_flip.jpg,base_2.jpg,partial_duplicate,16.010963896910347,1414,1.0,0.0,reports/new_2_flip__VS__base_2.jpg 12 | new_2_flip.jpg,base_3.jpg,partial_duplicate,13.815720035438547,27,0.29347826086956524,0.0,reports/new_2_flip__VS__base_3.jpg 13 | new_2_jpeg30.jpg,base_2.jpg,partial_duplicate,13.72376012705414,154,0.4425287356321839,0.0,reports/new_2_jpeg30__VS__base_2.jpg 14 | new_2_ps.jpg,base_2.jpg,partial_duplicate,15.997115687025545,1006,0.9599236641221374,0.0,reports/new_2_ps__VS__base_2.jpg 15 | new_3_bright.jpg,base_3.jpg,partial_duplicate,13.73724901047934,161,0.4086294416243655,0.0020003009121865034,reports/new_3_bright__VS__base_3.jpg 16 | new_3_copy.jpg,base_3.jpg,partial_duplicate,16.274227647299178,1414,1.0,0.0020335863810032606,reports/new_3_copy__VS__base_3.jpg 17 | new_3_copy.jpg,base_2.jpg,partial_duplicate,13.877052221405371,31,0.27927927927927926,0.0,reports/new_3_copy__VS__base_2.jpg 18 | new_3_crop.jpg,base_3.jpg,partial_duplicate,1.186078881467139,85,0.43147208121827413,0.0,reports/new_3_crop__VS__base_3.jpg 19 | new_3_flip.jpg,base_3.jpg,partial_duplicate,15.493055590306328,1414,1.0,0.0,reports/new_3_flip__VS__base_3.jpg 20 | new_3_jpeg30.jpg,base_3.jpg,partial_duplicate,13.432698663339925,144,0.4161849710982659,0.0016456048469990492,reports/new_3_jpeg30__VS__base_3.jpg 21 | new_3_ps.jpg,base_3.jpg,partial_duplicate,15.450083545037916,968,0.944390243902439,0.0,reports/new_3_ps__VS__base_3.jpg 22 | new_3_ps.jpg,base_1.jpg,partial_duplicate,15.28378456336357,25,0.25510204081632654,0.0,reports/new_3_ps__VS__base_1.jpg 23 | new_3_ps.jpg,base_2.jpg,partial_duplicate,14.509196431986936,28,0.27450980392156865,0.0,reports/new_3_ps__VS__base_2.jpg 24 | new_4_bright.jpg,base_4.jpg,partial_duplicate,13.11780427361023,123,0.36607142857142855,0.004998629447072744,reports/new_4_bright__VS__base_4.jpg 25 | new_4_copy.jpg,base_4.jpg,partial_duplicate,15.802083333333334,1442,1.0,0.0,reports/new_4_copy__VS__base_4.jpg 26 | new_4_crop.jpg,base_4.jpg,partial_duplicate,1.1693956007455526,77,0.39086294416243655,0.0,reports/new_4_crop__VS__base_4.jpg 27 | new_4_flip.jpg,base_4.jpg,partial_duplicate,14.595425144247953,1270,0.959214501510574,0.0,reports/new_4_flip__VS__base_4.jpg 28 | new_4_jpeg30.jpg,base_4.jpg,partial_duplicate,13.432213366064898,146,0.37823834196891193,0.0,reports/new_4_jpeg30__VS__base_4.jpg 29 | new_4_ps.jpg,base_4.jpg,partial_duplicate,15.009747378792026,1015,0.9424326833797586,0.0,reports/new_4_ps__VS__base_4.jpg 30 | new_5_bright.jpg,base_5.jpg,partial_duplicate,15.370038690126217,103,0.356401384083045,0.0,reports/new_5_bright__VS__base_5.jpg 31 | new_5_copy.jpg,base_5.jpg,partial_duplicate,17.821538426124057,1449,1.0,0.00033754599280655384,reports/new_5_copy__VS__base_5.jpg 32 | new_5_crop.jpg,base_5.jpg,partial_duplicate,1.1523947505389942,73,0.3989071038251366,-0.0007181827677413821,reports/new_5_crop__VS__base_5.jpg 33 | new_5_flip.jpg,base_5.jpg,partial_duplicate,17.01505248709818,1449,1.0,0.0,reports/new_5_flip__VS__base_5.jpg 34 | new_5_jpeg30.jpg,base_5.jpg,partial_duplicate,14.749811130209066,131,0.37110481586402266,0.0,reports/new_5_jpeg30__VS__base_5.jpg 35 | new_5_ps.jpg,base_5.jpg,partial_duplicate,17.09211623273774,1021,0.9453703703703704,0.0,reports/new_5_ps__VS__base_5.jpg 36 | new_5_rot.jpg,base_5.jpg,partial_duplicate,7.367151720660745,42,0.2781456953642384,0.0,reports/new_5_rot__VS__base_5.jpg 37 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | # DupCheck — Duplicate & Tamper Detection 2 | 3 | ## Overview 4 | DupCheck solves broad “duplicate / tamper detection” needs: It works in insurance claim review, content moderation, e-commerce authenticity checks, and copyright protection. It began as a submodule designed to stop third-party repair contractors from re-uploading maintenance photos to claim duplicate reimbursements; I later spun it out, optimised it, and expanded it into a general-purpose toolkit. Uploads are compared against a reference gallery to flag exact copies, crops, rotations, flips, and lightly edited variants, producing reviewer-friendly evidence. 5 | 6 | The pipeline is pure Python with minimal dependencies, making it easy to embed into intake pipelines or back-office review systems. 7 | 8 | ## Detection flow 9 | 1. **Index build** – each gallery image is converted to multiple perceptual hashes (original, rotations, flips), multi-scale tile hashes, cached ORB descriptors, and optional ResNet-18 / CLIP embeddings to support geometric and coarse semantic changes. 10 | 2. **Candidate recall** – a new upload is compared with the index via pHash buckets, tile voting, and optional FAISS (ResNet-18/CLIP) vector search; if needed, multi-orientation ORB matching pulls in additional suspects. 11 | 3. **Verification** – the best orientation pair runs ORB + RANSAC. When the homography is reliable, NCC on the corresponding patch upgrades matches to `exact_patch`. 12 | 4. **Reporting** – results are written to `dup_report.csv`, and the CLI can render side-by-side evidence images for manual review. 13 | 5. **Threshold tuning** – optionally run `tools/tune_thresholds.py` to grid-search `phash/ORB/NCC` thresholds and pick the best configuration for your data. 14 | 15 | > **Scaling tip:** Set `DUPC_VECTOR_INDEX=ivf_pq` or `hnsw` to switch the built-in FAISS index; for even larger deployments, replace the FAISS block in `duplicate_check/indexer.py` / `load_index_from_db` with writes to Milvus, Qdrant, Pinecone, etc., and query that service from `matcher.recall_candidates` before ORB reranking. 16 | > **Performance tip:** Tune `DUPC_TILE_SCALES` (e.g., `1.0,0.6`) and `DUPC_TILE_GRID` to trade multi-scale robustness for runtime when processing massive galleries. 17 | 18 | ## Project layout 19 | - `duplicate_check/` — core library modules (`features`, `indexer`, `matcher`, `report`). 20 | - `dupcheck_cli.py` — main CLI wrapper supporting in-memory or SQLite indices. 21 | - `duplicate_check.py` — minimal entry point kept for backwards compatibility. 22 | - `tools/` — utilities for synthetic data generation and threshold tuning. 23 | - `tests/` — quick test. 24 | - `data/` — sample synthetic dataset used by the documentation examples. 25 | 26 | ## Requirements 27 | Install dependencies listed in `requirements.txt` inside a Python 3.9+ environment. OpenCV, Pillow, imagehash, `torch`, `torchvision`, and (optionally) `faiss-cpu` enable the full feature set; the pipeline falls back gracefully if some extras are unavailable. 28 | 29 | ```bash 30 | python -m venv .venv 31 | source .venv/bin/activate 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | Optional extras: install `faiss-cpu` (for ANN recall) and either `open-clip-torch` or `clip` if you want CLIP-ViT embeddings in addition to ResNet. 36 | 37 | ## Quick start 38 | 1. Generate the demo dataset: 39 | ```bash 40 | python tools/generate_synthetic.py --out_dir data --count 5 41 | ``` 42 | 2. Rebuild the SQLite index and run detection: 43 | ```bash 44 | python dupcheck_cli.py \ 45 | --db_dir data/synth_db \ 46 | --input_dir data/synth_new \ 47 | --out_dir reports \ 48 | --index_db ./index.db \ 49 | --rebuild_index \ 50 | --vector_score_thresh 0.3 51 | ``` 52 | 3. Inspect `reports/dup_report.csv` and the generated evidence JPEGs. 53 | 4. (Optional) Benchmark on the labelled synthetic set and review mismatches: 54 | ```bash 55 | python tools/verify_synthetic.py \ 56 | --db_dir data/synth_db \ 57 | --input_dir data/synth_new \ 58 | --labels data/synth_labels.csv \ 59 | --phash_thresh 16 \ 60 | --orb_inliers_thresh 6 \ 61 | --ncc_thresh 0.85 62 | ``` 63 | 5. (Optional) Launch a grid search over thresholds: 64 | ```bash 65 | python tools/tune_thresholds.py \ 66 | --labels data/synth_labels.csv \ 67 | --db_dir data/synth_db \ 68 | --input_dir data/synth_new \ 69 | --out_dir reports/tune_out 70 | ``` 71 | 72 | To reuse an existing index, drop the `--rebuild_index` flag. Tweak `--phash_thresh`, `--orb_inliers_thresh`, and `--ncc_thresh` to experiment with precision/recall. 73 | 74 | ## CLI examples 75 | ```bash 76 | # Rebuild index for fresh data 77 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index 78 | 79 | # Run with custom thresholds 80 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94 81 | 82 | # Quick scan using the cached index 83 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db 84 | ``` 85 | 86 | ## Threshold tuning 87 | Use `tools/tune_thresholds.py` with the synthetic labels to grid-search thresholds: 88 | 89 | ```bash 90 | python tools/tune_thresholds.py \ 91 | --labels data/synth_labels.csv \ 92 | --db_dir data/synth_db \ 93 | --input_dir data/synth_new \ 94 | --out_dir reports/tune_out 95 | ``` 96 | 97 | The script writes `tune_results.csv` with TP/FP/FN counts for each parameter combo, making it easy to lock in settings for your own data. 98 | 99 | ## License 100 | 101 | This project is released under the [MIT License](LICENSE). 102 | -------------------------------------------------------------------------------- /tools/verify_synthetic.py: -------------------------------------------------------------------------------- 1 | """Quick evaluator for the synthetic DupCheck dataset. 2 | 3 | Usage example: 4 | python tools/verify_synthetic.py \ 5 | --db_dir data/synth_db \ 6 | --input_dir data/synth_new \ 7 | --labels data/synth_labels.csv 8 | 9 | The script runs the duplicate detection pipeline against the synthetic 10 | dataset and reports how many annotated duplicates / uniques are detected 11 | correctly along with any mismatches it finds. 12 | """ 13 | import argparse 14 | import csv 15 | import sys 16 | from pathlib import Path 17 | from typing import Dict, List 18 | 19 | ROOT = Path(__file__).resolve().parents[1] 20 | if str(ROOT) not in sys.path: 21 | sys.path.insert(0, str(ROOT)) 22 | 23 | from duplicate_check import features, indexer, matcher 24 | 25 | 26 | def check_dependencies() -> List[str]: 27 | missing: List[str] = [] 28 | if not getattr(features, "PIL_AVAILABLE", False) or getattr(features, "imagehash", None) is None: 29 | missing.append("Pillow + imagehash (needed for perceptual hash and tile hashing)") 30 | if getattr(features, "cv2", None) is None: 31 | missing.append("opencv-python (needed for ORB matching and NCC verification)") 32 | return missing 33 | 34 | 35 | def load_labels(path: Path) -> Dict[str, Dict[str, str]]: 36 | rows: Dict[str, Dict[str, str]] = {} 37 | with path.open(newline="", encoding="utf-8") as f: 38 | reader = csv.DictReader(f) 39 | for row in reader: 40 | rows[row["new_image"]] = row 41 | return rows 42 | 43 | 44 | def evaluate( 45 | db_dir: Path, 46 | input_dir: Path, 47 | labels_path: Path, 48 | *, 49 | topk: int, 50 | phash_thresh: int, 51 | orb_inliers_thresh: int, 52 | ncc_thresh: float, 53 | vector_score_thresh: float, 54 | roi_margin_ratio: float, 55 | max_roi_matches: int, 56 | ) -> Dict[str, object]: 57 | labels = load_labels(labels_path) 58 | idx = indexer.build_index(db_dir) 59 | 60 | stats = { 61 | "duplicate_total": 0, 62 | "duplicate_hits": 0, 63 | "unique_total": 0, 64 | "unique_hits": 0, 65 | "mismatches": [], 66 | } 67 | 68 | for img_path in sorted(input_dir.iterdir()): 69 | if not img_path.is_file(): 70 | continue 71 | feats = features.compute_features(img_path) 72 | cands = matcher.recall_candidates( 73 | feats, 74 | idx, 75 | topk=topk, 76 | phash_thresh=phash_thresh, 77 | vector_score_thresh=vector_score_thresh, 78 | ) 79 | rows = matcher.rerank_and_verify( 80 | img_path, 81 | cands, 82 | idx, 83 | orb_inliers_thresh=orb_inliers_thresh, 84 | ncc_thresh=ncc_thresh, 85 | roi_margin_ratio=roi_margin_ratio, 86 | max_roi_matches=max_roi_matches, 87 | ) 88 | 89 | meta = labels.get(img_path.name, {"matched_image": "", "label": "unique"}) 90 | gt_match = meta.get("matched_image") or "" 91 | gt_label = meta.get("label", "unique") 92 | 93 | predicted_label = rows[0]["final_label"] if rows else "unique" 94 | predicted_match = rows[0]["matched_image"] if rows else "" 95 | if predicted_label == "unique": 96 | predicted_match = "" 97 | 98 | if gt_match: 99 | stats["duplicate_total"] += 1 100 | if predicted_match == gt_match: 101 | stats["duplicate_hits"] += 1 102 | else: 103 | stats["mismatches"].append( 104 | { 105 | "image": img_path.name, 106 | "expected_match": gt_match, 107 | "expected_label": gt_label, 108 | "predicted_match": rows[0]["matched_image"] if rows else "", 109 | "predicted_label": predicted_label, 110 | } 111 | ) 112 | else: 113 | stats["unique_total"] += 1 114 | if not predicted_match: 115 | stats["unique_hits"] += 1 116 | else: 117 | stats["mismatches"].append( 118 | { 119 | "image": img_path.name, 120 | "expected_match": "", 121 | "expected_label": gt_label, 122 | "predicted_match": rows[0]["matched_image"] if rows else "", 123 | "predicted_label": predicted_label, 124 | } 125 | ) 126 | 127 | return stats 128 | 129 | 130 | def format_summary(stats: Dict[str, object]) -> str: 131 | dup_total = stats["duplicate_total"] or 1 132 | uniq_total = stats["unique_total"] or 1 133 | lines: List[str] = [] 134 | lines.append( 135 | f"Duplicate accuracy: {stats['duplicate_hits']}/{stats['duplicate_total']}" 136 | f" ({stats['duplicate_hits']/dup_total:.1%})" 137 | ) 138 | lines.append( 139 | f"Unique accuracy: {stats['unique_hits']}/{stats['unique_total']}" 140 | f" ({stats['unique_hits']/uniq_total:.1%})" 141 | ) 142 | mismatches = stats["mismatches"] 143 | if mismatches: 144 | lines.append("\nMismatches:") 145 | for miss in mismatches: 146 | lines.append( 147 | f" - {miss['image']}: expected {miss['expected_match'] or 'unique'}" 148 | f" → predicted {miss['predicted_match'] or miss['predicted_label']}" 149 | ) 150 | else: 151 | lines.append("\nAll samples matched expected labels.") 152 | return "\n".join(lines) 153 | 154 | 155 | def parse_args() -> argparse.Namespace: 156 | p = argparse.ArgumentParser(description="Evaluate synthetic DupCheck dataset") 157 | p.add_argument("--db_dir", default="data/synth_db") 158 | p.add_argument("--input_dir", default="data/synth_new") 159 | p.add_argument("--labels", default="data/synth_labels.csv") 160 | p.add_argument("--topk", type=int, default=50) 161 | p.add_argument("--phash_thresh", type=int, default=10) 162 | p.add_argument("--orb_inliers_thresh", type=int, default=25) 163 | p.add_argument("--ncc_thresh", type=float, default=0.92) 164 | p.add_argument("--vector_score_thresh", type=float, default=0.0) 165 | p.add_argument("--roi_margin_ratio", type=float, default=0.12) 166 | p.add_argument("--max_roi_matches", type=int, default=60) 167 | return p.parse_args() 168 | 169 | 170 | def main() -> None: 171 | args = parse_args() 172 | missing = check_dependencies() 173 | if missing: 174 | print("Warning: required imaging dependencies missing; results will be unreliable.") 175 | for item in missing: 176 | print(f" - {item}") 177 | print("Install them via `pip install -r requirements.txt` and re-run this script.") 178 | return 179 | stats = evaluate( 180 | Path(args.db_dir), 181 | Path(args.input_dir), 182 | Path(args.labels), 183 | topk=args.topk, 184 | phash_thresh=args.phash_thresh, 185 | orb_inliers_thresh=args.orb_inliers_thresh, 186 | ncc_thresh=args.ncc_thresh, 187 | vector_score_thresh=args.vector_score_thresh, 188 | roi_margin_ratio=args.roi_margin_ratio, 189 | max_roi_matches=args.max_roi_matches, 190 | ) 191 | print(format_summary(stats)) 192 | 193 | 194 | if __name__ == "__main__": 195 | main() 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DupCheck — Duplicate & Tamper Detection / 图片重复与伪造检测 2 | 3 |
6 | 7 | --- 8 | 9 |