├── index.db
├── data
    ├── synth_db
    │   ├── base_1.jpg
    │   ├── base_2.jpg
    │   ├── base_3.jpg
    │   ├── base_4.jpg
    │   └── base_5.jpg
    ├── synth_new
    │   ├── new_1_copy.jpg
    │   ├── new_1_crop.jpg
    │   ├── new_1_flip.jpg
    │   ├── new_1_ps.jpg
    │   ├── new_1_rot.jpg
    │   ├── new_2_copy.jpg
    │   ├── new_2_crop.jpg
    │   ├── new_2_flip.jpg
    │   ├── new_2_ps.jpg
    │   ├── new_2_rot.jpg
    │   ├── new_3_copy.jpg
    │   ├── new_3_crop.jpg
    │   ├── new_3_flip.jpg
    │   ├── new_3_ps.jpg
    │   ├── new_3_rot.jpg
    │   ├── new_4_copy.jpg
    │   ├── new_4_crop.jpg
    │   ├── new_4_flip.jpg
    │   ├── new_4_ps.jpg
    │   ├── new_4_rot.jpg
    │   ├── new_5_copy.jpg
    │   ├── new_5_crop.jpg
    │   ├── new_5_flip.jpg
    │   ├── new_5_ps.jpg
    │   ├── new_5_rot.jpg
    │   ├── new_1_bright.jpg
    │   ├── new_1_jpeg30.jpg
    │   ├── new_2_bright.jpg
    │   ├── new_2_jpeg30.jpg
    │   ├── new_3_bright.jpg
    │   ├── new_3_jpeg30.jpg
    │   ├── new_4_bright.jpg
    │   ├── new_4_jpeg30.jpg
    │   ├── new_5_bright.jpg
    │   ├── new_5_jpeg30.jpg
    │   ├── new_unique_1.jpg
    │   ├── new_unique_2.jpg
    │   ├── new_unique_3.jpg
    │   ├── new_unique_4.jpg
    │   └── new_unique_5.jpg
    └── synth_labels.csv
├── reports
    ├── new_1_copy__VS__base_1.jpg
    ├── new_1_crop__VS__base_1.jpg
    ├── new_1_flip__VS__base_1.jpg
    ├── new_1_ps__VS__base_1.jpg
    ├── new_2_copy__VS__base_2.jpg
    ├── new_2_copy__VS__base_3.jpg
    ├── new_2_crop__VS__base_2.jpg
    ├── new_2_flip__VS__base_2.jpg
    ├── new_2_flip__VS__base_3.jpg
    ├── new_2_ps__VS__base_2.jpg
    ├── new_3_copy__VS__base_2.jpg
    ├── new_3_copy__VS__base_3.jpg
    ├── new_3_crop__VS__base_3.jpg
    ├── new_3_flip__VS__base_2.jpg
    ├── new_3_flip__VS__base_3.jpg
    ├── new_3_ps__VS__base_1.jpg
    ├── new_3_ps__VS__base_2.jpg
    ├── new_3_ps__VS__base_3.jpg
    ├── new_4_copy__VS__base_4.jpg
    ├── new_4_crop__VS__base_4.jpg
    ├── new_4_flip__VS__base_4.jpg
    ├── new_4_ps__VS__base_4.jpg
    ├── new_5_copy__VS__base_5.jpg
    ├── new_5_crop__VS__base_5.jpg
    ├── new_5_flip__VS__base_5.jpg
    ├── new_5_ps__VS__base_5.jpg
    ├── new_5_rot__VS__base_5.jpg
    ├── new_1_bright__VS__base_1.jpg
    ├── new_2_bright__VS__base_2.jpg
    ├── new_2_jpeg30__VS__base_2.jpg
    ├── new_3_bright__VS__base_3.jpg
    ├── new_3_jpeg30__VS__base_3.jpg
    ├── new_4_bright__VS__base_4.jpg
    ├── new_4_jpeg30__VS__base_4.jpg
    ├── new_5_bright__VS__base_5.jpg
    ├── new_5_jpeg30__VS__base_5.jpg
    ├── tune_out
    │   └── tune_results.csv
    └── dup_report.csv
├── __pycache__
    └── duplicate_check.cpython-313.pyc
├── tools
    ├── __pycache__
    │   └── tune_thresholds.cpython-312.pyc
    ├── generate_synthetic.py
    ├── tune_thresholds.py
    └── verify_synthetic.py
├── duplicate_check
    ├── __pycache__
    │   ├── indexer.cpython-312.pyc
    │   ├── matcher.cpython-312.pyc
    │   ├── report.cpython-312.pyc
    │   ├── __init__.cpython-312.pyc
    │   └── features.cpython-312.pyc
    ├── __init__.py
    ├── report.py
    ├── indexer.py
    ├── features.py
    └── matcher.py
├── requirements.txt
├── config.yaml
├── tests
    ├── test_matcher.py
    └── test_features.py
├── LICENSE
├── duplicate_check.py
├── run_smoke.py
├── dupcheck_cli.py
├── README_zh.md
├── README_en.md
└── README.md


/index.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/index.db


--------------------------------------------------------------------------------
/data/synth_db/base_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_1.jpg


--------------------------------------------------------------------------------
/data/synth_db/base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_2.jpg


--------------------------------------------------------------------------------
/data/synth_db/base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_3.jpg


--------------------------------------------------------------------------------
/data/synth_db/base_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_4.jpg


--------------------------------------------------------------------------------
/data/synth_db/base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_db/base_5.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_1_copy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_copy.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_1_crop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_crop.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_1_flip.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_flip.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_1_ps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_ps.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_1_rot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_rot.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_2_copy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_copy.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_2_crop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_crop.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_2_flip.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_flip.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_2_ps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_ps.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_2_rot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_rot.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_3_copy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_copy.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_3_crop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_crop.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_3_flip.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_flip.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_3_ps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_ps.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_3_rot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_rot.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_4_copy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_copy.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_4_crop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_crop.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_4_flip.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_flip.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_4_ps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_ps.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_4_rot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_rot.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_5_copy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_copy.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_5_crop.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_crop.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_5_flip.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_flip.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_5_ps.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_ps.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_5_rot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_rot.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_1_bright.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_bright.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_1_jpeg30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_1_jpeg30.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_2_bright.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_bright.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_2_jpeg30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_2_jpeg30.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_3_bright.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_bright.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_3_jpeg30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_3_jpeg30.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_4_bright.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_bright.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_4_jpeg30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_4_jpeg30.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_5_bright.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_bright.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_5_jpeg30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_5_jpeg30.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_unique_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_1.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_unique_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_2.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_unique_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_3.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_unique_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_4.jpg


--------------------------------------------------------------------------------
/data/synth_new/new_unique_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/data/synth_new/new_unique_5.jpg


--------------------------------------------------------------------------------
/reports/new_1_copy__VS__base_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_copy__VS__base_1.jpg


--------------------------------------------------------------------------------
/reports/new_1_crop__VS__base_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_crop__VS__base_1.jpg


--------------------------------------------------------------------------------
/reports/new_1_flip__VS__base_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_flip__VS__base_1.jpg


--------------------------------------------------------------------------------
/reports/new_1_ps__VS__base_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_ps__VS__base_1.jpg


--------------------------------------------------------------------------------
/reports/new_2_copy__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_copy__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_2_copy__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_copy__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_2_crop__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_crop__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_2_flip__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_flip__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_2_flip__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_flip__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_2_ps__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_ps__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_3_copy__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_copy__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_3_copy__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_copy__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_3_crop__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_crop__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_3_flip__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_flip__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_3_flip__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_flip__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_3_ps__VS__base_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_1.jpg


--------------------------------------------------------------------------------
/reports/new_3_ps__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_3_ps__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_ps__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_4_copy__VS__base_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_copy__VS__base_4.jpg


--------------------------------------------------------------------------------
/reports/new_4_crop__VS__base_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_crop__VS__base_4.jpg


--------------------------------------------------------------------------------
/reports/new_4_flip__VS__base_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_flip__VS__base_4.jpg


--------------------------------------------------------------------------------
/reports/new_4_ps__VS__base_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_ps__VS__base_4.jpg


--------------------------------------------------------------------------------
/reports/new_5_copy__VS__base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_copy__VS__base_5.jpg


--------------------------------------------------------------------------------
/reports/new_5_crop__VS__base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_crop__VS__base_5.jpg


--------------------------------------------------------------------------------
/reports/new_5_flip__VS__base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_flip__VS__base_5.jpg


--------------------------------------------------------------------------------
/reports/new_5_ps__VS__base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_ps__VS__base_5.jpg


--------------------------------------------------------------------------------
/reports/new_5_rot__VS__base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_rot__VS__base_5.jpg


--------------------------------------------------------------------------------
/reports/new_1_bright__VS__base_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_1_bright__VS__base_1.jpg


--------------------------------------------------------------------------------
/reports/new_2_bright__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_bright__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_2_jpeg30__VS__base_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_2_jpeg30__VS__base_2.jpg


--------------------------------------------------------------------------------
/reports/new_3_bright__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_bright__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_3_jpeg30__VS__base_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_3_jpeg30__VS__base_3.jpg


--------------------------------------------------------------------------------
/reports/new_4_bright__VS__base_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_bright__VS__base_4.jpg


--------------------------------------------------------------------------------
/reports/new_4_jpeg30__VS__base_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_4_jpeg30__VS__base_4.jpg


--------------------------------------------------------------------------------
/reports/new_5_bright__VS__base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_bright__VS__base_5.jpg


--------------------------------------------------------------------------------
/reports/new_5_jpeg30__VS__base_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/reports/new_5_jpeg30__VS__base_5.jpg


--------------------------------------------------------------------------------
/__pycache__/duplicate_check.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/__pycache__/duplicate_check.cpython-313.pyc


--------------------------------------------------------------------------------
/tools/__pycache__/tune_thresholds.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/tools/__pycache__/tune_thresholds.cpython-312.pyc


--------------------------------------------------------------------------------
/duplicate_check/__pycache__/indexer.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/indexer.cpython-312.pyc


--------------------------------------------------------------------------------
/duplicate_check/__pycache__/matcher.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/matcher.cpython-312.pyc


--------------------------------------------------------------------------------
/duplicate_check/__pycache__/report.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/report.cpython-312.pyc


--------------------------------------------------------------------------------
/duplicate_check/__pycache__/__init__.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/__init__.cpython-312.pyc


--------------------------------------------------------------------------------
/duplicate_check/__pycache__/features.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/summerness/dupcheck/HEAD/duplicate_check/__pycache__/features.cpython-312.pyc


--------------------------------------------------------------------------------
/duplicate_check/__init__.py:
--------------------------------------------------------------------------------
1 | """duplicate_check package init for the skeleton project."""
2 | from . import features, indexer, matcher, report
3 | 
4 | __all__ = ["features", "indexer", "matcher", "report"]
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | opencv-python
 2 | numpy
 3 | Pillow
 4 | imagehash
 5 | faiss-cpu
 6 | torch
 7 | torchvision
 8 | # Optional CLIP support (install from source if wheel unavailable)
 9 | clip-anytorch
10 | tqdm
11 | matplotlib
12 | piexif
13 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | phash_bits: 64
 2 | phash_thresh: 10
 3 | tile_grid: 8
 4 | tile_hamming_thresh: 6
 5 | orb_max_features: 2000
 6 | orb_inliers_thresh: 25
 7 | orb_inlier_ratio: 0.25
 8 | ncc_thresh: 0.92
 9 | roi_margin_ratio: 0.12
10 | max_roi_matches: 60
11 | topk_recall: 50
12 | 


--------------------------------------------------------------------------------
/tests/test_matcher.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from duplicate_check import matcher
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("dtype", [np.uint8, np.float32], ids=["uint8", "float32"])
 8 | def test_count_good_matches_dtype_handling(dtype):
 9 |     cv2 = pytest.importorskip("cv2")
10 |     rng = np.random.default_rng(42)
11 |     desc1 = (rng.random((32, 64)) * (255 if dtype == np.uint8 else 1)).astype(dtype)
12 |     desc2 = desc1.copy().astype(dtype)
13 |     result = matcher._count_good_matches(desc1, desc2)
14 |     assert isinstance(result, int)
15 |     assert result >= 0
16 | 
17 | 
18 | def test_count_good_matches_mixed_dtype():
19 |     pytest.importorskip("cv2")
20 |     rng = np.random.default_rng(7)
21 |     desc1 = (rng.random((16, 32)) * 255).astype(np.uint8)
22 |     desc2 = desc1.astype(np.float32) / 255.0
23 |     result = matcher._count_good_matches(desc1, desc2)
24 |     assert isinstance(result, int)
25 |     assert result >= 0
26 | 


--------------------------------------------------------------------------------
/reports/tune_out/tune_results.csv:
--------------------------------------------------------------------------------
 1 | phash,orb,ncc,tp,fp,fn
 2 | 6,10,0.85,5,0,30
 3 | 6,10,0.9,5,0,30
 4 | 6,10,0.92,5,0,30
 5 | 6,10,0.95,5,0,30
 6 | 6,25,0.85,5,0,30
 7 | 6,25,0.9,5,0,30
 8 | 6,25,0.92,5,0,30
 9 | 6,25,0.95,5,0,30
10 | 6,50,0.85,5,0,30
11 | 6,50,0.9,5,0,30
12 | 6,50,0.92,5,0,30
13 | 6,50,0.95,5,0,30
14 | 8,10,0.85,5,0,30
15 | 8,10,0.9,5,0,30
16 | 8,10,0.92,5,0,30
17 | 8,10,0.95,5,0,30
18 | 8,25,0.85,5,0,30
19 | 8,25,0.9,5,0,30
20 | 8,25,0.92,5,0,30
21 | 8,25,0.95,5,0,30
22 | 8,50,0.85,5,0,30
23 | 8,50,0.9,5,0,30
24 | 8,50,0.92,5,0,30
25 | 8,50,0.95,5,0,30
26 | 10,10,0.85,8,0,27
27 | 10,10,0.9,8,0,27
28 | 10,10,0.92,8,0,27
29 | 10,10,0.95,8,0,27
30 | 10,25,0.85,8,0,27
31 | 10,25,0.9,8,0,27
32 | 10,25,0.92,8,0,27
33 | 10,25,0.95,8,0,27
34 | 10,50,0.85,8,0,27
35 | 10,50,0.9,8,0,27
36 | 10,50,0.92,8,0,27
37 | 10,50,0.95,8,0,27
38 | 12,10,0.85,8,0,27
39 | 12,10,0.9,8,0,27
40 | 12,10,0.92,8,0,27
41 | 12,10,0.95,8,0,27
42 | 12,25,0.85,8,0,27
43 | 12,25,0.9,8,0,27
44 | 12,25,0.92,8,0,27
45 | 12,25,0.95,8,0,27
46 | 12,50,0.85,8,0,27
47 | 12,50,0.9,8,0,27
48 | 12,50,0.92,8,0,27
49 | 12,50,0.95,8,0,27
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 DupCheck contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/duplicate_check.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Entrypoint for the duplicate image checking skeleton.
 3 | 
 4 | This script wires the components together and provides a simple CLI.
 5 | """
 6 | import argparse
 7 | from pathlib import Path
 8 | 
 9 | # ...existing code...
10 | def parse_args():
11 |     p = argparse.ArgumentParser(description="Duplicate image check skeleton")
12 |     p.add_argument("--db_dir", required=True, help="Path to image database directory")
13 |     p.add_argument("--input_dir", required=True, help="Path to new images to check")
14 |     p.add_argument("--out_dir", required=True, help="Output reports directory")
15 |     p.add_argument("--topk", type=int, default=50)
16 |     return p.parse_args()
17 | 
18 | 
19 | def main():
20 |     args = parse_args()
21 |     db_dir = Path(args.db_dir)
22 |     input_dir = Path(args.input_dir)
23 |     out_dir = Path(args.out_dir)
24 |     out_dir.mkdir(parents=True, exist_ok=True)
25 | 
26 |     # Lazy imports to keep CLI responsive if modules missing
27 |     from duplicate_check import indexer, features, matcher, report
28 | 
29 |     print(f"Indexing DB: {db_dir}")
30 |     idx = indexer.build_index(db_dir)
31 | 
32 |     print(f"Processing inputs from: {input_dir}")
33 |     results = []
34 |     for img_path in sorted(input_dir.iterdir()):
35 |         if not img_path.is_file():
36 |             continue
37 |         print(f"Checking {img_path.name}...")
38 |         feats = features.compute_features(img_path)
39 |         cand = matcher.recall_candidates(feats, idx, topk=args.topk)
40 |         detailed = matcher.rerank_and_verify(img_path, cand, idx)
41 |         results.extend(detailed)
42 | 
43 |     csv_path = out_dir / "dup_report.csv"
44 |     report.write_csv(results, csv_path)
45 |     print(f"Done. Report: {csv_path}")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/data/synth_labels.csv:
--------------------------------------------------------------------------------
 1 | new_image,matched_image,label
 2 | new_1_copy.jpg,base_1.jpg,partial_duplicate
 3 | new_1_crop.jpg,base_1.jpg,partial_duplicate
 4 | new_1_rot.jpg,base_1.jpg,partial_duplicate
 5 | new_1_bright.jpg,base_1.jpg,partial_duplicate
 6 | new_1_jpeg30.jpg,base_1.jpg,partial_duplicate
 7 | new_1_ps.jpg,base_1.jpg,partial_duplicate
 8 | new_1_flip.jpg,base_1.jpg,partial_duplicate
 9 | new_2_copy.jpg,base_2.jpg,partial_duplicate
10 | new_2_crop.jpg,base_2.jpg,partial_duplicate
11 | new_2_rot.jpg,base_2.jpg,partial_duplicate
12 | new_2_bright.jpg,base_2.jpg,partial_duplicate
13 | new_2_jpeg30.jpg,base_2.jpg,partial_duplicate
14 | new_2_ps.jpg,base_2.jpg,partial_duplicate
15 | new_2_flip.jpg,base_2.jpg,partial_duplicate
16 | new_3_copy.jpg,base_3.jpg,partial_duplicate
17 | new_3_crop.jpg,base_3.jpg,partial_duplicate
18 | new_3_rot.jpg,base_3.jpg,partial_duplicate
19 | new_3_bright.jpg,base_3.jpg,partial_duplicate
20 | new_3_jpeg30.jpg,base_3.jpg,partial_duplicate
21 | new_3_ps.jpg,base_3.jpg,partial_duplicate
22 | new_3_flip.jpg,base_3.jpg,partial_duplicate
23 | new_4_copy.jpg,base_4.jpg,partial_duplicate
24 | new_4_crop.jpg,base_4.jpg,partial_duplicate
25 | new_4_rot.jpg,base_4.jpg,partial_duplicate
26 | new_4_bright.jpg,base_4.jpg,partial_duplicate
27 | new_4_jpeg30.jpg,base_4.jpg,partial_duplicate
28 | new_4_ps.jpg,base_4.jpg,partial_duplicate
29 | new_4_flip.jpg,base_4.jpg,partial_duplicate
30 | new_5_copy.jpg,base_5.jpg,partial_duplicate
31 | new_5_crop.jpg,base_5.jpg,partial_duplicate
32 | new_5_rot.jpg,base_5.jpg,partial_duplicate
33 | new_5_bright.jpg,base_5.jpg,partial_duplicate
34 | new_5_jpeg30.jpg,base_5.jpg,partial_duplicate
35 | new_5_ps.jpg,base_5.jpg,partial_duplicate
36 | new_5_flip.jpg,base_5.jpg,partial_duplicate
37 | new_unique_1.jpg,,unique
38 | new_unique_2.jpg,,unique
39 | new_unique_3.jpg,,unique
40 | new_unique_4.jpg,,unique
41 | new_unique_5.jpg,,unique
42 | 


--------------------------------------------------------------------------------
/tests/test_features.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from pathlib import Path
 4 | from PIL import Image
 5 | 
 6 | from duplicate_check import features
 7 | 
 8 | 
 9 | @pytest.fixture()
10 | def sample_image(tmp_path: Path) -> Path:
11 |     path = tmp_path / "sample.png"
12 |     img = Image.new("RGB", (96, 80), color=(128, 128, 128))
13 |     for x in range(96):
14 |         for y in range(80):
15 |             img.putpixel((x, y), (x % 256, y % 256, (x + y) % 256))
16 |     img.save(path)
17 |     return path
18 | 
19 | 
20 | def test_compute_phash_variants_multiscale(sample_image: Path):
21 |     variants = features.compute_phash_variants(sample_image)
22 |     unique = {v for v in variants if v}
23 |     assert len(variants) >= len(features.MULTISCALE_LEVELS), "expect multi-scale hashes"
24 |     assert len(unique) >= len(features.MULTISCALE_LEVELS), "hashes should cover multiple scales/orientations"
25 | 
26 | 
27 | def test_compute_tile_hashes_structure(sample_image: Path):
28 |     tiles = features.compute_tile_hashes(sample_image, grid=4)
29 |     assert tiles, "tiles should not be empty"
30 |     scales = {tile.get("scale") for tile in tiles}
31 |     assert features.MULTISCALE_LEVELS[0] in scales
32 |     w, h = Image.open(sample_image).size
33 |     for tile in tiles:
34 |         bbox = tile.get("bbox")
35 |         assert isinstance(bbox, tuple) and len(bbox) == 4
36 |         x0, y0, x1, y1 = bbox
37 |         assert 0 <= x0 <= x1 <= w
38 |         assert 0 <= y0 <= y1 <= h
39 | 
40 | 
41 | def test_compute_embedding_returns_vector(sample_image: Path):
42 |     emb = features.compute_embedding(sample_image)
43 |     assert emb is not None
44 |     arr = np.asarray(emb)
45 |     assert arr.ndim == 1 and arr.size > 0
46 | 
47 | 
48 | def test_compute_features_attaches_tiles(sample_image: Path):
49 |     feats = features.compute_features(sample_image)
50 |     assert feats.tiles is not None and len(feats.tiles) > 0
51 |     assert isinstance(feats.tiles[0], dict)
52 | 


--------------------------------------------------------------------------------
/run_smoke.py:
--------------------------------------------------------------------------------
 1 | """Run a simple smoke test of the duplicate check pipeline without pytest.
 2 | 
 3 | Creates temporary directories with tiny JPEG fixtures and runs the main flow.
 4 | """
 5 | import base64
 6 | import tempfile
 7 | from pathlib import Path
 8 | 
 9 | from duplicate_check import indexer, features, matcher, report
10 | 
11 | 
12 | _TINY_JPEG_B64 = (
13 |     "/9j/4AAQSkZJRgABAQAAAQABAAD/2wCEAAkGBxISEBUQEBAVFRUVFRUVFRUVFRUVFRUXFhUVFRUYHSggGBolGxUVITEhJSkrLi4uFx8zODMsNygtLisBCgoKDg0OGhAQGy0lICYtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLf/AABEIAJ8BPgMBIgACEQEDEQH/xAAbAAABBQEBAAAAAAAAAAAAAAAAAQIEBQYDB//EADwQAAEDAgQDBgMHAwMFAAAAAAEAAgMEEQUSITEGE0FRMmFxgZGh8COhsUIjUmKyweHxFSNDU5LxJENT/8QAGQEAAwEBAQAAAAAAAAAAAAAAAAECAwQF/8QAJhEBAAICAgIBAwUAAAAAAAAAAAECAxESIQQxQVEiUYGh8GH/2gAMAwEAAhEDEQA/AO4gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD//Z"
14 | )
15 | 
16 | 
17 | def _write_tiny_jpeg(path: Path) -> None:
18 |     """Write a minimal 1x1 JPEG so PIL/OpenCV can read it."""
19 |     path.write_bytes(base64.b64decode(_TINY_JPEG_B64))
20 | 
21 | 
22 | def run():
23 |     with tempfile.TemporaryDirectory() as db_dir, tempfile.TemporaryDirectory() as in_dir, tempfile.TemporaryDirectory() as out_dir:
24 |         dbp = Path(db_dir)
25 |         inp = Path(in_dir)
26 |         outp = Path(out_dir)
27 |         # create tiny but valid JPEG fixtures
28 |         _write_tiny_jpeg(dbp / "db_1.jpg")
29 |         _write_tiny_jpeg(inp / "new_1.jpg")
30 | 
31 |         print("Building index...")
32 |         idx = indexer.build_index(dbp)
33 |         print("Computing features for input...")
34 |         feats = features.compute_features(inp / "new_1.jpg")
35 |         print("Recalling candidates...")
36 |         cands = matcher.recall_candidates(feats, idx)
37 |         print("Reranking/verifying...")
38 |         rows = matcher.rerank_and_verify(inp / "new_1.jpg", cands, idx)
39 |         csvp = outp / "dup_report.csv"
40 |         report.write_csv(rows, csvp)
41 |         print(f"Smoke run complete. Report: {csvp}")
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     run()
46 | 


--------------------------------------------------------------------------------
/dupcheck_cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Simple CLI for running duplicate detection pipeline.
 3 | 
 4 | Usage example:
 5 |   python dupcheck_cli.py --db_dir ./images_db --input_dir ./images_new --out_dir ./reports
 6 | """
 7 | import argparse
 8 | from pathlib import Path
 9 | 
10 | 
11 | def parse_args():
12 |     p = argparse.ArgumentParser()
13 |     p.add_argument("--db_dir", required=True)
14 |     p.add_argument("--input_dir", required=True)
15 |     p.add_argument("--out_dir", required=True)
16 |     p.add_argument("--topk", type=int, default=50)
17 |     p.add_argument("--index_db", default="./index.db", help="Path to sqlite index DB")
18 |     p.add_argument("--rebuild_index", action="store_true", help="Rebuild sqlite index from db_dir")
19 |     p.add_argument("--phash_thresh", type=int, default=10)
20 |     p.add_argument("--orb_inliers_thresh", type=int, default=25)
21 |     p.add_argument("--ncc_thresh", type=float, default=0.92)
22 |     p.add_argument("--vector_score_thresh", type=float, default=0.0, help="Minimum FAISS similarity to accept a vector candidate")
23 |     return p.parse_args()
24 | 
25 | 
26 | def main():
27 |     args = parse_args()
28 |     db_dir = Path(args.db_dir)
29 |     input_dir = Path(args.input_dir)
30 |     out_dir = Path(args.out_dir)
31 |     out_dir.mkdir(parents=True, exist_ok=True)
32 | 
33 |     from duplicate_check import indexer, features, matcher, report
34 |     # import modules from package
35 |     # 从包中导入模块
36 | 
37 |     # use sqlite-backed index if available
38 |     # 优先使用 SQLite 索引以支持持久化和增量更新
39 |     idx = None
40 |     db_path = Path(args.index_db)
41 |     if db_path.exists() and not args.rebuild_index:
42 |         print(f"Loading index from {db_path}...")
43 |         try:
44 |             idx = indexer.load_index_from_db(db_path)
45 |         except Exception:
46 |             idx = None
47 | 
48 |     if idx is None:
49 |         if args.rebuild_index or not db_path.exists():
50 |             print("Building sqlite index...")
51 |             indexer.build_index_db(db_dir, db_path)
52 |         else:
53 |             print("Building in-memory index...")
54 |         idx = indexer.load_index_from_db(db_path) if db_path.exists() else indexer.build_index(db_dir)
55 | 
56 |     results = []
57 |     for p in sorted(input_dir.iterdir()):
58 |         if not p.is_file():
59 |             continue
60 |         print(f"Checking {p.name}...")
61 |         feats = features.compute_features(p)
62 |         cands = matcher.recall_candidates(
63 |             feats,
64 |             idx,
65 |             topk=args.topk,
66 |             phash_thresh=args.phash_thresh,
67 |             vector_score_thresh=args.vector_score_thresh,
68 |         )
69 |         rows = matcher.rerank_and_verify(p, cands, idx, orb_inliers_thresh=args.orb_inliers_thresh, ncc_thresh=args.ncc_thresh)
70 |         # generate evidence images for rows
71 |         for r in rows:
72 |             if r.get("matched_image"):
73 |                 dbp = Path(idx["by_id"][r["matched_image"]]["path"])
74 |                 evid = out_dir / f"{p.stem}__VS__{dbp.stem}.jpg"
75 |                 report.make_evidence_image(p, dbp, evid, draw_matches=True, matches=r.get("match_pairs"))
76 |                 r["evidence_img_path"] = str(evid)
77 |         results.extend(rows)
78 | 
79 |     csvp = out_dir / "dup_report.csv"
80 |     report.write_csv(results, csvp)
81 |     print(f"Done. Report: {csvp}")
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/tools/generate_synthetic.py:
--------------------------------------------------------------------------------
 1 | """Generate synthetic dataset for duplicate detection experiments.
 2 | 
 3 | Creates two folders under project `data/synth_db` and `data/synth_new` and a
 4 | labels CSV `data/synth_labels.csv` listing ground-truth matches.
 5 | 
 6 | Usage:
 7 |   python tools/generate_synthetic.py --out_dir ./data --count 5
 8 | 
 9 | This reproduces the same patterns used in the interactive session.
10 | """
11 | import argparse
12 | from pathlib import Path
13 | from PIL import Image, ImageDraw, ImageEnhance
14 | import random
15 | import csv
16 | 
17 | 
18 | def generate(out_dir: Path, count: int = 5):
19 |     db = out_dir / 'synth_db'
20 |     new = out_dir / 'synth_new'
21 |     db.mkdir(parents=True, exist_ok=True)
22 |     new.mkdir(parents=True, exist_ok=True)
23 | 
24 |     labels = []
25 |     for i in range(1, count+1):
26 |         img = Image.new('RGB',(400,300),(200+i*5,180+i*3,160+i*2))
27 |         draw = ImageDraw.Draw(img)
28 |         for x in range(50,350,6):
29 |             for y in range(60,240,6):
30 |                 if (x*y+i) % 13 < 4:
31 |                     draw.point((x,y),(0,0,0))
32 |         base = db / f'base_{i}.jpg'
33 |         img.save(base)
34 | 
35 |         # exact copy
36 |         img.save(new / f'new_{i}_copy.jpg')
37 |         labels.append((f'new_{i}_copy.jpg', base.name))
38 | 
39 |         # cropped
40 |         crop = img.crop((80,70,320,230))
41 |         crop.save(new / f'new_{i}_crop.jpg')
42 |         labels.append((f'new_{i}_crop.jpg', base.name))
43 | 
44 |         # rotated
45 |         rot = img.rotate(15, expand=True, fillcolor=(200,200,200))
46 |         rot.save(new / f'new_{i}_rot.jpg')
47 |         labels.append((f'new_{i}_rot.jpg', base.name))
48 | 
49 |         # brightness
50 |         bright = ImageEnhance.Brightness(img).enhance(1.3)
51 |         bright.save(new / f'new_{i}_bright.jpg')
52 |         labels.append((f'new_{i}_bright.jpg', base.name))
53 | 
54 |         # compressed
55 |         img.save(new / f'new_{i}_jpeg30.jpg', quality=30)
56 |         labels.append((f'new_{i}_jpeg30.jpg', base.name))
57 | 
58 |         # ps overlay (draw rectangle)
59 |         ps = img.copy()
60 |         d = ImageDraw.Draw(ps)
61 |         d.rectangle((120,90,220,160), fill=(255,255,255))
62 |         ps.save(new / f'new_{i}_ps.jpg')
63 |         labels.append((f'new_{i}_ps.jpg', base.name))
64 | 
65 |         # flipped
66 |         flip = img.transpose(Image.FLIP_LEFT_RIGHT)
67 |         flip.save(new / f'new_{i}_flip.jpg')
68 |         labels.append((f'new_{i}_flip.jpg', base.name))
69 | 
70 |     # add some unique images
71 |     for j in range(1, count+1):
72 |         u = Image.new('RGB',(300,200),(random.randint(0,255),random.randint(0,255),random.randint(0,255)))
73 |         u.save(new / f'new_unique_{j}.jpg')
74 |         labels.append((f'new_unique_{j}.jpg',''))
75 | 
76 |     # write labels.csv
77 |     labp = out_dir / 'synth_labels.csv'
78 |     with open(labp, 'w', newline='', encoding='utf-8') as f:
79 |         w = csv.writer(f)
80 |         w.writerow(['new_image','matched_image','label'])
81 |         for newn, dbn in labels:
82 |             lab = 'unique' if dbn=='' else 'partial_duplicate'
83 |             w.writerow([newn, dbn, lab])
84 | 
85 |     print('Synthetic dataset created:')
86 |     print(' DB:', db)
87 |     print(' NEW:', new)
88 |     print(' Labels:', labp)
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     parser = argparse.ArgumentParser()
93 |     parser.add_argument('--out_dir', default='./data')
94 |     parser.add_argument('--count', type=int, default=5)
95 |     args = parser.parse_args()
96 |     generate(Path(args.out_dir), count=args.count)
97 | 


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
  1 | # DupCheck — 图片重复与伪造检测
  2 | 
  3 | ## 项目简介
  4 | DupCheck 面向广义的“图库去重 / 篡改检测”需求：不仅适用于理赔审核，也可服务内容审核、电商验真、版权保护等场景；项目最初用于防止第三方维修工重复上传维修照片骗取维修资金，原是某项目的子模块，后来被我独立化、优化并扩展为通用工具。系统会把新上传图片与历史图库逐一比对，识别完全重复、局部重复以及轻度改动的图像，并输出可供人工复核的证据。
  5 | 
  6 | 项目依赖常见的 Python 图像 / 深度学习库，便于集成到各类上传管线或后台审核系统。
  7 | 
  8 | ## 检测流程
  9 | 1. **构建索引**：对图库图片计算多姿态 pHash（原图、旋转、翻转）、块哈希、缓存 ORB 特征，并可生成 ResNet-18 嵌入，确保几何和粗语义变换仍可召回。
 10 | 2. **召回候选**：新上传图片通过 pHash/块哈希匹配，并可结合基于 ResNet-18 的 FAISS 向量检索；如有需要再进行多姿态 ORB 比对，将旋转、翻转的嫌疑图拉入候选集。
 11 | 3. **精排验证**：对最佳姿态组合执行 ORB + RANSAC，若单应关系可靠，则在对应区域做 NCC，判断是否为 `exact_patch`。
 12 | 4. **结果输出**：检测结论写入 `dup_report.csv`，命令行可生成对照证据图，辅助人工审核。
 13 | 5. **阈值调优**：可选运行 `tools/tune_thresholds.py` 做网格搜索，为不同业务场景选取合适的 pHash/ORB/NCC 阈值组合。
 14 | 
 15 | > **扩展建议**：若图库规模巨大或需集群部署，可在 `duplicate_check/indexer.py` / `load_index_from_db` 中替换内置 FAISS 索引，改写为向 Milvus、Qdrant、Pinecone 等外部向量数据库写入，再在 `matcher.recall_candidates` 中改为查询该服务。
 16 | > **性能提示**：可调整 `DUPC_TILE_SCALES`（如 `1.0,0.6`）与 `DUPC_TILE_GRID`，在多尺度鲁棒性与运行速度之间取得平衡。
 17 | 
 18 | ## 目录结构
 19 | - `duplicate_check/` —— 核心库模块（`features`、`indexer`、`matcher`、`report`）。
 20 | - `dupcheck_cli.py` —— 主命令行工具，支持内存索引或 SQLite 索引。
 21 | - `duplicate_check.py` —— 兼容性入口脚本。
 22 | - `tools/` —— 合成数据生成、阈值调参等辅助脚本。
 23 | - `tests/` —— 测试文件夹。
 24 | - `data/` —— 文档示例使用的合成数据集。
 25 | 
 26 | ## 环境依赖
 27 | 建议在 Python 3.9 及以上版本下创建虚拟环境，并安装 `requirements.txt` 中的依赖。OpenCV、Pillow、imagehash、`torch`、`torchvision` 与可选的 `faiss-cpu` 能启用全部功能，缺失时流程会自动降级。
 28 | 
 29 | ```bash
 30 | python -m venv .venv
 31 | source .venv/bin/activate
 32 | pip install -r requirements.txt
 33 | ```
 34 | 
 35 | ## 快速体验
 36 | 1. 生成示例数据集：
 37 |    ```bash
 38 |    python tools/generate_synthetic.py --out_dir data --count 5
 39 |    ```
 40 | 2. 重建 SQLite 索引并执行检测：
 41 |    ```bash
 42 |    python dupcheck_cli.py \
 43 |      --db_dir data/synth_db \
 44 |      --input_dir data/synth_new \
 45 |      --out_dir reports \
 46 |      --index_db ./index.db \
 47 |      --rebuild_index \
 48 |      --vector_score_thresh 0.3
 49 |    ```
 50 | 3. 查看 `reports/dup_report.csv` 以及生成的证据图片。
 51 | 4. （可选）对合成标注集进行评估，查看召回差异：
 52 |    ```bash
 53 |    python tools/verify_synthetic.py \
 54 |      --db_dir data/synth_db \
 55 |      --input_dir data/synth_new \
 56 |      --labels data/synth_labels.csv \
 57 |      --phash_thresh 16 \
 58 |      --orb_inliers_thresh 6 \
 59 |      --ncc_thresh 0.85
 60 |    ```
 61 | 5. （可选）执行阈值网格搜索，找到更优配置：
 62 |    ```bash
 63 |    python tools/tune_thresholds.py \
 64 |      --labels data/synth_labels.csv \
 65 |      --db_dir data/synth_db \
 66 |      --input_dir data/synth_new \
 67 |      --out_dir reports/tune_out
 68 |    ```
 69 | 
 70 | 若要复用已有索引，可省略 `--rebuild_index`。通过调整 `--phash_thresh`、`--orb_inliers_thresh`、`--ncc_thresh` 等参数探索查准率和召回率的平衡。
 71 | 
 72 | ## 常用命令
 73 | ```bash
 74 | # 重建索引
 75 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index
 76 | 
 77 | # 自定义阈值运行
 78 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94
 79 | 
 80 | # 直接使用缓存索引
 81 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db
 82 | ```
 83 | 
 84 | ## 阈值调参
 85 | 使用 `tools/tune_thresholds.py` 对多个阈值组合做网格搜索：
 86 | 
 87 | ```bash
 88 | python tools/tune_thresholds.py \
 89 |   --labels data/synth_labels.csv \
 90 |   --db_dir data/synth_db \
 91 |   --input_dir data/synth_new \
 92 |   --out_dir reports/tune_out
 93 | ```
 94 | 
 95 | 脚本会输出 `tune_results.csv`，其中包含每组参数的 TP/FP/FN 统计，可据此锁定最适合的数据集配置。
 96 | 
 97 | ## 许可协议
 98 | 
 99 | 本项目以 [MIT License](LICENSE) 开源发布。
100 | ```
101 | 


--------------------------------------------------------------------------------
/duplicate_check/report.py:
--------------------------------------------------------------------------------
  1 | """Reporting utilities: CSV output and evidence image generation (stub).
  2 | 
  3 | 报告模块：生成 CSV 报表并创建证据图（并排显示、可绘制匹配连线）。
  4 | """
  5 | import csv
  6 | from pathlib import Path
  7 | from typing import List, Dict
  8 | from shutil import copyfile
  9 | 
 10 | try:
 11 |     import cv2
 12 | except Exception:
 13 |     cv2 = None
 14 | 
 15 | 
 16 | CSV_FIELDS = [
 17 |     "new_image",
 18 |     "matched_image",
 19 |     "final_label",
 20 |     "score",
 21 |     "inliers",
 22 |     "inlier_ratio",
 23 |     "ncc_peak",
 24 |     "evidence_img_path",
 25 | ]
 26 | 
 27 | 
 28 | def write_csv(rows: List[Dict], out_path: Path):
 29 |     with out_path.open("w", newline="", encoding="utf-8") as f:
 30 |         writer = csv.DictWriter(f, fieldnames=CSV_FIELDS)
 31 |         writer.writeheader()
 32 |         for r in rows:
 33 |             writer.writerow({k: r.get(k, "") for k in CSV_FIELDS})
 34 | 
 35 | 
 36 | def make_evidence_image(new_img_path: Path, db_img_path: Path, out_path: Path, draw_matches: bool = False, matches=None):
 37 |     """Create a side-by-side evidence image. If cv2 and matches provided, draw matches."""
 38 |     if cv2 is None:
 39 |         # fallback: copy new image
 40 |         # 若未安装 OpenCV，则回退为直接复制新图作为证据图
 41 |         try:
 42 |             copyfile(str(new_img_path), str(out_path))
 43 |         except Exception:
 44 |             pass
 45 |         return
 46 | 
 47 |     na = cv2.imread(str(new_img_path))
 48 |     db = cv2.imread(str(db_img_path))
 49 |     if na is None or db is None:
 50 |         try:
 51 |             copyfile(str(new_img_path), str(out_path))
 52 |         except Exception:
 53 |             pass
 54 |         return
 55 | 
 56 |     # Resize to same height
 57 |     h = max(na.shape[0], db.shape[0])
 58 |     def resize_keep(asrc, height):
 59 |         h0, w0 = asrc.shape[:2]
 60 |         scale = height / h0
 61 |         return cv2.resize(asrc, (int(w0 * scale), height))
 62 | 
 63 |     na_r = resize_keep(na, h)
 64 |     db_r = resize_keep(db, h)
 65 | 
 66 |     if draw_matches and matches:
 67 |         # matches: list of ((xq,yq),(xd,yd)) pairs
 68 |         # build a canvas that is na_r + db_r side-by-side and draw lines
 69 |         concat = cv2.hconcat([na_r, db_r])
 70 |         wq = na_r.shape[1]
 71 |         # compute scale factors from original images to resized ones
 72 |         hq_orig = na.shape[0]
 73 |         wq_orig = na.shape[1]
 74 |         hd_orig = db.shape[0]
 75 |         wd_orig = db.shape[1]
 76 |         h_res = h
 77 |         na_scale_x = na_r.shape[1] / max(1, wq_orig)
 78 |         na_scale_y = na_r.shape[0] / max(1, hq_orig)
 79 |         db_scale_x = db_r.shape[1] / max(1, wd_orig)
 80 |         db_scale_y = db_r.shape[0] / max(1, hd_orig)
 81 |         for (xq, yq), (xd, yd) in matches:
 82 |             pt1 = (int(xq * na_scale_x), int(yq * na_scale_y))
 83 |             pt2 = (int(wq + xd * db_scale_x), int(yd * db_scale_y))
 84 |             cv2.line(concat, pt1, pt2, (0, 255, 0), 1)
 85 |             cv2.circle(concat, pt1, 3, (0, 0, 255), -1)
 86 |             cv2.circle(concat, pt2, 3, (0, 0, 255), -1)
 87 |         try:
 88 |             cv2.imwrite(str(out_path), concat)
 89 |         except Exception:
 90 |             try:
 91 |                 copyfile(str(new_img_path), str(out_path))
 92 |             except Exception:
 93 |                 pass
 94 |         return
 95 | 
 96 | 
 97 |     concat = cv2.hconcat([na_r, db_r])
 98 |     try:
 99 |         cv2.imwrite(str(out_path), concat)
100 |     except Exception:
101 |         try:
102 |             copyfile(str(new_img_path), str(out_path))
103 |         except Exception:
104 |             pass
105 | 


--------------------------------------------------------------------------------
/tools/tune_thresholds.py:
--------------------------------------------------------------------------------
  1 | """Threshold tuning helper.
  2 | 
  3 | Usage:
  4 |     python tools/tune_thresholds.py --labels labels.csv --db_dir ./images_db --input_dir ./images_new --out_dir ./reports
  5 | 
  6 | labels.csv should contain columns: new_image, matched_image, label (unique/partial_duplicate/exact_patch)
  7 | 
  8 | This script sweeps phash_thresh, orb_inliers_thresh, ncc_thresh and reports simple match rate vs ground truth. NCC now operates on a warped ROI; use --roi_margin_ratio / --max_roi_matches to keep tuning aligned.
  9 | 
 10 | 阈值调优脚本。
 11 | 
 12 | 用法：
 13 |     python tools/tune_thresholds.py --labels labels.csv --db_dir ./images_db --input_dir ./images_new --out_dir ./reports
 14 | 
 15 | labels.csv 应包含列：new_image, matched_image, label（unique/partial_duplicate/exact_patch）
 16 | 
 17 | 本脚本对 phash_thresh、orb_inliers_thresh、ncc_thresh 做网格搜索，并报告与标注的 TP/FP/FN 统计。NCC 已改为基于单应 ROI 的对齐互相关，可通过 --roi_margin_ratio / --max_roi_matches 调整 ROI 设定。
 18 | """
 19 | import sys
 20 | import argparse
 21 | import csv
 22 | from pathlib import Path
 23 | 
 24 | # Ensure repo root is on sys.path so `duplicate_check` package is importable
 25 | _ROOT = Path(__file__).resolve().parents[1]
 26 | if str(_ROOT) not in sys.path:
 27 |     sys.path.insert(0, str(_ROOT))
 28 | 
 29 | from duplicate_check import indexer, features, matcher
 30 | 
 31 | 
 32 | def parse_args():
 33 |     p = argparse.ArgumentParser()
 34 |     p.add_argument("--labels", required=True)
 35 |     p.add_argument("--db_dir", required=True)
 36 |     p.add_argument("--input_dir", required=True)
 37 |     p.add_argument("--out_dir", required=True)
 38 |     p.add_argument("--roi_margin_ratio", type=float, default=0.12)
 39 |     p.add_argument("--max_roi_matches", type=int, default=60)
 40 |     return p.parse_args()
 41 | 
 42 | 
 43 | def load_labels(path):
 44 |     rows = {}
 45 |     with open(path, newline='', encoding='utf-8') as f:
 46 |         r = csv.DictReader(f)
 47 |         for row in r:
 48 |             rows[row['new_image']] = row
 49 |     return rows
 50 | 
 51 | 
 52 | def main():
 53 |     args = parse_args()
 54 |     labels = load_labels(args.labels)
 55 |     db_dir = Path(args.db_dir)
 56 |     input_dir = Path(args.input_dir)
 57 |     out_dir = Path(args.out_dir); out_dir.mkdir(parents=True, exist_ok=True)
 58 | 
 59 |     idx = indexer.build_index(db_dir)
 60 | 
 61 |     # simple sweep
 62 |     phash_range = [6,8,10,12]
 63 |     orb_range = [10,25,50]
 64 |     ncc_range = [0.85,0.9,0.92,0.95]
 65 | 
 66 |     results = []
 67 |     for ph in phash_range:
 68 |         for orb_th in orb_range:
 69 |             for ncc in ncc_range:
 70 |                 tp=0; fp=0; fn=0
 71 |                 for p in input_dir.iterdir():
 72 |                     if not p.is_file():
 73 |                         continue
 74 |                     feats = features.compute_features(p)
 75 |                     cands = matcher.recall_candidates(feats, idx, phash_thresh=ph)
 76 |                     rows = matcher.rerank_and_verify(
 77 |                         p,
 78 |                         cands,
 79 |                         idx,
 80 |                         orb_inliers_thresh=orb_th,
 81 |                         ncc_thresh=ncc,
 82 |                         roi_margin_ratio=args.roi_margin_ratio,
 83 |                         max_roi_matches=args.max_roi_matches,
 84 |                     )
 85 |                     predicted = rows[0]['matched_image'] if rows else None
 86 |                     gt = labels.get(p.name, {}).get('matched_image')
 87 |                     if gt and predicted == gt:
 88 |                         tp+=1
 89 |                     elif gt and predicted != gt:
 90 |                         fn+=1
 91 |                     elif not gt and predicted:
 92 |                         fp+=1
 93 |                 results.append((ph,orb_th,ncc,tp,fp,fn))
 94 |     # write out
 95 |     outp = out_dir / 'tune_results.csv'
 96 |     with open(outp, 'w', newline='', encoding='utf-8') as f:
 97 |         w=csv.writer(f)
 98 |         w.writerow(['phash','orb','ncc','tp','fp','fn'])
 99 |         for r in results:
100 |             w.writerow(r)
101 |     print('Done. Results:', outp)
102 | 
103 | if __name__=='__main__':
104 |     main()
105 | 


--------------------------------------------------------------------------------
/reports/dup_report.csv:
--------------------------------------------------------------------------------
 1 | new_image,matched_image,final_label,score,inliers,inlier_ratio,ncc_peak,evidence_img_path
 2 | new_1_bright.jpg,base_1.jpg,partial_duplicate,15.648811834888072,166,0.35319148936170214,0.0,reports/new_1_bright__VS__base_1.jpg
 3 | new_1_copy.jpg,base_1.jpg,partial_duplicate,18.364583452542625,1411,1.0,0.0038532966282218695,reports/new_1_copy__VS__base_1.jpg
 4 | new_1_crop.jpg,base_1.jpg,partial_duplicate,1.157927119731903,90,0.4090909090909091,0.009402623400092125,reports/new_1_crop__VS__base_1.jpg
 5 | new_1_flip.jpg,base_1.jpg,partial_duplicate,17.548451742953002,1405,0.9992887624466572,0.0,reports/new_1_flip__VS__base_1.jpg
 6 | new_1_ps.jpg,base_1.jpg,partial_duplicate,17.587277013366506,990,0.9482758620689655,-0.0046783494763076305,reports/new_1_ps__VS__base_1.jpg
 7 | new_2_bright.jpg,base_2.jpg,partial_duplicate,14.278943573362971,162,0.3894230769230769,0.0,reports/new_2_bright__VS__base_2.jpg
 8 | new_2_copy.jpg,base_2.jpg,partial_duplicate,16.791666785875954,1414,1.0,0.0,reports/new_2_copy__VS__base_2.jpg
 9 | new_2_copy.jpg,base_3.jpg,partial_duplicate,13.86416643242519,27,0.29347826086956524,0.0,reports/new_2_copy__VS__base_3.jpg
10 | new_2_crop.jpg,base_2.jpg,partial_duplicate,1.1774003977885679,75,0.39473684210526316,-0.002573883393779397,reports/new_2_crop__VS__base_2.jpg
11 | new_2_flip.jpg,base_2.jpg,partial_duplicate,16.010963896910347,1414,1.0,0.0,reports/new_2_flip__VS__base_2.jpg
12 | new_2_flip.jpg,base_3.jpg,partial_duplicate,13.815720035438547,27,0.29347826086956524,0.0,reports/new_2_flip__VS__base_3.jpg
13 | new_2_jpeg30.jpg,base_2.jpg,partial_duplicate,13.72376012705414,154,0.4425287356321839,0.0,reports/new_2_jpeg30__VS__base_2.jpg
14 | new_2_ps.jpg,base_2.jpg,partial_duplicate,15.997115687025545,1006,0.9599236641221374,0.0,reports/new_2_ps__VS__base_2.jpg
15 | new_3_bright.jpg,base_3.jpg,partial_duplicate,13.73724901047934,161,0.4086294416243655,0.0020003009121865034,reports/new_3_bright__VS__base_3.jpg
16 | new_3_copy.jpg,base_3.jpg,partial_duplicate,16.274227647299178,1414,1.0,0.0020335863810032606,reports/new_3_copy__VS__base_3.jpg
17 | new_3_copy.jpg,base_2.jpg,partial_duplicate,13.877052221405371,31,0.27927927927927926,0.0,reports/new_3_copy__VS__base_2.jpg
18 | new_3_crop.jpg,base_3.jpg,partial_duplicate,1.186078881467139,85,0.43147208121827413,0.0,reports/new_3_crop__VS__base_3.jpg
19 | new_3_flip.jpg,base_3.jpg,partial_duplicate,15.493055590306328,1414,1.0,0.0,reports/new_3_flip__VS__base_3.jpg
20 | new_3_jpeg30.jpg,base_3.jpg,partial_duplicate,13.432698663339925,144,0.4161849710982659,0.0016456048469990492,reports/new_3_jpeg30__VS__base_3.jpg
21 | new_3_ps.jpg,base_3.jpg,partial_duplicate,15.450083545037916,968,0.944390243902439,0.0,reports/new_3_ps__VS__base_3.jpg
22 | new_3_ps.jpg,base_1.jpg,partial_duplicate,15.28378456336357,25,0.25510204081632654,0.0,reports/new_3_ps__VS__base_1.jpg
23 | new_3_ps.jpg,base_2.jpg,partial_duplicate,14.509196431986936,28,0.27450980392156865,0.0,reports/new_3_ps__VS__base_2.jpg
24 | new_4_bright.jpg,base_4.jpg,partial_duplicate,13.11780427361023,123,0.36607142857142855,0.004998629447072744,reports/new_4_bright__VS__base_4.jpg
25 | new_4_copy.jpg,base_4.jpg,partial_duplicate,15.802083333333334,1442,1.0,0.0,reports/new_4_copy__VS__base_4.jpg
26 | new_4_crop.jpg,base_4.jpg,partial_duplicate,1.1693956007455526,77,0.39086294416243655,0.0,reports/new_4_crop__VS__base_4.jpg
27 | new_4_flip.jpg,base_4.jpg,partial_duplicate,14.595425144247953,1270,0.959214501510574,0.0,reports/new_4_flip__VS__base_4.jpg
28 | new_4_jpeg30.jpg,base_4.jpg,partial_duplicate,13.432213366064898,146,0.37823834196891193,0.0,reports/new_4_jpeg30__VS__base_4.jpg
29 | new_4_ps.jpg,base_4.jpg,partial_duplicate,15.009747378792026,1015,0.9424326833797586,0.0,reports/new_4_ps__VS__base_4.jpg
30 | new_5_bright.jpg,base_5.jpg,partial_duplicate,15.370038690126217,103,0.356401384083045,0.0,reports/new_5_bright__VS__base_5.jpg
31 | new_5_copy.jpg,base_5.jpg,partial_duplicate,17.821538426124057,1449,1.0,0.00033754599280655384,reports/new_5_copy__VS__base_5.jpg
32 | new_5_crop.jpg,base_5.jpg,partial_duplicate,1.1523947505389942,73,0.3989071038251366,-0.0007181827677413821,reports/new_5_crop__VS__base_5.jpg
33 | new_5_flip.jpg,base_5.jpg,partial_duplicate,17.01505248709818,1449,1.0,0.0,reports/new_5_flip__VS__base_5.jpg
34 | new_5_jpeg30.jpg,base_5.jpg,partial_duplicate,14.749811130209066,131,0.37110481586402266,0.0,reports/new_5_jpeg30__VS__base_5.jpg
35 | new_5_ps.jpg,base_5.jpg,partial_duplicate,17.09211623273774,1021,0.9453703703703704,0.0,reports/new_5_ps__VS__base_5.jpg
36 | new_5_rot.jpg,base_5.jpg,partial_duplicate,7.367151720660745,42,0.2781456953642384,0.0,reports/new_5_rot__VS__base_5.jpg
37 | 


--------------------------------------------------------------------------------
/README_en.md:
--------------------------------------------------------------------------------
  1 | # DupCheck — Duplicate & Tamper Detection
  2 | 
  3 | ## Overview
  4 | DupCheck solves broad “duplicate / tamper detection” needs: It works in insurance claim review, content moderation, e-commerce authenticity checks, and copyright protection. It began as a submodule designed to stop third-party repair contractors from re-uploading maintenance photos to claim duplicate reimbursements; I later spun it out, optimised it, and expanded it into a general-purpose toolkit. Uploads are compared against a reference gallery to flag exact copies, crops, rotations, flips, and lightly edited variants, producing reviewer-friendly evidence.
  5 | 
  6 | The pipeline is pure Python with minimal dependencies, making it easy to embed into intake pipelines or back-office review systems.
  7 | 
  8 | ## Detection flow
  9 | 1. **Index build** – each gallery image is converted to multiple perceptual hashes (original, rotations, flips), multi-scale tile hashes, cached ORB descriptors, and optional ResNet-18 / CLIP embeddings to support geometric and coarse semantic changes.
 10 | 2. **Candidate recall** – a new upload is compared with the index via pHash buckets, tile voting, and optional FAISS (ResNet-18/CLIP) vector search; if needed, multi-orientation ORB matching pulls in additional suspects.
 11 | 3. **Verification** – the best orientation pair runs ORB + RANSAC. When the homography is reliable, NCC on the corresponding patch upgrades matches to `exact_patch`.
 12 | 4. **Reporting** – results are written to `dup_report.csv`, and the CLI can render side-by-side evidence images for manual review.
 13 | 5. **Threshold tuning** – optionally run `tools/tune_thresholds.py` to grid-search `phash/ORB/NCC` thresholds and pick the best configuration for your data.
 14 | 
 15 | > **Scaling tip:** Set `DUPC_VECTOR_INDEX=ivf_pq` or `hnsw` to switch the built-in FAISS index; for even larger deployments, replace the FAISS block in `duplicate_check/indexer.py` / `load_index_from_db` with writes to Milvus, Qdrant, Pinecone, etc., and query that service from `matcher.recall_candidates` before ORB reranking.
 16 | > **Performance tip:** Tune `DUPC_TILE_SCALES` (e.g., `1.0,0.6`) and `DUPC_TILE_GRID` to trade multi-scale robustness for runtime when processing massive galleries.
 17 | 
 18 | ## Project layout
 19 | - `duplicate_check/` — core library modules (`features`, `indexer`, `matcher`, `report`).
 20 | - `dupcheck_cli.py` — main CLI wrapper supporting in-memory or SQLite indices.
 21 | - `duplicate_check.py` — minimal entry point kept for backwards compatibility.
 22 | - `tools/` — utilities for synthetic data generation and threshold tuning.
 23 | - `tests/` — quick test.
 24 | - `data/` — sample synthetic dataset used by the documentation examples.
 25 | 
 26 | ## Requirements
 27 | Install dependencies listed in `requirements.txt` inside a Python 3.9+ environment. OpenCV, Pillow, imagehash, `torch`, `torchvision`, and (optionally) `faiss-cpu` enable the full feature set; the pipeline falls back gracefully if some extras are unavailable.
 28 | 
 29 | ```bash
 30 | python -m venv .venv
 31 | source .venv/bin/activate
 32 | pip install -r requirements.txt
 33 | ```
 34 | 
 35 | Optional extras: install `faiss-cpu` (for ANN recall) and either `open-clip-torch` or `clip` if you want CLIP-ViT embeddings in addition to ResNet.
 36 | 
 37 | ## Quick start
 38 | 1. Generate the demo dataset:
 39 |    ```bash
 40 |    python tools/generate_synthetic.py --out_dir data --count 5
 41 |    ```
 42 | 2. Rebuild the SQLite index and run detection:
 43 |    ```bash
 44 |    python dupcheck_cli.py \
 45 |      --db_dir data/synth_db \
 46 |      --input_dir data/synth_new \
 47 |      --out_dir reports \
 48 |      --index_db ./index.db \
 49 |      --rebuild_index \
 50 |      --vector_score_thresh 0.3
 51 |    ```
 52 | 3. Inspect `reports/dup_report.csv` and the generated evidence JPEGs.
 53 | 4. (Optional) Benchmark on the labelled synthetic set and review mismatches:
 54 |    ```bash
 55 |    python tools/verify_synthetic.py \
 56 |      --db_dir data/synth_db \
 57 |      --input_dir data/synth_new \
 58 |      --labels data/synth_labels.csv \
 59 |      --phash_thresh 16 \
 60 |      --orb_inliers_thresh 6 \
 61 |      --ncc_thresh 0.85
 62 |    ```
 63 | 5. (Optional) Launch a grid search over thresholds:
 64 |    ```bash
 65 |    python tools/tune_thresholds.py \
 66 |      --labels data/synth_labels.csv \
 67 |      --db_dir data/synth_db \
 68 |      --input_dir data/synth_new \
 69 |      --out_dir reports/tune_out
 70 |    ```
 71 | 
 72 | To reuse an existing index, drop the `--rebuild_index` flag. Tweak `--phash_thresh`, `--orb_inliers_thresh`, and `--ncc_thresh` to experiment with precision/recall.
 73 | 
 74 | ## CLI examples
 75 | ```bash
 76 | # Rebuild index for fresh data
 77 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index
 78 | 
 79 | # Run with custom thresholds
 80 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94
 81 | 
 82 | # Quick scan using the cached index
 83 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db
 84 | ```
 85 | 
 86 | ## Threshold tuning
 87 | Use `tools/tune_thresholds.py` with the synthetic labels to grid-search thresholds:
 88 | 
 89 | ```bash
 90 | python tools/tune_thresholds.py \
 91 |   --labels data/synth_labels.csv \
 92 |   --db_dir data/synth_db \
 93 |   --input_dir data/synth_new \
 94 |   --out_dir reports/tune_out
 95 | ```
 96 | 
 97 | The script writes `tune_results.csv` with TP/FP/FN counts for each parameter combo, making it easy to lock in settings for your own data.
 98 | 
 99 | ## License
100 | 
101 | This project is released under the [MIT License](LICENSE).
102 | 


--------------------------------------------------------------------------------
/tools/verify_synthetic.py:
--------------------------------------------------------------------------------
  1 | """Quick evaluator for the synthetic DupCheck dataset.
  2 | 
  3 | Usage example:
  4 |     python tools/verify_synthetic.py \
  5 |       --db_dir data/synth_db \
  6 |       --input_dir data/synth_new \
  7 |       --labels data/synth_labels.csv
  8 | 
  9 | The script runs the duplicate detection pipeline against the synthetic
 10 | dataset and reports how many annotated duplicates / uniques are detected
 11 | correctly along with any mismatches it finds.
 12 | """
 13 | import argparse
 14 | import csv
 15 | import sys
 16 | from pathlib import Path
 17 | from typing import Dict, List
 18 | 
 19 | ROOT = Path(__file__).resolve().parents[1]
 20 | if str(ROOT) not in sys.path:
 21 |     sys.path.insert(0, str(ROOT))
 22 | 
 23 | from duplicate_check import features, indexer, matcher
 24 | 
 25 | 
 26 | def check_dependencies() -> List[str]:
 27 |     missing: List[str] = []
 28 |     if not getattr(features, "PIL_AVAILABLE", False) or getattr(features, "imagehash", None) is None:
 29 |         missing.append("Pillow + imagehash (needed for perceptual hash and tile hashing)")
 30 |     if getattr(features, "cv2", None) is None:
 31 |         missing.append("opencv-python (needed for ORB matching and NCC verification)")
 32 |     return missing
 33 | 
 34 | 
 35 | def load_labels(path: Path) -> Dict[str, Dict[str, str]]:
 36 |     rows: Dict[str, Dict[str, str]] = {}
 37 |     with path.open(newline="", encoding="utf-8") as f:
 38 |         reader = csv.DictReader(f)
 39 |         for row in reader:
 40 |             rows[row["new_image"]] = row
 41 |     return rows
 42 | 
 43 | 
 44 | def evaluate(
 45 |     db_dir: Path,
 46 |     input_dir: Path,
 47 |     labels_path: Path,
 48 |     *,
 49 |     topk: int,
 50 |     phash_thresh: int,
 51 |     orb_inliers_thresh: int,
 52 |     ncc_thresh: float,
 53 |     vector_score_thresh: float,
 54 |     roi_margin_ratio: float,
 55 |     max_roi_matches: int,
 56 | ) -> Dict[str, object]:
 57 |     labels = load_labels(labels_path)
 58 |     idx = indexer.build_index(db_dir)
 59 | 
 60 |     stats = {
 61 |         "duplicate_total": 0,
 62 |         "duplicate_hits": 0,
 63 |         "unique_total": 0,
 64 |         "unique_hits": 0,
 65 |         "mismatches": [],
 66 |     }
 67 | 
 68 |     for img_path in sorted(input_dir.iterdir()):
 69 |         if not img_path.is_file():
 70 |             continue
 71 |         feats = features.compute_features(img_path)
 72 |         cands = matcher.recall_candidates(
 73 |             feats,
 74 |             idx,
 75 |             topk=topk,
 76 |             phash_thresh=phash_thresh,
 77 |             vector_score_thresh=vector_score_thresh,
 78 |         )
 79 |         rows = matcher.rerank_and_verify(
 80 |             img_path,
 81 |             cands,
 82 |             idx,
 83 |             orb_inliers_thresh=orb_inliers_thresh,
 84 |             ncc_thresh=ncc_thresh,
 85 |             roi_margin_ratio=roi_margin_ratio,
 86 |             max_roi_matches=max_roi_matches,
 87 |         )
 88 | 
 89 |         meta = labels.get(img_path.name, {"matched_image": "", "label": "unique"})
 90 |         gt_match = meta.get("matched_image") or ""
 91 |         gt_label = meta.get("label", "unique")
 92 | 
 93 |         predicted_label = rows[0]["final_label"] if rows else "unique"
 94 |         predicted_match = rows[0]["matched_image"] if rows else ""
 95 |         if predicted_label == "unique":
 96 |             predicted_match = ""
 97 | 
 98 |         if gt_match:
 99 |             stats["duplicate_total"] += 1
100 |             if predicted_match == gt_match:
101 |                 stats["duplicate_hits"] += 1
102 |             else:
103 |                 stats["mismatches"].append(
104 |                     {
105 |                         "image": img_path.name,
106 |                         "expected_match": gt_match,
107 |                         "expected_label": gt_label,
108 |                         "predicted_match": rows[0]["matched_image"] if rows else "",
109 |                         "predicted_label": predicted_label,
110 |                     }
111 |                 )
112 |         else:
113 |             stats["unique_total"] += 1
114 |             if not predicted_match:
115 |                 stats["unique_hits"] += 1
116 |             else:
117 |                 stats["mismatches"].append(
118 |                     {
119 |                         "image": img_path.name,
120 |                         "expected_match": "",
121 |                         "expected_label": gt_label,
122 |                         "predicted_match": rows[0]["matched_image"] if rows else "",
123 |                         "predicted_label": predicted_label,
124 |                     }
125 |                 )
126 | 
127 |     return stats
128 | 
129 | 
130 | def format_summary(stats: Dict[str, object]) -> str:
131 |     dup_total = stats["duplicate_total"] or 1
132 |     uniq_total = stats["unique_total"] or 1
133 |     lines: List[str] = []
134 |     lines.append(
135 |         f"Duplicate accuracy: {stats['duplicate_hits']}/{stats['duplicate_total']}"
136 |         f" ({stats['duplicate_hits']/dup_total:.1%})"
137 |     )
138 |     lines.append(
139 |         f"Unique accuracy: {stats['unique_hits']}/{stats['unique_total']}"
140 |         f" ({stats['unique_hits']/uniq_total:.1%})"
141 |     )
142 |     mismatches = stats["mismatches"]
143 |     if mismatches:
144 |         lines.append("\nMismatches:")
145 |         for miss in mismatches:
146 |             lines.append(
147 |                 f" - {miss['image']}: expected {miss['expected_match'] or 'unique'}"
148 |                 f" → predicted {miss['predicted_match'] or miss['predicted_label']}"
149 |             )
150 |     else:
151 |         lines.append("\nAll samples matched expected labels.")
152 |     return "\n".join(lines)
153 | 
154 | 
155 | def parse_args() -> argparse.Namespace:
156 |     p = argparse.ArgumentParser(description="Evaluate synthetic DupCheck dataset")
157 |     p.add_argument("--db_dir", default="data/synth_db")
158 |     p.add_argument("--input_dir", default="data/synth_new")
159 |     p.add_argument("--labels", default="data/synth_labels.csv")
160 |     p.add_argument("--topk", type=int, default=50)
161 |     p.add_argument("--phash_thresh", type=int, default=10)
162 |     p.add_argument("--orb_inliers_thresh", type=int, default=25)
163 |     p.add_argument("--ncc_thresh", type=float, default=0.92)
164 |     p.add_argument("--vector_score_thresh", type=float, default=0.0)
165 |     p.add_argument("--roi_margin_ratio", type=float, default=0.12)
166 |     p.add_argument("--max_roi_matches", type=int, default=60)
167 |     return p.parse_args()
168 | 
169 | 
170 | def main() -> None:
171 |     args = parse_args()
172 |     missing = check_dependencies()
173 |     if missing:
174 |         print("Warning: required imaging dependencies missing; results will be unreliable.")
175 |         for item in missing:
176 |             print(f" - {item}")
177 |         print("Install them via `pip install -r requirements.txt` and re-run this script.")
178 |         return
179 |     stats = evaluate(
180 |         Path(args.db_dir),
181 |         Path(args.input_dir),
182 |         Path(args.labels),
183 |         topk=args.topk,
184 |         phash_thresh=args.phash_thresh,
185 |         orb_inliers_thresh=args.orb_inliers_thresh,
186 |         ncc_thresh=args.ncc_thresh,
187 |         vector_score_thresh=args.vector_score_thresh,
188 |         roi_margin_ratio=args.roi_margin_ratio,
189 |         max_roi_matches=args.max_roi_matches,
190 |     )
191 |     print(format_summary(stats))
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     main()
196 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DupCheck — Duplicate & Tamper Detection / 图片重复与伪造检测
  2 | 
  3 | <div align="right">
  4 |   <a href="#english">English</a> | <a href="#中文">中文</a>
  5 | </div>
  6 | 
  7 | ---
  8 | 
  9 | <details open>
 10 | <summary id="english"><strong>English</strong></summary>
 11 | 
 12 | ### Overview
 13 | DupCheck targets broader duplicate/tamper detection needs: It works for insurance claim review, content moderation, e-commerce authenticity checks, and copyright protection. It was originally a submodule built to stop third-party repair contractors from re-uploading maintenance photos to claim duplicate reimbursements; I later spun it out, optimised it, and generalised it for additional scenarios. The system compares every new image against a reference gallery, flags exact copies, crops, rotations/flips, and subtle edits, then produces reviewer-friendly evidence.
 14 | 
 15 | The implementation is pure Python and depends only on widely available imaging libraries, which keeps integration with existing intake or back-office pipelines straightforward.
 16 | 
 17 | ### Detection flow
 18 | 1. **Index build** – gallery images are converted to multi-orientation pHash, multi-scale tile hashes, cached ORB descriptors, and optional ResNet-18 / CLIP embeddings so geometric tweaks and coarse semantics remain discoverable.
 19 | 2. **Candidate recall** – a new upload is matched through pHash buckets, tile voting, and optional FAISS (ResNet-18/CLIP) vector search; if necessary, orientation-aware ORB matching pulls in additional suspects.
 20 | 3. **Verification** – the best orientation pair runs ORB + RANSAC. We warp the database image via the estimated homography, crop a small ROI around the inlier hull, and run ZNCC on that aligned patch; high correlation promotes the match to `exact_patch`.
 21 | 4. **Reporting** – matches are recorded in `dup_report.csv`, and the CLI can render side-by-side evidence images for manual review.
 22 | 
 23 | > **Scaling tip:** Set `DUPC_VECTOR_INDEX=ivf_pq` or `hnsw` to switch the built-in FAISS index; for very large galleries or cluster deployments, replace the in-process FAISS index with an external vector database (e.g., Milvus, Qdrant, Pinecone). A natural hook is the `duplicate_check/indexer.py::build_index` / `load_index_from_db` functions—swap the FAISS creation for remote writes, and query that service inside `matcher.recall_candidates` before running ORB reranking.
 24 | > **Performance tip:** Adjust `DUPC_TILE_SCALES` (e.g., `1.0,0.6`) and `DUPC_TILE_GRID` to balance multi-scale accuracy against runtime when processing massive galleries.
 25 | 
 26 | ### Project layout
 27 | - `duplicate_check/` — core modules (`features`, `indexer`, `matcher`, `report`).
 28 | - `dupcheck_cli.py` — main CLI with in-memory and SQLite index support.
 29 | - `duplicate_check.py` — legacy entrypoint kept for backward compatibility.
 30 | - `tools/` — helpers for synthetic data generation and threshold tuning.
 31 | - `tests/` - quick test.
 32 | - `data/` — synthetic dataset used in docs and examples.
 33 | 
 34 | ### Requirements
 35 | Install the dependencies from `requirements.txt` inside a Python 3.9+ environment. Pillow, OpenCV, imagehash, `torch`, `torchvision`, and (optionally) `faiss-cpu` unlock the full feature set; the code degrades gracefully if some extras are missing.
 36 | 
 37 | ```bash
 38 | python -m venv .venv
 39 | source .venv/bin/activate
 40 | pip install -r requirements.txt
 41 | ```
 42 | 
 43 | Optional extras: install `faiss-cpu` (for ANN recall) and either `open-clip-torch` or `clip` if you want CLIP-ViT embeddings in addition to ResNet.
 44 | 
 45 | ### Quick start
 46 | 1. Generate the demo dataset:
 47 |    ```bash
 48 |    python tools/generate_synthetic.py --out_dir data --count 5
 49 |    ```
 50 | 2. Rebuild the SQLite index and run detection:
 51 |    ```bash
 52 |    python dupcheck_cli.py \
 53 |      --db_dir data/synth_db \
 54 |      --input_dir data/synth_new \
 55 |      --out_dir reports \
 56 |      --index_db ./index.db \
 57 |      --rebuild_index \
 58 |      --vector_score_thresh 0.3
 59 |    ```
 60 | 3. Inspect `reports/dup_report.csv` alongside the generated evidence JPEGs.
 61 | 4. (Optional) Benchmark on the synthetic labels and inspect mismatches:
 62 |    ```bash
 63 |    python tools/verify_synthetic.py \
 64 |      --db_dir data/synth_db \
 65 |      --input_dir data/synth_new \
 66 |      --labels data/synth_labels.csv \
 67 |      --phash_thresh 16 \
 68 |      --orb_inliers_thresh 6 \
 69 |      --ncc_thresh 0.88 \
 70 |      --roi_margin_ratio 0.12 \
 71 |      --max_roi_matches 60
 72 |    ```
 73 | 5. (Optional) Run a threshold grid search to tune the pipeline:
 74 |    ```bash
 75 |    python tools/tune_thresholds.py \
 76 |      --labels data/synth_labels.csv \
 77 |      --db_dir data/synth_db \
 78 |      --input_dir data/synth_new \
 79 |      --out_dir reports/tune_out
 80 |    ```
 81 | 
 82 | Drop `--rebuild_index` to reuse a cached index. Tune `--phash_thresh`, `--orb_inliers_thresh`, and `--ncc_thresh` to explore different precision/recall tradeoffs.
 83 | 
 84 | ### CLI examples
 85 | ```bash
 86 | # Rebuild index for fresh data
 87 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index
 88 | 
 89 | # Run with custom thresholds
 90 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94
 91 | 
 92 | # Quick scan using the cached index
 93 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db
 94 | ```
 95 | 
 96 | ### Threshold tuning
 97 | Use `tools/tune_thresholds.py` with the synthetic labels to sweep detection thresholds:
 98 | 
 99 | ```bash
100 | python tools/tune_thresholds.py \
101 |   --labels data/synth_labels.csv \
102 |   --db_dir data/synth_db \
103 |   --input_dir data/synth_new \
104 |   --out_dir reports/tune_out
105 | ```
106 | 
107 | The script writes `tune_results.csv` containing TP/FP/FN counts for each parameter combo so you can lock in thresholds for your own dataset.
108 | 
109 | ## License
110 | 
111 | This project is released under the [MIT License](LICENSE).
112 | 
113 | </details>
114 | 
115 | <details open>
116 | <summary id="中文"><strong>中文</strong></summary>
117 | 
118 | ### 项目简介
119 | DupCheck 面向广义的“图库去重 / 篡改检测”场景：不仅可用于理赔审核，也适合内容审核、电商验真、图像版权保护等业务。它最初用于防止第三方维修工重复上传维修照片骗取维修资金，原为某项目的子模块；后来我将其独立化、优化，并扩展为通用的重复与篡改检测工具，可适用于更多场景。系统会把新上传图片与历史图库逐一比对，识别完全重复、局部重复、旋转/翻转及轻度改动的图像，并输出便于人工复核的证据。
120 | 
121 | 项目依赖常见的 Python 图像 / 深度学习库，可嵌入各类上传管线或后台审核流程。 
122 | 
123 | ### 检测流程
124 | 1. **构建索引**：对图库图片计算多姿态 pHash（原图、旋转、翻转）、多尺度块哈希、缓存 ORB 关键点，并可生成 ResNet-18 / CLIP 嵌入，确保几何和粗语义变化也能被召回。
125 | 2. **召回候选**：新图片通过 pHash/块哈希匹配，并可结合基于 ResNet-18/CLIP 的 FAISS 向量检索；如有需要再执行多姿态 ORB 匹配，把旋转、翻转的嫌疑图拉入候选集。
126 | 3. **精排验证**：对最佳姿态组合执行 ORB + RANSAC，将数据库图像按单应变换对齐到查询图坐标系后，在内点凸包附近裁剪 ROI，计算对齐区域的 ZNCC，判定是否为 `exact_patch`。
127 | 4. **结果输出**：检测结论写入 `dup_report.csv`，命令行可生成对照证据图，辅助人工审核。
128 | 5. **阈值调优**：可选地运行 `tools/tune_thresholds.py` 做网格搜索，针对不同场景选择更合适的 `phash/ORB/NCC` 参数。
129 | 
130 | > **扩展建议**：可通过设置环境变量 `DUPC_VECTOR_INDEX=ivf_pq` 或 `hnsw` 切换内置 FAISS 索引；若图库规模巨大或需集群部署，可在 `duplicate_check/indexer.py` / `load_index_from_db` 中替换 FAISS，为 Milvus、Qdrant、Pinecone 等外部向量库写入，并在 `matcher.recall_candidates` 中调用该服务。
131 | 
132 | ### 目录结构
133 | - `duplicate_check/` —— 核心模块（`features`、`indexer`、`matcher`、`report`）。
134 | - `dupcheck_cli.py` —— 主命令行工具，支持内存或 SQLite 索引。
135 | - `duplicate_check.py` —— 保留的兼容性入口脚本。
136 | - `tools/` —— 合成数据生成、阈值调参等辅助脚本。
137 | - `tests/` —— 测试。
138 | - `data/` —— 文档示例所用的合成数据集。
139 | 
140 | ### 环境依赖
141 | 建议在 Python 3.9+ 中创建虚拟环境，并安装 `requirements.txt` 列出的依赖。OpenCV、Pillow、imagehash、`torch`、`torchvision` 与可选的 `faiss-cpu` 能启用全部功能，缺失时流程会自动降级。
142 | 
143 | ```bash
144 | python -m venv .venv
145 | source .venv/bin/activate
146 | pip install -r requirements.txt
147 | ```
148 | 
149 | 可选依赖：`faiss-cpu`（向量召回），以及 `open-clip-torch` 或 `clip`（启用 CLIP-ViT 向量）。
150 | 
151 | ### 快速体验
152 | 1. 生成示例数据集：
153 |    ```bash
154 |    python tools/generate_synthetic.py --out_dir data --count 5
155 |    ```
156 | 2. 重建 SQLite 索引并运行检测：
157 |    ```bash
158 |    python dupcheck_cli.py \
159 |      --db_dir data/synth_db \
160 |      --input_dir data/synth_new \
161 |      --out_dir reports \
162 |      --index_db ./index.db \
163 |      --rebuild_index
164 |    ```
165 | 3. 查看 `reports/dup_report.csv` 及生成的证据图片。
166 | 4. （可选）对合成标注集进行评估，查看召回差异：
167 |    ```bash
168 |    python tools/verify_synthetic.py \
169 |      --db_dir data/synth_db \
170 |      --input_dir data/synth_new \
171 |      --labels data/synth_labels.csv \
172 |      --phash_thresh 16 \
173 |      --orb_inliers_thresh 6 \
174 |      --ncc_thresh 0.88 \
175 |      --roi_margin_ratio 0.12 \
176 |      --max_roi_matches 60
177 |    ```
178 | 
179 | 如需复用已有索引，可省略 `--rebuild_index`。可通过 `--phash_thresh`、`--orb_inliers_thresh`、`--ncc_thresh` 调整查准率与召回率之间的权衡。
180 | 
181 | ### 常用命令
182 | ```bash
183 | # 重建索引
184 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db --rebuild_index
185 | 
186 | # 自定义阈值运行
187 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --phash_thresh 12 --orb_inliers_thresh 30 --ncc_thresh 0.94
188 | 
189 | # 使用已有索引快速扫描
190 | python dupcheck_cli.py --db_dir data/synth_db --input_dir data/synth_new --out_dir reports --index_db ./index.db
191 | ```
192 | 
193 | ### 阈值调参
194 | 使用 `tools/tune_thresholds.py` 对阈值组合进行网格搜索：
195 | 
196 | ```bash
197 | python tools/tune_thresholds.py \
198 |   --labels data/synth_labels.csv \
199 |   --db_dir data/synth_db \
200 |   --input_dir data/synth_new \
201 |   --out_dir reports/tune_out
202 | ```
203 | 
204 | 脚本会输出 `tune_results.csv`，包含每组参数的 TP/FP/FN 统计，可据此锁定适合业务数据的阈值。
205 | 
206 | </details>
207 | 


--------------------------------------------------------------------------------
/duplicate_check/indexer.py:
--------------------------------------------------------------------------------
  1 | """Index utilities with in-memory + SQLite backends supporting tile & vector lookups."""
  2 | from contextlib import closing
  3 | import json
  4 | import os
  5 | import sqlite3
  6 | from pathlib import Path
  7 | from typing import Any, Dict, List, Optional
  8 | 
  9 | try:
 10 |     import numpy as np
 11 | except Exception:
 12 |     np = None
 13 | 
 14 | try:
 15 |     import faiss
 16 | except Exception:
 17 |     faiss = None
 18 | 
 19 | from duplicate_check.features import (
 20 |     compute_phash,
 21 |     compute_tile_hashes,
 22 |     compute_phash_variants,
 23 |     compute_embedding,
 24 | )
 25 | 
 26 | VECTOR_INDEX_TYPE = os.environ.get("DUPC_VECTOR_INDEX", "flat").lower()
 27 | VECTOR_INDEX_NLIST = int(os.environ.get("DUPC_VECTOR_NLIST", "1024"))
 28 | VECTOR_INDEX_PQ_M = int(os.environ.get("DUPC_VECTOR_PQ_M", "16"))
 29 | VECTOR_HNSW_M = int(os.environ.get("DUPC_VECTOR_HNSW_M", "32"))
 30 | VECTOR_HNSW_EF = int(os.environ.get("DUPC_VECTOR_HNSW_EF", "64"))
 31 | 
 32 | 
 33 | def _build_vector_index(embeddings: List[Any], ids: List[str]) -> Optional[Dict[str, Any]]:
 34 |     if not embeddings or faiss is None or np is None:
 35 |         return None
 36 |     try:
 37 |         mat = np.stack(embeddings).astype("float32")
 38 |     except Exception:
 39 |         return None
 40 |     if mat.size == 0:
 41 |         return None
 42 |     dim = mat.shape[1]
 43 |     index = None
 44 |     metric = "ip"
 45 |     index_type = VECTOR_INDEX_TYPE
 46 |     try:
 47 |         if index_type == "ivf_pq" and mat.shape[0] > VECTOR_INDEX_PQ_M:
 48 |             nlist = min(max(1, VECTOR_INDEX_NLIST), mat.shape[0])
 49 |             quantizer = faiss.IndexFlatIP(dim)
 50 |             index = faiss.IndexIVFPQ(quantizer, dim, nlist, VECTOR_INDEX_PQ_M, 8)
 51 |             index.train(mat)
 52 |             index.add(mat)
 53 |             index.nprobe = max(1, min(nlist, nlist // 10 or 1))
 54 |         elif index_type == "hnsw":
 55 |             hnsw_m = max(2, VECTOR_HNSW_M)
 56 |             index = faiss.IndexHNSWFlat(dim, hnsw_m)
 57 |             index.hnsw.efConstruction = max(hnsw_m, VECTOR_HNSW_EF)
 58 |             index.add(mat)
 59 |             index.hnsw.efSearch = max(hnsw_m, VECTOR_HNSW_EF)
 60 |         else:
 61 |             index = faiss.IndexFlatIP(dim)
 62 |             index.add(mat)
 63 |             index_type = "flat"
 64 |     except Exception:
 65 |         index = None
 66 |     if index is None:
 67 |         return None
 68 |     return {"index": index, "ids": ids, "metric": metric, "type": index_type}
 69 | 
 70 | 
 71 | def build_index(db_dir: Path, tile_grid: int = 8) -> Dict[str, Any]:
 72 |     """Build an in-memory index containing multi-scale hashes and optional vectors."""
 73 |     idx = {"by_id": {}, "by_phash": {}, "by_tile": {}, "vector": None}
 74 |     use_vectors = faiss is not None and np is not None
 75 |     vector_embeddings = []
 76 |     vector_ids = []
 77 |     for p in sorted(db_dir.iterdir()):
 78 |         if not p.is_file():
 79 |             continue
 80 |         pid = p.name
 81 |         ph_variants = compute_phash_variants(p)
 82 |         primary_ph = ph_variants[0]
 83 |         tiles = compute_tile_hashes(p, grid=tile_grid)
 84 |         idx["by_id"][pid] = {
 85 |             "path": str(p),
 86 |             "phash": primary_ph,
 87 |             "phash_variants": ph_variants,
 88 |             "tiles": tiles,
 89 |         }
 90 |         for ph in ph_variants:
 91 |             bucket = idx["by_phash"].setdefault(ph, [])
 92 |             if pid not in bucket:
 93 |                 bucket.append(pid)
 94 |         for tile in tiles:
 95 |             th = tile.get("hash")
 96 |             if not th:
 97 |                 continue
 98 |             entry = {
 99 |                 "img_id": pid,
100 |                 "bbox": tile.get("bbox", (0, 0, 0, 0)),
101 |                 "scale": tile.get("scale", 1.0),
102 |             }
103 |             idx["by_tile"].setdefault(th, []).append(entry)
104 |         if use_vectors and np is not None:
105 |             try:
106 |                 emb_val = compute_embedding(p)
107 |                 if emb_val is None:
108 |                     continue
109 |                 emb = np.asarray(emb_val, dtype=np.float32)
110 |                 if emb.ndim == 1 and emb.size > 0:
111 |                     vector_embeddings.append(emb)
112 |                     vector_ids.append(pid)
113 |             except Exception:
114 |                 continue
115 |     if use_vectors and vector_embeddings:
116 |         idx["vector"] = _build_vector_index(vector_embeddings, vector_ids)
117 |     return idx
118 | 
119 | 
120 | def load_index(path: Path) -> Dict[str, Any]:
121 |     with open(path, "r", encoding="utf-8") as f:
122 |         return json.load(f)
123 | 
124 | 
125 | def save_index(idx: Dict[str, Any], path: Path) -> None:
126 |     to_dump = dict(idx)
127 |     if "vector" in to_dump:
128 |         to_dump["vector"] = None
129 |     with open(path, "w", encoding="utf-8") as f:
130 |         json.dump(to_dump, f)
131 | 
132 | 
133 | def init_sqlite(db_path: Path) -> None:
134 |     db_path.parent.mkdir(parents=True, exist_ok=True)
135 |     conn = sqlite3.connect(str(db_path))
136 |     with closing(conn):
137 |         cur = conn.cursor()
138 |         cur.execute("PRAGMA journal_mode=WAL")
139 |         cur.execute(
140 |             "CREATE TABLE IF NOT EXISTS images(\n"
141 |             "    img_id TEXT PRIMARY KEY,\n"
142 |             "    path TEXT,\n"
143 |             "    phash TEXT,\n"
144 |             "    w INTEGER,\n"
145 |             "    h INTEGER\n"
146 |             ")"
147 |         )
148 |         cur.execute(
149 |             "CREATE TABLE IF NOT EXISTS tiles(\n"
150 |             "    img_id TEXT,\n"
151 |             "    tile_hash TEXT,\n"
152 |             "    x0 INTEGER,\n"
153 |             "    y0 INTEGER,\n"
154 |             "    x1 INTEGER,\n"
155 |             "    y1 INTEGER,\n"
156 |             "    scale REAL DEFAULT 1.0\n"
157 |             ")"
158 |         )
159 |         try:
160 |             cur.execute("PRAGMA table_info(tiles)")
161 |             existing_cols = {row[1] for row in cur.fetchall()}
162 |             if "scale" not in existing_cols:
163 |                 cur.execute("ALTER TABLE tiles ADD COLUMN scale REAL DEFAULT 1.0")
164 |         except Exception:
165 |             pass
166 |         cur.execute("CREATE INDEX IF NOT EXISTS idx_tiles_hash ON tiles(tile_hash)")
167 |         conn.commit()
168 | 
169 | 
170 | def add_image_to_db(db_path: Path, image_path: Path, tile_grid: int = 8) -> None:
171 |     conn = sqlite3.connect(str(db_path))
172 |     with closing(conn):
173 |         cur = conn.cursor()
174 |         ph = compute_phash(image_path)
175 |         tiles = compute_tile_hashes(image_path, grid=tile_grid)
176 |         try:
177 |             from PIL import Image
178 | 
179 |             w, h = Image.open(str(image_path)).size
180 |         except Exception:
181 |             w, h = 0, 0
182 |         img_id = image_path.name
183 |         cur.execute(
184 |             "INSERT OR REPLACE INTO images(img_id,path,phash,w,h) VALUES (?,?,?,?,?)",
185 |             (img_id, str(image_path), ph, w, h),
186 |         )
187 |         cur.execute("DELETE FROM tiles WHERE img_id = ?", (img_id,))
188 |         tile_rows = []
189 |         for tile in tiles:
190 |             th = tile.get("hash")
191 |             bbox = tile.get("bbox", (0, 0, 0, 0))
192 |             scale = tile.get("scale", 1.0)
193 |             tile_rows.append((img_id, th, bbox[0], bbox[1], bbox[2], bbox[3], float(scale)))
194 |         cur.executemany(
195 |             "INSERT INTO tiles(img_id,tile_hash,x0,y0,x1,y1,scale) VALUES (?,?,?,?,?,?,?)",
196 |             tile_rows,
197 |         )
198 |         conn.commit()
199 | 
200 | 
201 | def build_index_db(db_dir: Path, db_path: Path, tile_grid: int = 8) -> None:
202 |     init_sqlite(db_path)
203 |     for p in sorted(db_dir.iterdir()):
204 |         if not p.is_file():
205 |             continue
206 |         add_image_to_db(db_path, p, tile_grid=tile_grid)
207 | 
208 | 
209 | def load_index_from_db(db_path: Path) -> Dict[str, Any]:
210 |     conn = sqlite3.connect(str(db_path))
211 |     idx = {"by_id": {}, "by_phash": {}, "by_tile": {}, "vector": None}
212 |     with closing(conn):
213 |         cur = conn.cursor()
214 |         for img_id, path, phash, w, h in cur.execute(
215 |             "SELECT img_id,path,phash,w,h FROM images"
216 |         ):
217 |             idx["by_id"][img_id] = {"path": path, "phash": phash, "phash_variants": [phash], "tiles": []}
218 |             idx["by_phash"].setdefault(phash, []).append(img_id)
219 |         try:
220 |             tile_rows = cur.execute(
221 |                 "SELECT img_id,tile_hash,x0,y0,x1,y1,scale FROM tiles"
222 |             )
223 |             scale_included = True
224 |         except sqlite3.OperationalError:
225 |             tile_rows = cur.execute(
226 |                 "SELECT img_id,tile_hash,x0,y0,x1,y1 FROM tiles"
227 |             )
228 |             scale_included = False
229 |         for row in tile_rows:
230 |             if scale_included:
231 |                 img_id, th, x0, y0, x1, y1, scale = row
232 |             else:
233 |                 img_id, th, x0, y0, x1, y1 = row
234 |                 scale = 1.0
235 |             tile_entry = {
236 |                 "hash": th,
237 |                 "bbox": (x0, y0, x1, y1),
238 |                 "scale": float(scale),
239 |             }
240 |             rec = idx["by_id"].setdefault(
241 |                 img_id,
242 |                 {"path": "", "phash": "", "phash_variants": [], "tiles": []},
243 |             )
244 |             rec.setdefault("tiles", []).append(tile_entry)
245 |             idx["by_tile"].setdefault(th, []).append(
246 |                 {"img_id": img_id, "bbox": tile_entry["bbox"], "scale": tile_entry["scale"]}
247 |             )
248 | 
249 |     # Augment with variant phashes for better recall
250 |     for img_id, rec in list(idx["by_id"].items()):
251 |         path = Path(rec.get("path", ""))
252 |         try:
253 |             variants = compute_phash_variants(path)
254 |         except Exception:
255 |             variants = [rec.get("phash")]
256 |         rec["phash_variants"] = variants or [rec.get("phash")]
257 |         for ph in rec["phash_variants"]:
258 |             if not ph:
259 |                 continue
260 |             bucket = idx["by_phash"].setdefault(ph, [])
261 |             if img_id not in bucket:
262 |                 bucket.append(img_id)
263 |     # Build vector index on demand
264 |     use_vectors = faiss is not None and np is not None
265 |     if use_vectors:
266 |         vector_embeddings: List[np.ndarray] = []
267 |         vector_ids: List[str] = []
268 |         for img_id, rec in idx["by_id"].items():
269 |             path = rec.get("path")
270 |             if not path:
271 |                 continue
272 |             try:
273 |                 emb_val = compute_embedding(Path(path))
274 |             except Exception:
275 |                 emb_val = None
276 |             if emb_val is None:
277 |                 continue
278 |             try:
279 |                 arr = np.asarray(emb_val, dtype=np.float32)
280 |             except Exception:
281 |                 continue
282 |             if arr.ndim != 1 or arr.size == 0:
283 |                 continue
284 |             vector_embeddings.append(arr)
285 |             vector_ids.append(img_id)
286 |         idx["vector"] = _build_vector_index(vector_embeddings, vector_ids)
287 |     return idx
288 | 


--------------------------------------------------------------------------------
/duplicate_check/features.py:
--------------------------------------------------------------------------------
  1 | """Feature extraction utilities: pHash, tile-hash, ORB descriptors.
  2 | 
  3 | This module implements:
  4 | - compute_phash(image_path) -> hex string
  5 | - compute_tile_hashes(image_path, grid) -> list of (hex, bbox)
  6 | - compute_orb_descriptors(image_path, max_features) -> {kps, descs}
  7 | - compute_features(image_path) -> ImageFeatures
  8 | 
  9 | If OpenCV/imagehash/Pillow are missing, functions will raise ImportError.
 10 | 
 11 | 特征提取工具：pHash、块哈希（tile-hash）、ORB 特征。
 12 | 
 13 | 本模块实现：
 14 | - compute_phash(image_path) -> 十六进制字符串
 15 | - compute_tile_hashes(image_path, grid) -> 返回 (hash, bbox) 列表
 16 | - compute_orb_descriptors(image_path, max_features) -> 返回 {kps, descs}
 17 | - compute_features(image_path) -> 返回 ImageFeatures
 18 | 
 19 | 如果系统缺少 OpenCV/imagehash/Pillow，函数会进行降级或抛出异常。
 20 | """
 21 | from dataclasses import dataclass
 22 | from typing import Any, Dict, List, Tuple, Optional
 23 | from pathlib import Path
 24 | import hashlib
 25 | import io
 26 | import os
 27 | 
 28 | # Optional dependencies
 29 | # 可选依赖
 30 | try:
 31 |     import imagehash
 32 |     from PIL import Image
 33 |     PIL_AVAILABLE = True
 34 | except Exception:
 35 |     imagehash = None
 36 |     Image = None
 37 |     PIL_AVAILABLE = False
 38 | 
 39 | try:
 40 |     import numpy as np
 41 | except Exception:
 42 |     np = None
 43 | 
 44 | try:
 45 |     import cv2
 46 | except Exception:
 47 |     cv2 = None
 48 | 
 49 | try:
 50 |     import clip
 51 |     CLIP_AVAILABLE = True
 52 | except Exception:
 53 |     clip = None
 54 |     CLIP_AVAILABLE = False
 55 | 
 56 | try:
 57 |     import torch
 58 |     from torchvision import models
 59 |     TORCH_AVAILABLE = True
 60 | except Exception:
 61 |     torch = None
 62 |     models = None
 63 |     TORCH_AVAILABLE = False
 64 | 
 65 | _EMBED_MODEL = None
 66 | _EMBED_TRANSFORM = None
 67 | _CLIP_MODEL = None
 68 | _CLIP_PREPROCESS = None
 69 | 
 70 | def _parse_scales(env_name: str, default: Tuple[float, ...]) -> Tuple[float, ...]:
 71 |     raw = os.getenv(env_name)
 72 |     if not raw:
 73 |         return default
 74 |     values: List[float] = []
 75 |     for part in raw.split(","):
 76 |         part = part.strip()
 77 |         if not part:
 78 |             continue
 79 |         try:
 80 |             val = float(part)
 81 |         except ValueError:
 82 |             continue
 83 |         if val > 0:
 84 |             values.append(val)
 85 |     return tuple(values) if values else default
 86 | 
 87 | 
 88 | MULTISCALE_LEVELS: Tuple[float, ...] = _parse_scales("DUPC_TILE_SCALES", (1.0, 0.75))
 89 | DEFAULT_TILE_GRID = max(1, int(os.getenv("DUPC_TILE_GRID", "8")))
 90 | 
 91 | @dataclass
 92 | class ImageFeatures:
 93 |     phash: str
 94 |     orb: Dict[str, Any]
 95 |     size: Tuple[int, int]
 96 |     embedding: Optional[Any] = None
 97 |     tiles: Optional[List[Dict[str, Any]]] = None
 98 | 
 99 | 
100 | def compute_phash(image_path: Path, hash_size: int = 8) -> str:
101 |     """Compute pHash for the image and return as hex string."""
102 |     if PIL_AVAILABLE and imagehash is not None:
103 |         img = Image.open(str(image_path)).convert("RGB")
104 |         ph = imagehash.phash(img, hash_size=hash_size)
105 |         return ph.__str__()
106 |     # Fallback: use SHA1 of file contents and return truncated hex
107 |     h = hashlib.sha1()
108 |     with open(str(image_path), "rb") as f:
109 |         for chunk in iter(lambda: f.read(8192), b""):
110 |             h.update(chunk)
111 |     return h.hexdigest()[:16]
112 | 
113 | 
114 | def compute_phash_variants(
115 |     image_path: Path,
116 |     hash_size: int = 8,
117 |     scales: Tuple[float, ...] = MULTISCALE_LEVELS,
118 | ) -> List[str]:
119 |     """Return a list of pHash values with multi-scale + orientation variants."""
120 |     if not PIL_AVAILABLE or imagehash is None:
121 |         return [compute_phash(image_path, hash_size=hash_size)]
122 |     variants: List[str] = []
123 |     with Image.open(str(image_path)) as img:
124 |         base = img.convert("RGB")
125 |         transforms: List[Image.Image] = []
126 |         for scale in scales:
127 |             if scale <= 0:
128 |                 continue
129 |             if scale == 1.0:
130 |                 scaled = base
131 |             else:
132 |                 w = max(1, int(base.width * scale))
133 |                 h = max(1, int(base.height * scale))
134 |                 scaled = base.resize((w, h))
135 |             transforms.extend(
136 |                 [
137 |                     scaled,
138 |                     scaled.rotate(90, expand=True),
139 |                     scaled.rotate(180, expand=True),
140 |                     scaled.rotate(270, expand=True),
141 |                     scaled.transpose(Image.FLIP_LEFT_RIGHT),
142 |                     scaled.transpose(Image.FLIP_TOP_BOTTOM),
143 |                 ]
144 |             )
145 |         for im in transforms:
146 |             variants.append(imagehash.phash(im, hash_size=hash_size).__str__())
147 |     # deduplicate while preserving order
148 |     seen: List[str] = []
149 |     for v in variants:
150 |         if v not in seen:
151 |             seen.append(v)
152 |     return seen
153 | 
154 | 
155 | def _load_embedder():
156 |     global _EMBED_MODEL, _EMBED_TRANSFORM
157 |     if not TORCH_AVAILABLE:
158 |         return None, None
159 |     if _EMBED_MODEL is not None and _EMBED_TRANSFORM is not None:
160 |         return _EMBED_MODEL, _EMBED_TRANSFORM
161 |     try:
162 |         weights = None
163 |         try:
164 |             weights = models.ResNet18_Weights.DEFAULT  # type: ignore[attr-defined]
165 |         except Exception:
166 |             weights = None
167 |         if weights is not None:
168 |             model = models.resnet18(weights=weights)
169 |             transform = weights.transforms()
170 |         else:
171 |             model = models.resnet18(pretrained=True)
172 |             from torchvision import transforms
173 | 
174 |             transform = transforms.Compose(
175 |                 [
176 |                     transforms.Resize(256),
177 |                     transforms.CenterCrop(224),
178 |                     transforms.ToTensor(),
179 |                     transforms.Normalize(
180 |                         mean=[0.485, 0.456, 0.406],
181 |                         std=[0.229, 0.224, 0.225],
182 |                     ),
183 |                 ]
184 |             )
185 |         model.fc = torch.nn.Identity()
186 |         model.eval()
187 |         model.to("cpu")
188 |         _EMBED_MODEL = model
189 |         _EMBED_TRANSFORM = transform
190 |     except Exception:
191 |         _EMBED_MODEL = None
192 |         _EMBED_TRANSFORM = None
193 |     return _EMBED_MODEL, _EMBED_TRANSFORM
194 | 
195 | 
196 | def _load_clip_model():
197 |     global _CLIP_MODEL, _CLIP_PREPROCESS
198 |     if not CLIP_AVAILABLE or not TORCH_AVAILABLE:
199 |         return None, None
200 |     if _CLIP_MODEL is not None and _CLIP_PREPROCESS is not None:
201 |         return _CLIP_MODEL, _CLIP_PREPROCESS
202 |     try:
203 |         device = "cpu"
204 |         model, preprocess = clip.load("ViT-B/32", device=device)
205 |         model.eval()
206 |         _CLIP_MODEL = model
207 |         _CLIP_PREPROCESS = preprocess
208 |     except Exception:
209 |         _CLIP_MODEL = None
210 |         _CLIP_PREPROCESS = None
211 |     return _CLIP_MODEL, _CLIP_PREPROCESS
212 | 
213 | 
214 | def _fallback_embedding(image_path: Path, size: int = 64) -> Optional[Any]:
215 |     if np is None or not PIL_AVAILABLE or Image is None:
216 |         return None
217 |     try:
218 |         img = Image.open(str(image_path)).convert("RGB")
219 |         img = img.resize((size, size))
220 |         arr = np.asarray(img, dtype=np.float32)
221 |         if arr.size == 0:
222 |             return None
223 |         arr = arr / 255.0
224 |         emb = arr.reshape(-1)
225 |         norm = np.linalg.norm(emb)
226 |         if norm > 0:
227 |             emb = emb / norm
228 |         return emb
229 |     except Exception:
230 |         return None
231 | 
232 | 
233 | def compute_embedding(image_path: Path) -> Optional[Any]:
234 |     """Compute fused embeddings (ResNet18 + optional CLIP) for ANN recall."""
235 |     if np is None:
236 |         return _fallback_embedding(image_path)
237 |     embeddings: List[np.ndarray] = []
238 | 
239 |     model, transform = _load_embedder()
240 |     if model is not None and transform is not None and Image is not None:
241 |         try:
242 |             img = Image.open(str(image_path)).convert("RGB")
243 |             tensor = transform(img).unsqueeze(0)
244 |             with torch.no_grad():
245 |                 vec = model(tensor.to("cpu")).squeeze(0).numpy()
246 |             norm = np.linalg.norm(vec)
247 |             if norm > 0:
248 |                 vec = vec / norm
249 |             embeddings.append(vec.astype("float32"))
250 |         except Exception:
251 |             pass
252 | 
253 |     clip_model, clip_preprocess = _load_clip_model()
254 |     if clip_model is not None and clip_preprocess is not None:
255 |         try:
256 |             img = Image.open(str(image_path)).convert("RGB")
257 |             tensor = clip_preprocess(img).unsqueeze(0)
258 |             with torch.no_grad():
259 |                 vec = clip_model.encode_image(tensor.to("cpu")).squeeze(0).cpu().numpy()
260 |             norm = np.linalg.norm(vec)
261 |             if norm > 0:
262 |                 vec = vec / norm
263 |             embeddings.append(vec.astype("float32"))
264 |         except Exception:
265 |             pass
266 | 
267 |     if embeddings:
268 |         try:
269 |             fused = np.concatenate(embeddings)
270 |             norm = np.linalg.norm(fused)
271 |             if norm > 0:
272 |                 fused = fused / norm
273 |             return fused.astype("float32")
274 |         except Exception:
275 |             pass
276 | 
277 |     return _fallback_embedding(image_path)
278 | 
279 | 
280 | def compute_tile_hashes(
281 |     image_path: Path,
282 |     grid: int = DEFAULT_TILE_GRID,
283 |     hash_size: int = 8,
284 |     scales: Tuple[float, ...] = MULTISCALE_LEVELS,
285 | ) -> List[Dict[str, Any]]:
286 |     """Split image into grid x grid tiles across multiple scales and compute pHash per tile."""
287 |     tiles: List[Dict[str, Any]] = []
288 |     if not PIL_AVAILABLE or Image is None or imagehash is None:
289 |         ph = compute_phash(image_path, hash_size=hash_size)
290 |         tiles.append({"hash": ph, "bbox": (0, 0, 0, 0), "scale": 1.0})
291 |         return tiles
292 | 
293 |     base = Image.open(str(image_path)).convert("RGB")
294 |     w_base, h_base = base.size
295 |     for scale in scales:
296 |         if scale <= 0:
297 |             continue
298 |         if scale == 1.0:
299 |             img = base
300 |             w, h = w_base, h_base
301 |         else:
302 |             w = max(1, int(w_base * scale))
303 |             h = max(1, int(h_base * scale))
304 |             img = base.resize((w, h))
305 |         if w == 0 or h == 0:
306 |             continue
307 | 
308 |         tile_w = max(1, w // grid)
309 |         tile_h = max(1, h // grid)
310 |         for yi in range(grid):
311 |             for xi in range(grid):
312 |                 x0 = xi * tile_w
313 |                 y0 = yi * tile_h
314 |                 x1 = x0 + tile_w if xi < grid - 1 else w
315 |                 y1 = y0 + tile_h if yi < grid - 1 else h
316 |                 crop = img.crop((x0, y0, x1, y1))
317 |                 ph = imagehash.phash(crop, hash_size=hash_size)
318 |                 inv = 1.0 / scale if scale != 0 else 1.0
319 |                 bbox = (
320 |                     int(x0 * inv),
321 |                     int(y0 * inv),
322 |                     int(x1 * inv),
323 |                     int(y1 * inv),
324 |                 )
325 |                 tiles.append({
326 |                     "hash": ph.__str__(),
327 |                     "bbox": bbox,
328 |                     "scale": float(scale),
329 |                 })
330 |     return tiles
331 | 
332 | 
333 | def compute_orb_descriptors(image_path: Path, max_features: int = 2000) -> Dict:
334 |     """Extract ORB keypoints and descriptors using OpenCV.
335 | 
336 |     Returns dict {"kps": list of cv2.KeyPoint, "descs": np.ndarray}
337 |     """
338 |     if cv2 is None:
339 |         # Graceful fallback: return empty descriptors
340 |         # 优雅降级：返回空的关键点/描述子
341 |         return {"kps": [], "descs": None}
342 |     img = cv2.imread(str(image_path))
343 |     if img is None:
344 |         raise IOError(f"Unable to read image: {image_path}")
345 |     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
346 |     orb = cv2.ORB_create(nfeatures=max_features)
347 |     kps, descs = orb.detectAndCompute(gray, None)
348 |     return {"kps": kps, "descs": descs}
349 | 
350 | 
351 | def compute_features(image_path: Path, orb_max_features: int = 2000, tile_grid: int = DEFAULT_TILE_GRID) -> ImageFeatures:
352 |     ph = compute_phash(image_path)
353 |     orb = {}
354 |     try:
355 |         orb = compute_orb_descriptors(image_path, max_features=orb_max_features)
356 |     except Exception:
357 |         orb = {"kps": [], "descs": None}
358 |     size = (0, 0)
359 |     embedding = None
360 |     if PIL_AVAILABLE and Image is not None:
361 |         try:
362 |             img = Image.open(str(image_path))
363 |             size = img.size
364 |         except Exception:
365 |             size = (0, 0)
366 |     try:
367 |         embedding = compute_embedding(image_path)
368 |     except Exception:
369 |         embedding = None
370 |     try:
371 |         tiles = compute_tile_hashes(image_path, grid=tile_grid)
372 |     except Exception:
373 |         tiles = []
374 |     feats = ImageFeatures(phash=ph, orb=orb, size=size, embedding=embedding)
375 |     feats.tiles = tiles
376 |     try:
377 |         feats._path = str(image_path)
378 |     except Exception:
379 |         pass
380 |     return feats
381 | 


--------------------------------------------------------------------------------
/duplicate_check/matcher.py:
--------------------------------------------------------------------------------
  1 | """Matcher: recall via phash and tile-hash, precise verification via ORB+RANSAC and NCC.
  2 | 
  3 | Matcher 模块：通过 phash 和 tile-hash 召回候选，使用 ORB+RANSAC 与 NCC 做精排与判定。
  4 | """
  5 | from pathlib import Path
  6 | from typing import Dict, Any, List, Tuple, Optional
  7 | from duplicate_check.features import ImageFeatures
  8 | import hashlib
  9 | 
 10 | try:
 11 |     import numpy as np
 12 | except Exception:
 13 |     np = None
 14 | 
 15 | try:
 16 |     import cv2
 17 | except Exception:
 18 |     cv2 = None
 19 | 
 20 | try:
 21 |     import faiss
 22 | except Exception:
 23 |     faiss = None
 24 | 
 25 | 
 26 | _DB_FEATURE_VARIANT_CACHE: Dict[str, List[Dict[str, Any]]] = {}
 27 | 
 28 | 
 29 | def hamming_distance_hex(a: str, b: str) -> int:
 30 |     # imagehash returns hex string; convert to int
 31 |     # imagehash 返回十六进制字符串；将其转换为整数并计算汉明距离
 32 |     ai = int(a, 16)
 33 |     bi = int(b, 16)
 34 |     x = ai ^ bi
 35 |     return x.bit_count()
 36 | 
 37 | 
 38 | def _has_descriptors(variant: Dict[str, Any]) -> bool:
 39 |     desc = variant.get("descs")
 40 |     try:
 41 |         return desc is not None and len(desc) > 0
 42 |     except Exception:
 43 |         return False
 44 | 
 45 | 
 46 | def _count_good_matches(desc1, desc2, ratio: float = 0.75) -> int:
 47 |     if cv2 is None or np is None:
 48 |         return 0
 49 |     if desc1 is None or desc2 is None:
 50 |         return 0
 51 |     try:
 52 |         if len(desc1) == 0 or len(desc2) == 0:
 53 |             return 0
 54 |     except Exception:
 55 |         return 0
 56 |     dtype1 = getattr(desc1, "dtype", None)
 57 |     dtype2 = getattr(desc2, "dtype", None)
 58 |     norm = cv2.NORM_HAMMING
 59 |     if dtype1 is not None:
 60 |         if dtype1 == np.float32:
 61 |             norm = cv2.NORM_L2
 62 |         elif dtype1 == np.uint8:
 63 |             norm = cv2.NORM_HAMMING
 64 |     if dtype1 is not None and dtype2 is not None and dtype1 != dtype2:
 65 |         try:
 66 |             desc2 = desc2.astype(dtype1)
 67 |         except Exception:
 68 |             pass
 69 |     bf = cv2.BFMatcher(norm, crossCheck=False)
 70 |     try:
 71 |         matches = bf.knnMatch(desc1, desc2, k=2)
 72 |     except cv2.error:
 73 |         return 0
 74 |     good = 0
 75 |     for pair in matches:
 76 |         if len(pair) != 2:
 77 |             continue
 78 |         m, n = pair
 79 |         if m.distance < ratio * n.distance:
 80 |             good += 1
 81 |     return good
 82 | 
 83 | 
 84 | def _variant_orientation(name: Optional[str]) -> str:
 85 |     if not name:
 86 |         return ""
 87 |     parts = name.split("_", 1)
 88 |     return parts[1] if len(parts) == 2 else parts[0]
 89 | 
 90 | 
 91 | def _filter_inlier_matches(matches: List[Any], mask: Optional[List[int]]) -> List[Any]:
 92 |     if not matches:
 93 |         return []
 94 |     if not mask:
 95 |         return matches
 96 |     return [m for m, keep in zip(matches, mask) if keep]
 97 | 
 98 | 
 99 | def _limit_matches(matches: List[Any], max_count: int) -> List[Any]:
100 |     if max_count <= 0 or not matches:
101 |         return matches
102 |     if len(matches) <= max_count:
103 |         return matches
104 |     stride = max(1, len(matches) // max_count)
105 |     limited = matches[::stride]
106 |     if len(limited) > max_count:
107 |         limited = limited[:max_count]
108 |     if not limited:
109 |         return matches[:max_count]
110 |     return limited
111 | 
112 | 
113 | def _compute_roi_from_matches(
114 |     matches: List[Any],
115 |     keypoints: List[Any],
116 |     image_path: Path,
117 |     margin_ratio: float = 0.15,
118 |     *,
119 |     index_attr: str = "trainIdx",
120 |     max_fraction: float = 0.6,
121 |     min_size: int = 16,
122 | ) -> Optional[Tuple[int, int, int, int]]:
123 |     if cv2 is None or not matches or not keypoints:
124 |         return None
125 |     img = cv2.imread(str(image_path), cv2.IMREAD_COLOR)
126 |     if img is None:
127 |         return None
128 |     h, w = img.shape[:2]
129 |     xs: List[float] = []
130 |     ys: List[float] = []
131 |     for m in matches:
132 |         idx = getattr(m, index_attr, None)
133 |         if idx is None or idx >= len(keypoints):
134 |             continue
135 |         pt = keypoints[idx].pt
136 |         xs.append(float(pt[0]))
137 |         ys.append(float(pt[1]))
138 |     if len(xs) < 2 or len(ys) < 2:
139 |         return None
140 |     min_x, max_x = min(xs), max(xs)
141 |     min_y, max_y = min(ys), max(ys)
142 |     width = max_x - min_x
143 |     height = max_y - min_y
144 |     if width <= 0 or height <= 0:
145 |         return None
146 |     margin_x = max(10.0, width * margin_ratio)
147 |     margin_y = max(10.0, height * margin_ratio)
148 |     x0 = max(0, int(min_x - margin_x))
149 |     y0 = max(0, int(min_y - margin_y))
150 |     x1 = min(w, int(max_x + margin_x))
151 |     y1 = min(h, int(max_y + margin_y))
152 |     roi_w = x1 - x0
153 |     roi_h = y1 - y0
154 |     if roi_w <= 0 or roi_h <= 0:
155 |         return None
156 |     max_w = max(min_size, int(w * max_fraction))
157 |     max_h = max(min_size, int(h * max_fraction))
158 |     if roi_w > max_w:
159 |         cx = (x0 + x1) / 2.0
160 |         half = max_w / 2.0
161 |         x0 = max(0, int(round(cx - half)))
162 |         x1 = min(w, int(round(cx + half)))
163 |     if roi_h > max_h:
164 |         cy = (y0 + y1) / 2.0
165 |         half = max_h / 2.0
166 |         y0 = max(0, int(round(cy - half)))
167 |         y1 = min(h, int(round(cy + half)))
168 |     if x1 - x0 <= 0 or y1 - y0 <= 0:
169 |         return None
170 |     return (x0, y0, x1, y1)
171 | 
172 | 
173 | def _compute_feature_variants_for_path(
174 |     path: Path,
175 |     cache: Dict[str, List[Dict[str, Any]]] | None = None,
176 |     max_features: int = 2000,
177 | ) -> List[Dict[str, Any]]:
178 |     key = str(path)
179 |     if cache is not None and key in cache:
180 |         return cache[key]
181 | 
182 |     variants: List[Dict[str, Any]] = []
183 |     if cv2 is None:
184 |         variants.append({"name": "orb_rot0", "algo": "orb", "kps": [], "descs": None})
185 |         variants.append({"name": "akaze_rot0", "algo": "akaze", "kps": [], "descs": None})
186 |     else:
187 |         img = cv2.imread(str(path))
188 |         if img is None:
189 |             variants.append({"name": "orb_rot0", "algo": "orb", "kps": [], "descs": None})
190 |             variants.append({"name": "akaze_rot0", "algo": "akaze", "kps": [], "descs": None})
191 |         else:
192 |             detectors: List[Tuple[str, Any]] = []
193 |             try:
194 |                 detectors.append(("orb", cv2.ORB_create(nfeatures=max_features)))
195 |             except Exception:
196 |                 detectors.append(("orb", None))
197 |             try:
198 |                 detectors.append(("akaze", cv2.AKAZE_create()))
199 |             except Exception:
200 |                 detectors.append(("akaze", None))
201 | 
202 |             transforms = [
203 |                 ("rot0", img),
204 |                 ("rot90", cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)),
205 |                 ("rot180", cv2.rotate(img, cv2.ROTATE_180)),
206 |                 ("rot270", cv2.rotate(img, cv2.ROTATE_90_COUNTERCLOCKWISE)),
207 |                 ("flip0", cv2.flip(img, 1)),
208 |             ]
209 | 
210 |             for algo, detector in detectors:
211 |                 seen = set()
212 |                 for name, mat in transforms:
213 |                     variant_name = f"{algo}_{name}"
214 |                     if mat is None or variant_name in seen:
215 |                         continue
216 |                     seen.add(variant_name)
217 |                     if detector is None:
218 |                         variants.append({"name": variant_name, "algo": algo, "kps": [], "descs": None})
219 |                         continue
220 |                     try:
221 |                         gray = cv2.cvtColor(mat, cv2.COLOR_BGR2GRAY)
222 |                     except Exception:
223 |                         variants.append({"name": variant_name, "algo": algo, "kps": [], "descs": None})
224 |                         continue
225 |                     kps, descs = detector.detectAndCompute(gray, None)
226 |                     variants.append({"name": variant_name, "algo": algo, "kps": kps or [], "descs": descs})
227 | 
228 |     if cache is not None:
229 |         cache[key] = variants
230 |     return variants
231 | 
232 | 
233 | def _get_db_feature_variants(path: Path) -> List[Dict[str, Any]]:
234 |     return _compute_feature_variants_for_path(path, _DB_FEATURE_VARIANT_CACHE)
235 | 
236 | 
237 | def _best_orb_match(q_variants: List[Dict[str, Any]], db_variants: List[Dict[str, Any]]) -> Tuple[int, int, Optional[Tuple[str, str]]]:
238 |     best_good = 0
239 |     best_len = 1
240 |     best_pair: Optional[Tuple[str, str]] = None
241 |     for q_var in q_variants:
242 |         if not _has_descriptors(q_var):
243 |             continue
244 |         q_desc = q_var.get("descs")
245 |         q_len = len(q_desc)
246 |         for d_var in db_variants:
247 |             if not _has_descriptors(d_var):
248 |                 continue
249 |             good = _count_good_matches(q_desc, d_var.get("descs"))
250 |             if good > best_good:
251 |                 best_good = good
252 |                 best_len = max(1, q_len)
253 |                 best_pair = (q_var.get("name"), d_var.get("name"))
254 |     return best_good, best_len, best_pair
255 | 
256 | def recall_candidates(
257 |     features: ImageFeatures,
258 |     index: Dict,
259 |     topk: int = 50,
260 |     phash_thresh: int = 10,
261 |     tile_match_count: int = 3,
262 |     vector_score_thresh: float = 0.0,
263 | ) -> List[Dict[str, Any]]:
264 |     """Recall candidates by global pHash and tile-hash. Returns list of dicts with scores.
265 | 
266 |     通过全局 pHash 和块哈希召回候选，返回包含分数的字典列表。
267 |     """
268 |     ph = features.phash
269 |     hits: Dict[str, Dict[str, Any]] = {}
270 |     # global phash exact-ish match
271 |     for phash_key, ids in index.get("by_phash", {}).items():
272 |         d = hamming_distance_hex(ph, phash_key)
273 |         if d <= phash_thresh:
274 |             for i in ids:
275 |                 hits.setdefault(i, {"score": 0.0, "reason": []})
276 |                 hits[i]["score"] += max(0, (phash_thresh - d) / phash_thresh)
277 |                 hits[i]["reason"].append(("phash", d))
278 | 
279 |     # tile recall: compute query tile hashes (if possible) and count matches
280 |     q_tiles = getattr(features, "tiles", None)
281 |     if q_tiles is None:
282 |         try:
283 |             from duplicate_check.features import compute_tile_hashes, DEFAULT_TILE_GRID
284 | 
285 |             if hasattr(features, "_path") and features._path:
286 |                 q_tiles = compute_tile_hashes(Path(features._path), grid=DEFAULT_TILE_GRID)
287 |                 features.tiles = q_tiles
288 |         except Exception:
289 |             q_tiles = None
290 | 
291 |     if q_tiles:
292 |         tile_counts: Dict[str, int] = {}
293 |         for tile in q_tiles:
294 |             th = tile.get("hash")
295 |             if not th:
296 |                 continue
297 |             for entry in index.get("by_tile", {}).get(th, []):
298 |                 img_id = entry.get("img_id")
299 |                 if img_id is None:
300 |                     continue
301 |                 tile_counts.setdefault(img_id, 0)
302 |                 tile_counts[img_id] += 1
303 |         for img_id, cnt in tile_counts.items():
304 |             entry = hits.setdefault(img_id, {"score": 0.0, "reason": []})
305 |             entry["score"] += cnt / (len(q_tiles) or 1)
306 |             entry.setdefault("reason", []).append(("tiles", cnt))
307 | 
308 |     # Vector-based recall via FAISS (optional)
309 |     vector_index = index.get("vector") if isinstance(index, dict) else None
310 |     if vector_index and np is not None and faiss is not None:
311 |         q_emb = getattr(features, "embedding", None)
312 |         if q_emb is None and hasattr(features, "_path"):
313 |             try:
314 |                 from duplicate_check.features import compute_embedding
315 | 
316 |                 q_emb = compute_embedding(Path(features._path))
317 |             except Exception:
318 |                 q_emb = None
319 |         try:
320 |             if q_emb is not None:
321 |                 vec = np.asarray(q_emb, dtype=np.float32)
322 |                 if vec.ndim == 1 and vec.size > 0:
323 |                     norm = np.linalg.norm(vec)
324 |                     if norm > 0:
325 |                         vec = vec / norm
326 |                     vec = vec.reshape(1, -1)
327 |                     index_obj = vector_index.get("index")
328 |                     ids = vector_index.get("ids", [])
329 |                     metric = vector_index.get("metric", "ip")
330 |                     if index_obj is not None and len(ids):
331 |                         topn = min(max(topk * 2, 32), len(ids))
332 |                         D, I = index_obj.search(vec, topn)
333 |                         for dist, idx_id in zip(D[0], I[0]):
334 |                             if idx_id < 0 or idx_id >= len(ids):
335 |                                 continue
336 |                             db_id = ids[idx_id]
337 |                             if metric == "ip":
338 |                                 score = float(dist)
339 |                             else:
340 |                                 score = float(1.0 / (1.0 + dist))
341 |                             if score <= 0:
342 |                                 continue
343 |                             if score >= vector_score_thresh:
344 |                                 entry = hits.setdefault(db_id, {"score": 0.0, "reason": []})
345 |                                 entry["score"] += score
346 |                                 entry.setdefault("reason", []).append(("vector", score))
347 |         except Exception:
348 |             pass
349 | 
350 |     # Orientation-aware ORB scoring
351 |     query_path = None
352 |     if hasattr(features, "_path") and features._path:
353 |         try:
354 |             query_path = Path(features._path)
355 |         except Exception:
356 |             query_path = None
357 | 
358 |     q_variants: List[Dict[str, Any]] = []
359 |     if query_path is not None:
360 |         q_variants = getattr(features, "_feature_variants", None) or []
361 |         if not q_variants:
362 |             try:
363 |                 q_variants = _compute_feature_variants_for_path(query_path)
364 |                 features._feature_variants = q_variants
365 |             except Exception:
366 |                 q_variants = []
367 | 
368 |     has_query_orb = any(_has_descriptors(v) for v in q_variants)
369 | 
370 |     if has_query_orb:
371 |         for img_id in list(hits.keys()):
372 |             rec = index.get("by_id", {}).get(img_id)
373 |             if rec is None:
374 |                 continue
375 |             db_variants = _get_db_feature_variants(Path(rec["path"]))
376 |             best_good, best_len, best_pair = _best_orb_match(q_variants, db_variants)
377 |             if best_good <= 0 or best_pair is None:
378 |                 continue
379 |             entry = hits.setdefault(img_id, {"score": 0.0, "reason": []})
380 |             entry["score"] += min(1.0, best_good / max(1, best_len))
381 |             entry.setdefault("reason", []).append(("orb", best_good))
382 |             entry["best_orient"] = best_pair
383 | 
384 |         # Fallback: add strong ORB matches not yet recalled
385 |         if len(hits) < topk:
386 |             ORB_FALLBACK_MIN = 25
387 |             for img_id, rec in index.get("by_id", {}).items():
388 |                 if img_id in hits:
389 |                     continue
390 |                 db_variants = _get_db_feature_variants(Path(rec["path"]))
391 |                 best_good, best_len, best_pair = _best_orb_match(q_variants, db_variants)
392 |                 if best_good < ORB_FALLBACK_MIN or best_pair is None:
393 |                     continue
394 |                 entry = hits.setdefault(img_id, {"score": 0.0, "reason": []})
395 |                 entry["score"] += min(1.0, best_good / max(1, best_len))
396 |                 entry.setdefault("reason", []).append(("orb", best_good))
397 |                 entry["best_orient"] = best_pair
398 |                 if len(hits) >= topk:
399 |                     break
400 | 
401 |     # Convert hits to sorted list
402 |     out = []
403 |     for img_id, v in hits.items():
404 |         out.append(
405 |             {
406 |                 "db_id": img_id,
407 |                 "score": v.get("score", 0.0),
408 |                 "reason": v.get("reason", []),
409 |                 "orientation": v.get("best_orient"),
410 |             }
411 |         )
412 |     out.sort(key=lambda x: x["score"], reverse=True)
413 |     return out[:topk]
414 | 
415 | 
416 | def _orb_ransac_inliers(kps1, desc1, kps2, desc2, ratio=0.75, ransac_thresh=5.0):
417 |     """Match descriptors using BFMatcher and compute RANSAC homography inliers.
418 | 
419 |     Returns (inlier_count, inlier_ratio, matches_mask, H)
420 |     """
421 |     if cv2 is None or np is None or desc1 is None or desc2 is None:
422 |         return 0, 0.0, None, None, []
423 |     try:
424 |         if len(desc1) == 0 or len(desc2) == 0:
425 |             return 0, 0.0, None, None, []
426 |     except Exception:
427 |         return 0, 0.0, None, None, []
428 |     dtype1 = getattr(desc1, "dtype", None)
429 |     dtype2 = getattr(desc2, "dtype", None)
430 |     if dtype1 is not None and dtype2 is not None and dtype1 != dtype2:
431 |         try:
432 |             desc2 = desc2.astype(dtype1)
433 |             dtype2 = dtype1
434 |         except Exception:
435 |             pass
436 |     if dtype1 is None or dtype2 is None:
437 |         return 0, 0.0, None, None, []
438 |     if desc1.shape[1] != desc2.shape[1]:
439 |         return 0, 0.0, None, None, []
440 |     norm = cv2.NORM_HAMMING if dtype1 == np.uint8 else cv2.NORM_L2
441 |     bf = cv2.BFMatcher(norm, crossCheck=False)
442 |     try:
443 |         matches = bf.knnMatch(desc1, desc2, k=2)
444 |     except cv2.error:
445 |         return 0, 0.0, None, None, []
446 |     good = []
447 |     for m_n in matches:
448 |         if len(m_n) != 2:
449 |             continue
450 |         m, n = m_n
451 |         if m.distance < ratio * n.distance:
452 |             good.append(m)
453 |     if len(good) < 4:
454 |         return 0, 0.0, None, None, good
455 |     src_pts = np.float32([kps1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
456 |     dst_pts = np.float32([kps2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
457 |     method = getattr(cv2, "USAC_MAGSAC", cv2.RANSAC)
458 |     try:
459 |         H, mask = cv2.findHomography(src_pts, dst_pts, method, ransac_thresh)
460 |     except Exception:
461 |         H, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, ransac_thresh)
462 |     if mask is None and method != cv2.RANSAC:
463 |         try:
464 |             H, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, ransac_thresh)
465 |         except Exception:
466 |             mask = None
467 |     if mask is None:
468 |         return 0, 0.0, None, H, good
469 |     inliers = int(mask.sum())
470 |     inlier_ratio = inliers / max(1, len(good))
471 |     return inliers, inlier_ratio, mask.ravel().tolist(), H, good
472 | 
473 | 
474 | def _ncc_peak(
475 |     img_query_path: Path,
476 |     db_path: Path,
477 |     bbox_query: Tuple[int, int, int, int],
478 |     bbox_db: Tuple[int, int, int, int],
479 |     *,
480 |     min_size: int = 16,
481 | ) -> float:
482 |     """Compute normalized cross correlation between query and db patch.
483 | 
484 |     For simplicity, we load images via OpenCV, extract the db bbox, resize query to same
485 |     and compute cv2.matchTemplate with TM_CCOEFF_NORMED.
486 |     """
487 |     if cv2 is None or np is None:
488 |         return 0.0
489 |     q = cv2.imread(str(img_query_path), cv2.IMREAD_COLOR)
490 |     d = cv2.imread(str(db_path), cv2.IMREAD_COLOR)
491 |     if q is None or d is None:
492 |         return 0.0
493 |     qx0, qy0, qx1, qy1 = bbox_query
494 |     dx0, dy0, dx1, dy1 = bbox_db
495 |     q_patch = q[qy0:qy1, qx0:qx1]
496 |     d_patch = d[dy0:dy1, dx0:dx1]
497 |     if q_patch.size == 0 or d_patch.size == 0:
498 |         return 0.0
499 |     if q_patch.shape[0] < min_size or q_patch.shape[1] < min_size:
500 |         return 0.0
501 |     if d_patch.shape[0] < min_size or d_patch.shape[1] < min_size:
502 |         return 0.0
503 |     # Resize query ROI to the database ROI size for comparison
504 |     q_resized = cv2.resize(q_patch, (d_patch.shape[1], d_patch.shape[0]))
505 |     qf = cv2.cvtColor(q_resized, cv2.COLOR_BGR2GRAY)
506 |     pf = cv2.cvtColor(d_patch, cv2.COLOR_BGR2GRAY)
507 |     res = cv2.matchTemplate(pf, qf, cv2.TM_CCOEFF_NORMED)
508 |     return float(res.max()) if res.size else 0.0
509 | 
510 | 
511 | def rerank_and_verify(
512 |     input_path: Path,
513 |     candidates: List[Dict[str, Any]],
514 |     index: Dict,
515 |     orb_inliers_thresh: int = 25,
516 |     orb_inlier_ratio: float = 0.25,
517 |     ncc_thresh: float = 0.92,
518 |     roi_margin_ratio: float = 0.12,
519 |     max_roi_matches: int = 60,
520 | ) -> List[Dict[str, Any]]:
521 |     """For each candidate, run ORB matching + RANSAC and NCC to generate final decision rows."""
522 |     rows: List[Dict[str, Any]] = []
523 | 
524 |     try:
525 |         q_variants = _compute_feature_variants_for_path(input_path)
526 |     except Exception:
527 |         q_variants = []
528 |     q_map = {var.get("name"): var for var in q_variants}
529 |     has_query_orb = any(_has_descriptors(v) for v in q_variants)
530 | 
531 |     for c in candidates:
532 |         db_id = c.get("db_id")
533 |         db_rec = index.get("by_id", {}).get(db_id) if db_id else None
534 |         if db_rec is None:
535 |             continue
536 |         db_path = Path(db_rec["path"])
537 |         db_variants = _get_db_feature_variants(db_path)
538 |         d_map = {var.get("name"): var for var in db_variants}
539 |         has_db_orb = any(_has_descriptors(v) for v in db_variants)
540 | 
541 |         orientation_hint = c.get("orientation")
542 |         pair_order: List[Tuple[str, str]] = []
543 |         if orientation_hint and isinstance(orientation_hint, (tuple, list)) and len(orientation_hint) == 2:
544 |             q_name, d_name = orientation_hint
545 |             if q_name in q_map and d_name in d_map:
546 |                 pair_order.append((q_name, d_name))
547 | 
548 |         for q_var in q_variants:
549 |             for d_var in db_variants:
550 |                 pair = (q_var.get("name"), d_var.get("name"))
551 |                 if pair not in pair_order:
552 |                     pair_order.append(pair)
553 | 
554 |         best = None
555 |         for q_name, d_name in pair_order:
556 |             q_var = q_map.get(q_name)
557 |             d_var = d_map.get(d_name)
558 |             if not q_var or not d_var:
559 |                 continue
560 |             if not _has_descriptors(q_var) or not _has_descriptors(d_var):
561 |                 continue
562 |             inliers, inlier_ratio, mask, H, good_matches = _orb_ransac_inliers(
563 |                 q_var["kps"],
564 |                 q_var["descs"],
565 |                 d_var["kps"],
566 |                 d_var["descs"],
567 |             )
568 |             if best is None or inliers > best["inliers"]:
569 |                 best = {
570 |                     "q": q_var,
571 |                     "d": d_var,
572 |                     "q_name": q_name,
573 |                     "d_name": d_name,
574 |                     "algo_q": q_var.get("algo", "orb"),
575 |                     "algo_d": d_var.get("algo", "orb"),
576 |                     "inliers": inliers,
577 |                     "inlier_ratio": inlier_ratio,
578 |                     "matches": good_matches,
579 |                     "mask": mask,
580 |                 }
581 | 
582 |         has_descriptors = has_query_orb and has_db_orb
583 | 
584 |         if best is None:
585 |             if not has_descriptors:
586 |                 reasons = {r[0] for r in c.get("reason", [])}
587 |                 if "phash" in reasons:
588 |                     rows.append(
589 |                         {
590 |                             "new_image": str(input_path.name),
591 |                             "matched_image": db_id,
592 |                             "final_label": "phash_duplicate",
593 |                             "score": float(max(c.get("score", 0.5), 0.5)),
594 |                             "inliers": 0,
595 |                             "inlier_ratio": 0.0,
596 |                             "ncc_peak": 0.0,
597 |                             "evidence_img_path": "",
598 |                             "match_pairs": [],
599 |                             "orientation": "",
600 |                         }
601 |                     )
602 |             continue
603 | 
604 |         label = "unique"
605 |         score = c.get("score", 0.0)
606 |         ncc_peak = 0.0
607 |         evidence = ""
608 | 
609 |         if (
610 |             best["inliers"] >= orb_inliers_thresh
611 |             and best["inlier_ratio"] >= orb_inlier_ratio
612 |         ):
613 |             label = "partial_duplicate"
614 |             score = max(score, min(0.99, 0.5 + best["inlier_ratio"]))
615 |             matches_for_roi = _filter_inlier_matches(best.get("matches") or [], best.get("mask"))
616 |             matches_for_roi = _limit_matches(matches_for_roi, max_roi_matches)
617 |             if (
618 |                 matches_for_roi
619 |                 and len(matches_for_roi) >= 4
620 |                 and best.get("algo_q") == "orb"
621 |                 and best.get("algo_d") == "orb"
622 |                 and _variant_orientation(best.get("q_name")) == "rot0"
623 |                 and _variant_orientation(best.get("d_name")) == "rot0"
624 |             ):
625 |                 q_bbox = _compute_roi_from_matches(
626 |                     matches_for_roi,
627 |                     best["q"]["kps"],
628 |                     input_path,
629 |                     margin_ratio=roi_margin_ratio,
630 |                     index_attr="queryIdx",
631 |                 )
632 |                 d_bbox = _compute_roi_from_matches(
633 |                     matches_for_roi,
634 |                     best["d"]["kps"],
635 |                     db_path,
636 |                     margin_ratio=roi_margin_ratio,
637 |                     index_attr="trainIdx",
638 |                 )
639 |                 if q_bbox and d_bbox:
640 |                     try:
641 |                         ncc_peak = _ncc_peak(input_path, db_path, q_bbox, d_bbox)
642 |                     except Exception:
643 |                         ncc_peak = 0.0
644 |                     if ncc_peak >= ncc_thresh:
645 |                         label = "exact_patch"
646 |                         score = 0.99
647 |         else:
648 |             continue
649 | 
650 |         match_pairs: List[Tuple[Tuple[float, float], Tuple[float, float]]] = []
651 |         try:
652 |             matches = best.get("matches") or []
653 |             q_kps = best["q"]["kps"]
654 |             d_kps = best["d"]["kps"]
655 |             if matches and q_kps and d_kps:
656 |                 for m in matches:
657 |                     pt_q = q_kps[m.queryIdx].pt
658 |                     pt_d = d_kps[m.trainIdx].pt
659 |                     match_pairs.append(((float(pt_q[0]), float(pt_q[1])), (float(pt_d[0]), float(pt_d[1]))))
660 |         except Exception:
661 |             match_pairs = []
662 | 
663 |         rows.append(
664 |             {
665 |                 "new_image": str(input_path.name),
666 |                 "matched_image": db_id,
667 |                 "final_label": label,
668 |                 "score": float(score),
669 |                 "inliers": int(best["inliers"]),
670 |                 "inlier_ratio": float(best["inlier_ratio"]),
671 |                 "ncc_peak": float(ncc_peak),
672 |                 "evidence_img_path": evidence,
673 |                 "match_pairs": match_pairs,
674 |                 "orientation": f"{best['q_name']}->{best['d_name']}",
675 |             }
676 |         )
677 | 
678 |     return rows
679 | 


--------------------------------------------------------------------------------