├── .gitignore
├── fiftyone.yml
├── assets
├── exact_dup.svg
├── approx_dup.svg
├── view_groups.svg
├── delete.svg
└── representative.svg
├── .pre-commit-config.yaml
├── README.md
├── exact_dups.py
├── approx_dups.py
└── __init__.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
--------------------------------------------------------------------------------
/fiftyone.yml:
--------------------------------------------------------------------------------
1 | fiftyone:
2 | version: ">=0.23.7"
3 | name: "@jacobmarks/image_deduplication"
4 | version: "1.0.1"
5 | description: "Find and remove duplicate images"
6 | url: "https://github.com/jacobmarks/image-dedup-plugin/blob/main/README.md"
7 | operators:
8 | - find_approximate_duplicate_images
9 | - find_exact_duplicate_images
10 | - display_approximate_duplicate_groups
11 | - display_exact_duplicate_groups
12 | - remove_all_approximate_duplicates
13 | - remove_all_exact_duplicates
14 | - deduplicate_approximate_duplicates
15 | - deduplicate_exact_duplicates
16 |
--------------------------------------------------------------------------------
/assets/exact_dup.svg:
--------------------------------------------------------------------------------
1 |
11 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/asottile/blacken-docs
3 | rev: v1.12.0
4 | hooks:
5 | - id: blacken-docs
6 | additional_dependencies: [black==21.12b0]
7 | args: ["-l 79"]
8 | exclude: index.umd.js
9 | - repo: https://github.com/ambv/black
10 | rev: 22.3.0
11 | hooks:
12 | - id: black
13 | language_version: python3
14 | args: ["-l 79"]
15 | exclude: index.umd.js
16 | - repo: local
17 | hooks:
18 | - id: pylint
19 | name: pylint
20 | language: system
21 | files: \.py$
22 | entry: pylint
23 | args: ["--errors-only"]
24 | exclude: index.umd.js
25 | - repo: local
26 | hooks:
27 | - id: ipynb-strip
28 | name: ipynb-strip
29 | language: system
30 | files: \.ipynb$
31 | entry: jupyter nbconvert --clear-output --ClearOutputPreprocessor.enabled=True
32 | args: ["--log-level=ERROR"]
33 | - repo: https://github.com/pre-commit/mirrors-prettier
34 | rev: v2.6.2
35 | hooks:
36 | - id: prettier
37 | exclude: index.umd.js
38 | language_version: system
39 |
--------------------------------------------------------------------------------
/assets/approx_dup.svg:
--------------------------------------------------------------------------------
1 |
17 |
--------------------------------------------------------------------------------
/assets/view_groups.svg:
--------------------------------------------------------------------------------
1 |
4 |
--------------------------------------------------------------------------------
/assets/delete.svg:
--------------------------------------------------------------------------------
1 |
13 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Image Deduplication Plugin
2 |
3 | This plugin is a Python plugin that streamlines image deduplication workflows!
4 |
5 | With this plugin, you can:
6 |
7 | - Find _exact_ duplicate images using a hash function
8 | - Find _near_ duplicate images using an embedding model and similarity threshold
9 | - View and interact with duplicate images in the App
10 | - Remove all duplicates, or keep a representative image from each duplicate set
11 |
12 | ## Watch On Youtube
13 | [](https://www.youtube.com/watch?v=aingeh0KdPw&list=PLuREAXoPgT0RZrUaT0UpX_HzwKkoB-S9j&index=5)
14 |
15 |
16 | ## Installation
17 |
18 | ```shell
19 | fiftyone plugins download https://github.com/jacobmarks/image-deduplication-plugin
20 | ```
21 |
22 | ## Operators
23 |
24 | ### `find_approximate_duplicate_images`
25 | 
26 |
27 |
28 | This operator finds near-duplicate images in a dataset using a specified similarity index paired with either a distance threshold or a fraction of samples to mark as duplicates.
29 |
30 | ### `find_exact_duplicate_images`
31 |
32 | 
33 |
34 | This operator finds exact duplicate images in a dataset using a hash function.
35 |
36 | ### `display_approximate_duplicate_groups`
37 | 
38 |
39 | This operator displays the images in a dataset that are near-duplicates of each other, grouped together.
40 |
41 | ### `display_exact_duplicate_groups`
42 | 
43 |
44 | This operator displays the images in a dataset that are exact duplicates of each other, grouped together.
45 |
46 | ### `remove_all_approximate_duplicates`
47 | 
48 |
49 | This operator removes all near-duplicate images from a dataset.
50 |
51 | ### `remove_all_exact_duplicates`
52 | 
53 |
54 | This operator removes all exact duplicate images from a dataset.
55 |
56 | ### `deduplicate_approximate_duplicates`
57 |
58 | 
59 |
60 | This operator removes near-duplicate images from a dataset, _keeping a representative image_ from each duplicate set.
61 |
62 | ### `deduplicate_exact_duplicates`
63 |
64 | 
65 |
66 | This operator removes exact duplicate images from a dataset, _keeping a representative image_ from each duplicate set.
67 |
--------------------------------------------------------------------------------
/exact_dups.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | import fiftyone as fo
4 | import fiftyone.core.utils as fou
5 | from fiftyone import ViewField as F
6 |
7 |
8 | def get_filepath(sample):
9 | return (
10 | sample.local_path if hasattr(sample, "local_path") else sample.filepath
11 | )
12 |
13 |
14 | def compute_filehashes(sample_collection):
15 | for sample in sample_collection.iter_samples(autosave=True):
16 | filepath = get_filepath(sample)
17 | sample["filehash"] = str(fou.compute_filehash(filepath))
18 |
19 |
20 | def _need_to_compute_filehashes(sample_collection):
21 | return (
22 | True
23 | if "filehash" not in sample_collection.get_field_schema()
24 | else False
25 | )
26 |
27 |
28 | def find_exact_duplicates(sample_collection):
29 | if _need_to_compute_filehashes(sample_collection):
30 | compute_filehashes(sample_collection)
31 |
32 | filehash_counts = Counter(sample.filehash for sample in sample_collection)
33 | dup_filehashes = [k for k, v in filehash_counts.items() if v > 1]
34 |
35 | exact_dup_view = sample_collection.match(
36 | F("filehash").is_in(dup_filehashes)
37 | ).sort_by("filehash")
38 | ### save the view
39 | dataset = sample_collection._dataset
40 | dataset.save_view("exact_dup_view", exact_dup_view, overwrite=True)
41 |
42 | num_images_with_exact_dups = len(exact_dup_view)
43 | num_dups = num_images_with_exact_dups - len(dup_filehashes)
44 |
45 | response = {
46 | "num_images_with_exact_dups": num_images_with_exact_dups,
47 | "num_dups": num_dups,
48 | }
49 |
50 | return response
51 |
52 |
53 | def get_exact_duplicate_groups(sample_collection):
54 | dataset = sample_collection._dataset
55 | exact_dup_view = dataset.load_saved_view("exact_dup_view")
56 | exact_dup_groups_view = exact_dup_view.group_by("filehash")
57 | return exact_dup_groups_view
58 |
59 |
60 | def remove_all_exact_duplicates(sample_collection):
61 | dataset = sample_collection._dataset
62 |
63 | if "exact_dup_view" not in dataset.list_saved_views():
64 | find_exact_duplicates(sample_collection)
65 |
66 | exact_dup_view = dataset.load_saved_view("exact_dup_view")
67 | dataset.delete_samples(exact_dup_view.values("id"))
68 |
69 | ## remove the saved view
70 | dataset.delete_saved_view("exact_dup_view")
71 |
72 |
73 | def deduplicate_exact_duplicates(sample_collection):
74 | dataset = sample_collection._dataset
75 |
76 | if "exact_dup_view" not in dataset.list_saved_views():
77 | find_exact_duplicates(sample_collection)
78 |
79 | exact_dup_view = dataset.load_saved_view("exact_dup_view")
80 |
81 | remove_sample_ids = []
82 | for fh in exact_dup_view.values("filehash"):
83 | hash_view = exact_dup_view.match(F("filehash") == fh)
84 | ## keep the first sample in each group
85 | keep_sample_id = hash_view.first().id
86 | remove_sample_ids.extend(
87 | [sample.id for sample in hash_view if sample.id != keep_sample_id]
88 | )
89 | dataset.delete_samples(remove_sample_ids)
90 |
91 | dataset.delete_saved_view("exact_dup_view")
92 |
--------------------------------------------------------------------------------
/assets/representative.svg:
--------------------------------------------------------------------------------
1 |
21 |
--------------------------------------------------------------------------------
/approx_dups.py:
--------------------------------------------------------------------------------
1 | import fiftyone as fo
2 | from fiftyone import ViewField as F
3 |
4 |
5 | def get_filepath(sample):
6 | return (
7 | sample.local_path if hasattr(sample, "local_path") else sample.filepath
8 | )
9 |
10 |
11 | def gen_approx_duplicate_groups_view(dataset, index):
12 | """
13 | This function is used to generate the approximate duplicate groups view.
14 | """
15 |
16 | dup_ids = index.duplicates_view().values("id")
17 | view = dataset.select(dup_ids)
18 |
19 | for rep_id, dups in index.neighbors_map.items():
20 | ids = [rep_id] + [d[0] for d in dups]
21 | subview = view.select(ids)
22 | for sample in subview:
23 | sample["approx_dup_group_id"] = rep_id
24 | sample.save()
25 |
26 | approx_dup_groups_view = view.group_by("approx_dup_group_id")
27 | dataset.save_view(
28 | "approx_dup_groups_view", approx_dup_groups_view, overwrite=True
29 | )
30 |
31 |
32 | def find_approximate_duplicates(
33 | sample_collection, brain_key, threshold=None, fraction=None
34 | ):
35 | dataset = sample_collection._dataset
36 |
37 | index = dataset.load_brain_results(brain_key)
38 | if threshold is not None:
39 | index.find_duplicates(thresh=threshold)
40 | else:
41 | index.find_duplicates(fraction=fraction)
42 |
43 | ### save the full duplicates view
44 | approx_dup_view = index.duplicates_view()
45 | dataset.save_view("approx_dup_view", approx_dup_view, overwrite=True)
46 | approx_dup_view = dataset.load_saved_view("approx_dup_view")
47 |
48 | ### save the approximate duplicate groups view
49 | gen_approx_duplicate_groups_view(dataset, index)
50 |
51 | ### compute the number of images with duplicates
52 | num_images_with_approx_dups = len(approx_dup_view)
53 | num_approx_dup_groups = len(index.neighbors_map)
54 | num_dups = num_images_with_approx_dups - num_approx_dup_groups
55 |
56 | response = {
57 | "num_images_with_approx_dups": num_images_with_approx_dups,
58 | "num_dups": num_dups,
59 | }
60 |
61 | return response
62 |
63 |
64 | def get_approximate_duplicate_groups(sample_collection):
65 | dataset = sample_collection._dataset
66 | approx_dup_view = dataset.load_saved_view("approx_dup_groups_view")
67 | return approx_dup_view
68 |
69 |
70 | def remove_all_approximate_duplicates(sample_collection):
71 | dataset = sample_collection._dataset
72 |
73 | if "approx_dup_view" not in dataset.list_saved_views():
74 | raise ValueError("Approximate duplicates have not been computed yet.")
75 |
76 | approx_dup_view = dataset.load_saved_view("approx_dup_view")
77 | dataset.delete_samples(approx_dup_view.values("id"))
78 |
79 | ## remove the saved views
80 | dataset.delete_saved_view("approx_dup_view")
81 | dataset.delete_saved_view("approx_dup_groups_view")
82 |
83 |
84 | def deduplicate_approximate_duplicates(sample_collection):
85 | dataset = sample_collection._dataset
86 |
87 | if "approx_dup_view" not in dataset.list_saved_views():
88 | raise ValueError("Approximate duplicates have not been computed yet.")
89 |
90 | approx_dup_view = dataset.load_saved_view("approx_dup_view")
91 |
92 | remove_sample_ids = []
93 | for group_id in approx_dup_view.distinct("approx_dup_group_id"):
94 | group_view = approx_dup_view.match(
95 | F("approx_dup_group_id") == group_id
96 | )
97 | group_view = group_view.sort_by("filepath")
98 | remove_sample_ids.extend(group_view.values("id")[1:])
99 |
100 | dataset.delete_samples(remove_sample_ids)
101 |
102 | ## remove the saved views
103 | dataset.delete_saved_view("approx_dup_view")
104 | dataset.delete_saved_view("approx_dup_groups_view")
105 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | """Image Deduplication plugin.
2 |
3 | | Copyright 2017-2023, Voxel51, Inc.
4 | | `voxel51.com `_
5 | |
6 | """
7 | import os
8 |
9 | import fiftyone as fo
10 | from fiftyone.core.utils import add_sys_path
11 | import fiftyone.operators as foo
12 | from fiftyone.operators import types
13 |
14 |
15 | def _execution_mode(ctx, inputs):
16 | delegate = ctx.params.get("delegate", False)
17 |
18 | if delegate:
19 | description = "Uncheck this box to execute the operation immediately"
20 | else:
21 | description = "Check this box to delegate execution of this task"
22 |
23 | inputs.bool(
24 | "delegate",
25 | default=False,
26 | required=True,
27 | label="Delegate execution?",
28 | description=description,
29 | view=types.CheckboxView(),
30 | )
31 |
32 | if delegate:
33 | inputs.view(
34 | "notice",
35 | types.Notice(
36 | label=(
37 | "You've chosen delegated execution. Note that you must "
38 | "have a delegated operation service running in order for "
39 | "this task to be processed. See "
40 | "https://docs.voxel51.com/plugins/index.html#operators "
41 | "for more information"
42 | )
43 | ),
44 | )
45 |
46 |
47 | def get_similarity_runs(dataset):
48 | """
49 | Returns a list of similarity runs for the given dataset.
50 | """
51 |
52 | similarity_runs = []
53 | for br in dataset.list_brain_runs():
54 | if "Similarity" in dataset.get_brain_info(br).config.cls:
55 | similarity_runs.append(br)
56 |
57 | return similarity_runs
58 |
59 |
60 | class FindExactDuplicates(foo.Operator):
61 | @property
62 | def config(self):
63 | _config = foo.OperatorConfig(
64 | name="find_exact_duplicate_images",
65 | label="Dedup: Find exact duplicates",
66 | description="Find exact duplicates in the dataset",
67 | dynamic=True,
68 | )
69 | _config.icon = "/assets/exact_dup.svg"
70 | return _config
71 |
72 | def resolve_delegation(self, ctx):
73 | return ctx.params.get("delegate", False)
74 |
75 | def resolve_input(self, ctx):
76 | inputs = types.Object()
77 | form_view = types.View(
78 | label="Find exact duplicates",
79 | description="Find exact duplicates in the dataset",
80 | )
81 | _execution_mode(ctx, inputs)
82 | return types.Property(inputs, view=form_view)
83 |
84 | def execute(self, ctx):
85 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
86 | # pylint: disable=no-name-in-module,import-error
87 | from exact_dups import find_exact_duplicates
88 |
89 | sample_collection = ctx.dataset
90 |
91 | response = find_exact_duplicates(sample_collection)
92 | ctx.ops.reload_dataset()
93 | return response
94 |
95 | def resolve_output(self, ctx):
96 | outputs = types.Object()
97 | outputs.str(
98 | "num_images_with_exact_dups",
99 | label="Number of images with exact duplicates",
100 | )
101 | outputs.str("num_dups", label="Number of exact duplicates")
102 | header = "Exact Duplicate Results"
103 | return types.Property(outputs, view=types.View(label=header))
104 |
105 |
106 | class DisplayExactDuplicates(foo.Operator):
107 | @property
108 | def config(self):
109 | _config = foo.OperatorConfig(
110 | name="display_exact_duplicate_groups",
111 | label="Dedup: Display exact duplicates",
112 | description="Display exact duplicates in the dataset",
113 | dynamic=True,
114 | )
115 | _config.icon = "/assets/view_groups.svg"
116 | return _config
117 |
118 | def resolve_input(self, ctx):
119 | inputs = types.Object()
120 | form_view = types.View(
121 | label="Display exact duplicates",
122 | description="Display exact duplicates in the dataset",
123 | )
124 | return types.Property(inputs, view=form_view)
125 |
126 | def execute(self, ctx):
127 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
128 | # pylint: disable=no-name-in-module,import-error
129 | from exact_dups import get_exact_duplicate_groups
130 |
131 | view = get_exact_duplicate_groups(ctx.dataset)
132 | ctx.ops.set_view(view=view)
133 |
134 |
135 | class RemoveAllExactDuplicates(foo.Operator):
136 | @property
137 | def config(self):
138 | _config = foo.OperatorConfig(
139 | name="remove_all_exact_duplicates",
140 | label="Dedup: Remove all exact duplicates",
141 | description="Remove all exact duplicates from the dataset",
142 | dynamic=True,
143 | )
144 | _config.icon = "/assets/delete.svg"
145 | return _config
146 |
147 | def resolve_input(self, ctx):
148 | inputs = types.Object()
149 | form_view = types.View(
150 | label="Remove all exact duplicates",
151 | description="Remove all exact duplicates from the dataset",
152 | )
153 | return types.Property(inputs, view=form_view)
154 |
155 | def execute(self, ctx):
156 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
157 | # pylint: disable=no-name-in-module,import-error
158 | from exact_dups import remove_all_exact_duplicates
159 |
160 | remove_all_exact_duplicates(ctx.dataset)
161 | ctx.ops.reload_dataset()
162 |
163 |
164 | class DeduplicateExactDuplicates(foo.Operator):
165 | @property
166 | def config(self):
167 | _config = foo.OperatorConfig(
168 | name="deduplicate_exact_duplicates",
169 | label="Dedup: Deduplicate exact duplicates",
170 | description="Remove all but one copy from each group of exact duplicates in the dataset",
171 | dynamic=True,
172 | )
173 | _config.icon = "/assets/representative.svg"
174 | return _config
175 |
176 | def resolve_input(self, ctx):
177 | inputs = types.Object()
178 | form_view = types.View(
179 | label="Deduplicate exact duplicates",
180 | description="Deduplicate exact duplicates in the dataset",
181 | )
182 | return types.Property(inputs, view=form_view)
183 |
184 | def execute(self, ctx):
185 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
186 | # pylint: disable=no-name-in-module,import-error
187 | from exact_dups import deduplicate_exact_duplicates
188 |
189 | deduplicate_exact_duplicates(ctx.dataset)
190 | ctx.ops.reload_dataset()
191 |
192 |
193 | class FindApproximateDuplicates(foo.Operator):
194 | @property
195 | def config(self):
196 | _config = foo.OperatorConfig(
197 | name="find_approximate_duplicate_images",
198 | label="Dedup: Find approximate duplicates",
199 | description="Find approximate duplicates in the dataset",
200 | dynamic=True,
201 | )
202 | _config.icon = "/assets/approx_dup.svg"
203 | return _config
204 |
205 | def resolve_delegation(self, ctx):
206 | return ctx.params.get("delegate", False)
207 |
208 | def resolve_input(self, ctx):
209 | inputs = types.Object()
210 | form_view = types.View(
211 | label="Find Approximate Duplicates",
212 | description="Find approximate duplicates in the dataset using embeddings",
213 | )
214 |
215 | sim_keys = get_similarity_runs(ctx.dataset)
216 | if len(sim_keys) == 0:
217 | inputs.str(
218 | "no_similarity_run_warning",
219 | view=types.Warning(
220 | label=f"No Similarity Runs",
221 | description="You must generate a similarity index on the dataset before you can find approximate duplicates. \n\nSee ```fob.compute_similarity()```",
222 | ),
223 | )
224 | else:
225 | sim_choices = types.Dropdown(label="Similarity Run")
226 | for sim_key in sim_keys:
227 | sim_choices.add_choice(sim_key, label=sim_key)
228 | inputs.enum(
229 | "sim_choices",
230 | sim_choices.values(),
231 | default=sim_choices.choices[0].value,
232 | view=sim_choices,
233 | )
234 |
235 | method_choices = types.RadioGroup()
236 | method_choices.add_choice("threshold", label="Threshold")
237 | method_choices.add_choice("fraction", label="Fraction")
238 | inputs.enum(
239 | "method_choices",
240 | method_choices.values(),
241 | default=method_choices.choices[0].value,
242 | label="Approximate Duplicate Selection Method",
243 | view=method_choices,
244 | )
245 |
246 | if ctx.params.get("method_choices", False) == "fraction":
247 | fraction_slider = types.SliderView(
248 | label="Fraction of dataset to select",
249 | description="Select the fraction of the dataset to mark as approximate duplicates",
250 | componentsProps={
251 | "slider": {"min": 0, "max": 1, "step": 0.01}
252 | },
253 | )
254 | inputs.float("dup_fraction", default=0.1, view=fraction_slider)
255 | else:
256 | inputs.float(
257 | "threshold_value",
258 | default=0.5,
259 | label="Distance Threshold",
260 | description="Select the distance threshold for determining approximate duplicates",
261 | )
262 |
263 | _execution_mode(ctx, inputs)
264 | return types.Property(inputs, view=form_view)
265 |
266 | def execute(self, ctx):
267 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
268 | # pylint: disable=no-name-in-module,import-error
269 | from approx_dups import find_approximate_duplicates
270 |
271 | sample_collection = ctx.dataset
272 | method = ctx.params.get("method", "None provided")
273 | brain_key = ctx.params.get("sim_choices", None)
274 |
275 | if method == "fraction":
276 | fraction = ctx.params.get("dup_fraction", 0.1)
277 | response = find_approximate_duplicates(
278 | sample_collection, brain_key, fraction=fraction
279 | )
280 | else:
281 | threshold = ctx.params.get("threshold_value", 0.5)
282 | response = find_approximate_duplicates(
283 | sample_collection, brain_key, threshold=threshold
284 | )
285 |
286 | return response
287 |
288 | def resolve_output(self, ctx):
289 | outputs = types.Object()
290 | outputs.str(
291 | "num_images_with_approx_dups",
292 | label="Number of images with approximate duplicates",
293 | )
294 | outputs.str("num_dups", label="Number of approximate duplicates")
295 | header = "Approximate Duplicate Results"
296 | return types.Property(outputs, view=types.View(label=header))
297 |
298 |
299 | class DisplayApproximateDuplicates(foo.Operator):
300 | @property
301 | def config(self):
302 | _config = foo.OperatorConfig(
303 | name="display_approximate_duplicate_groups",
304 | label="Dedup: Display approximate duplicates",
305 | description="Display approximate duplicates in the dataset",
306 | dynamic=True,
307 | )
308 | _config.icon = "/assets/view_groups.svg"
309 | return _config
310 |
311 | def resolve_input(self, ctx):
312 | inputs = types.Object()
313 | form_view = types.View(
314 | label="Display approximate duplicates",
315 | description="Display approximate duplicates in the dataset",
316 | )
317 | return types.Property(inputs, view=form_view)
318 |
319 | def execute(self, ctx):
320 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
321 | # pylint: disable=no-name-in-module,import-error
322 | from approx_dups import get_approximate_duplicate_groups
323 |
324 | view = get_approximate_duplicate_groups(ctx.dataset)
325 | ctx.ops.set_view(view=view)
326 |
327 |
328 | class RemoveAllApproximateDuplicates(foo.Operator):
329 | @property
330 | def config(self):
331 | _config = foo.OperatorConfig(
332 | name="remove_all_approximate_duplicates",
333 | label="Dedup: Remove all approximate duplicates",
334 | description="Remove all approximate duplicates from the dataset",
335 | dynamic=True,
336 | )
337 | _config.icon = "/assets/delete.svg"
338 | return _config
339 |
340 | def resolve_input(self, ctx):
341 | inputs = types.Object()
342 | form_view = types.View(
343 | label="Remove all approximate duplicates",
344 | description="Remove all approximate duplicates from the dataset",
345 | )
346 | return types.Property(inputs, view=form_view)
347 |
348 | def execute(self, ctx):
349 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
350 | # pylint: disable=no-name-in-module,import-error
351 | from approx_dups import remove_all_approximate_duplicates
352 |
353 | remove_all_approximate_duplicates(ctx.dataset)
354 | ctx.ops.reload_dataset()
355 |
356 |
357 | class DeduplicateApproximateDuplicates(foo.Operator):
358 | @property
359 | def config(self):
360 | _config = foo.OperatorConfig(
361 | name="deduplicate_approximate_duplicates",
362 | label="Dedup: Deduplicate approximate duplicates",
363 | description="Remove all but one copy from each group of approximate duplicates in the dataset",
364 | dynamic=True,
365 | )
366 | _config.icon = "/assets/representative.svg"
367 | return _config
368 |
369 | def resolve_input(self, ctx):
370 | inputs = types.Object()
371 | form_view = types.View(
372 | label="Deduplicate approximate duplicates",
373 | description="Deduplicate approximate duplicates in the dataset",
374 | )
375 | return types.Property(inputs, view=form_view)
376 |
377 | def execute(self, ctx):
378 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
379 | # pylint: disable=no-name-in-module,import-error
380 | from approx_dups import deduplicate_approximate_duplicates
381 |
382 | deduplicate_approximate_duplicates(ctx.dataset)
383 | ctx.ops.reload_dataset()
384 |
385 |
386 | def register(plugin):
387 | plugin.register(FindExactDuplicates)
388 | plugin.register(DisplayExactDuplicates)
389 | plugin.register(RemoveAllExactDuplicates)
390 | plugin.register(DeduplicateExactDuplicates)
391 | plugin.register(FindApproximateDuplicates)
392 | plugin.register(DisplayApproximateDuplicates)
393 | plugin.register(RemoveAllApproximateDuplicates)
394 | plugin.register(DeduplicateApproximateDuplicates)
395 |
--------------------------------------------------------------------------------