├── .build
├── transifex-pull.cmd
├── release.cmd
├── debug.cmd
├── generate-pot.cmd
└── build.cmd
├── .docs
├── menu.png
├── configuration.png
├── find-book-output-1.png
├── manage-exemptions.png
├── find-book-duplicates.png
├── library-preferences.png
├── find-library-duplicates.png
├── find-library-output-1.png
└── find-metadata-variations.png
├── images
├── next_result.png
├── find_duplicates.png
└── previous_result.png
├── plugin-import-name-find_duplicates.txt
├── .tx
└── config
├── translations
├── README.md
└── find-duplicates.pot
├── common_compatibility.py
├── __init__.py
├── README.md
├── common_icons.py
├── common_menus.py
├── changelog.txt
├── variation_algorithms.py
├── config.py
├── common_dialogs.py
├── common_widgets.py
├── action.py
├── book_algorithms.py
├── matching.py
└── LICENSE.md
/.build/transifex-pull.cmd:
--------------------------------------------------------------------------------
1 | @pushd
2 | @cd ..
3 | tx.exe pull -f -a
4 | @popd
--------------------------------------------------------------------------------
/.docs/menu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/menu.png
--------------------------------------------------------------------------------
/.docs/configuration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/configuration.png
--------------------------------------------------------------------------------
/images/next_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/images/next_result.png
--------------------------------------------------------------------------------
/.docs/find-book-output-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/find-book-output-1.png
--------------------------------------------------------------------------------
/.docs/manage-exemptions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/manage-exemptions.png
--------------------------------------------------------------------------------
/images/find_duplicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/images/find_duplicates.png
--------------------------------------------------------------------------------
/images/previous_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/images/previous_result.png
--------------------------------------------------------------------------------
/plugin-import-name-find_duplicates.txt:
--------------------------------------------------------------------------------
1 | /home/caleb/src/calibre-find-duplicates/plugin-import-name-find_duplicates.txt
2 |
--------------------------------------------------------------------------------
/.docs/find-book-duplicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/find-book-duplicates.png
--------------------------------------------------------------------------------
/.docs/library-preferences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/library-preferences.png
--------------------------------------------------------------------------------
/.docs/find-library-duplicates.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/find-library-duplicates.png
--------------------------------------------------------------------------------
/.docs/find-library-output-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/find-library-output-1.png
--------------------------------------------------------------------------------
/.docs/find-metadata-variations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komali2/calibre-find-duplicates/HEAD/.docs/find-metadata-variations.png
--------------------------------------------------------------------------------
/.build/release.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | call build.cmd
3 |
4 | cd ..
5 |
6 | python ..\common\release.py "%CALIBRE_GITHUB_TOKEN%"
7 |
8 | cd .build
9 |
--------------------------------------------------------------------------------
/.build/debug.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | call build.cmd
3 |
4 | set CALIBRE_DEVELOP_FROM=
5 | set CALIBRE_OVERRIDE_LANG=
6 |
7 | echo Starting calibre in debug mode
8 | calibre-debug -g
--------------------------------------------------------------------------------
/.tx/config:
--------------------------------------------------------------------------------
1 | [main]
2 | host = https://www.transifex.com
3 |
4 | [o:calibre:p:calibre-plugins:r:find-duplicates]
5 | file_filter = translations/ '+_('Are you sure you want to change your settings in this library for this plugin?')+' '+_('Any settings in other libraries or stored in a JSON file in your calibre plugins ' \
191 | 'folder will not be touched.')+'
'+_('Settings for this plugin in this library have been changed.')+'
' \ 202 | ''+_('Please restart calibre now.')+'
') 203 | self.close() 204 | if restart: 205 | self.gui.quit(restart=True) 206 | 207 | def _clear_settings(self): 208 | from calibre.gui2.dialogs.confirm_delete import confirm 209 | message = ''+_('Are you sure you want to clear your settings in this library for this plugin?')+'
' \ 210 | ''+_('Any settings in other libraries or stored in a JSON file in your calibre plugins ' \ 211 | 'folder will not be touched.')+'
' \ 212 | ''+_('You must restart calibre afterwards.')+'
' 213 | if not confirm(message, self.namespace+'_clear_settings', self): 214 | return 215 | 216 | ns_prefix = self._get_ns_prefix() 217 | keys = [k for k in six.iterkeys(self.db.prefs) if k.startswith(ns_prefix)] 218 | for k in keys: 219 | del self.db.prefs[k] 220 | self._populate_settings() 221 | restart = prompt_for_restart(self, _('Settings deleted'), 222 | ''+_('All settings for this plugin in this library have been cleared.')+'
' 223 | ''+_('Please restart calibre now.')+'
') 224 | self.close() 225 | if restart: 226 | self.gui.quit(restart=True) 227 | 228 | 229 | 230 | class ProgressBarDialog(QDialog): 231 | def __init__(self, parent=None, max_items=100, window_title='Progress Bar', 232 | label='Label goes here', on_top=False): 233 | if on_top: 234 | super(ProgressBarDialog, self).__init__(parent=parent, flags=Qt.WindowStaysOnTopHint) 235 | else: 236 | super(ProgressBarDialog, self).__init__(parent=parent) 237 | self.application = Application 238 | self.setWindowTitle(window_title) 239 | self.l = QVBoxLayout(self) 240 | self.setLayout(self.l) 241 | 242 | self.label = QLabel(label) 243 | # self.label.setAlignment(Qt.AlignHCenter) 244 | self.l.addWidget(self.label) 245 | 246 | self.progressBar = QProgressBar(self) 247 | self.progressBar.setRange(0, max_items) 248 | self.progressBar.setValue(0) 249 | self.l.addWidget(self.progressBar) 250 | 251 | def increment(self): 252 | self.progressBar.setValue(self.progressBar.value() + 1) 253 | self.refresh() 254 | 255 | def refresh(self): 256 | self.application.processEvents() 257 | 258 | def set_label(self, value): 259 | self.label.setText(value) 260 | self.refresh() 261 | 262 | def left_align_label(self): 263 | self.label.setAlignment(Qt.AlignLeft ) 264 | 265 | def set_maximum(self, value): 266 | self.progressBar.setMaximum(value) 267 | self.refresh() 268 | 269 | def set_value(self, value): 270 | self.progressBar.setValue(value) 271 | self.refresh() 272 | 273 | def set_progress_format(self, progress_format=None): 274 | pass 275 | 276 | 277 | class ViewLogDialog(QDialog): 278 | 279 | def __init__(self, title, html, parent=None): 280 | QDialog.__init__(self, parent) 281 | self.l = l = QVBoxLayout() 282 | self.setLayout(l) 283 | 284 | self.tb = QTextBrowser(self) 285 | QApplication.setOverrideCursor(Qt.WaitCursor) 286 | # Rather than formatting the text in blocks like the calibre
287 | # ViewLog does, instead just format it inside divs to keep style formatting
288 | html = html.replace('\t',' ').replace('\n', '
')
289 | html = html.replace('> ','> ')
290 | self.tb.setHtml('%s' % html)
291 | QApplication.restoreOverrideCursor()
292 | l.addWidget(self.tb)
293 |
294 | self.bb = QDialogButtonBox(QDialogButtonBox.Ok)
295 | self.bb.accepted.connect(self.accept)
296 | self.bb.rejected.connect(self.reject)
297 | self.copy_button = self.bb.addButton(_('Copy to clipboard'),
298 | self.bb.ActionRole)
299 | self.copy_button.setIcon(QIcon(I('edit-copy.png')))
300 | self.copy_button.clicked.connect(self.copy_to_clipboard)
301 | l.addWidget(self.bb)
302 | self.setModal(False)
303 | self.resize(QSize(700, 500))
304 | self.setWindowTitle(title)
305 | self.setWindowIcon(QIcon(I('debug.png')))
306 | self.show()
307 |
308 | def copy_to_clipboard(self):
309 | txt = self.tb.toPlainText()
310 | QApplication.clipboard().setText(txt)
311 |
--------------------------------------------------------------------------------
/common_widgets.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3 | from __future__ import (unicode_literals, division, absolute_import,
4 | print_function)
5 |
6 | __license__ = 'GPL v3'
7 | __copyright__ = '2022, Grant Drake'
8 |
9 | from six import text_type as unicode
10 |
11 | try:
12 | from qt.core import (Qt, QTableWidgetItem, QComboBox, QHBoxLayout, QLabel, QFont,
13 | QDateTime, QStyledItemDelegate, QLineEdit)
14 | except ImportError:
15 | from PyQt5.Qt import (Qt, QTableWidgetItem, QComboBox, QHBoxLayout, QLabel, QFont,
16 | QDateTime, QStyledItemDelegate, QLineEdit)
17 |
18 | try:
19 | load_translations()
20 | except NameError:
21 | pass # load_translations() added in calibre 1.9
22 |
23 | from calibre.gui2 import error_dialog, UNDEFINED_QDATETIME
24 | from calibre.utils.date import now, format_date, UNDEFINED_DATE
25 |
26 | from calibre_plugins.find_duplicates.common_icons import get_pixmap
27 |
28 | # get_date_format
29 | #
30 | # CheckableTableWidgetItem
31 | # DateDelegate
32 | # DateTableWidgetItem
33 | # ImageTitleLayout
34 | # ReadOnlyTableWidgetItem
35 | # ReadOnlyTextIconWidgetItem
36 | # ReadOnlyCheckableTableWidgetItem
37 | # TextIconWidgetItem
38 | #
39 | # CustomColumnComboBox
40 | # KeyValueComboBox
41 | # NoWheelComboBox
42 | # ReadOnlyLineEdit
43 |
44 | # ----------------------------------------------
45 | # Functions
46 | # ----------------------------------------------
47 |
48 | def get_date_format(tweak_name='gui_timestamp_display_format', default_fmt='dd MMM yyyy'):
49 | from calibre.utils.config import tweaks
50 | format = tweaks[tweak_name]
51 | if format is None:
52 | format = default_fmt
53 | return format
54 |
55 | # ----------------------------------------------
56 | # Widgets
57 | # ----------------------------------------------
58 |
59 | class CheckableTableWidgetItem(QTableWidgetItem):
60 | '''
61 | For use in a table cell, displays a checkbox that can potentially be tristate
62 | '''
63 | def __init__(self, checked=False, is_tristate=False):
64 | super(CheckableTableWidgetItem, self).__init__('')
65 | try:
66 | self.setFlags(Qt.ItemFlag.ItemIsSelectable | Qt.ItemFlag.ItemIsUserCheckable | Qt.ItemFlag.ItemIsEnabled )
67 | except:
68 | self.setFlags(Qt.ItemFlags(Qt.ItemIsSelectable | Qt.ItemIsUserCheckable | Qt.ItemIsEnabled ))
69 | if is_tristate:
70 | self.setFlags(self.flags() | Qt.ItemFlag.ItemIsUserTristate)
71 | if checked:
72 | self.setCheckState(Qt.Checked)
73 | else:
74 | if is_tristate and checked is None:
75 | self.setCheckState(Qt.PartiallyChecked)
76 | else:
77 | self.setCheckState(Qt.Unchecked)
78 |
79 | def get_boolean_value(self):
80 | '''
81 | Return a boolean value indicating whether checkbox is checked
82 | If this is a tristate checkbox, a partially checked value is returned as None
83 | '''
84 | if self.checkState() == Qt.PartiallyChecked:
85 | return None
86 | else:
87 | return self.checkState() == Qt.Checked
88 |
89 | from calibre.gui2.library.delegates import DateDelegate as _DateDelegate
90 | class DateDelegate(_DateDelegate):
91 | '''
92 | Delegate for dates. Because this delegate stores the
93 | format as an instance variable, a new instance must be created for each
94 | column. This differs from all the other delegates.
95 | '''
96 | def __init__(self, parent, fmt='dd MMM yyyy', default_to_today=True):
97 | super(DateDelegate, self).__init__(parent)
98 | self.default_to_today = default_to_today
99 | self.format = get_date_format(default_fmt=fmt)
100 |
101 | def createEditor(self, parent, option, index):
102 | qde = QStyledItemDelegate.createEditor(self, parent, option, index)
103 | qde.setDisplayFormat(self.format)
104 | qde.setMinimumDateTime(UNDEFINED_QDATETIME)
105 | qde.setSpecialValueText(_('Undefined'))
106 | qde.setCalendarPopup(True)
107 | return qde
108 |
109 | def setEditorData(self, editor, index):
110 | val = index.model().data(index, Qt.DisplayRole)
111 | if val is None or val == UNDEFINED_QDATETIME:
112 | if self.default_to_today:
113 | val = self.default_date
114 | else:
115 | val = UNDEFINED_QDATETIME
116 | editor.setDateTime(val)
117 |
118 | def setModelData(self, editor, model, index):
119 | val = editor.dateTime()
120 | if val <= UNDEFINED_QDATETIME:
121 | model.setData(index, UNDEFINED_QDATETIME, Qt.EditRole)
122 | else:
123 | model.setData(index, QDateTime(val), Qt.EditRole)
124 |
125 |
126 | class DateTableWidgetItem(QTableWidgetItem):
127 |
128 | def __init__(self, date_read, is_read_only=False, default_to_today=False, fmt=None):
129 | if date_read is None or date_read == UNDEFINED_DATE and default_to_today:
130 | date_read = now()
131 | if is_read_only:
132 | super(DateTableWidgetItem, self).__init__(format_date(date_read, fmt))
133 | self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
134 | self.setData(Qt.DisplayRole, QDateTime(date_read))
135 | else:
136 | super(DateTableWidgetItem, self).__init__('')
137 | self.setData(Qt.DisplayRole, QDateTime(date_read))
138 |
139 |
140 | class ImageTitleLayout(QHBoxLayout):
141 | '''
142 | A reusable layout widget displaying an image followed by a title
143 | '''
144 | def __init__(self, parent, icon_name, title):
145 | super(ImageTitleLayout, self).__init__()
146 | self.title_image_label = QLabel(parent)
147 | self.update_title_icon(icon_name)
148 | self.addWidget(self.title_image_label)
149 |
150 | title_font = QFont()
151 | title_font.setPointSize(16)
152 | shelf_label = QLabel(title, parent)
153 | shelf_label.setFont(title_font)
154 | self.addWidget(shelf_label)
155 | self.insertStretch(-1)
156 |
157 | def update_title_icon(self, icon_name):
158 | pixmap = get_pixmap(icon_name)
159 | if pixmap is None:
160 | error_dialog(self.parent(), _('Restart required'),
161 | _('Title image not found - you must restart Calibre before using this plugin!'), show=True)
162 | else:
163 | self.title_image_label.setPixmap(pixmap)
164 | self.title_image_label.setMaximumSize(32, 32)
165 | self.title_image_label.setScaledContents(True)
166 |
167 |
168 | class ReadOnlyTableWidgetItem(QTableWidgetItem):
169 | '''
170 | For use in a table cell, displays text the user cannot select or modify.
171 | '''
172 | def __init__(self, text):
173 | if text is None:
174 | text = ''
175 | super(ReadOnlyTableWidgetItem, self).__init__(text)
176 | self.setFlags(Qt.ItemIsSelectable|Qt.ItemIsEnabled)
177 |
178 |
179 | class ReadOnlyTextIconWidgetItem(ReadOnlyTableWidgetItem):
180 | '''
181 | For use in a table cell, displays an icon the user cannot select or modify.
182 | '''
183 | def __init__(self, text, icon):
184 | super(ReadOnlyTextIconWidgetItem, self).__init__(text)
185 | if icon:
186 | self.setIcon(icon)
187 |
188 | class ReadOnlyCheckableTableWidgetItem(ReadOnlyTableWidgetItem):
189 | '''
190 | For use in a table cell, displays a checkbox next to some text the user cannot select or modify.
191 | '''
192 | def __init__(self, text, checked=False, is_tristate=False):
193 | super(ReadOnlyCheckableTableWidgetItem, self).__init__(text)
194 | try: # For Qt Backwards compatibility.
195 | self.setFlags(Qt.ItemFlag.ItemIsSelectable | Qt.ItemFlag.ItemIsUserCheckable | Qt.ItemFlag.ItemIsEnabled )
196 | except:
197 | self.setFlags(Qt.ItemFlags(Qt.ItemIsSelectable | Qt.ItemIsUserCheckable | Qt.ItemIsEnabled ))
198 | if is_tristate:
199 | self.setFlags(self.flags() | Qt.ItemIsTristate)
200 | if checked:
201 | self.setCheckState(Qt.Checked)
202 | else:
203 | if is_tristate and checked is None:
204 | self.setCheckState(Qt.PartiallyChecked)
205 | else:
206 | self.setCheckState(Qt.Unchecked)
207 |
208 | def get_boolean_value(self):
209 | '''
210 | Return a boolean value indicating whether checkbox is checked
211 | If this is a tristate checkbox, a partially checked value is returned as None
212 | '''
213 | if self.checkState() == Qt.PartiallyChecked:
214 | return None
215 | else:
216 | return self.checkState() == Qt.Checked
217 |
218 |
219 | class TextIconWidgetItem(QTableWidgetItem):
220 | '''
221 | For use in a table cell, displays text with an icon next to it.
222 | '''
223 | def __init__(self, text, icon):
224 | super(TextIconWidgetItem, self).__init__(text)
225 | self.setIcon(icon)
226 |
227 |
228 | # ----------------------------------------------
229 | # Controls
230 | # ----------------------------------------------
231 |
232 |
233 | class CustomColumnComboBox(QComboBox):
234 | CREATE_NEW_COLUMN_ITEM = _("Create new column")
235 |
236 | def __init__(self, parent, custom_columns={}, selected_column='', initial_items=[''], create_column_callback=None):
237 | super(CustomColumnComboBox, self).__init__(parent)
238 | self.create_column_callback = create_column_callback
239 | self.current_index = 0
240 | if create_column_callback is not None:
241 | self.currentTextChanged.connect(self.current_text_changed)
242 | self.populate_combo(custom_columns, selected_column, initial_items)
243 |
244 | def populate_combo(self, custom_columns, selected_column, initial_items=[''], show_lookup_name=True):
245 | self.clear()
246 | self.column_names = []
247 | selected_idx = 0
248 |
249 | if isinstance(initial_items, dict):
250 | for key in sorted(initial_items.keys()):
251 | self.column_names.append(key)
252 | display_name = initial_items[key]
253 | self.addItem(display_name)
254 | if key == selected_column:
255 | selected_idx = len(self.column_names) - 1
256 | else:
257 | for display_name in initial_items:
258 | self.column_names.append(display_name)
259 | self.addItem(display_name)
260 | if display_name == selected_column:
261 | selected_idx = len(self.column_names) - 1
262 |
263 | for key in sorted(custom_columns.keys()):
264 | self.column_names.append(key)
265 | display_name = '%s (%s)'%(key, custom_columns[key]['name']) if show_lookup_name else custom_columns[key]['name']
266 | self.addItem(display_name)
267 | if key == selected_column:
268 | selected_idx = len(self.column_names) - 1
269 |
270 | if self.create_column_callback is not None:
271 | self.addItem(self.CREATE_NEW_COLUMN_ITEM)
272 | self.column_names.append(self.CREATE_NEW_COLUMN_ITEM)
273 |
274 | self.setCurrentIndex(selected_idx)
275 |
276 | def get_selected_column(self):
277 | selected_column = self.column_names[self.currentIndex()]
278 | if selected_column == self.CREATE_NEW_COLUMN_ITEM:
279 | selected_column = None
280 | return selected_column
281 |
282 | def current_text_changed(self, new_text):
283 | if new_text == self.CREATE_NEW_COLUMN_ITEM:
284 | result = self.create_column_callback()
285 | if not result:
286 | self.setCurrentIndex(self.current_index)
287 | else:
288 | self.current_index = self.currentIndex()
289 |
290 |
291 | class KeyValueComboBox(QComboBox):
292 |
293 | def __init__(self, parent, values, selected_key):
294 | QComboBox.__init__(self, parent)
295 | self.values = values
296 | self.populate_combo(selected_key)
297 |
298 | def populate_combo(self, selected_key):
299 | self.clear()
300 | selected_idx = idx = -1
301 | for key, value in self.values.items():
302 | idx = idx + 1
303 | self.addItem(value)
304 | if key == selected_key:
305 | selected_idx = idx
306 | self.setCurrentIndex(selected_idx)
307 |
308 | def selected_key(self):
309 | for key, value in self.values.items():
310 | if value == unicode(self.currentText()).strip():
311 | return key
312 |
313 |
314 | class NoWheelComboBox(QComboBox):
315 | '''
316 | For combobox displayed in a table cell using the mouse wheel has nasty interactions
317 | due to the conflict between scrolling the table vs scrolling the combobox item.
318 | Inherit from this class to disable the combobox changing value with mouse wheel.
319 | '''
320 | def wheelEvent(self, event):
321 | event.ignore()
322 |
323 |
324 | class ReadOnlyLineEdit(QLineEdit):
325 |
326 | def __init__(self, text, parent):
327 | if text is None:
328 | text = ''
329 | super(ReadOnlyLineEdit, self).__init__(text, parent)
330 | self.setEnabled(False)
331 |
--------------------------------------------------------------------------------
/action.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals, division, absolute_import, print_function
2 |
3 | __license__ = 'GPL v3'
4 | __copyright__ = '2011, Grant Drake'
5 | __copyright__ = '2021, Caleb Rogers'
6 |
7 | from functools import partial
8 |
9 | try:
10 | from qt.core import QMenu, QToolButton, QApplication, QUrl, Qt
11 | except ImportError:
12 | from PyQt5.Qt import QMenu, QToolButton, QApplication, QUrl, Qt
13 |
14 | import json, os
15 | from datetime import datetime
16 | try:
17 | from calibre.utils.iso8601 import local_tz
18 | except ImportError:
19 | from calibre.utils.date import local_tz
20 |
21 | from calibre.debug import iswindows
22 | from calibre.gui2 import info_dialog, error_dialog, open_url, choose_save_file
23 | from calibre.gui2.actions import InterfaceAction
24 | from calibre.gui2.dialogs.confirm_delete import confirm
25 |
26 | from calibre_plugins.find_duplicates.common_icons import set_plugin_icon_resources, get_icon
27 | from calibre_plugins.find_duplicates.common_menus import unregister_menu_actions, create_menu_action_unique
28 | from calibre_plugins.find_duplicates.dialogs import (FindBookDuplicatesDialog, FindVariationsDialog,
29 | FindLibraryDuplicatesDialog, ManageExemptionsDialog)
30 | from calibre_plugins.find_duplicates.duplicates import DuplicateFinder, CrossLibraryDuplicateFinder
31 |
32 | try:
33 | load_translations()
34 | except NameError:
35 | pass
36 |
37 | HELP_URL = 'https://github.com/kiwidude68/calibre_plugins/wiki/Find-Duplicates'
38 |
39 | PLUGIN_ICONS = ['images/find_duplicates.png',
40 | 'images/next_result.png', 'images/previous_result.png']
41 |
42 | class FindDuplicatesAction(InterfaceAction):
43 |
44 | name = 'Find Duplicates'
45 | # Create our top-level menu/toolbar action (text, icon_path, tooltip, keyboard shortcut)
46 | action_spec = (_('Find Duplicates'), None, None, None)
47 | popup_type = QToolButton.MenuButtonPopup
48 | action_type = 'current'
49 |
50 | def genesis(self):
51 | self.menu = QMenu(self.gui)
52 |
53 | # Read the plugin icons and store for potential sharing with the config widget
54 | icon_resources = self.load_resources(PLUGIN_ICONS)
55 | set_plugin_icon_resources(self.name, icon_resources)
56 |
57 | self.rebuild_menus()
58 |
59 | # Assign our menu to this action and an icon
60 | self.qaction.setMenu(self.menu)
61 | self.qaction.setIcon(get_icon(PLUGIN_ICONS[0]))
62 | self.qaction.triggered.connect(self.toolbar_button_clicked)
63 | self.menu.aboutToShow.connect(self.about_to_show_menu)
64 |
65 | def initialization_complete(self):
66 | # Delay instantiating our finder as we require access to the library view
67 | self.duplicate_finder = DuplicateFinder(self.gui)
68 | self.has_advanced_results = False
69 | self.update_actions_enabled()
70 | self.gui.search.cleared.connect(self.user_has_cleared_search)
71 | self.gui.search_restriction.currentIndexChanged.connect(self.user_has_changed_restriction)
72 |
73 | def library_changed(self, db):
74 | # We need to reset our duplicate finder after switching libraries
75 | self.duplicate_finder = DuplicateFinder(self.gui)
76 | self.update_actions_enabled()
77 |
78 | def shutting_down(self):
79 | if self.duplicate_finder.is_showing_duplicate_exemptions() or self.duplicate_finder.has_results():
80 | self.duplicate_finder.clear_duplicates_mode()
81 |
82 | def rebuild_menus(self):
83 | # Ensure any keyboard shortcuts from previous display of plugin menu are cleared
84 | unregister_menu_actions(self)
85 |
86 | m = self.menu
87 | m.clear()
88 | create_menu_action_unique(self, m, _('&Find book duplicates')+'...', image=PLUGIN_ICONS[0],
89 | triggered=self.find_book_duplicates)
90 | create_menu_action_unique(self, m, _('Find library duplicates')+'...', image='library.png',
91 | tooltip=_('Find books that are duplicated in another library compared to this one'),
92 | triggered=self.find_library_duplicates)
93 | m.addSeparator()
94 | create_menu_action_unique(self, m, _('Find metadata &variations')+'...', image='user_profile.png',
95 | tooltip=_('Find & rename variations in author, publisher, series or tags names that may indicate duplicates'),
96 | triggered=self.find_variations)
97 | m.addSeparator()
98 | self.next_group_action = create_menu_action_unique(self, m, _('&Next result'), image='images/next_result.png',
99 | tooltip=_('Display the next duplicate result group'),
100 | triggered=partial(self.show_next_result, forward=True))
101 | self.previous_group_action = create_menu_action_unique(self, m, _('&Previous result'), image='images/previous_result.png',
102 | tooltip=_('Display the previous duplicate result group'),
103 | triggered=partial(self.show_next_result, forward=False))
104 | m.addSeparator()
105 | self.mark_group_exempt_action = create_menu_action_unique(self, m, _('&Mark current group as exempt'),
106 | tooltip=_('Mark the current group as not duplicates and exempt from future consideration'),
107 | triggered=partial(self.mark_groups_as_duplicate_exemptions, all_groups=False))
108 | self.mark_all_groups_exempt_action = create_menu_action_unique(self, m,
109 | _('Mark &all groups as exempt'),
110 | tooltip=_('Mark all remaining duplicate groups as exempt from future consideration'),
111 | triggered=partial(self.mark_groups_as_duplicate_exemptions, all_groups=True))
112 | m.addSeparator()
113 | self.show_book_exempt_action = create_menu_action_unique(self, m,
114 | _('&Show all book duplicate exemptions'),
115 | tooltip=_('Show all books that have book duplicate exemption pairings'),
116 | triggered=partial(self.show_all_exemptions, for_books=True))
117 | self.show_author_exempt_action = create_menu_action_unique(self, m,
118 | _('&Show all author duplicate exemptions'),
119 | tooltip=_('Show all books that have author duplicate exemption pairings'),
120 | triggered=partial(self.show_all_exemptions, for_books=False))
121 | self.manage_exemptions_action = create_menu_action_unique(self, m,
122 | _('&Manage exemptions for this book'),
123 | tooltip=_('Show duplicate exemptions for this book to enable removal'),
124 | triggered=self.manage_exemptions_for_book)
125 | self.remove_exemptions_action = create_menu_action_unique(self, m,
126 | _('&Remove selected exemptions'),
127 | tooltip=_('Remove any duplicate book/author exemptions for the selected books'),
128 | triggered=self.remove_from_duplicate_exemptions)
129 | m.addSeparator()
130 | self.clear_duplicate_mode_action = create_menu_action_unique(self, m,
131 | _('&Clear duplicate results'), image='clear_left.png',
132 | tooltip=_('Exit duplicate search mode'),
133 | triggered=self.clear_duplicate_results)
134 | m.addSeparator()
135 | self.export_duplicates_action = create_menu_action_unique(self, m,
136 | _('&Export duplicate groups'),
137 | tooltip=_('Export duplicates groups to a json file'),
138 | triggered=self.export_duplicates)
139 | self.merge_all_groups_action = create_menu_action_unique(self, m,
140 | _('&Merge all groups'),
141 | tooltip=_('Merge all the groups, showing confirmation box for each group. Back up your library first.'),
142 | triggered=partial(self.merge_all_groups))
143 | m.addSeparator()
144 |
145 | create_menu_action_unique(self, m, _('&Customize plugin')+'...', 'config.png',
146 | shortcut=False, triggered=self.show_configuration)
147 | create_menu_action_unique(self, m, _('&Help'), 'help.png',
148 | shortcut=False, triggered=self.show_help)
149 |
150 | self.gui.keyboard.finalize()
151 |
152 | def about_to_show_menu(self):
153 | self.update_actions_enabled()
154 | # As we are showing a menu we can refine the enabled state of the
155 | # actions that are based on the selected rows
156 | has_duplicate_exemptions = self.duplicate_finder.has_duplicate_exemptions()
157 | if has_duplicate_exemptions:
158 | book_ids = self.gui.library_view.get_selected_ids()
159 | remove_enabled = len(book_ids) > 0
160 | manage_enabled = len(book_ids) == 1
161 | if manage_enabled:
162 | manage_enabled = self.duplicate_finder.is_book_in_exemption(book_ids[0])
163 | for book_id in book_ids:
164 | if not self.duplicate_finder.is_book_in_exemption(book_id):
165 | remove_enabled = False
166 | break
167 | self.manage_exemptions_action.setEnabled(manage_enabled)
168 | self.remove_exemptions_action.setEnabled(remove_enabled)
169 |
170 | def update_actions_enabled(self):
171 | has_results = self.duplicate_finder.has_results()
172 | self.next_group_action.setEnabled(has_results)
173 | self.previous_group_action.setEnabled(has_results)
174 | self.mark_group_exempt_action.setEnabled(has_results)
175 | self.mark_all_groups_exempt_action.setEnabled(has_results)
176 | self.merge_all_groups_action.setEnabled(has_results)
177 | is_showing_exemptions = self.duplicate_finder.is_showing_duplicate_exemptions()
178 | self.clear_duplicate_mode_action.setEnabled(has_results or is_showing_exemptions or self.has_advanced_results)
179 | self.export_duplicates_action.setEnabled(has_results)
180 |
181 | # As some actions could be via shortcut keys we need them enabled
182 | # regardless of row selections
183 | has_duplicate_exemptions = self.duplicate_finder.has_duplicate_exemptions()
184 | self.show_book_exempt_action.setEnabled(self.duplicate_finder.has_book_exemptions())
185 | self.show_author_exempt_action.setEnabled(self.duplicate_finder.has_author_exemptions())
186 | self.manage_exemptions_action.setEnabled(has_duplicate_exemptions)
187 | self.remove_exemptions_action.setEnabled(has_duplicate_exemptions)
188 |
189 | def find_book_duplicates(self):
190 | d = FindBookDuplicatesDialog(self.gui)
191 | if d.exec_() == d.Accepted:
192 | self.duplicate_finder.run_book_duplicates_check()
193 | self.update_actions_enabled()
194 |
195 | def find_library_duplicates(self):
196 | if self.clear_duplicate_mode_action.isEnabled():
197 | self.clear_duplicate_results()
198 | else:
199 | self.gui.search.clear()
200 | d = FindLibraryDuplicatesDialog(self.gui)
201 | if d.exec_() == d.Accepted:
202 | self.library_finder = CrossLibraryDuplicateFinder(self.gui)
203 | self.library_finder.run_library_duplicates_check()
204 | self.has_advanced_results = self.library_finder.display_results
205 | self.update_actions_enabled()
206 |
207 | def find_variations(self):
208 | if self.clear_duplicate_mode_action.isEnabled():
209 | self.clear_duplicate_results()
210 | ids = self.gui.library_view.get_selected_ids()
211 | query = self.gui.search.text()
212 | d = FindVariationsDialog(self.gui)
213 | d.exec_()
214 | if d.is_changed():
215 | # Signal the library view and tags panel to refresh.
216 | QApplication.setOverrideCursor(Qt.WaitCursor)
217 | try:
218 | self.gui.library_view.model().refresh()
219 | finally:
220 | QApplication.restoreOverrideCursor()
221 | # If the user is displaying books simultaneously from the dialog then we do
222 | # not want to change the search in case they intentionally cancelled to make
223 | # some changes to those visible rows
224 | if not d.is_showing_books():
225 | self.gui.search.set_search_string(query)
226 | self.gui.library_view.select_rows(ids)
227 | self.gui.tags_view.recount()
228 | if d.is_showing_books():
229 | self.gui.search.do_search()
230 |
231 | def toolbar_button_clicked(self):
232 | if not self.duplicate_finder.has_results():
233 | return self.find_book_duplicates()
234 | # If the user control-clicks on this button/menu, reverse the direction of search
235 | forward = True
236 | mods = QApplication.keyboardModifiers()
237 | if mods & Qt.ControlModifier or mods & Qt.ShiftModifier:
238 | forward = False
239 | self.show_next_result(forward)
240 |
241 | def show_next_result(self, forward=True):
242 | self.duplicate_finder.show_next_result(forward)
243 | self.update_actions_enabled()
244 |
245 | def mark_groups_as_duplicate_exemptions(self, all_groups):
246 | can_exempt = self.duplicate_finder.check_can_mark_exemption(all_groups)
247 | if can_exempt:
248 | # Ensure that the selection is moved onto the current duplicate group
249 | duplicate_ids = self.duplicate_finder.get_current_duplicate_group_ids()
250 | self.gui.library_view.select_rows(duplicate_ids)
251 | exemption_type = 'books'
252 | if self.duplicate_finder.is_searching_for_authors():
253 | exemption_type = 'authors'
254 | dialog_name = 'find_duplicates_mark_all_groups' if all_groups else 'find_duplicates_mark_group'
255 | if not confirm('' + _(
256 | 'This action will ensure that each of the {0} in the group '
257 | 'are exempt from appearing together again in future.').format(exemption_type)+'
'+
258 | _('Are you sure you want to proceed?'),
259 | dialog_name, self.gui):
260 | return
261 | if all_groups:
262 | self.duplicate_finder.mark_groups_as_duplicate_exemptions()
263 | else:
264 | self.duplicate_finder.mark_current_group_as_duplicate_exemptions()
265 | else:
266 | info_dialog(self.gui, _('No duplicates in group'),
267 | _('There are no duplicates remaining in this group.'),
268 | show=True, show_copy_button=False)
269 | self.update_actions_enabled()
270 |
271 | def show_all_exemptions(self, for_books=True):
272 | self.duplicate_finder.show_all_exemptions(for_books)
273 | self.update_actions_enabled()
274 |
275 | def merge_all_groups(self):
276 | self.duplicate_finder.merge_all_groups()
277 | self.update_actions_enabled()
278 |
279 | def manage_exemptions_for_book(self):
280 | row = self.gui.library_view.currentIndex()
281 | if not row.isValid():
282 | return error_dialog(self.gui, _('Cannot manage exemptions'),
283 | _('No book selected'), show=True)
284 | book_id = self.gui.library_view.model().id(row)
285 | book_exemptions, author_exemptions_map = self.duplicate_finder.get_exemptions_for_book(book_id)
286 | if not book_exemptions and not author_exemptions_map:
287 | return info_dialog(self.gui, _('Cannot manage exemptions'),
288 | _('This book has no duplicate exemptions'), show=True)
289 |
290 | d = ManageExemptionsDialog(self.gui, self.gui.current_db,
291 | book_id, book_exemptions, author_exemptions_map)
292 | d.exec_()
293 | if d.result() == d.Accepted:
294 | exempt_book_ids = d.get_checked_book_ids()
295 | if exempt_book_ids:
296 | self.duplicate_finder.remove_from_book_exemptions(
297 | exempt_book_ids, from_book_id=book_id)
298 | exempt_authors_map = d.get_checked_authors_map()
299 | if exempt_authors_map:
300 | for author, exempt_authors in list(exempt_authors_map.items()):
301 | self.duplicate_finder.remove_from_author_exemptions(
302 | authors=exempt_authors, from_author=author)
303 |
304 | self.update_actions_enabled()
305 |
306 | def remove_from_duplicate_exemptions(self):
307 | book_ids = self.gui.library_view.get_selected_ids()
308 | if len(book_ids) < 1:
309 | return error_dialog(self.gui, _('Invalid selection'),
310 | _('You must select at least one book.'), show=True)
311 | if not confirm('
' + _(
312 | 'This action will remove any duplicate exemptions for your '
313 | 'selection. This will allow them to potentially appear '
314 | 'as duplicates together in a future duplicate search.')+'
'+
315 | _('Are you sure you want to proceed?'),
316 | 'find_duplicates_remove_exemption', self.gui):
317 | return
318 | self.duplicate_finder.remove_from_book_exemptions(book_ids)
319 | self.duplicate_finder.remove_from_author_exemptions(book_ids)
320 | self.update_actions_enabled()
321 |
322 | def clear_duplicate_results(self, clear_search=True, reapply_restriction=True):
323 | if not self.clear_duplicate_mode_action.isEnabled():
324 | return
325 | if self.has_advanced_results:
326 | self.library_finder.clear_gui_duplicates_mode(clear_search, reapply_restriction)
327 | self.has_advanced_results = False
328 | else:
329 | self.duplicate_finder.clear_duplicates_mode(clear_search, reapply_restriction)
330 | self.update_actions_enabled()
331 |
332 | def user_has_cleared_search(self):
333 | if self.has_advanced_results or self.duplicate_finder.is_valid_to_clear_search():
334 | self.clear_duplicate_results(clear_search=False)
335 |
336 | def user_has_changed_restriction(self, idx):
337 | if self.has_advanced_results or self.duplicate_finder.is_valid_to_clear_search():
338 | self.clear_duplicate_results(clear_search=False, reapply_restriction=False)
339 |
340 | def show_configuration(self):
341 | self.interface_action_base_plugin.do_user_config(self.gui)
342 |
343 | def export_duplicates(self):
344 | '''
345 | export all duplicate books to a json file.
346 | '''
347 | self.duplicate_finder._cleanup_deleted_books()
348 |
349 | json_path = choose_save_file(self.gui, 'export-duplicates', _('Choose file'), filters=[
350 | (_('Saved duplicates'), ['json'])], all_files=False)
351 | if json_path:
352 | if not json_path.lower().endswith('.json'):
353 | json_path += '.json'
354 | if not json_path:
355 | return
356 |
357 | if iswindows:
358 | json_path = os.path.normpath(json_path)
359 |
360 | entangled_books = {}
361 | for book_id, groups in self.duplicate_finder._groups_for_book_map.items():
362 | if len(groups) > 1:
363 | entangled_books[book_id] = list(groups)
364 |
365 | data = {
366 | 'books_for_group': self.duplicate_finder._books_for_group_map,
367 | 'entangled_groups_for_book': entangled_books,
368 | 'library_uuid': self.gui.current_db.library_id,
369 | 'library_path': self.gui.current_db.library_path,
370 | 'timestamp': datetime.now().replace(tzinfo=local_tz).isoformat()
371 | }
372 |
373 | with open(json_path, 'w') as f:
374 | json.dump(data, f, indent=4)
375 |
376 | info_dialog(self.gui, _('Export completed'),
377 | _('Exported to: {}').format(json_path),
378 | show=True, show_copy_button=False)
379 |
380 | def show_help(self):
381 | open_url(QUrl(HELP_URL))
382 |
--------------------------------------------------------------------------------
/book_algorithms.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals, division, absolute_import, print_function
2 |
3 | __license__ = 'GPL v3'
4 | __copyright__ = '2011, Grant Drake'
5 |
6 | import time, traceback
7 | from collections import OrderedDict, defaultdict
8 |
9 | try:
10 | from qt.core import QModelIndex
11 | except ImportError:
12 | from PyQt5.Qt import QModelIndex
13 |
14 | from calibre import prints
15 | from calibre.constants import DEBUG
16 |
17 | from calibre_plugins.find_duplicates.matching import (authors_to_list, similar_title_match,
18 | get_author_algorithm_fn, get_title_algorithm_fn)
19 |
20 | try:
21 | load_translations()
22 | except NameError:
23 | pass
24 |
25 | DUPLICATE_SEARCH_FOR_BOOK = 'BOOK'
26 | DUPLICATE_SEARCH_FOR_AUTHOR = 'AUTHOR'
27 |
28 | # --------------------------------------------------------------
29 | # Find Duplicate Book Algorithm Classes
30 | # --------------------------------------------------------------
31 |
32 | class AlgorithmBase(object):
33 | '''
34 | All duplicate search algorithms should inherit from this class
35 | '''
36 | def __init__(self, gui, db, exemptions_map):
37 | self.gui = gui
38 | self.db = db
39 | self.model = self.gui.library_view.model()
40 | self._exemptions_map = exemptions_map
41 |
42 | def duplicate_search_mode(self):
43 | return DUPLICATE_SEARCH_FOR_BOOK
44 |
45 | def run_duplicate_check(self, sort_groups_by_title=True, include_languages=False):
46 | '''
47 | The entry point for running the algorithm
48 | '''
49 | book_ids = self.get_book_ids_to_consider()
50 | start = time.time()
51 |
52 | # Get our map of potential duplicate candidates
53 | self.gui.status_bar.showMessage(_('Analysing {0} books for duplicates').format(len(book_ids)))
54 | candidates_map = self.find_candidates(book_ids, include_languages)
55 |
56 | # Perform a quick pass through removing all groups with < 2 members
57 | self.shrink_candidates_map(candidates_map)
58 |
59 | # Now ask for these candidate groups to be ordered so that our numbered
60 | # groups will have some kind of consistent order to them.
61 | candidates_map = self.sort_candidate_groups(candidates_map, sort_groups_by_title)
62 |
63 | # Convert our dictionary of potential candidates into sets of more than one
64 | books_for_groups_map, groups_for_book_map = self.convert_candidates_to_groups(candidates_map)
65 | if DEBUG:
66 | prints('Completed duplicate analysis in:', time.time() - start)
67 | prints('Found %d duplicate groups covering %d books'%(len(books_for_groups_map),
68 | len(groups_for_book_map)))
69 | return books_for_groups_map, groups_for_book_map
70 |
71 | def get_book_ids_to_consider(self):
72 | '''
73 | Default implementation will iterate over the current subset of books
74 | in our current library model
75 | '''
76 | rows = list(range(self.model.rowCount(QModelIndex())))
77 | book_ids = list(map(self.model.id, rows))
78 | return book_ids
79 |
80 | def find_candidates(self, book_ids, include_languages=False):
81 | '''
82 | Default implementation will iterate across the book ids to consider
83 | and call find_candidate. Return a dictionary of candidates.
84 | '''
85 | candidates_map = defaultdict(set)
86 | for book_id in book_ids:
87 | self.find_candidate(book_id, candidates_map, include_languages)
88 | return candidates_map
89 |
90 | def find_candidate(self, book_id, candidates_map, include_languages=False):
91 | '''
92 | Derived classes must provide an implementation
93 | '''
94 | pass
95 |
96 | def shrink_candidates_map(self, candidates_map):
97 | for key in list(candidates_map.keys()):
98 | if len(candidates_map[key]) < 2:
99 | del candidates_map[key]
100 |
101 | def convert_candidates_to_groups(self, candidates_map):
102 | '''
103 | Given a dictionary keyed by some sort of common duplicate group
104 | key (like a fuzzy of title/author) remove all of the groups that
105 | have less than two members, repartition as required for any
106 | duplicate exemptions and return as a tuple of:
107 | (books_for_group_map, groups_for_book_map)
108 | books_for_group_map - for each group id, contains a list of book ids
109 | groups_for_book_map - for each book id, contains a list of group ids
110 | '''
111 | books_for_group_map = dict()
112 | groups_for_book_map = defaultdict(set)
113 | group_id = 0
114 | # Convert our map of groups into a list of sets with any duplicate groups removed
115 | candidates_list = self.clean_dup_groups(candidates_map)
116 | for book_ids in candidates_list:
117 | partition_groups = self.partition_using_exemptions(book_ids)
118 | for partition_group in partition_groups:
119 | if len(partition_group) > 1:
120 | group_id += 1
121 | partition_book_ids = self.get_book_ids_for_candidate_group(partition_group)
122 | books_for_group_map[group_id] = partition_book_ids
123 | for book_id in partition_book_ids:
124 | groups_for_book_map[book_id].add(group_id)
125 | return books_for_group_map, groups_for_book_map
126 |
127 | def clean_dup_groups(self, candidates_map):
128 | '''
129 | Given a dictionary of sets, convert into a list of sets removing any sets
130 | that are subsets of other sets.
131 | '''
132 | res = [set(d) for d in list(candidates_map.values())]
133 | res.sort(key=lambda x: len(x))
134 | candidates_list = []
135 | for i,a in enumerate(res):
136 | for b in res[i+1:]:
137 | if a.issubset(b):
138 | break
139 | else:
140 | candidates_list.append(a)
141 | return candidates_list
142 |
143 | def get_book_ids_for_candidate_group(self, candidate_group):
144 | '''
145 | Return the book ids representing this candidate group
146 | Default implementation is given a book ids so just return them
147 | '''
148 | return candidate_group
149 |
150 | def sort_candidate_groups(self, candidates_map, by_title=True):
151 | '''
152 | Responsible for returning an ordered dict of how to order the groups
153 | Default implementation will just sort by the fuzzy key of our candidates
154 | '''
155 | if by_title:
156 | skeys = sorted(candidates_map.keys())
157 | else:
158 | skeys = sorted(list(candidates_map.keys()),
159 | key=lambda ckey: '%04d%s' % (len(candidates_map[ckey]), ckey),
160 | reverse=True)
161 | return OrderedDict([(key, candidates_map[key]) for key in skeys])
162 |
163 | def partition_using_exemptions(self, data_items):
164 | '''
165 | Given a set of data items, see if any of these combinations should
166 | be excluded due to being marked as not duplicates of each other
167 | If we find items that should not appear together, then we will
168 | repartition into multiple groups. Returns a list where each item
169 | is a sublist containing the data items for that partitioned group.
170 | '''
171 | data_items = sorted(data_items)
172 | # Initial condition -- the group contains 1 set of all elements
173 | results = [set(data_items)]
174 | partitioning_ids = [None]
175 | # Loop through the set of duplicates, checking to see if the entry is in a non-dup set
176 | for one_dup in data_items:
177 | if one_dup in self._exemptions_map:
178 | ndm_entry = self._exemptions_map.merge_sets(one_dup)
179 | # The entry is indeed in a non-dup set. We may need to partition
180 | for i,res in enumerate(results):
181 | if one_dup in res:
182 | # This result group contains the item with a non-dup set. If the item
183 | # was the one that caused this result group to partition in the first place,
184 | # then we must not partition again or we will make subsets of the group
185 | # that split this partition off. Consider a group of (1,2,3,4) and
186 | # non-dups of [(1,2), (2,3)]. The first partition will give us (1,3,4)
187 | # and (2,3,4). Later when we discover (2,3), if we partition (2,3,4)
188 | # again, we will end up with (2,4) and (3,4), but (3,4) is a subset
189 | # of (1,3,4). All we need to do is remove 3 from the (2,3,4) partition.
190 | if one_dup == partitioning_ids[i]:
191 | results[i] = (res - ndm_entry) | set([one_dup])
192 | continue
193 | # Must partition. We already have one partition, the one in our hand.
194 | # Remove the dups from it, then create new partitions for each of the dups.
195 | results[i] = (res - ndm_entry) | set([one_dup])
196 | for nd in ndm_entry:
197 | # Only partition if the duplicate is larger than the one we are looking
198 | # at. This is necessary because the non-dup set map is complete,
199 | # map[2] == (2,3), and map[3] == (2,3). We know that when processing
200 | # the set for 3, we have already done the work for the element 2.
201 | if nd > one_dup and nd in res:
202 | results.append((res - ndm_entry - set([one_dup])) | set([nd]))
203 | partitioning_ids.append(nd)
204 | sr = []
205 | for r in results:
206 | if len(r) > 1:
207 | sr.append(sorted(list(r)))
208 | sr.sort()
209 | return sr
210 |
211 |
212 | class IdentifierAlgorithm(AlgorithmBase):
213 | '''
214 | This algorithm simply finds books that have duplicate identifier values
215 | '''
216 | def __init__(self, gui, db, exemptions_map, identifier_type='isbn'):
217 | AlgorithmBase.__init__(self, gui, db, exemptions_map)
218 | self.identifier_type = identifier_type
219 |
220 | def get_book_ids_to_consider(self):
221 | '''
222 | Override base function as we will only consider books that have an identifier
223 | rather than every book in the library.
224 | '''
225 | return self.db.data.search_getting_ids('identifier:'+self.identifier_type+':True', self.db.data.search_restriction)
226 |
227 | def find_candidate(self, book_id, candidates_map, include_languages=False):
228 | identifiers = self.db.get_identifiers(book_id, index_is_id=True)
229 | identifier = identifiers.get(self.identifier_type, '')
230 | if identifier:
231 | candidates_map[identifier].add(book_id)
232 |
233 | def sort_candidate_groups(self, candidates_map, by_title=True):
234 | '''
235 | Responsible for returning an ordered dict of how to order the groups
236 | Override to just do a fuzzy title sort to give a better sort than by identifier
237 | '''
238 | title_map = {}
239 | for key in list(candidates_map.keys()):
240 | book_id = list(candidates_map[key])[0]
241 | title_map[key] = similar_title_match(self.db.title(book_id, index_is_id=True))
242 | if by_title:
243 | skeys = sorted(list(candidates_map.keys()), key=lambda identifier: title_map[identifier])
244 | else:
245 | skeys = sorted(list(candidates_map.keys()),
246 | key=lambda identifier: '%04d%s' % (len(candidates_map[identifier]), identifier),
247 | reverse=True)
248 | return OrderedDict([(identifier, candidates_map[identifier]) for identifier in skeys])
249 |
250 |
251 | class BinaryCompareAlgorithm(IdentifierAlgorithm):
252 | '''
253 | This algorithm simply finds books that have binary duplicates of their format files
254 | Inheriting from IdentifierAlgorithm only to reuse the sort_candidate_groups override
255 | '''
256 | def get_book_ids_to_consider(self):
257 | '''
258 | Override base function as we will only consider books that have a format
259 | rather than every book in the library.
260 | '''
261 | return self.db.data.search_getting_ids('formats:True', self.db.data.search_restriction)
262 |
263 | def find_candidates(self, book_ids, include_languages=False):
264 | '''
265 | Override the default implementation so we can do multiple passes as a more
266 | efficient approach to finding binary duplicates.
267 | '''
268 | # Our first pass will be to find all books that have an identical file size
269 | candidates_size_map = defaultdict(set)
270 | formats_count = 0
271 | for book_id in book_ids:
272 | formats_count += self._find_candidate_by_file_size(book_id, candidates_size_map)
273 |
274 | # Perform a quick pass through removing all groups with < 2 members
275 | self.shrink_candidates_map(candidates_size_map)
276 | if DEBUG:
277 | prints('Pass 1: %d formats created %d size collisions' % (formats_count, len(candidates_size_map)))
278 |
279 | # Our final pass is to build our result set for this function
280 | candidates_map = defaultdict(set)
281 | hash_map = self.db.get_all_custom_book_data('find_duplicates', default={})
282 | result_hash_map = {}
283 | for size, size_group in list(candidates_size_map.items()):
284 | for book_id, fmt, mtime in size_group:
285 | self._find_candidate_by_hash(book_id, fmt, mtime, size, candidates_map, hash_map, result_hash_map)
286 | self.db.add_multiple_custom_book_data('find_duplicates', result_hash_map)
287 | return candidates_map
288 |
289 | def _find_candidate_by_file_size(self, book_id, candidates_map):
290 | formats = self.db.formats(book_id, index_is_id=True, verify_formats=False)
291 | count = 0
292 | for fmt in formats.split(','):
293 | try:
294 | stat_metadata = self.db.format_metadata(book_id, fmt)
295 | mtime = stat_metadata['mtime']
296 | size = stat_metadata['size']
297 | candidates_map[size].add((book_id, fmt, mtime))
298 | count += 1
299 | except:
300 | traceback.print_exc()
301 | return count
302 |
303 | def _add_to_hash_map(self, hash_map, book_id, fmt, book_data):
304 | if book_id not in hash_map:
305 | hash_map[book_id] = {}
306 | hash_map[book_id][fmt] = book_data
307 |
308 | def _find_candidate_by_hash(self, book_id, fmt, mtime, size, candidates_map, hash_map, result_hash_map):
309 | # Work out whether we need to calculate a hash for this file from
310 | # book plugin data from a previous run
311 | book_data = hash_map.get(book_id, {}).get(fmt, {})
312 | if book_data.get('mtime', None) == mtime:
313 | sha = book_data.get('sha', None)
314 | size = book_data.get('size', None)
315 | if sha and size:
316 | candidates_map[(sha, size)].add(book_id)
317 | self._add_to_hash_map(result_hash_map, book_id, fmt, book_data)
318 | return
319 | try:
320 | format_hash = self.db.format_hash(book_id, fmt)
321 | hash_key = (format_hash, size)
322 | candidates_map[hash_key].add(book_id)
323 | # Store our plugin book data for future repeat scanning
324 | book_data['mtime'] = mtime
325 | book_data['sha'] = format_hash
326 | book_data['size'] = size
327 | self._add_to_hash_map(result_hash_map, book_id, fmt, book_data)
328 | except:
329 | traceback.print_exc()
330 |
331 |
332 | class TitleAuthorAlgorithm(AlgorithmBase):
333 | '''
334 | This algorithm is used for all the permutations requiring
335 | some evaluation of book titles and an optional author evaluation
336 | '''
337 | def __init__(self, gui, db, book_exemptions_map, title_eval, author_eval):
338 | AlgorithmBase.__init__(self, gui, db, exemptions_map=book_exemptions_map)
339 | self._title_eval = title_eval
340 | self._author_eval = author_eval
341 |
342 | def find_candidate(self, book_id, candidates_map, include_languages=False):
343 | lang = None
344 | if include_languages:
345 | lang = self.db.languages(book_id, index_is_id=True)
346 | title_hash = self._title_eval(self.db.title(book_id, index_is_id=True), lang)
347 | if self._author_eval:
348 | authors = authors_to_list(self.db, book_id)
349 | if authors:
350 | for author in authors:
351 | author_hash, rev_author_hash = self._author_eval(author)
352 | candidates_map[title_hash+author_hash].add(book_id)
353 | if rev_author_hash and rev_author_hash != author_hash:
354 | candidates_map[title_hash+rev_author_hash].add(book_id)
355 | return
356 | candidates_map[title_hash].add(book_id)
357 |
358 |
359 | class AuthorOnlyAlgorithm(AlgorithmBase):
360 | '''
361 | This algorithm is used for all the permutations requiring
362 | some evaluation of authors without considering the book titles.
363 | '''
364 | def __init__(self, gui, db, author_exemptions_map, author_eval):
365 | AlgorithmBase.__init__(self, gui, db, exemptions_map=author_exemptions_map)
366 | self._author_eval = author_eval
367 | self.author_bookids_map = defaultdict(set)
368 |
369 | def duplicate_search_mode(self):
370 | return DUPLICATE_SEARCH_FOR_AUTHOR
371 |
372 | def find_candidate(self, book_id, candidates_map, include_languages=False):
373 | '''
374 | Override the base implementation because it differs in several ways:
375 | - Our candidates map contains authors per key, not book ids
376 | - Our exclusions are per author rather than per book
377 | '''
378 | authors = authors_to_list(self.db, book_id)
379 | if not authors:
380 | # A book with no authors will not be considered
381 | return
382 | for author in authors:
383 | self.find_author_candidate(author, candidates_map, book_id)
384 |
385 | def find_author_candidate(self, author, candidates_map, book_id=None):
386 | '''
387 | Split into a separate method (making book id optional) for the purposes
388 | of re-use by the cross library duplicates comparison logic
389 | '''
390 | author_hash, rev_author_hash = self._author_eval(author)
391 | if book_id:
392 | self.author_bookids_map[author].add(book_id)
393 | candidates_map[author_hash].add(author)
394 | if rev_author_hash and rev_author_hash != author_hash:
395 | candidates_map[rev_author_hash].add(author)
396 |
397 | def get_book_ids_for_candidate_group(self, candidate_group):
398 | '''
399 | Override as our candidate group contains a list of authors
400 | We need to lookup the book ids for each author to build our set
401 | '''
402 | book_ids = set()
403 | for author in candidate_group:
404 | book_ids |= self.author_bookids_map[author]
405 | return sorted(list(book_ids))
406 |
407 |
408 | # --------------------------------------------------------------
409 | # Find Duplicates Book Algorithm Factory
410 | # --------------------------------------------------------------
411 |
412 |
413 | def create_algorithm(gui, db, search_type, identifier_type, title_match, author_match, bex_map, aex_map):
414 | '''
415 | Our factory responsible for returning the appropriate algorithm
416 | based on the permutation of title/author matching desired.
417 | Returns a tuple of the algorithm and a summary description
418 | '''
419 | if search_type == 'identifier':
420 | display_identifier = identifier_type if len(identifier_type) <+ 50 else identifier_type[0:47]+'...'
421 | return IdentifierAlgorithm(gui, db, bex_map, identifier_type), \
422 | _("matching '{0}' identifier").format(display_identifier)
423 | elif search_type == 'binary':
424 | return BinaryCompareAlgorithm(gui, db, bex_map), \
425 | _('binary compare')
426 | else:
427 | author_fn = get_author_algorithm_fn(author_match)
428 | if title_match == 'ignore':
429 | return AuthorOnlyAlgorithm(gui, db, aex_map, author_fn), \
430 | _('ignore title, {0} author').format(author_match)
431 | else:
432 | title_fn = get_title_algorithm_fn(title_match)
433 | return TitleAuthorAlgorithm(gui, db, bex_map, title_fn, author_fn), \
434 | _('{0} title, {1} author').format(title_match, author_match)
435 |
--------------------------------------------------------------------------------
/translations/find-duplicates.pot:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) YEAR ORGANIZATION
3 | # FIRST AUTHOR , YEAR.
4 | #
5 | msgid ""
6 | msgstr ""
7 | "Project-Id-Version: PACKAGE VERSION\n"
8 | "POT-Creation-Date: 2022-10-18 19:16+0100\n"
9 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
10 | "Last-Translator: FULL NAME \n"
11 | "Language-Team: LANGUAGE \n"
12 | "MIME-Version: 1.0\n"
13 | "Content-Type: text/plain; charset=cp1252\n"
14 | "Content-Transfer-Encoding: 8bit\n"
15 | "Generated-By: pygettext.py 1.5\n"
16 |
17 |
18 | #: ..\common\common_dialogs.py:84 config.py:141
19 | msgid "Keyboard shortcuts"
20 | msgstr ""
21 |
22 | #: ..\common\common_dialogs.py:112
23 | msgid "Restart calibre now"
24 | msgstr ""
25 |
26 | #: ..\common\common_dialogs.py:128
27 | msgid "Preferences for:"
28 | msgstr ""
29 |
30 | #: ..\common\common_dialogs.py:160
31 | msgid "Clear"
32 | msgstr ""
33 |
34 | #: ..\common\common_dialogs.py:162
35 | msgid "Clear all settings for this plugin"
36 | msgstr ""
37 |
38 | #: ..\common\common_dialogs.py:189
39 | msgid "Are you sure you want to change your settings in this library for this plugin?"
40 | msgstr ""
41 |
42 | #: ..\common\common_dialogs.py:190 ..\common\common_dialogs.py:210
43 | msgid "Any settings in other libraries or stored in a JSON file in your calibre plugins folder will not be touched."
44 | msgstr ""
45 |
46 | #: ..\common\common_dialogs.py:192 ..\common\common_dialogs.py:212
47 | msgid "You must restart calibre afterwards."
48 | msgstr ""
49 |
50 | #: ..\common\common_dialogs.py:200
51 | msgid "Settings changed"
52 | msgstr ""
53 |
54 | #: ..\common\common_dialogs.py:201
55 | msgid "Settings for this plugin in this library have been changed."
56 | msgstr ""
57 |
58 | #: ..\common\common_dialogs.py:202 ..\common\common_dialogs.py:223
59 | msgid "Please restart calibre now."
60 | msgstr ""
61 |
62 | #: ..\common\common_dialogs.py:209
63 | msgid "Are you sure you want to clear your settings in this library for this plugin?"
64 | msgstr ""
65 |
66 | #: ..\common\common_dialogs.py:221
67 | msgid "Settings deleted"
68 | msgstr ""
69 |
70 | #: ..\common\common_dialogs.py:222
71 | msgid "All settings for this plugin in this library have been cleared."
72 | msgstr ""
73 |
74 | #: ..\common\common_dialogs.py:297
75 | msgid "Copy to clipboard"
76 | msgstr ""
77 |
78 | #: ..\common\common_widgets.py:93
79 | msgid "Undefined"
80 | msgstr ""
81 |
82 | #: ..\common\common_widgets.py:148
83 | msgid "Restart required"
84 | msgstr ""
85 |
86 | #: ..\common\common_widgets.py:149
87 | msgid "Title image not found - you must restart Calibre before using this plugin!"
88 | msgstr ""
89 |
90 | #: ..\common\common_widgets.py:222
91 | msgid "Create new column"
92 | msgstr ""
93 |
94 | #: action.py:45 dialogs.py:175 dialogs.py:1077 duplicates.py:222
95 | msgid "Find Duplicates"
96 | msgstr ""
97 |
98 | #: action.py:87
99 | msgid "&Find book duplicates"
100 | msgstr ""
101 |
102 | #: action.py:89
103 | msgid "Find library duplicates"
104 | msgstr ""
105 |
106 | #: action.py:90
107 | msgid "Find books that are duplicated in another library compared to this one"
108 | msgstr ""
109 |
110 | #: action.py:93
111 | msgid "Find metadata &variations"
112 | msgstr ""
113 |
114 | #: action.py:94
115 | msgid "Find & rename variations in author, publisher, series or tags names that may indicate duplicates"
116 | msgstr ""
117 |
118 | #: action.py:97
119 | msgid "&Next result"
120 | msgstr ""
121 |
122 | #: action.py:98
123 | msgid "Display the next duplicate result group"
124 | msgstr ""
125 |
126 | #: action.py:100
127 | msgid "&Previous result"
128 | msgstr ""
129 |
130 | #: action.py:101
131 | msgid "Display the previous duplicate result group"
132 | msgstr ""
133 |
134 | #: action.py:104
135 | msgid "&Mark current group as exempt"
136 | msgstr ""
137 |
138 | #: action.py:105
139 | msgid "Mark the current group as not duplicates and exempt from future consideration"
140 | msgstr ""
141 |
142 | #: action.py:108
143 | msgid "Mark &all groups as exempt"
144 | msgstr ""
145 |
146 | #: action.py:109
147 | msgid "Mark all remaining duplicate groups as exempt from future consideration"
148 | msgstr ""
149 |
150 | #: action.py:113
151 | msgid "&Show all book duplicate exemptions"
152 | msgstr ""
153 |
154 | #: action.py:114
155 | msgid "Show all books that have book duplicate exemption pairings"
156 | msgstr ""
157 |
158 | #: action.py:117
159 | msgid "&Show all author duplicate exemptions"
160 | msgstr ""
161 |
162 | #: action.py:118
163 | msgid "Show all books that have author duplicate exemption pairings"
164 | msgstr ""
165 |
166 | #: action.py:121
167 | msgid "&Manage exemptions for this book"
168 | msgstr ""
169 |
170 | #: action.py:122
171 | msgid "Show duplicate exemptions for this book to enable removal"
172 | msgstr ""
173 |
174 | #: action.py:125
175 | msgid "&Remove selected exemptions"
176 | msgstr ""
177 |
178 | #: action.py:126
179 | msgid "Remove any duplicate book/author exemptions for the selected books"
180 | msgstr ""
181 |
182 | #: action.py:130
183 | msgid "&Clear duplicate results"
184 | msgstr ""
185 |
186 | #: action.py:131
187 | msgid "Exit duplicate search mode"
188 | msgstr ""
189 |
190 | #: action.py:135
191 | msgid "&Export duplicate groups"
192 | msgstr ""
193 |
194 | #: action.py:136
195 | msgid "Export duplicates groups to a json file"
196 | msgstr ""
197 |
198 | #: action.py:140
199 | msgid "&Customize plugin"
200 | msgstr ""
201 |
202 | #: action.py:142
203 | msgid "&Help"
204 | msgstr ""
205 |
206 | #: action.py:248
207 | msgid "This action will ensure that each of the {0} in the group are exempt from appearing together again in future."
208 | msgstr ""
209 |
210 | #: action.py:251 action.py:304
211 | msgid "Are you sure you want to proceed?"
212 | msgstr ""
213 |
214 | #: action.py:259
215 | msgid "No duplicates in group"
216 | msgstr ""
217 |
218 | #: action.py:260
219 | msgid "There are no duplicates remaining in this group."
220 | msgstr ""
221 |
222 | #: action.py:271 action.py:276
223 | msgid "Cannot manage exemptions"
224 | msgstr ""
225 |
226 | #: action.py:272
227 | msgid "No book selected"
228 | msgstr ""
229 |
230 | #: action.py:277
231 | msgid "This book has no duplicate exemptions"
232 | msgstr ""
233 |
234 | #: action.py:298
235 | msgid "Invalid selection"
236 | msgstr ""
237 |
238 | #: action.py:299
239 | msgid "You must select at least one book."
240 | msgstr ""
241 |
242 | #: action.py:300
243 | msgid "This action will remove any duplicate exemptions for your selection. This will allow them to potentially appear as duplicates together in a future duplicate search."
244 | msgstr ""
245 |
246 | #: action.py:338
247 | msgid "Choose file"
248 | msgstr ""
249 |
250 | #: action.py:339
251 | msgid "Saved duplicates"
252 | msgstr ""
253 |
254 | #: action.py:365
255 | msgid "Export completed"
256 | msgstr ""
257 |
258 | #: action.py:366
259 | msgid "Exported to: {}"
260 | msgstr ""
261 |
262 | #: book_algorithms.py:53
263 | msgid "Analysing {0} books for duplicates"
264 | msgstr ""
265 |
266 | #: book_algorithms.py:422
267 | msgid "matching '{0}' identifier"
268 | msgstr ""
269 |
270 | #: book_algorithms.py:425
271 | msgid "binary compare"
272 | msgstr ""
273 |
274 | #: book_algorithms.py:430
275 | msgid "ignore title, {0} author"
276 | msgstr ""
277 |
278 | #: book_algorithms.py:434
279 | msgid "{0} title, {1} author"
280 | msgstr ""
281 |
282 | #: config.py:143
283 | msgid "Edit the keyboard shortcuts associated with this plugin"
284 | msgstr ""
285 |
286 | #: config.py:147
287 | msgid "Reset &confirmation dialogs"
288 | msgstr ""
289 |
290 | #: config.py:148
291 | msgid "Reset all show me again dialogs for the Find Duplicates plugin"
292 | msgstr ""
293 |
294 | #: config.py:152
295 | msgid "&View library preferences"
296 | msgstr ""
297 |
298 | #: config.py:153
299 | msgid "View data stored in the library database for this plugin"
300 | msgstr ""
301 |
302 | #: config.py:169
303 | msgid "Done"
304 | msgstr ""
305 |
306 | #: config.py:170
307 | msgid "Confirmation dialogs have all been reset"
308 | msgstr ""
309 |
310 | #: dialogs.py:51
311 | msgid "Book duplicate search
- Find groups of books which have an identical identifier such as an ISBN, amazon id, goodreads, uri etc.
- Marking a group as exempt will prevent those specific books from appearing together in future duplicate book searches."
312 | msgstr ""
313 |
314 | #: dialogs.py:57
315 | msgid "Book duplicate search
- Find groups of books which have a book format that is binary identical.
- Compares the actual file size of every book format in your library, computing an SHA hash to compare contents where sizes match.
- Books found using this search are guaranteed to be duplicates.
- Marking a group as exempt will prevent those specific books from appearing together in future duplicate book searches."
316 | msgstr ""
317 |
318 | #: dialogs.py:66
319 | msgid "Title duplicate search
- Find groups of books with an identical title and {0}
- Titles must match exactly excluding case.
- Marking a group as exempt will prevent those specific books from appearing together in future duplicate book searches."
320 | msgstr ""
321 |
322 | #: dialogs.py:71
323 | msgid "Title duplicate search
- Find groups of books with a similar title and {0}
- Similar title matches apply removal of common punctuation and prefixes and applies the same title matching logic as Automerge.
- Marking a group as exempt will prevent those specific books from appearing together in future duplicate book searches."
324 | msgstr ""
325 |
326 | #: dialogs.py:77
327 | msgid "Title duplicate search
- Find groups of books with a soundex title and {0}
- Soundex title matches are based on the same removal of punctuation and common prefixes as a similar title search.
- Marking a group as exempt will prevent those specific books from appearing together in future duplicate book searches."
328 | msgstr ""
329 |
330 | #: dialogs.py:83
331 | msgid "Title duplicate search
- Find groups of books with a fuzzy title and {0}
- Fuzzy title matches remove all punctuation, subtitles and any words after 'and', 'or' or 'aka' in the title.
- Marking a group as exempt will prevent those specific books from appearing together in future duplicate book searches."
332 | msgstr ""
333 |
334 | #: dialogs.py:89
335 | msgid "Author duplicate search
- Find groups of books ignoring title with {0}
- Ignore title searches are best to find variations of author names regardless of the books you have for each.
- Marking a group as exempt will prevent any books by those authors from appearing together in future duplicate author searches."
336 | msgstr ""
337 |
338 | #: dialogs.py:98
339 | msgid "an identical author.
- Authors must match exactly excluding case."
340 | msgstr ""
341 |
342 | #: dialogs.py:100
343 | msgid "a similar author.
- Similar authors differ only in punctuation, initials or order of their names."
344 | msgstr ""
345 |
346 | #: dialogs.py:103
347 | msgid "a soundex author.
- Soundex author matches start with the same removal of punctuation and ordering as a similar author search."
348 | msgstr ""
349 |
350 | #: dialogs.py:106
351 | msgid "a fuzzy match author.
- Fuzzy author matches compare using their surnames and only the first initial."
352 | msgstr ""
353 |
354 | #: dialogs.py:109
355 | msgid "ignoring the author."
356 | msgstr ""
357 |
358 | #: dialogs.py:178
359 | msgid "Duplicate Search Options"
360 | msgstr ""
361 |
362 | #: dialogs.py:182
363 | msgid "Duplicate Search Type"
364 | msgstr ""
365 |
366 | #: dialogs.py:188 dialogs.py:1104
367 | msgid "Binary Compare"
368 | msgstr ""
369 |
370 | #: dialogs.py:188 dialogs.py:1104
371 | msgid "Identifier"
372 | msgstr ""
373 |
374 | #: dialogs.py:188 dialogs.py:1104
375 | msgid "Title/Author"
376 | msgstr ""
377 |
378 | #: dialogs.py:203
379 | msgid "Title Matching"
380 | msgstr ""
381 |
382 | #: dialogs.py:215 dialogs.py:236 dialogs.py:686 dialogs.py:1131
383 | #: dialogs.py:1152
384 | msgid "Length:"
385 | msgstr ""
386 |
387 | #: dialogs.py:216 dialogs.py:687 dialogs.py:1132
388 | msgid ""
389 | "The shorter the soundex length, the greater likelihood of false positives.\n"
390 | "Large soundex values reduce your chances of matches"
391 | msgstr ""
392 |
393 | #: dialogs.py:224
394 | msgid "Author Matching"
395 | msgstr ""
396 |
397 | #: dialogs.py:249
398 | msgid "Result Options"
399 | msgstr ""
400 |
401 | #: dialogs.py:253
402 | msgid "Show all groups at once with highlighting"
403 | msgstr ""
404 |
405 | #: dialogs.py:254
406 | msgid "Show one group at a time"
407 | msgstr ""
408 |
409 | #: dialogs.py:257
410 | msgid "Highlight authors in the tag browser for ignore title searches"
411 | msgstr ""
412 |
413 | #: dialogs.py:258
414 | msgid ""
415 | "When checked, will ensure that the authors for the current group\n"
416 | "are shown in the tag browser and highlighted if multiple groups shown.\n"
417 | "Only applies for author duplicate searches."
418 | msgstr ""
419 |
420 | #: dialogs.py:262
421 | msgid "Sort groups by number of duplicates"
422 | msgstr ""
423 |
424 | #: dialogs.py:263
425 | msgid ""
426 | "When unchecked, will sort by an approximation of the title\n"
427 | "or by author if title is being ignored"
428 | msgstr ""
429 |
430 | #: dialogs.py:266 dialogs.py:1169
431 | msgid "Include languages metadata when comparing titles"
432 | msgstr ""
433 |
434 | #: dialogs.py:267 dialogs.py:1170
435 | msgid ""
436 | "When checked, books with identical titles but different\n"
437 | "languages metadata field values will not show as duplicates"
438 | msgstr ""
439 |
440 | #: dialogs.py:270
441 | msgid "When doing a Binary Compare, automatically remove duplicate formats"
442 | msgstr ""
443 |
444 | #: dialogs.py:272
445 | msgid ""
446 | "When checked and the Binary duplicate search is run, if duplicate formats are found\n"
447 | "then all except one are deleted. The format on the oldest book record will be kept.\n"
448 | "This is a convenience function for where you have multiple formats associated with\n"
449 | "each book and hence it is not readily obvious which of these is the duplicate.\n"
450 | "Note that the book records themselves are not deleted, and will still appear in the\n"
451 | "results for merging even if they now have no formats."
452 | msgstr ""
453 |
454 | #: dialogs.py:370
455 | msgid "Invalid Criteria"
456 | msgstr ""
457 |
458 | #: dialogs.py:371
459 | msgid "You must select an identifier type to search by Identifier."
460 | msgstr ""
461 |
462 | #: dialogs.py:504
463 | msgid "Manage Duplicate Exemptions"
464 | msgstr ""
465 |
466 | #: dialogs.py:507
467 | msgid "Manage Exemptions"
468 | msgstr ""
469 |
470 | #: dialogs.py:513
471 | msgid "The first book below will never appear as a duplicate with the following books.
To allow future duplicate consideration, tick the remove checkbox and click ok."
472 | msgstr ""
473 |
474 | #: dialogs.py:528
475 | msgid "The authors below will never appear as a duplicate with the following authors.
To allow future duplicate consideration, tick the remove checkbox and click ok."
476 | msgstr ""
477 |
478 | #: dialogs.py:645
479 | msgid "Find Duplicates Plugin"
480 | msgstr ""
481 |
482 | #: dialogs.py:649
483 | msgid "Find Metadata Variations"
484 | msgstr ""
485 |
486 | #: dialogs.py:653
487 | msgid "Choose metadata column:"
488 | msgstr ""
489 |
490 | #: dialogs.py:657
491 | msgid "Authors"
492 | msgstr ""
493 |
494 | #: dialogs.py:660
495 | msgid "Series"
496 | msgstr ""
497 |
498 | #: dialogs.py:663
499 | msgid "Publisher"
500 | msgstr ""
501 |
502 | #: dialogs.py:666
503 | msgid "Tags"
504 | msgstr ""
505 |
506 | #: dialogs.py:675
507 | msgid "Choose similarity level:"
508 | msgstr ""
509 |
510 | #: dialogs.py:679
511 | msgid "Similar"
512 | msgstr ""
513 |
514 | #: dialogs.py:682
515 | msgid "Soundex"
516 | msgstr ""
517 |
518 | #: dialogs.py:684
519 | msgid "Fuzzy"
520 | msgstr ""
521 |
522 | #: dialogs.py:691
523 | msgid "Search"
524 | msgstr ""
525 |
526 | #: dialogs.py:693
527 | msgid "Search for results"
528 | msgstr ""
529 |
530 | #: dialogs.py:704
531 | msgid "Search results:"
532 | msgstr ""
533 |
534 | #: dialogs.py:710
535 | msgid "Authors:"
536 | msgstr ""
537 |
538 | #: dialogs.py:711 dialogs.py:910
539 | msgid "Variations:"
540 | msgstr ""
541 |
542 | #: dialogs.py:726
543 | msgid "&Show matching books"
544 | msgstr ""
545 |
546 | #: dialogs.py:727
547 | msgid "As a group is selected, show the search results in the library view"
548 | msgstr ""
549 |
550 | #: dialogs.py:730
551 | msgid "Rename to:"
552 | msgstr ""
553 |
554 | #: dialogs.py:747
555 | msgid "&Rename"
556 | msgstr ""
557 |
558 | #: dialogs.py:748
559 | msgid "Rename all of the selected items to this name"
560 | msgstr ""
561 |
562 | #: dialogs.py:751
563 | msgid "&Ignore"
564 | msgstr ""
565 |
566 | #: dialogs.py:752
567 | msgid "Ignore all selected items from consideration at this time"
568 | msgstr ""
569 |
570 | #: dialogs.py:791
571 | msgid "No matches"
572 | msgstr ""
573 |
574 | #: dialogs.py:791
575 | msgid "You have no variations of {0} using this criteria"
576 | msgstr ""
577 |
578 | #: dialogs.py:851
579 | msgid "Use this variation name"
580 | msgstr ""
581 |
582 | #: dialogs.py:914
583 | msgid "Variations of: {0}"
584 | msgstr ""
585 |
586 | #: dialogs.py:950
587 | msgid "Are you sure you want to rename the selected {0} items to \"{1}\"?"
588 | msgstr ""
589 |
590 | #: dialogs.py:1037
591 | msgid "Book duplicate search
- Report books in this library which have an identical identifier for books in the target library.
"
592 | msgstr ""
593 |
594 | #: dialogs.py:1041
595 | msgid "Book duplicate search
- Report books in this library which are binary identical to books in your target library.
- Compares the actual file size of every book format in your libraries, computing an SHA hash to compare contents where sizes match.
- Books found using this search are guaranteed to be duplicates."
596 | msgstr ""
597 |
598 | #: dialogs.py:1048
599 | msgid "Title duplicate search
- Report books in this library compared to your target library with an identical title and {0}
- Titles must match exactly excluding case."
600 | msgstr ""
601 |
602 | #: dialogs.py:1051
603 | msgid "Title duplicate search
- Report books in this library compared to your target library with a similar title and {0}
- Similar title matches apply removal of common punctuation and prefixes and applies the same title matching logic as Automerge."
604 | msgstr ""
605 |
606 | #: dialogs.py:1055
607 | msgid "Title duplicate search
- Report books in this library compared to your target library with a soundex title and {0}
- Soundex title matches are based on the same removal of punctuation and common prefixes as a similar title search."
608 | msgstr ""
609 |
610 | #: dialogs.py:1059
611 | msgid "Title duplicate search
- Report books in this library compared to your target library with a fuzzy title and {0}
- Fuzzy title matches remove all punctuation, subtitles and any words after 'and', 'or' or 'aka' in the title."
612 | msgstr ""
613 |
614 | #: dialogs.py:1063
615 | msgid "Author duplicate search
- Report books in this library compared to your target library ignoring title with {0}
- Ignore title searches are best to find variations of author names regardless of the books you have for each."
616 | msgstr ""
617 |
618 | #: dialogs.py:1080
619 | msgid "Cross Library Search Options"
620 | msgstr ""
621 |
622 | #: dialogs.py:1084
623 | msgid "Compare With Library:"
624 | msgstr ""
625 |
626 | #: dialogs.py:1088
627 | msgid "Library:"
628 | msgstr ""
629 |
630 | #: dialogs.py:1098
631 | msgid "Duplicate Search Type:"
632 | msgstr ""
633 |
634 | #: dialogs.py:1119
635 | msgid "Title Matching:"
636 | msgstr ""
637 |
638 | #: dialogs.py:1140
639 | msgid "Author Matching:"
640 | msgstr ""
641 |
642 | #: dialogs.py:1165
643 | msgid "Compare Options:"
644 | msgstr ""
645 |
646 | #: dialogs.py:1173
647 | msgid "Display duplicate books when search completes"
648 | msgstr ""
649 |
650 | #: dialogs.py:1174
651 | msgid "Uncheck this option if you just want the output log"
652 | msgstr ""
653 |
654 | #: dialogs.py:1213
655 | msgid "Choose library location to compare against"
656 | msgstr ""
657 |
658 | #: dialogs.py:1264
659 | msgid "No library specified"
660 | msgstr ""
661 |
662 | #: dialogs.py:1265
663 | msgid "You must specify a library path"
664 | msgstr ""
665 |
666 | #: dialogs.py:1268
667 | msgid "Same as current"
668 | msgstr ""
669 |
670 | #: dialogs.py:1269
671 | msgid "The location {0} contains the current calibre library"
672 | msgstr ""
673 |
674 | #: dialogs.py:1271
675 | msgid "No existing library found"
676 | msgstr ""
677 |
678 | #: dialogs.py:1272
679 | msgid "There is no existing calibre library at {0}"
680 | msgstr ""
681 |
682 | #: dialogs.py:1295
683 | msgid "Save log"
684 | msgstr ""
685 |
686 | #: dialogs.py:1302
687 | msgid "Save Find Duplicates log"
688 | msgstr ""
689 |
690 | #: dialogs.py:1303
691 | msgid "Duplicates log file"
692 | msgstr ""
693 |
694 | #: duplicates.py:213
695 | msgid "No duplicate groups were found when searching with: {0}"
696 | msgstr ""
697 |
698 | #: duplicates.py:215 duplicates.py:328 duplicates.py:361 duplicates.py:368
699 | msgid "No duplicates"
700 | msgstr ""
701 |
702 | #: duplicates.py:217 duplicates.py:224 duplicates.py:330
703 | msgid "Show this information again"
704 | msgstr ""
705 |
706 | #: duplicates.py:220
707 | msgid "Found {0} duplicate groups when searching with: {1}"
708 | msgstr ""
709 |
710 | #: duplicates.py:327 duplicates.py:369
711 | msgid "No more duplicate groups exist from your search."
712 | msgstr ""
713 |
714 | #: duplicates.py:362
715 | msgid "The current duplicate group no longer exists. You cannot perform this action."
716 | msgstr ""
717 |
718 | #: duplicates.py:720
719 | msgid "Showing #{0} of {0} remaining duplicate groups for {0}"
720 | msgstr ""
721 |
722 | #: duplicates.py:827
723 | msgid ""
724 | "Results of {0} comparison:\n"
725 | " Source library: {1}\n"
726 | " Target library: {2}\n"
727 | "\n"
728 | "{3}"
729 | msgstr ""
730 |
731 | #: duplicates.py:899
732 | msgid "Click 'Show details' to see the results."
733 | msgstr ""
734 |
735 | #: duplicates.py:911 duplicates.py:1037
736 | msgid "Analysing duplicates in target database"
737 | msgstr ""
738 |
739 | #: duplicates.py:913 duplicates.py:1046
740 | msgid "Analysing duplicates in current database"
741 | msgstr ""
742 |
743 | #: duplicates.py:944
744 | msgid "Found {0} authors with potential duplicates using {1} against the library at: {2}"
745 | msgstr ""
746 |
747 | #: duplicates.py:1033
748 | msgid "Found {0} books with binary duplicates against the library at: {1}"
749 | msgstr ""
750 |
751 | #: duplicates.py:1070
752 | msgid "Found {0} books with potential duplicates using {1} against the library at: {2}"
753 | msgstr ""
754 |
755 |
--------------------------------------------------------------------------------
/matching.py:
--------------------------------------------------------------------------------
1 | from __future__ import unicode_literals, division, absolute_import, print_function
2 |
3 | __license__ = 'GPL v3'
4 | __copyright__ = '2011, Grant Drake'
5 |
6 | import re
7 | from calibre import prints
8 | from calibre.utils.config import tweaks
9 | from calibre.utils.localization import get_udc
10 |
11 | title_soundex_length = 6
12 | author_soundex_length = 8
13 | publisher_soundex_length = 6
14 | series_soundex_length = 6
15 | tags_soundex_length = 4
16 |
17 | ignore_author_words = ['von', 'van', 'jr', 'sr', 'i', 'ii', 'iii', 'second', 'third',
18 | 'md', 'phd']
19 | IGNORE_AUTHOR_WORDS_MAP = dict((k,True) for k in ignore_author_words)
20 |
21 | def ids_for_field(db, ids_of_books, field_name):
22 | # First get all the names for the desired books.
23 | # Use a set to make them unique
24 | unique_names = set()
25 | val = db.all_field_for(field_name, ids_of_books)
26 | for field_value in db.all_field_for(field_name, ids_of_books).values():
27 | if type(field_value) is tuple:
28 | for val in field_value:
29 | unique_names.add(val)
30 | elif field_value:
31 | unique_names.add(field_value)
32 | # reverse the map of ids to names so id_map[name] gives the id
33 | id_map = {v:k for k,v in db.get_id_map(field_name).items()}
34 | # Now build the pairs (id, name)
35 | id_field_pairs = list()
36 | for name in unique_names:
37 | id_field_pairs.append((id_map[name], name))
38 | return id_field_pairs
39 |
40 | def get_field_pairs(db, field):
41 | # Get the list of books in the current VL
42 | ids_in_vl = db.data.search_getting_ids('', '', use_virtual_library=True)
43 | # Get the id,val pairs for the desired field
44 | db_ref = db.new_api if hasattr(db, 'new_api') else db
45 | field_pairs = ids_for_field(db_ref, ids_in_vl, field)
46 | return field_pairs
47 |
48 | def set_soundex_lengths(title_len, author_len):
49 | global title_soundex_length
50 | title_soundex_length = title_len
51 | global author_soundex_length
52 | author_soundex_length = author_len
53 |
54 | def set_title_soundex_length(title_len):
55 | global title_soundex_length
56 | title_soundex_length = title_len
57 |
58 | def set_author_soundex_length(author_len):
59 | global author_soundex_length
60 | author_soundex_length = author_len
61 |
62 | def set_publisher_soundex_length(publisher_len):
63 | global publisher_soundex_length
64 | publisher_soundex_length = publisher_len
65 |
66 | def set_series_soundex_length(series_len):
67 | global series_soundex_length
68 | series_soundex_length = series_len
69 |
70 | def set_tags_soundex_length(tags_len):
71 | global tags_soundex_length
72 | tags_soundex_length = tags_len
73 |
74 |
75 | def authors_to_list(db, book_id):
76 | authors = db.authors(book_id, index_is_id=True)
77 | if authors:
78 | return [a.strip().replace('|',',') for a in authors.split(',')]
79 | return []
80 |
81 | def fuzzy_it(text, patterns=None):
82 | fuzzy_title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
83 | [
84 | (r'[\[\](){}<>\'";,:#]', ''),
85 | (tweaks.get('title_sort_articles', r'^(a|the|an)\s+'), ''),
86 | (r'[-._]', ' '),
87 | (r'\s+', ' ')
88 | ]]
89 | if not patterns:
90 | patterns = fuzzy_title_patterns
91 | text = text.strip().lower()
92 | for pat, repl in patterns:
93 | text = pat.sub(repl, text)
94 | return text.strip()
95 |
96 | def soundex(name, length=4):
97 | '''
98 | soundex module conforming to Knuth's algorithm
99 | implementation 2000-12-24 by Gregory Jorgensen
100 | public domain
101 | http://code.activestate.com/recipes/52213-soundex-algorithm/
102 | '''
103 | # digits holds the soundex values for the alphabet
104 | # ABCDEFGHIJKLMNOPQRSTUVWXYZ
105 | digits = '01230120022455012623010202'
106 | sndx = ''
107 | fc = ''
108 | orda = ord('A')
109 | ordz = ord('Z')
110 |
111 | # translate alpha chars in name to soundex digits
112 | for c in name.upper():
113 | ordc = ord(c)
114 | if ordc >= orda and ordc <= ordz:
115 | if not fc: fc = c # remember first letter
116 | d = digits[ordc-orda]
117 | # duplicate consecutive soundex digits are skipped
118 | if not sndx or (d != sndx[-1]):
119 | sndx += d
120 |
121 | # replace first digit with first alpha character
122 | sndx = fc + sndx[1:]
123 |
124 | # remove all 0s from the soundex code
125 | sndx = sndx.replace('0','')
126 |
127 | # return soundex code padded to length characters
128 | return (sndx + (length * '0'))[:length]
129 |
130 |
131 | # --------------------------------------------------------------
132 | # Title Matching Algorithm Functions
133 | # --------------------------------------------------------------
134 |
135 | def get_title_tokens(title, strip_subtitle=True, decode_non_ascii=True):
136 | '''
137 | Take a title and return a list of tokens useful for an AND search query.
138 | Excludes subtitles (optionally), punctuation and a, the.
139 | '''
140 | if title:
141 | # strip sub-titles
142 | if strip_subtitle:
143 | subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
144 | if len(subtitle.sub('', title)) > 1:
145 | title = subtitle.sub('', title)
146 |
147 | title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
148 | [
149 | # Remove things like: (2010) (Omnibus) etc.
150 | (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|paperback|mass\s*market|edition|ed\.)[\])}]', ''),
151 | # Remove any strings that contain the substring edition inside
152 | # parentheses
153 | (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''),
154 | # Remove commas used a separators in numbers
155 | (r'(\d+),(\d+)', r'\1\2'),
156 | # Remove hyphens only if they have whitespace before them
157 | (r'(\s-)', ' '),
158 | # Remove single quotes not followed by 's'
159 | (r"'(?!s)", ''),
160 | # Replace other special chars with a space
161 | (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
162 | ]]
163 |
164 | for pat, repl in title_patterns:
165 | title = pat.sub(repl, title)
166 |
167 | if decode_non_ascii:
168 | title = get_udc().decode(title)
169 | tokens = title.split()
170 | for token in tokens:
171 | token = token.strip()
172 | if token and (token.lower() not in ('a', 'the')):
173 | yield token.lower()
174 |
175 | def identical_title_match(title, lang=None):
176 | if lang:
177 | return lang + title.lower()
178 | return title.lower()
179 |
180 | def similar_title_match(title, lang=None):
181 | title = get_udc().decode(title)
182 | result = fuzzy_it(title)
183 | if lang:
184 | return lang + result
185 | return result
186 |
187 | def soundex_title_match(title, lang=None):
188 | # Convert to an equivalent of "similar" title first before applying the soundex
189 | title = similar_title_match(title)
190 | result = soundex(title, title_soundex_length)
191 | if lang:
192 | return lang + result
193 | return result
194 |
195 | def fuzzy_title_match(title, lang=None):
196 | title_tokens = list(get_title_tokens(title))
197 | # We will strip everything after "and", "or" provided it is not first word in title - this is very aggressive!
198 | for i, tok in enumerate(title_tokens):
199 | if tok in ['&', 'and', 'or', 'aka'] and i > 0:
200 | title_tokens = title_tokens[:i]
201 | break
202 | result = ''.join(title_tokens)
203 | if lang:
204 | return lang + result
205 | return result
206 |
207 |
208 | # --------------------------------------------------------------
209 | # Author Matching Algorithm Functions
210 | #
211 | # Note that these return two hashes
212 | # - first is based on the author name supplied
213 | # - second (if not None) is based on swapping name order
214 | # --------------------------------------------------------------
215 |
216 | def get_author_tokens(author, decode_non_ascii=True, strip_initials=False):
217 | '''
218 | Take an author and return a list of tokens useful for duplicate
219 | hash comparisons. This function tries to return tokens in
220 | first name middle names last name order, by assuming that if a comma is
221 | in the author name, the name is in lastname, other names form.
222 | '''
223 |
224 | if author:
225 | # Ensure Last,First is treated same as Last, First adding back space after comma.
226 | comma_no_space_pat = re.compile(r',([^\s])')
227 | author = comma_no_space_pat.sub(', \\1', author)
228 | replace_pat = re.compile(r'[-+.:;]')
229 | au = replace_pat.sub(' ', author)
230 | if decode_non_ascii:
231 | au = get_udc().decode(au)
232 | parts = au.split()
233 | if ',' in au:
234 | # au probably in ln, fn form
235 | parts = parts[1:] + parts[:1]
236 | # Leave ' in there for Irish names
237 | remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]')
238 | # We will ignore author initials of only one character.
239 | min_length = 1 if strip_initials else 0
240 | for tok in parts:
241 | tok = remove_pat.sub('', tok).strip()
242 | if len(tok) > min_length and tok.lower() not in IGNORE_AUTHOR_WORDS_MAP:
243 | yield tok.lower()
244 |
245 | def identical_authors_match(author):
246 | return author.lower(), None
247 |
248 | def similar_authors_match(author):
249 | author_tokens = list(get_author_tokens(author, strip_initials=True))
250 | ahash = ' '.join(author_tokens)
251 | rev_ahash = None
252 | if len(author_tokens) > 1:
253 | author_tokens = author_tokens[1:] + author_tokens[:1]
254 | rev_ahash = ' '.join(author_tokens)
255 | return ahash, rev_ahash
256 |
257 | def soundex_authors_match(author):
258 | # Convert to an equivalent of "similar" author first before applying the soundex
259 | author_tokens = list(get_author_tokens(author))
260 | if len(author_tokens) <= 1:
261 | return soundex(''.join(author_tokens)), None
262 | # We will put the last name at front as want the soundex to focus on surname
263 | new_author_tokens = [author_tokens[-1]]
264 | new_author_tokens.extend(author_tokens[:-1])
265 | ahash = soundex(''.join(new_author_tokens), author_soundex_length)
266 | rev_ahash = None
267 | if len(author_tokens) > 1:
268 | rev_ahash = soundex(''.join(author_tokens), author_soundex_length)
269 | return ahash, rev_ahash
270 |
271 | def fuzzy_authors_match(author):
272 | author_tokens = list(get_author_tokens(author))
273 | if not author_tokens:
274 | return '', None
275 | elif len(author_tokens) == 1:
276 | return author_tokens[0], None
277 | # We have multiple tokens - create a new list of initial plus last token as surname
278 | # However we do not want to do a reversed permutation
279 | # i.e. A. Bronte should return "ABronte" and "", not "BA"!
280 | new_author_tokens = [author_tokens[0][0], author_tokens[-1]]
281 | ahash = ''.join(new_author_tokens)
282 | return ahash, None
283 |
284 |
285 | # --------------------------------------------------------------
286 | # Series Matching Algorithm Functions
287 | # --------------------------------------------------------------
288 |
289 | def get_series_tokens(series, decode_non_ascii=True):
290 | '''
291 | Take a series and return a list of tokens useful for duplicate
292 | hash comparisons.
293 | '''
294 |
295 | ignore_words = ['the', 'a', 'and',]
296 | if series:
297 | remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
298 | replace_pat = re.compile(r'[-+.:;]')
299 | s = replace_pat.sub(' ', series)
300 | if decode_non_ascii:
301 | s = get_udc().decode(s)
302 | parts = s.split()
303 | for tok in parts:
304 | tok = remove_pat.sub('', tok).strip()
305 | if len(tok) > 0 and tok.lower() not in ignore_words:
306 | yield tok.lower()
307 |
308 | def similar_series_match(series):
309 | series_tokens = list(get_series_tokens(series))
310 | return ' '.join(series_tokens)
311 |
312 | def soundex_series_match(series):
313 | # Convert to an equivalent of "similar" series before applying the soundex
314 | series_tokens = list(get_series_tokens(series))
315 | if len(series_tokens) <= 1:
316 | return soundex(''.join(series_tokens))
317 | return soundex(''.join(series_tokens), series_soundex_length)
318 |
319 | def fuzzy_series_match(series):
320 | # Fuzzy is going to just be the first name of the series
321 | series_tokens = list(get_series_tokens(series))
322 | if not series_tokens:
323 | return ''
324 | return series_tokens[0]
325 |
326 |
327 | # --------------------------------------------------------------
328 | # Publisher Matching Algorithm Functions
329 | # --------------------------------------------------------------
330 |
331 | def get_publisher_tokens(publisher, decode_non_ascii=True):
332 | '''
333 | Take a publisher and return a list of tokens useful for duplicate
334 | hash comparisons.
335 | '''
336 |
337 | ignore_words = ['the', 'inc', 'ltd', 'limited', 'llc', 'co', 'pty',
338 | 'usa', 'uk']
339 | if publisher:
340 | remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
341 | replace_pat = re.compile(r'[-+.:;]')
342 | p = replace_pat.sub(' ', publisher)
343 | if decode_non_ascii:
344 | p = get_udc().decode(p)
345 | parts = p.split()
346 | for tok in parts:
347 | tok = remove_pat.sub('', tok).strip()
348 | if len(tok) > 0 and tok.lower() not in ignore_words:
349 | yield tok.lower()
350 |
351 | def similar_publisher_match(publisher):
352 | publisher_tokens = list(get_publisher_tokens(publisher))
353 | return ' '.join(publisher_tokens)
354 |
355 | def soundex_publisher_match(publisher):
356 | # Convert to an equivalent of "similar" publisher before applying the soundex
357 | publisher_tokens = list(get_publisher_tokens(publisher))
358 | if len(publisher_tokens) <= 1:
359 | return soundex(''.join(publisher_tokens))
360 | return soundex(''.join(publisher_tokens), publisher_soundex_length)
361 |
362 | def fuzzy_publisher_match(publisher):
363 | # Fuzzy is going to just be the first name of the publisher, unless
364 | # that is just a single letter, in which case first two names
365 | publisher_tokens = list(get_publisher_tokens(publisher))
366 | if not publisher_tokens:
367 | return ''
368 | first = publisher_tokens[0]
369 | if len(first) > 1 or len(publisher_tokens) == 1:
370 | return first
371 | return ' '.join(publisher_tokens[:2])
372 |
373 |
374 | # --------------------------------------------------------------
375 | # Tag Matching Algorithm Functions
376 | # --------------------------------------------------------------
377 |
378 | def get_tag_tokens(tag, decode_non_ascii=True):
379 | '''
380 | Take a tag and return a list of tokens useful for duplicate
381 | hash comparisons.
382 | '''
383 |
384 | ignore_words = ['the', 'and', 'a']
385 | if tag:
386 | remove_pat = re.compile(r'[,!@#$%^&*(){}`~\'"\s\[\]/]')
387 | replace_pat = re.compile(r'[-+.:;]')
388 | t = replace_pat.sub(' ', tag)
389 | if decode_non_ascii:
390 | t = get_udc().decode(t)
391 | parts = t.split()
392 | for tok in parts:
393 | tok = remove_pat.sub('', tok).strip()
394 | if len(tok) > 0 and tok.lower() not in ignore_words:
395 | yield tok.lower()
396 |
397 | def similar_tags_match(tag):
398 | tag_tokens = list(get_tag_tokens(tag))
399 | return ' '.join(tag_tokens)
400 |
401 | def soundex_tags_match(tag):
402 | # Convert to an equivalent of "similar" tag before applying the soundex
403 | tag_tokens = list(get_tag_tokens(tag))
404 | if len(tag_tokens) <= 1:
405 | return soundex(''.join(tag_tokens))
406 | return soundex(''.join(tag_tokens), publisher_soundex_length)
407 |
408 | def fuzzy_tags_match(tag):
409 | # Fuzzy is going to just be the first name of the tag
410 | tag_tokens = list(get_tag_tokens(tag))
411 | if not tag_tokens:
412 | return ''
413 | return tag_tokens[0]
414 |
415 |
416 | # --------------------------------------------------------------
417 | # Find Duplicates Algorithm Factories
418 | # --------------------------------------------------------------
419 |
420 |
421 | def get_title_algorithm_fn(title_match):
422 | '''
423 | Return the appropriate function for the desired title match
424 | '''
425 | if title_match == 'identical':
426 | return identical_title_match
427 | if title_match == 'similar':
428 | return similar_title_match
429 | if title_match == 'soundex':
430 | return soundex_title_match
431 | if title_match == 'fuzzy':
432 | return fuzzy_title_match
433 | return None
434 |
435 |
436 | def get_author_algorithm_fn(author_match):
437 | '''
438 | Return the appropriate function for the desired author match
439 | '''
440 | if author_match == 'identical':
441 | return identical_authors_match
442 | if author_match == 'similar':
443 | return similar_authors_match
444 | if author_match == 'soundex':
445 | return soundex_authors_match
446 | if author_match == 'fuzzy':
447 | return fuzzy_authors_match
448 | return None
449 |
450 |
451 | def get_variation_algorithm_fn(match_type, item_type):
452 | '''
453 | Return the appropriate function for the desired variation match where:
454 | match_type is 'similar', 'soundex' or 'fuzzy'
455 | item_type is 'author', 'series', 'publisher' or 'tag'
456 | '''
457 | fn_name = '%s_%s_match'%(match_type, item_type)
458 | return globals()[fn_name]
459 |
460 | # --------------------------------------------------------------
461 | # Test Code
462 | # --------------------------------------------------------------
463 |
464 | def do_assert_tests():
465 |
466 | def _assert(test_name, match_type, item_type, value1, value2, equal=True):
467 | fn = get_variation_algorithm_fn(match_type, item_type)
468 | hash1 = fn(value1)
469 | hash2 = fn(value2)
470 | if (equal and hash1 != hash2) or (not equal and hash1 == hash2):
471 | prints('Failed: %s %s %s (\'%s\', \'%s\')'%(test_name,
472 | match_type, item_type, value1, value2))
473 | prints(' hash1: %s'%hash1)
474 | prints(' hash2: %s'%hash2)
475 |
476 | def assert_match(match_type, item_type, value1, value2):
477 | _assert('is matching', match_type, item_type, value1, value2, equal=True)
478 |
479 | def assert_nomatch(match_type, item_type, value1, value2):
480 | _assert('not matching', match_type, item_type, value1, value2, equal=False)
481 |
482 | def _assert_author(test_name, match_type, item_type, value1, value2, equal=True):
483 | fn = get_variation_algorithm_fn(match_type, item_type)
484 | hash1, rev_hash1 = fn(value1)
485 | hash2, rev_hash2 = fn(value2)
486 | results_equal = hash1 in [hash2, rev_hash2] or \
487 | (rev_hash1 is not None and rev_hash1 in [hash2, rev_hash2])
488 | if (equal and not results_equal) or (not equal and results_equal):
489 | prints('Failed: %s %s %s (\'%s\', \'%s\')'% (test_name,
490 | match_type, item_type, value1, value2))
491 | prints(' hash1: ', hash1, ' rev_hash1: ', rev_hash1)
492 | prints(' hash2: ', hash2, ' rev_hash2: ', rev_hash2)
493 |
494 | def assert_author_match(match_type, item_type, value1, value2):
495 | _assert_author('is matching', match_type, item_type, value1, value2, equal=True)
496 |
497 | def assert_author_nomatch(match_type, item_type, value1, value2):
498 | _assert_author('not matching', match_type, item_type, value1, value2, equal=False)
499 |
500 |
501 | # Test our identical title algorithms
502 | assert_match('identical', 'title', 'The Martian Way', 'The Martian Way')
503 | assert_match('identical', 'title', 'The Martian Way', 'the martian way')
504 | assert_nomatch('identical', 'title', 'The Martian Way', 'Martian Way')
505 | assert_nomatch('identical', 'title', 'China Miéville', 'China Mieville')
506 |
507 | # Test our similar title algorithms
508 | assert_match('similar', 'title', 'The Martian Way', 'The Martian Way')
509 | assert_match('similar', 'title', 'The Martian Way', 'the martian way')
510 | assert_match('similar', 'title', 'The Martian Way', 'Martian Way')
511 | assert_match('similar', 'title', 'The Martian Way', 'The Martian Way')
512 | assert_match('similar', 'title', 'China Miéville', 'China Mieville')
513 | assert_nomatch('similar', 'title', 'The Martian Way', 'The Martain Way')
514 | assert_nomatch('similar', 'title', 'The Martian Way', 'The Martian Way (Foo)')
515 | assert_nomatch('similar', 'title', 'The Martian Way I', 'The Martian Way II')
516 | assert_nomatch('similar', 'title', 'The Martian Way', 'The Martian Way and other stories')
517 | assert_nomatch('similar', 'title', 'The Martian Way', 'The Martian Way, or, My New Title')
518 | assert_nomatch('similar', 'title', 'The Martian Way', 'The Martian Way aka My New Title')
519 | assert_nomatch('similar', 'title', 'Foundation and Earth - Foundation 5', 'Foundation and Earth')
520 |
521 | # Test our soundex title algorithms
522 | assert_match('soundex', 'title', 'The Martian Way', 'The Martian Way')
523 | assert_match('soundex', 'title', 'The Martian Way', 'the martian way')
524 | assert_match('soundex', 'title', 'The Martian Way', 'Martian Way')
525 | assert_match('soundex', 'title', 'The Martian Way', 'The Martian Way')
526 | assert_match('soundex', 'title', 'The Martian Way', 'The Martain Way')
527 | assert_match('soundex', 'title', 'The Martian Way I', 'The Martian Way II')
528 | assert_match('soundex', 'title', 'Angel', 'Angle')
529 | assert_match('soundex', 'title', 'Foundation and Earth - Foundation 5', 'Foundation and Earth')
530 | assert_match('soundex', 'title', 'China Miéville', 'China Mieville')
531 | assert_nomatch('soundex', 'title', 'The Martian Way', 'The Martian Way (Foo)')
532 | assert_nomatch('soundex', 'title', 'The Martian Way', 'The Martian Way and other stories')
533 | assert_nomatch('soundex', 'title', 'The Martian Way', 'The Martian Way, or, My New Title')
534 | assert_nomatch('soundex', 'title', 'The Martian Way', 'The Martian Way aka My New Title')
535 | assert_nomatch('soundex', 'title', 'Foundation 5 - Foundation and Earth', 'Foundation and Earth')
536 |
537 | # Test our fuzzy title algorithms
538 | assert_match('fuzzy', 'title', 'The Martian Way', 'The Martian Way')
539 | assert_match('fuzzy', 'title', 'The Martian Way', 'the martian way')
540 | assert_match('fuzzy', 'title', 'The Martian Way', 'Martian Way')
541 | assert_match('fuzzy', 'title', 'The Martian Way', 'The Martian Way')
542 | assert_match('fuzzy', 'title', 'The Martian Way', 'The Martian Way (Foo)')
543 | assert_match('fuzzy', 'title', 'The Martian Way', 'The Martian Way: Sequel')
544 | assert_match('fuzzy', 'title', 'The Martian Way', 'The Martian Way and other stories')
545 | assert_match('fuzzy', 'title', 'The Martian Way', 'The Martian Way, or, My New Title')
546 | assert_match('fuzzy', 'title', 'The Martian Way', 'The Martian Way aka My New Title')
547 | assert_match('fuzzy', 'title', 'Foundation and Earth - Foundation 5', 'Foundation and Earth')
548 | assert_match('fuzzy', 'title', 'China Miéville', 'China Mieville')
549 | assert_nomatch('fuzzy', 'title', 'The Martian Way', 'The Martain Way')
550 | assert_nomatch('fuzzy', 'title', 'The Martian Way I', 'The Martian Way II')
551 | assert_nomatch('fuzzy', 'title', 'Foundation 5 - Foundation and Earth', 'Foundation and Earth')
552 |
553 | # Test our identical author algorithms
554 | assert_author_match('identical', 'authors', 'Kevin J. Anderson', 'Kevin J. Anderson')
555 | assert_author_match('identical', 'authors', 'Kevin J. Anderson', 'Kevin j. Anderson')
556 | assert_author_nomatch('identical', 'authors', 'Kevin J. Anderson', 'Kevin J Anderson')
557 | assert_author_nomatch('identical', 'authors', 'China Miéville', 'China Mieville')
558 | assert_author_nomatch('identical', 'authors', 'Kevin Anderson', 'Anderson Kevin')
559 | assert_author_nomatch('identical', 'authors', 'Kevin, Anderson', 'Anderson, Kevin')
560 |
561 | # Test our similar author algorithms
562 | assert_author_match('similar', 'authors', 'Kevin J. Anderson', 'Kevin J. Anderson')
563 | assert_author_match('similar', 'authors', 'Kevin J. Anderson', 'Kevin j. Anderson')
564 | assert_author_match('similar', 'authors', 'Kevin J. Anderson', 'Kevin J Anderson')
565 | assert_author_match('similar', 'authors', 'Kevin J. Anderson', 'Anderson, Kevin J.')
566 | assert_author_match('similar', 'authors', 'Kevin Anderson', 'Kevin Anderson Jr')
567 | assert_author_match('similar', 'authors', 'China Miéville', 'China Mieville')
568 | assert_author_match('similar', 'authors', 'Kevin Anderson', 'Anderson Kevin')
569 | assert_author_match('similar', 'authors', 'Kevin, Anderson', 'Anderson, Kevin')
570 | assert_author_match('similar', 'authors', 'Kevin J. Anderson', 'Anderson,Kevin J.')
571 | assert_author_match('similar', 'authors', 'Kevin Anderson', 'Anderson,Kevin J.')
572 | assert_author_match('similar', 'authors', 'Kevin Anderson', 'Anderson,Kevin J')
573 | assert_author_nomatch('identical', 'authors', 'Kevin, Anderson', 'Anderson, Dr Kevin')
574 |
575 | # Test our soundex author algorithms
576 | assert_author_match('soundex', 'authors', 'Kevin J. Anderson', 'Kevin J. Anderson')
577 | assert_author_match('soundex', 'authors', 'Kevin J. Anderson', 'Kevin j. Anderson')
578 | assert_author_match('soundex', 'authors', 'Kevin J. Anderson', 'Kevin J Anderson')
579 | assert_author_match('soundex', 'authors', 'Kevin J. Anderson', 'Keven J. Andersan')
580 | assert_author_match('soundex', 'authors', 'Kevin J. Anderson', 'Anderson, Kevin J.')
581 | assert_author_match('soundex', 'authors', 'Kevin Anderson', 'Kevin Anderson Jr')
582 | assert_author_match('soundex', 'authors', 'Kevin J. Anderson', 'Kevin Anderson')
583 | assert_author_match('soundex', 'authors', 'China Miéville', 'China Mieville')
584 | assert_author_match('soundex', 'authors', 'Kevin Anderson', 'Anderson Kevin')
585 | assert_author_match('soundex', 'authors', 'Kevin, Anderson', 'Anderson, Kevin')
586 | assert_author_nomatch('soundex', 'authors', 'Kevin J. Anderson', 'S. Anderson')
587 |
588 | # Test our fuzzy author algorithms
589 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'Kevin J. Anderson')
590 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'Kevin j. Anderson')
591 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'Kevin J Anderson')
592 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'Kevin Anderson')
593 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'Anderson, Kevin J.')
594 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'Anderson, Kevin')
595 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'K. J. Anderson')
596 | assert_author_match('fuzzy', 'authors', 'Kevin J. Anderson', 'K. Anderson')
597 | assert_author_match('fuzzy', 'authors', 'Kevin Anderson', 'Kevin Anderson Jr')
598 | assert_author_match('fuzzy', 'authors', 'Kevin Anderson', 'Anderson Jr, K. S.')
599 | assert_author_match('fuzzy', 'authors', 'China Miéville', 'China Mieville')
600 | assert_author_nomatch('fuzzy', 'authors', 'Kevin Anderson', 'Anderson Kevin')
601 | assert_author_nomatch('fuzzy', 'authors', 'Kevin, Anderson', 'Anderson, Kevin')
602 | assert_author_nomatch('fuzzy', 'authors', 'Kevin J. Anderson', 'S. Anderson')
603 | assert_author_nomatch('fuzzy', 'authors', 'A. Brown', 'A. Bronte')
604 |
605 | # Test our similar series algorithms
606 | assert_match('similar', 'series', 'The Martian Way', 'The Martian Way')
607 | assert_match('similar', 'series', 'China Miéville', 'China Mieville')
608 | assert_nomatch('similar', 'series', 'China Miéville', 'China')
609 |
610 | # Test our soundex series algorithms
611 | assert_match('soundex', 'series', 'Angel', 'Angle')
612 |
613 | # Test our fuzzy series algorithms
614 | assert_match('fuzzy', 'series', 'China Miéville', 'China')
615 |
616 |
617 | # Test our similar publisher algorithms
618 | assert_match('similar', 'publisher', 'Random House', 'Random House Inc')
619 | assert_match('similar', 'publisher', 'Random House Inc', 'Random House Inc.')
620 | assert_nomatch('similar', 'publisher', 'Random House Inc', 'Random')
621 |
622 | # Test our soundex publisher algorithms
623 | assert_match('soundex', 'publisher', 'Angel', 'Angle')
624 |
625 | # Test our fuzzy publisher algorithms
626 | assert_match('fuzzy', 'publisher', 'Random House Inc', 'Random')
627 |
628 | prints('Tests completed')
629 |
630 |
631 | # For testing, run from command line with this:
632 | # calibre-debug -e matching.py
633 | if __name__ == '__main__':
634 | do_assert_tests()
635 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | GNU General Public License
2 | ==========================
3 |
4 | _Version 3, 29 June 2007_
5 | _Copyright © 2007 Free Software Foundation, Inc. < >_
6 |
7 | Everyone is permitted to copy and distribute verbatim copies of this license
8 | document, but changing it is not allowed.
9 |
10 | ## Preamble
11 |
12 | The GNU General Public License is a free, copyleft license for software and other
13 | kinds of works.
14 |
15 | The licenses for most software and other practical works are designed to take away
16 | your freedom to share and change the works. By contrast, the GNU General Public
17 | License is intended to guarantee your freedom to share and change all versions of a
18 | program--to make sure it remains free software for all its users. We, the Free
19 | Software Foundation, use the GNU General Public License for most of our software; it
20 | applies also to any other work released this way by its authors. You can apply it to
21 | your programs, too.
22 |
23 | When we speak of free software, we are referring to freedom, not price. Our General
24 | Public Licenses are designed to make sure that you have the freedom to distribute
25 | copies of free software (and charge for them if you wish), that you receive source
26 | code or can get it if you want it, that you can change the software or use pieces of
27 | it in new free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you these rights or
30 | asking you to surrender the rights. Therefore, you have certain responsibilities if
31 | you distribute copies of the software, or if you modify it: responsibilities to
32 | respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether gratis or for a fee,
35 | you must pass on to the recipients the same freedoms that you received. You must make
36 | sure that they, too, receive or can get the source code. And you must show them these
37 | terms so they know their rights.
38 |
39 | Developers that use the GNU GPL protect your rights with two steps: **(1)** assert
40 | copyright on the software, and **(2)** offer you this License giving you legal permission
41 | to copy, distribute and/or modify it.
42 |
43 | For the developers' and authors' protection, the GPL clearly explains that there is
44 | no warranty for this free software. For both users' and authors' sake, the GPL
45 | requires that modified versions be marked as changed, so that their problems will not
46 | be attributed erroneously to authors of previous versions.
47 |
48 | Some devices are designed to deny users access to install or run modified versions of
49 | the software inside them, although the manufacturer can do so. This is fundamentally
50 | incompatible with the aim of protecting users' freedom to change the software. The
51 | systematic pattern of such abuse occurs in the area of products for individuals to
52 | use, which is precisely where it is most unacceptable. Therefore, we have designed
53 | this version of the GPL to prohibit the practice for those products. If such problems
54 | arise substantially in other domains, we stand ready to extend this provision to
55 | those domains in future versions of the GPL, as needed to protect the freedom of
56 | users.
57 |
58 | Finally, every program is threatened constantly by software patents. States should
59 | not allow patents to restrict development and use of software on general-purpose
60 | computers, but in those that do, we wish to avoid the special danger that patents
61 | applied to a free program could make it effectively proprietary. To prevent this, the
62 | GPL assures that patents cannot be used to render the program non-free.
63 |
64 | The precise terms and conditions for copying, distribution and modification follow.
65 |
66 | ## TERMS AND CONDITIONS
67 |
68 | ### 0. Definitions
69 |
70 | “This License” refers to version 3 of the GNU General Public License.
71 |
72 | “Copyright” also means copyright-like laws that apply to other kinds of
73 | works, such as semiconductor masks.
74 |
75 | “The Program” refers to any copyrightable work licensed under this
76 | License. Each licensee is addressed as “you”. “Licensees” and
77 | “recipients” may be individuals or organizations.
78 |
79 | To “modify” a work means to copy from or adapt all or part of the work in
80 | a fashion requiring copyright permission, other than the making of an exact copy. The
81 | resulting work is called a “modified version” of the earlier work or a
82 | work “based on” the earlier work.
83 |
84 | A “covered work” means either the unmodified Program or a work based on
85 | the Program.
86 |
87 | To “propagate” a work means to do anything with it that, without
88 | permission, would make you directly or secondarily liable for infringement under
89 | applicable copyright law, except executing it on a computer or modifying a private
90 | copy. Propagation includes copying, distribution (with or without modification),
91 | making available to the public, and in some countries other activities as well.
92 |
93 | To “convey” a work means any kind of propagation that enables other
94 | parties to make or receive copies. Mere interaction with a user through a computer
95 | network, with no transfer of a copy, is not conveying.
96 |
97 | An interactive user interface displays “Appropriate Legal Notices” to the
98 | extent that it includes a convenient and prominently visible feature that **(1)**
99 | displays an appropriate copyright notice, and **(2)** tells the user that there is no
100 | warranty for the work (except to the extent that warranties are provided), that
101 | licensees may convey the work under this License, and how to view a copy of this
102 | License. If the interface presents a list of user commands or options, such as a
103 | menu, a prominent item in the list meets this criterion.
104 |
105 | ### 1. Source Code
106 |
107 | The “source code” for a work means the preferred form of the work for
108 | making modifications to it. “Object code” means any non-source form of a
109 | work.
110 |
111 | A “Standard Interface” means an interface that either is an official
112 | standard defined by a recognized standards body, or, in the case of interfaces
113 | specified for a particular programming language, one that is widely used among
114 | developers working in that language.
115 |
116 | The “System Libraries” of an executable work include anything, other than
117 | the work as a whole, that **(a)** is included in the normal form of packaging a Major
118 | Component, but which is not part of that Major Component, and **(b)** serves only to
119 | enable use of the work with that Major Component, or to implement a Standard
120 | Interface for which an implementation is available to the public in source code form.
121 | A “Major Component”, in this context, means a major essential component
122 | (kernel, window system, and so on) of the specific operating system (if any) on which
123 | the executable work runs, or a compiler used to produce the work, or an object code
124 | interpreter used to run it.
125 |
126 | The “Corresponding Source” for a work in object code form means all the
127 | source code needed to generate, install, and (for an executable work) run the object
128 | code and to modify the work, including scripts to control those activities. However,
129 | it does not include the work's System Libraries, or general-purpose tools or
130 | generally available free programs which are used unmodified in performing those
131 | activities but which are not part of the work. For example, Corresponding Source
132 | includes interface definition files associated with source files for the work, and
133 | the source code for shared libraries and dynamically linked subprograms that the work
134 | is specifically designed to require, such as by intimate data communication or
135 | control flow between those subprograms and other parts of the work.
136 |
137 | The Corresponding Source need not include anything that users can regenerate
138 | automatically from other parts of the Corresponding Source.
139 |
140 | The Corresponding Source for a work in source code form is that same work.
141 |
142 | ### 2. Basic Permissions
143 |
144 | All rights granted under this License are granted for the term of copyright on the
145 | Program, and are irrevocable provided the stated conditions are met. This License
146 | explicitly affirms your unlimited permission to run the unmodified Program. The
147 | output from running a covered work is covered by this License only if the output,
148 | given its content, constitutes a covered work. This License acknowledges your rights
149 | of fair use or other equivalent, as provided by copyright law.
150 |
151 | You may make, run and propagate covered works that you do not convey, without
152 | conditions so long as your license otherwise remains in force. You may convey covered
153 | works to others for the sole purpose of having them make modifications exclusively
154 | for you, or provide you with facilities for running those works, provided that you
155 | comply with the terms of this License in conveying all material for which you do not
156 | control copyright. Those thus making or running the covered works for you must do so
157 | exclusively on your behalf, under your direction and control, on terms that prohibit
158 | them from making any copies of your copyrighted material outside their relationship
159 | with you.
160 |
161 | Conveying under any other circumstances is permitted solely under the conditions
162 | stated below. Sublicensing is not allowed; section 10 makes it unnecessary.
163 |
164 | ### 3. Protecting Users' Legal Rights From Anti-Circumvention Law
165 |
166 | No covered work shall be deemed part of an effective technological measure under any
167 | applicable law fulfilling obligations under article 11 of the WIPO copyright treaty
168 | adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention
169 | of such measures.
170 |
171 | When you convey a covered work, you waive any legal power to forbid circumvention of
172 | technological measures to the extent such circumvention is effected by exercising
173 | rights under this License with respect to the covered work, and you disclaim any
174 | intention to limit operation or modification of the work as a means of enforcing,
175 | against the work's users, your or third parties' legal rights to forbid circumvention
176 | of technological measures.
177 |
178 | ### 4. Conveying Verbatim Copies
179 |
180 | You may convey verbatim copies of the Program's source code as you receive it, in any
181 | medium, provided that you conspicuously and appropriately publish on each copy an
182 | appropriate copyright notice; keep intact all notices stating that this License and
183 | any non-permissive terms added in accord with section 7 apply to the code; keep
184 | intact all notices of the absence of any warranty; and give all recipients a copy of
185 | this License along with the Program.
186 |
187 | You may charge any price or no price for each copy that you convey, and you may offer
188 | support or warranty protection for a fee.
189 |
190 | ### 5. Conveying Modified Source Versions
191 |
192 | You may convey a work based on the Program, or the modifications to produce it from
193 | the Program, in the form of source code under the terms of section 4, provided that
194 | you also meet all of these conditions:
195 |
196 | * **a)** The work must carry prominent notices stating that you modified it, and giving a
197 | relevant date.
198 | * **b)** The work must carry prominent notices stating that it is released under this
199 | License and any conditions added under section 7. This requirement modifies the
200 | requirement in section 4 to “keep intact all notices”.
201 | * **c)** You must license the entire work, as a whole, under this License to anyone who
202 | comes into possession of a copy. This License will therefore apply, along with any
203 | applicable section 7 additional terms, to the whole of the work, and all its parts,
204 | regardless of how they are packaged. This License gives no permission to license the
205 | work in any other way, but it does not invalidate such permission if you have
206 | separately received it.
207 | * **d)** If the work has interactive user interfaces, each must display Appropriate Legal
208 | Notices; however, if the Program has interactive interfaces that do not display
209 | Appropriate Legal Notices, your work need not make them do so.
210 |
211 | A compilation of a covered work with other separate and independent works, which are
212 | not by their nature extensions of the covered work, and which are not combined with
213 | it such as to form a larger program, in or on a volume of a storage or distribution
214 | medium, is called an “aggregate” if the compilation and its resulting
215 | copyright are not used to limit the access or legal rights of the compilation's users
216 | beyond what the individual works permit. Inclusion of a covered work in an aggregate
217 | does not cause this License to apply to the other parts of the aggregate.
218 |
219 | ### 6. Conveying Non-Source Forms
220 |
221 | You may convey a covered work in object code form under the terms of sections 4 and
222 | 5, provided that you also convey the machine-readable Corresponding Source under the
223 | terms of this License, in one of these ways:
224 |
225 | * **a)** Convey the object code in, or embodied in, a physical product (including a
226 | physical distribution medium), accompanied by the Corresponding Source fixed on a
227 | durable physical medium customarily used for software interchange.
228 | * **b)** Convey the object code in, or embodied in, a physical product (including a
229 | physical distribution medium), accompanied by a written offer, valid for at least
230 | three years and valid for as long as you offer spare parts or customer support for
231 | that product model, to give anyone who possesses the object code either **(1)** a copy of
232 | the Corresponding Source for all the software in the product that is covered by this
233 | License, on a durable physical medium customarily used for software interchange, for
234 | a price no more than your reasonable cost of physically performing this conveying of
235 | source, or **(2)** access to copy the Corresponding Source from a network server at no
236 | charge.
237 | * **c)** Convey individual copies of the object code with a copy of the written offer to
238 | provide the Corresponding Source. This alternative is allowed only occasionally and
239 | noncommercially, and only if you received the object code with such an offer, in
240 | accord with subsection 6b.
241 | * **d)** Convey the object code by offering access from a designated place (gratis or for
242 | a charge), and offer equivalent access to the Corresponding Source in the same way
243 | through the same place at no further charge. You need not require recipients to copy
244 | the Corresponding Source along with the object code. If the place to copy the object
245 | code is a network server, the Corresponding Source may be on a different server
246 | (operated by you or a third party) that supports equivalent copying facilities,
247 | provided you maintain clear directions next to the object code saying where to find
248 | the Corresponding Source. Regardless of what server hosts the Corresponding Source,
249 | you remain obligated to ensure that it is available for as long as needed to satisfy
250 | these requirements.
251 | * **e)** Convey the object code using peer-to-peer transmission, provided you inform
252 | other peers where the object code and Corresponding Source of the work are being
253 | offered to the general public at no charge under subsection 6d.
254 |
255 | A separable portion of the object code, whose source code is excluded from the
256 | Corresponding Source as a System Library, need not be included in conveying the
257 | object code work.
258 |
259 | A “User Product” is either **(1)** a “consumer product”, which
260 | means any tangible personal property which is normally used for personal, family, or
261 | household purposes, or **(2)** anything designed or sold for incorporation into a
262 | dwelling. In determining whether a product is a consumer product, doubtful cases
263 | shall be resolved in favor of coverage. For a particular product received by a
264 | particular user, “normally used” refers to a typical or common use of
265 | that class of product, regardless of the status of the particular user or of the way
266 | in which the particular user actually uses, or expects or is expected to use, the
267 | product. A product is a consumer product regardless of whether the product has
268 | substantial commercial, industrial or non-consumer uses, unless such uses represent
269 | the only significant mode of use of the product.
270 |
271 | “Installation Information” for a User Product means any methods,
272 | procedures, authorization keys, or other information required to install and execute
273 | modified versions of a covered work in that User Product from a modified version of
274 | its Corresponding Source. The information must suffice to ensure that the continued
275 | functioning of the modified object code is in no case prevented or interfered with
276 | solely because modification has been made.
277 |
278 | If you convey an object code work under this section in, or with, or specifically for
279 | use in, a User Product, and the conveying occurs as part of a transaction in which
280 | the right of possession and use of the User Product is transferred to the recipient
281 | in perpetuity or for a fixed term (regardless of how the transaction is
282 | characterized), the Corresponding Source conveyed under this section must be
283 | accompanied by the Installation Information. But this requirement does not apply if
284 | neither you nor any third party retains the ability to install modified object code
285 | on the User Product (for example, the work has been installed in ROM).
286 |
287 | The requirement to provide Installation Information does not include a requirement to
288 | continue to provide support service, warranty, or updates for a work that has been
289 | modified or installed by the recipient, or for the User Product in which it has been
290 | modified or installed. Access to a network may be denied when the modification itself
291 | materially and adversely affects the operation of the network or violates the rules
292 | and protocols for communication across the network.
293 |
294 | Corresponding Source conveyed, and Installation Information provided, in accord with
295 | this section must be in a format that is publicly documented (and with an
296 | implementation available to the public in source code form), and must require no
297 | special password or key for unpacking, reading or copying.
298 |
299 | ### 7. Additional Terms
300 |
301 | “Additional permissions” are terms that supplement the terms of this
302 | License by making exceptions from one or more of its conditions. Additional
303 | permissions that are applicable to the entire Program shall be treated as though they
304 | were included in this License, to the extent that they are valid under applicable
305 | law. If additional permissions apply only to part of the Program, that part may be
306 | used separately under those permissions, but the entire Program remains governed by
307 | this License without regard to the additional permissions.
308 |
309 | When you convey a copy of a covered work, you may at your option remove any
310 | additional permissions from that copy, or from any part of it. (Additional
311 | permissions may be written to require their own removal in certain cases when you
312 | modify the work.) You may place additional permissions on material, added by you to a
313 | covered work, for which you have or can give appropriate copyright permission.
314 |
315 | Notwithstanding any other provision of this License, for material you add to a
316 | covered work, you may (if authorized by the copyright holders of that material)
317 | supplement the terms of this License with terms:
318 |
319 | * **a)** Disclaiming warranty or limiting liability differently from the terms of
320 | sections 15 and 16 of this License; or
321 | * **b)** Requiring preservation of specified reasonable legal notices or author
322 | attributions in that material or in the Appropriate Legal Notices displayed by works
323 | containing it; or
324 | * **c)** Prohibiting misrepresentation of the origin of that material, or requiring that
325 | modified versions of such material be marked in reasonable ways as different from the
326 | original version; or
327 | * **d)** Limiting the use for publicity purposes of names of licensors or authors of the
328 | material; or
329 | * **e)** Declining to grant rights under trademark law for use of some trade names,
330 | trademarks, or service marks; or
331 | * **f)** Requiring indemnification of licensors and authors of that material by anyone
332 | who conveys the material (or modified versions of it) with contractual assumptions of
333 | liability to the recipient, for any liability that these contractual assumptions
334 | directly impose on those licensors and authors.
335 |
336 | All other non-permissive additional terms are considered “further
337 | restrictions” within the meaning of section 10. If the Program as you received
338 | it, or any part of it, contains a notice stating that it is governed by this License
339 | along with a term that is a further restriction, you may remove that term. If a
340 | license document contains a further restriction but permits relicensing or conveying
341 | under this License, you may add to a covered work material governed by the terms of
342 | that license document, provided that the further restriction does not survive such
343 | relicensing or conveying.
344 |
345 | If you add terms to a covered work in accord with this section, you must place, in
346 | the relevant source files, a statement of the additional terms that apply to those
347 | files, or a notice indicating where to find the applicable terms.
348 |
349 | Additional terms, permissive or non-permissive, may be stated in the form of a
350 | separately written license, or stated as exceptions; the above requirements apply
351 | either way.
352 |
353 | ### 8. Termination
354 |
355 | You may not propagate or modify a covered work except as expressly provided under
356 | this License. Any attempt otherwise to propagate or modify it is void, and will
357 | automatically terminate your rights under this License (including any patent licenses
358 | granted under the third paragraph of section 11).
359 |
360 | However, if you cease all violation of this License, then your license from a
361 | particular copyright holder is reinstated **(a)** provisionally, unless and until the
362 | copyright holder explicitly and finally terminates your license, and **(b)** permanently,
363 | if the copyright holder fails to notify you of the violation by some reasonable means
364 | prior to 60 days after the cessation.
365 |
366 | Moreover, your license from a particular copyright holder is reinstated permanently
367 | if the copyright holder notifies you of the violation by some reasonable means, this
368 | is the first time you have received notice of violation of this License (for any
369 | work) from that copyright holder, and you cure the violation prior to 30 days after
370 | your receipt of the notice.
371 |
372 | Termination of your rights under this section does not terminate the licenses of
373 | parties who have received copies or rights from you under this License. If your
374 | rights have been terminated and not permanently reinstated, you do not qualify to
375 | receive new licenses for the same material under section 10.
376 |
377 | ### 9. Acceptance Not Required for Having Copies
378 |
379 | You are not required to accept this License in order to receive or run a copy of the
380 | Program. Ancillary propagation of a covered work occurring solely as a consequence of
381 | using peer-to-peer transmission to receive a copy likewise does not require
382 | acceptance. However, nothing other than this License grants you permission to
383 | propagate or modify any covered work. These actions infringe copyright if you do not
384 | accept this License. Therefore, by modifying or propagating a covered work, you
385 | indicate your acceptance of this License to do so.
386 |
387 | ### 10. Automatic Licensing of Downstream Recipients
388 |
389 | Each time you convey a covered work, the recipient automatically receives a license
390 | from the original licensors, to run, modify and propagate that work, subject to this
391 | License. You are not responsible for enforcing compliance by third parties with this
392 | License.
393 |
394 | An “entity transaction” is a transaction transferring control of an
395 | organization, or substantially all assets of one, or subdividing an organization, or
396 | merging organizations. If propagation of a covered work results from an entity
397 | transaction, each party to that transaction who receives a copy of the work also
398 | receives whatever licenses to the work the party's predecessor in interest had or
399 | could give under the previous paragraph, plus a right to possession of the
400 | Corresponding Source of the work from the predecessor in interest, if the predecessor
401 | has it or can get it with reasonable efforts.
402 |
403 | You may not impose any further restrictions on the exercise of the rights granted or
404 | affirmed under this License. For example, you may not impose a license fee, royalty,
405 | or other charge for exercise of rights granted under this License, and you may not
406 | initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging
407 | that any patent claim is infringed by making, using, selling, offering for sale, or
408 | importing the Program or any portion of it.
409 |
410 | ### 11. Patents
411 |
412 | A “contributor” is a copyright holder who authorizes use under this
413 | License of the Program or a work on which the Program is based. The work thus
414 | licensed is called the contributor's “contributor version”.
415 |
416 | A contributor's “essential patent claims” are all patent claims owned or
417 | controlled by the contributor, whether already acquired or hereafter acquired, that
418 | would be infringed by some manner, permitted by this License, of making, using, or
419 | selling its contributor version, but do not include claims that would be infringed
420 | only as a consequence of further modification of the contributor version. For
421 | purposes of this definition, “control” includes the right to grant patent
422 | sublicenses in a manner consistent with the requirements of this License.
423 |
424 | Each contributor grants you a non-exclusive, worldwide, royalty-free patent license
425 | under the contributor's essential patent claims, to make, use, sell, offer for sale,
426 | import and otherwise run, modify and propagate the contents of its contributor
427 | version.
428 |
429 | In the following three paragraphs, a “patent license” is any express
430 | agreement or commitment, however denominated, not to enforce a patent (such as an
431 | express permission to practice a patent or covenant not to sue for patent
432 | infringement). To “grant” such a patent license to a party means to make
433 | such an agreement or commitment not to enforce a patent against the party.
434 |
435 | If you convey a covered work, knowingly relying on a patent license, and the
436 | Corresponding Source of the work is not available for anyone to copy, free of charge
437 | and under the terms of this License, through a publicly available network server or
438 | other readily accessible means, then you must either **(1)** cause the Corresponding
439 | Source to be so available, or **(2)** arrange to deprive yourself of the benefit of the
440 | patent license for this particular work, or **(3)** arrange, in a manner consistent with
441 | the requirements of this License, to extend the patent license to downstream
442 | recipients. “Knowingly relying” means you have actual knowledge that, but
443 | for the patent license, your conveying the covered work in a country, or your
444 | recipient's use of the covered work in a country, would infringe one or more
445 | identifiable patents in that country that you have reason to believe are valid.
446 |
447 | If, pursuant to or in connection with a single transaction or arrangement, you
448 | convey, or propagate by procuring conveyance of, a covered work, and grant a patent
449 | license to some of the parties receiving the covered work authorizing them to use,
450 | propagate, modify or convey a specific copy of the covered work, then the patent
451 | license you grant is automatically extended to all recipients of the covered work and
452 | works based on it.
453 |
454 | A patent license is “discriminatory” if it does not include within the
455 | scope of its coverage, prohibits the exercise of, or is conditioned on the
456 | non-exercise of one or more of the rights that are specifically granted under this
457 | License. You may not convey a covered work if you are a party to an arrangement with
458 | a third party that is in the business of distributing software, under which you make
459 | payment to the third party based on the extent of your activity of conveying the
460 | work, and under which the third party grants, to any of the parties who would receive
461 | the covered work from you, a discriminatory patent license **(a)** in connection with
462 | copies of the covered work conveyed by you (or copies made from those copies), or **(b)**
463 | primarily for and in connection with specific products or compilations that contain
464 | the covered work, unless you entered into that arrangement, or that patent license
465 | was granted, prior to 28 March 2007.
466 |
467 | Nothing in this License shall be construed as excluding or limiting any implied
468 | license or other defenses to infringement that may otherwise be available to you
469 | under applicable patent law.
470 |
471 | ### 12. No Surrender of Others' Freedom
472 |
473 | If conditions are imposed on you (whether by court order, agreement or otherwise)
474 | that contradict the conditions of this License, they do not excuse you from the
475 | conditions of this License. If you cannot convey a covered work so as to satisfy
476 | simultaneously your obligations under this License and any other pertinent
477 | obligations, then as a consequence you may not convey it at all. For example, if you
478 | agree to terms that obligate you to collect a royalty for further conveying from
479 | those to whom you convey the Program, the only way you could satisfy both those terms
480 | and this License would be to refrain entirely from conveying the Program.
481 |
482 | ### 13. Use with the GNU Affero General Public License
483 |
484 | Notwithstanding any other provision of this License, you have permission to link or
485 | combine any covered work with a work licensed under version 3 of the GNU Affero
486 | General Public License into a single combined work, and to convey the resulting work.
487 | The terms of this License will continue to apply to the part which is the covered
488 | work, but the special requirements of the GNU Affero General Public License, section
489 | 13, concerning interaction through a network will apply to the combination as such.
490 |
491 | ### 14. Revised Versions of this License
492 |
493 | The Free Software Foundation may publish revised and/or new versions of the GNU
494 | General Public License from time to time. Such new versions will be similar in spirit
495 | to the present version, but may differ in detail to address new problems or concerns.
496 |
497 | Each version is given a distinguishing version number. If the Program specifies that
498 | a certain numbered version of the GNU General Public License “or any later
499 | version” applies to it, you have the option of following the terms and
500 | conditions either of that numbered version or of any later version published by the
501 | Free Software Foundation. If the Program does not specify a version number of the GNU
502 | General Public License, you may choose any version ever published by the Free
503 | Software Foundation.
504 |
505 | If the Program specifies that a proxy can decide which future versions of the GNU
506 | General Public License can be used, that proxy's public statement of acceptance of a
507 | version permanently authorizes you to choose that version for the Program.
508 |
509 | Later license versions may give you additional or different permissions. However, no
510 | additional obligations are imposed on any author or copyright holder as a result of
511 | your choosing to follow a later version.
512 |
513 | ### 15. Disclaimer of Warranty
514 |
515 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
516 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
517 | PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER
518 | EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
519 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE
520 | QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE
521 | DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
522 |
523 | ### 16. Limitation of Liability
524 |
525 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY
526 | COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS
527 | PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL,
528 | INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
529 | PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE
530 | OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE
531 | WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
532 | POSSIBILITY OF SUCH DAMAGES.
533 |
534 | ### 17. Interpretation of Sections 15 and 16
535 |
536 | If the disclaimer of warranty and limitation of liability provided above cannot be
537 | given local legal effect according to their terms, reviewing courts shall apply local
538 | law that most closely approximates an absolute waiver of all civil liability in
539 | connection with the Program, unless a warranty or assumption of liability accompanies
540 | a copy of the Program in return for a fee.
541 |
542 | _END OF TERMS AND CONDITIONS_
543 |
--------------------------------------------------------------------------------