Implemented performant & more reliable SQL query + python hybrid system
This commit is contained in:
+92
-15
@@ -6,7 +6,7 @@
|
||||
|
||||
from flask import Blueprint, jsonify, request, abort
|
||||
from flask_babel import gettext as _
|
||||
from sqlalchemy import func, and_
|
||||
from sqlalchemy import func, and_, case
|
||||
from sqlalchemy.orm import joinedload
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
@@ -213,7 +213,7 @@ def show_duplicates():
|
||||
log.info("[cwa-duplicates] Loading duplicates page for user: %s", current_user.name)
|
||||
|
||||
try:
|
||||
# Use SQL to efficiently find duplicates with proper user filtering
|
||||
# Use SQL/Python detection to find duplicates with proper user filtering
|
||||
duplicate_groups = find_duplicate_books()
|
||||
|
||||
print(f"[cwa-duplicates] Found {len(duplicate_groups)} duplicate groups total", flush=True)
|
||||
@@ -274,7 +274,7 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
|
||||
'duplicate_detection_publisher': 0,
|
||||
'duplicate_detection_format': 0,
|
||||
'duplicate_detection_use_sql': 1,
|
||||
'duplicate_scan_method': 'auto'
|
||||
'duplicate_scan_method': 'hybrid'
|
||||
}
|
||||
|
||||
# Extract duplicate detection criteria
|
||||
@@ -298,15 +298,17 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
|
||||
|
||||
# Determine which method to use
|
||||
method_to_use = 'python' # Default fallback
|
||||
|
||||
|
||||
if scan_method == 'python':
|
||||
method_to_use = 'python'
|
||||
elif scan_method == 'sql':
|
||||
# SQL-only is available but still experimental
|
||||
method_to_use = 'sql' if not use_format else 'hybrid'
|
||||
elif scan_method == 'hybrid':
|
||||
method_to_use = 'hybrid'
|
||||
else: # 'auto'
|
||||
if use_sql and not use_format:
|
||||
method_to_use = 'sql'
|
||||
elif use_sql and use_format:
|
||||
if use_sql:
|
||||
# Prefer hybrid prefilter for safety unless SQL-only is explicitly chosen
|
||||
method_to_use = 'hybrid'
|
||||
else:
|
||||
method_to_use = 'python'
|
||||
@@ -321,13 +323,22 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
|
||||
include_dismissed, user_id
|
||||
)
|
||||
elif method_to_use == 'hybrid':
|
||||
# Use SQL for metadata grouping, then Python for format filtering
|
||||
duplicate_groups = find_duplicate_books_sql(
|
||||
use_title, use_author, use_language, use_series, use_publisher,
|
||||
include_dismissed, user_id
|
||||
)
|
||||
# Additional format filtering would go here if needed
|
||||
print("[cwa-duplicates] Note: Format-based detection requires Python method, using hybrid approach", flush=True)
|
||||
# Use SQL as a prefilter to get candidate book IDs, then Python for robust grouping
|
||||
candidate_ids = find_duplicate_candidate_ids_sql(use_title, use_author)
|
||||
if candidate_ids is None:
|
||||
print("[cwa-duplicates] Hybrid prefilter unavailable, falling back to full Python scan", flush=True)
|
||||
duplicate_groups = find_duplicate_books_python(
|
||||
use_title, use_author, use_language, use_series, use_publisher, use_format,
|
||||
include_dismissed, user_id
|
||||
)
|
||||
elif not candidate_ids:
|
||||
duplicate_groups = []
|
||||
else:
|
||||
duplicate_groups = find_duplicate_books_python(
|
||||
use_title, use_author, use_language, use_series, use_publisher, use_format,
|
||||
include_dismissed, user_id, candidate_ids=candidate_ids
|
||||
)
|
||||
print("[cwa-duplicates] Hybrid prefilter applied (SQL candidates + Python validation)", flush=True)
|
||||
else:
|
||||
duplicate_groups = find_duplicate_books_python(
|
||||
use_title, use_author, use_language, use_series, use_publisher, use_format,
|
||||
@@ -361,6 +372,66 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
|
||||
return duplicate_groups
|
||||
|
||||
|
||||
def find_duplicate_candidate_ids_sql(use_title, use_author):
|
||||
"""SQL-based candidate prefilter for hybrid mode.
|
||||
|
||||
Returns a set of book IDs that are likely part of duplicate groups.
|
||||
Uses only title/author prefiltering to remain a safe superset.
|
||||
|
||||
Args:
|
||||
use_title: Whether title criteria is enabled
|
||||
use_author: Whether author criteria is enabled
|
||||
|
||||
Returns:
|
||||
set of int book IDs, empty set if none, or None if prefilter should be skipped
|
||||
"""
|
||||
# If neither title nor author is enabled, prefilter is too risky -> skip
|
||||
if not use_title and not use_author:
|
||||
return None
|
||||
|
||||
print("[cwa-duplicates] Using SQL hybrid prefilter (candidate IDs)", flush=True)
|
||||
|
||||
group_by_fields = []
|
||||
|
||||
if use_title:
|
||||
norm_title = func.lower(func.trim(func.coalesce(db.Books.title, 'untitled')))
|
||||
group_by_fields.append(norm_title)
|
||||
|
||||
if use_author:
|
||||
norm_author_sort = func.lower(func.trim(func.coalesce(db.Books.author_sort, 'unknown')))
|
||||
primary_author = case(
|
||||
(func.instr(norm_author_sort, '&') > 0,
|
||||
func.substr(norm_author_sort, 1, func.instr(norm_author_sort, '&') - 1)),
|
||||
else_=norm_author_sort
|
||||
)
|
||||
group_by_fields.append(primary_author)
|
||||
|
||||
query = (calibre_db.session.query(
|
||||
func.count(func.distinct(db.Books.id)).label('book_count'),
|
||||
func.group_concat(func.distinct(db.Books.id)).label('book_ids_str')
|
||||
)
|
||||
.select_from(db.Books)
|
||||
.filter(calibre_db.common_filters())
|
||||
.group_by(*group_by_fields)
|
||||
.having(func.count(func.distinct(db.Books.id)) > 1))
|
||||
|
||||
try:
|
||||
results = query.all()
|
||||
except Exception as e:
|
||||
log.error("[cwa-duplicates] Hybrid prefilter SQL failed: %s", str(e))
|
||||
print(f"[cwa-duplicates] Hybrid prefilter SQL failed: {str(e)}", flush=True)
|
||||
return None
|
||||
|
||||
candidate_ids = set()
|
||||
for row in results:
|
||||
if not row.book_ids_str:
|
||||
continue
|
||||
candidate_ids.update(int(bid) for bid in row.book_ids_str.split(',') if bid)
|
||||
|
||||
print(f"[cwa-duplicates] Hybrid prefilter returned {len(candidate_ids)} candidate books", flush=True)
|
||||
return candidate_ids
|
||||
|
||||
|
||||
def find_duplicate_books_sql(use_title, use_author, use_language, use_series, use_publisher,
|
||||
include_dismissed=False, user_id=None):
|
||||
"""SQL-based duplicate detection using GROUP BY - experimental/WIP
|
||||
@@ -551,7 +622,7 @@ def find_duplicate_books_sql(use_title, use_author, use_language, use_series, us
|
||||
|
||||
|
||||
def find_duplicate_books_python(use_title, use_author, use_language, use_series, use_publisher, use_format,
|
||||
include_dismissed=False, user_id=None):
|
||||
include_dismissed=False, user_id=None, candidate_ids=None):
|
||||
"""Original Python-based duplicate detection - fallback for complex scenarios
|
||||
|
||||
Args:
|
||||
@@ -569,6 +640,12 @@ def find_duplicate_books_python(use_title, use_author, use_language, use_series,
|
||||
books_query = (calibre_db.session.query(db.Books)
|
||||
.filter(calibre_db.common_filters()) # Respect user permissions and library filtering
|
||||
.order_by(db.Books.title, db.Books.timestamp.desc()))
|
||||
|
||||
if candidate_ids is not None:
|
||||
if not candidate_ids:
|
||||
print("[cwa-duplicates] No candidate IDs provided, returning empty duplicate set", flush=True)
|
||||
return []
|
||||
books_query = books_query.filter(db.Books.id.in_(list(candidate_ids)))
|
||||
|
||||
all_books = books_query.all()
|
||||
print(f"[cwa-duplicates] Retrieved {len(all_books)} books with user filtering applied", flush=True)
|
||||
|
||||
@@ -606,6 +606,29 @@
|
||||
<label for="duplicate_detection_enabled">{{_('Enable Duplicate Detection')}}</label>
|
||||
<small class="settings-explanation">{{_('When enabled, CWA will scan for duplicate books after each import')}}</small>
|
||||
</div>
|
||||
|
||||
<!-- Detection Method (Hybrid default) -->
|
||||
{% set scan_method = cwa_settings.get('duplicate_scan_method')|default('hybrid', true) %}
|
||||
<div class="form-group" style="max-width: 520px;">
|
||||
<label for="duplicate_scan_method" class="settings-section-header" style="margin-top: 8px;">{{_('Duplicate Detection Method')}}</label>
|
||||
<select class="cwa-settings-select" name="duplicate_scan_method" id="duplicate_scan_method" style="width: fit-content;">
|
||||
<option value="hybrid" {% if scan_method == 'hybrid' %}selected{% endif %}>
|
||||
{{_('Hybrid (SQL prefilter + Python validation)')}}
|
||||
</option>
|
||||
<option value="python" {% if scan_method == 'python' %}selected{% endif %}>
|
||||
{{_('Python only (slowest, most robust)')}}
|
||||
</option>
|
||||
<option value="sql" {% if scan_method == 'sql' %}selected{% endif %}>
|
||||
{{_('Legacy SQL only (experimental)')}}
|
||||
</option>
|
||||
</select>
|
||||
<p class="cwa-settings-explanation settings-explanation" style="margin-top: 10px;">
|
||||
{{_('Hybrid mode uses a fast SQL prefilter to narrow candidates, then applies the robust Python logic for final results.') }}
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<!-- Keep SQL prefilter enabled by default for hybrid/auto modes -->
|
||||
<input type="hidden" id="duplicate_detection_use_sql" name="duplicate_detection_use_sql" value="1">
|
||||
</div>
|
||||
|
||||
<div class="settings-container">
|
||||
|
||||
@@ -77,8 +77,8 @@ CREATE TABLE IF NOT EXISTS cwa_settings(
|
||||
duplicate_auto_resolve_strategy TEXT DEFAULT 'newest' NOT NULL,
|
||||
duplicate_format_priority TEXT DEFAULT '{"EPUB":100,"KEPUB":95,"AZW3":90,"MOBI":80,"AZW":75,"PDF":60,"TXT":40,"CBZ":35,"CBR":35,"FB2":30,"DJVU":25,"HTML":20,"RTF":15,"DOC":10,"DOCX":10}' NOT NULL,
|
||||
-- Duplicate scanning performance settings
|
||||
duplicate_detection_use_sql SMALLINT DEFAULT 0 NOT NULL, -- Disabled by default, Python method is stable
|
||||
duplicate_scan_method TEXT DEFAULT 'python' NOT NULL, -- Use python by default
|
||||
duplicate_detection_use_sql SMALLINT DEFAULT 1 NOT NULL, -- Enable SQL prefilter for hybrid by default
|
||||
duplicate_scan_method TEXT DEFAULT 'hybrid' NOT NULL, -- Use hybrid prefilter by default
|
||||
duplicate_scan_enabled SMALLINT DEFAULT 0 NOT NULL,
|
||||
duplicate_scan_frequency TEXT DEFAULT 'manual' NOT NULL,
|
||||
duplicate_scan_hour INTEGER DEFAULT 3 NOT NULL,
|
||||
|
||||
Reference in New Issue
Block a user