Implemented performant & more reliable SQL query + python hybrid system

This commit is contained in:
crocodilestick
2026-01-15 11:51:33 +01:00
parent 1491589bbf
commit ce3c452dca
3 changed files with 117 additions and 17 deletions
+92 -15
View File
@@ -6,7 +6,7 @@
from flask import Blueprint, jsonify, request, abort
from flask_babel import gettext as _
from sqlalchemy import func, and_
from sqlalchemy import func, and_, case
from sqlalchemy.orm import joinedload
from datetime import datetime
from functools import wraps
@@ -213,7 +213,7 @@ def show_duplicates():
log.info("[cwa-duplicates] Loading duplicates page for user: %s", current_user.name)
try:
# Use SQL to efficiently find duplicates with proper user filtering
# Use SQL/Python detection to find duplicates with proper user filtering
duplicate_groups = find_duplicate_books()
print(f"[cwa-duplicates] Found {len(duplicate_groups)} duplicate groups total", flush=True)
@@ -274,7 +274,7 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
'duplicate_detection_publisher': 0,
'duplicate_detection_format': 0,
'duplicate_detection_use_sql': 1,
'duplicate_scan_method': 'auto'
'duplicate_scan_method': 'hybrid'
}
# Extract duplicate detection criteria
@@ -298,15 +298,17 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
# Determine which method to use
method_to_use = 'python' # Default fallback
if scan_method == 'python':
method_to_use = 'python'
elif scan_method == 'sql':
# SQL-only is available but still experimental
method_to_use = 'sql' if not use_format else 'hybrid'
elif scan_method == 'hybrid':
method_to_use = 'hybrid'
else: # 'auto'
if use_sql and not use_format:
method_to_use = 'sql'
elif use_sql and use_format:
if use_sql:
# Prefer hybrid prefilter for safety unless SQL-only is explicitly chosen
method_to_use = 'hybrid'
else:
method_to_use = 'python'
@@ -321,13 +323,22 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
include_dismissed, user_id
)
elif method_to_use == 'hybrid':
# Use SQL for metadata grouping, then Python for format filtering
duplicate_groups = find_duplicate_books_sql(
use_title, use_author, use_language, use_series, use_publisher,
include_dismissed, user_id
)
# Additional format filtering would go here if needed
print("[cwa-duplicates] Note: Format-based detection requires Python method, using hybrid approach", flush=True)
# Use SQL as a prefilter to get candidate book IDs, then Python for robust grouping
candidate_ids = find_duplicate_candidate_ids_sql(use_title, use_author)
if candidate_ids is None:
print("[cwa-duplicates] Hybrid prefilter unavailable, falling back to full Python scan", flush=True)
duplicate_groups = find_duplicate_books_python(
use_title, use_author, use_language, use_series, use_publisher, use_format,
include_dismissed, user_id
)
elif not candidate_ids:
duplicate_groups = []
else:
duplicate_groups = find_duplicate_books_python(
use_title, use_author, use_language, use_series, use_publisher, use_format,
include_dismissed, user_id, candidate_ids=candidate_ids
)
print("[cwa-duplicates] Hybrid prefilter applied (SQL candidates + Python validation)", flush=True)
else:
duplicate_groups = find_duplicate_books_python(
use_title, use_author, use_language, use_series, use_publisher, use_format,
@@ -361,6 +372,66 @@ def find_duplicate_books(include_dismissed=False, user_id=None):
return duplicate_groups
def find_duplicate_candidate_ids_sql(use_title, use_author):
"""SQL-based candidate prefilter for hybrid mode.
Returns a set of book IDs that are likely part of duplicate groups.
Uses only title/author prefiltering to remain a safe superset.
Args:
use_title: Whether title criteria is enabled
use_author: Whether author criteria is enabled
Returns:
set of int book IDs, empty set if none, or None if prefilter should be skipped
"""
# If neither title nor author is enabled, prefilter is too risky -> skip
if not use_title and not use_author:
return None
print("[cwa-duplicates] Using SQL hybrid prefilter (candidate IDs)", flush=True)
group_by_fields = []
if use_title:
norm_title = func.lower(func.trim(func.coalesce(db.Books.title, 'untitled')))
group_by_fields.append(norm_title)
if use_author:
norm_author_sort = func.lower(func.trim(func.coalesce(db.Books.author_sort, 'unknown')))
primary_author = case(
(func.instr(norm_author_sort, '&') > 0,
func.substr(norm_author_sort, 1, func.instr(norm_author_sort, '&') - 1)),
else_=norm_author_sort
)
group_by_fields.append(primary_author)
query = (calibre_db.session.query(
func.count(func.distinct(db.Books.id)).label('book_count'),
func.group_concat(func.distinct(db.Books.id)).label('book_ids_str')
)
.select_from(db.Books)
.filter(calibre_db.common_filters())
.group_by(*group_by_fields)
.having(func.count(func.distinct(db.Books.id)) > 1))
try:
results = query.all()
except Exception as e:
log.error("[cwa-duplicates] Hybrid prefilter SQL failed: %s", str(e))
print(f"[cwa-duplicates] Hybrid prefilter SQL failed: {str(e)}", flush=True)
return None
candidate_ids = set()
for row in results:
if not row.book_ids_str:
continue
candidate_ids.update(int(bid) for bid in row.book_ids_str.split(',') if bid)
print(f"[cwa-duplicates] Hybrid prefilter returned {len(candidate_ids)} candidate books", flush=True)
return candidate_ids
def find_duplicate_books_sql(use_title, use_author, use_language, use_series, use_publisher,
include_dismissed=False, user_id=None):
"""SQL-based duplicate detection using GROUP BY - experimental/WIP
@@ -551,7 +622,7 @@ def find_duplicate_books_sql(use_title, use_author, use_language, use_series, us
def find_duplicate_books_python(use_title, use_author, use_language, use_series, use_publisher, use_format,
include_dismissed=False, user_id=None):
include_dismissed=False, user_id=None, candidate_ids=None):
"""Original Python-based duplicate detection - fallback for complex scenarios
Args:
@@ -569,6 +640,12 @@ def find_duplicate_books_python(use_title, use_author, use_language, use_series,
books_query = (calibre_db.session.query(db.Books)
.filter(calibre_db.common_filters()) # Respect user permissions and library filtering
.order_by(db.Books.title, db.Books.timestamp.desc()))
if candidate_ids is not None:
if not candidate_ids:
print("[cwa-duplicates] No candidate IDs provided, returning empty duplicate set", flush=True)
return []
books_query = books_query.filter(db.Books.id.in_(list(candidate_ids)))
all_books = books_query.all()
print(f"[cwa-duplicates] Retrieved {len(all_books)} books with user filtering applied", flush=True)
+23
View File
@@ -606,6 +606,29 @@
<label for="duplicate_detection_enabled">{{_('Enable Duplicate Detection')}}</label>
<small class="settings-explanation">{{_('When enabled, CWA will scan for duplicate books after each import')}}</small>
</div>
<!-- Detection Method (Hybrid default) -->
{% set scan_method = cwa_settings.get('duplicate_scan_method')|default('hybrid', true) %}
<div class="form-group" style="max-width: 520px;">
<label for="duplicate_scan_method" class="settings-section-header" style="margin-top: 8px;">{{_('Duplicate Detection Method')}}</label>
<select class="cwa-settings-select" name="duplicate_scan_method" id="duplicate_scan_method" style="width: fit-content;">
<option value="hybrid" {% if scan_method == 'hybrid' %}selected{% endif %}>
{{_('Hybrid (SQL prefilter + Python validation)')}}
</option>
<option value="python" {% if scan_method == 'python' %}selected{% endif %}>
{{_('Python only (slowest, most robust)')}}
</option>
<option value="sql" {% if scan_method == 'sql' %}selected{% endif %}>
{{_('Legacy SQL only (experimental)')}}
</option>
</select>
<p class="cwa-settings-explanation settings-explanation" style="margin-top: 10px;">
{{_('Hybrid mode uses a fast SQL prefilter to narrow candidates, then applies the robust Python logic for final results.') }}
</p>
</div>
<!-- Keep SQL prefilter enabled by default for hybrid/auto modes -->
<input type="hidden" id="duplicate_detection_use_sql" name="duplicate_detection_use_sql" value="1">
</div>
<div class="settings-container">
+2 -2
View File
@@ -77,8 +77,8 @@ CREATE TABLE IF NOT EXISTS cwa_settings(
duplicate_auto_resolve_strategy TEXT DEFAULT 'newest' NOT NULL,
duplicate_format_priority TEXT DEFAULT '{"EPUB":100,"KEPUB":95,"AZW3":90,"MOBI":80,"AZW":75,"PDF":60,"TXT":40,"CBZ":35,"CBR":35,"FB2":30,"DJVU":25,"HTML":20,"RTF":15,"DOC":10,"DOCX":10}' NOT NULL,
-- Duplicate scanning performance settings
duplicate_detection_use_sql SMALLINT DEFAULT 0 NOT NULL, -- Disabled by default, Python method is stable
duplicate_scan_method TEXT DEFAULT 'python' NOT NULL, -- Use python by default
duplicate_detection_use_sql SMALLINT DEFAULT 1 NOT NULL, -- Enable SQL prefilter for hybrid by default
duplicate_scan_method TEXT DEFAULT 'hybrid' NOT NULL, -- Use hybrid prefilter by default
duplicate_scan_enabled SMALLINT DEFAULT 0 NOT NULL,
duplicate_scan_frequency TEXT DEFAULT 'manual' NOT NULL,
duplicate_scan_hour INTEGER DEFAULT 3 NOT NULL,