Files
Calibre-Web-Automated/cps/clean_html.py
T

38 lines
1.3 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
# Calibre-Web Automated fork of Calibre-Web
# Copyright (C) 2018-2025 Calibre-Web contributors
# Copyright (C) 2024-2025 Calibre-Web Automated contributors
# SPDX-License-Identifier: GPL-3.0-or-later
# See CONTRIBUTORS for full list of authors.
from . import logger
from lxml.etree import ParserError
log = logger.create()
try:
# at least bleach 6.0 is needed -> incomplatible change from list arguments to set arguments
from bleach import clean as clean_html
from bleach.sanitizer import ALLOWED_TAGS
bleach = True
except ImportError:
from nh3 import clean as clean_html
bleach = False
def clean_string(unsafe_text, book_id=0):
try:
if bleach:
allowed_tags = list(ALLOWED_TAGS)
allowed_tags.extend(["p", "span", "div", "pre", "br", "h1", "h2", "h3", "h4", "h5", "h6"])
safe_text = clean_html(unsafe_text, tags=set(allowed_tags))
else:
safe_text = clean_html(unsafe_text)
except ParserError as e:
log.error("Comments of book {} are corrupted: {}".format(book_id, e))
safe_text = ""
except TypeError as e:
log.error("Comments can't be parsed, maybe 'lxml' is too new, try installing 'bleach': {}".format(e))
safe_text = ""
return safe_text