Files
readability/Readability-readerable.js
AlexTate a07e62c8ab Preserve MathJax tags (#958)
* Prevent MathJax nodes from being identified as 'unlikely candidates', and prevent <mjx-math> tags from being removed due to attribute aria-hidden="true"

* Revert changes to _isProbablyVisible() and isNodeVisible()

* Update test case to include the MathJax tags which are produced once client-side rendering is complete. The previous test case only used the static HTML received from the server.

Unfortunately, after htmltidy2 processes the page it is determined to be "unreaderable" though the appropriate JSDOM tests run and pass. Alternatively, if htmltidy2 is skipped, JSDOMParser produces a slew of errors.

Perhaps this will do for now...

* Adding support for file:// URLs. This is useful when the test case contains dynamic content as it allows the dev to save a copy of the rendered DOM to disk and use the resulting file as input to generate-testcase. Alternatively one could use JSDOM's {runScripts: "dangerously", resources: "usable"} options, but in my case these fell short and caused MathJax to crash due to missing localStorage implementation in JSDOM. Perhaps my approach will be useful to others...

* Use url fileURLToPath to handle file urls

---------

Co-authored-by: Gijs Kruitbosch <gijskruitbosch@gmail.com>
2025-03-25 22:16:28 +00:00

123 lines
4.2 KiB
JavaScript

/*
* Copyright (c) 2010 Arc90 Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability
*/
var REGEXPS = {
// NOTE: These two regular expressions are duplicated in
// Readability.js. Please keep both copies in sync.
unlikelyCandidates:
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|mathjax|shadow/i,
};
function isNodeVisible(node) {
// Have to null-check node.style and node.className.includes to deal with SVG and MathML nodes.
return (
(!node.style || node.style.display != "none") &&
!node.hasAttribute("hidden") &&
//check for "fallback-image" so that wikimedia math images are displayed
(!node.hasAttribute("aria-hidden") ||
node.getAttribute("aria-hidden") != "true" ||
(node.className &&
node.className.includes &&
node.className.includes("fallback-image")))
);
}
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
* @param {Object} options Configuration object.
* @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable.
* @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable.
* @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible.
* @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object.
*/
function isProbablyReaderable(doc, options = {}) {
// For backward compatibility reasons 'options' can either be a configuration object or the function used
// to determine if a node is visible.
if (typeof options == "function") {
options = { visibilityChecker: options };
}
var defaultOptions = {
minScore: 20,
minContentLength: 140,
visibilityChecker: isNodeVisible,
};
options = Object.assign(defaultOptions, options);
var nodes = doc.querySelectorAll("p, pre, article");
// Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
// Some articles' DOM structures might look like
// <div>
// Sentences<br>
// <br>
// Sentences<br>
// </div>
var brNodes = doc.querySelectorAll("div > br");
if (brNodes.length) {
var set = new Set(nodes);
[].forEach.call(brNodes, function (node) {
set.add(node.parentNode);
});
nodes = Array.from(set);
}
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return [].some.call(nodes, function (node) {
if (!options.visibilityChecker(node)) {
return false;
}
var matchString = node.className + " " + node.id;
if (
REGEXPS.unlikelyCandidates.test(matchString) &&
!REGEXPS.okMaybeItsACandidate.test(matchString)
) {
return false;
}
if (node.matches("li p")) {
return false;
}
var textContentLength = node.textContent.trim().length;
if (textContentLength < options.minContentLength) {
return false;
}
score += Math.sqrt(textContentLength - options.minContentLength);
if (score > options.minScore) {
return true;
}
return false;
});
}
if (typeof module === "object") {
/* eslint-disable-next-line no-redeclare */
/* global module */
module.exports = isProbablyReaderable;
}