aboutsummaryrefslogtreecommitdiff
path: root/plugins/readability
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/readability')
-rw-r--r--plugins/readability/readability.js1149
1 files changed, 730 insertions, 419 deletions
diff --git a/plugins/readability/readability.js b/plugins/readability/readability.js
index 0a5bda7..3bef151 100644
--- a/plugins/readability/readability.js
+++ b/plugins/readability/readability.js
@@ -31,7 +31,9 @@ function Readability(doc, options) {
doc = options;
options = arguments[2];
} else if (!doc || !doc.documentElement) {
- throw new Error("First argument to Readability constructor should be a document object.");
+ throw new Error(
+ "First argument to Readability constructor should be a document object."
+ );
}
options = options || {};
@@ -45,41 +47,49 @@ function Readability(doc, options) {
// Configurable options
this._debug = !!options.debug;
- this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
- this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
+ this._maxElemsToParse =
+ options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
+ this._nbTopCandidates =
+ options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
- this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
+ this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(
+ options.classesToPreserve || []
+ );
this._keepClasses = !!options.keepClasses;
- this._serializer = options.serializer || function(el) {
- return el.innerHTML;
- };
+ this._serializer =
+ options.serializer ||
+ function (el) {
+ return el.innerHTML;
+ };
this._disableJSONLD = !!options.disableJSONLD;
// Start with all flags set
- this._flags = this.FLAG_STRIP_UNLIKELYS |
- this.FLAG_WEIGHT_CLASSES |
- this.FLAG_CLEAN_CONDITIONALLY;
-
+ this._flags =
+ this.FLAG_STRIP_UNLIKELYS |
+ this.FLAG_WEIGHT_CLASSES |
+ this.FLAG_CLEAN_CONDITIONALLY;
// Control whether log messages are sent to the console
if (this._debug) {
- let logNode = function(node) {
+ let logNode = function (node) {
if (node.nodeType == node.TEXT_NODE) {
return `${node.nodeName} ("${node.textContent}")`;
}
- let attrPairs = Array.from(node.attributes || [], function(attr) {
+ let attrPairs = Array.from(node.attributes || [], function (attr) {
return `${attr.name}="${attr.value}"`;
}).join(" ");
return `<${node.localName} ${attrPairs}>`;
};
this.log = function () {
if (typeof dump !== "undefined") {
- var msg = Array.prototype.map.call(arguments, function(x) {
- return (x && x.nodeName) ? logNode(x) : x;
- }).join(" ");
+ var msg = Array.prototype.map
+ .call(arguments, function (x) {
+ return x && x.nodeName ? logNode(x) : x;
+ })
+ .join(" ");
dump("Reader: (Readability) " + msg + "\n");
} else if (typeof console !== "undefined") {
- let args = Array.from(arguments, arg => {
+ let args = Array.from(arguments, (arg) => {
if (arg && arg.nodeType == this.ELEMENT_NODE) {
return logNode(arg);
}
@@ -111,7 +121,9 @@ Readability.prototype = {
DEFAULT_N_TOP_CANDIDATES: 5,
// Element tags to score by default.
- DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
+ DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre"
+ .toUpperCase()
+ .split(","),
// The default number of chars an article must have in order to return a result
DEFAULT_CHAR_THRESHOLD: 500,
@@ -121,16 +133,21 @@ Readability.prototype = {
REGEXPS: {
// NOTE: These two regular expressions are duplicated in
// Readability-readerable.js. Please keep both copies in sync.
- unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+ unlikelyCandidates:
+ /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
- positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
- negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
- extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
+ positive:
+ /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
+ negative:
+ /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
+ extraneous:
+ /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
- videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
+ videos:
+ /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
@@ -141,40 +158,106 @@ Readability.prototype = {
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
// See: https://schema.org/Article
- jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/
+ jsonLdArticleTypes:
+ /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
},
- UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ],
+ UNLIKELY_ROLES: [
+ "menu",
+ "menubar",
+ "complementary",
+ "navigation",
+ "alert",
+ "alertdialog",
+ "dialog",
+ ],
- DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]),
+ DIV_TO_P_ELEMS: new Set([
+ "BLOCKQUOTE",
+ "DL",
+ "DIV",
+ "IMG",
+ "OL",
+ "P",
+ "PRE",
+ "TABLE",
+ "UL",
+ ]),
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
- PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
+ PRESENTATIONAL_ATTRIBUTES: [
+ "align",
+ "background",
+ "bgcolor",
+ "border",
+ "cellpadding",
+ "cellspacing",
+ "frame",
+ "hspace",
+ "rules",
+ "style",
+ "valign",
+ "vspace",
+ ],
- DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+ DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"],
// The commented out elements qualify as phrasing content but tend to be
// removed by readability when put into paragraphs, so we ignore them here.
PHRASING_ELEMS: [
// "CANVAS", "IFRAME", "SVG", "VIDEO",
- "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
- "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
- "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
- "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
- "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
+ "ABBR",
+ "AUDIO",
+ "B",
+ "BDO",
+ "BR",
+ "BUTTON",
+ "CITE",
+ "CODE",
+ "DATA",
+ "DATALIST",
+ "DFN",
+ "EM",
+ "EMBED",
+ "I",
+ "IMG",
+ "INPUT",
+ "KBD",
+ "LABEL",
+ "MARK",
+ "MATH",
+ "METER",
+ "NOSCRIPT",
+ "OBJECT",
+ "OUTPUT",
+ "PROGRESS",
+ "Q",
+ "RUBY",
+ "SAMP",
+ "SCRIPT",
+ "SELECT",
+ "SMALL",
+ "SPAN",
+ "STRONG",
+ "SUB",
+ "SUP",
+ "TEXTAREA",
+ "TIME",
+ "VAR",
+ "WBR",
],
// These are the classes that readability sets itself.
- CLASSES_TO_PRESERVE: [ "page" ],
+ CLASSES_TO_PRESERVE: ["page"],
// These are the list of HTML entities that need to be escaped.
HTML_ESCAPE_MAP: {
- "lt": "<",
- "gt": ">",
- "amp": "&",
- "quot": '"',
- "apos": "'",
+ lt: "<",
+ gt: ">",
+ amp: "&",
+ quot: '"',
+ apos: "'",
},
/**
@@ -182,8 +265,8 @@ Readability.prototype = {
*
* @param Element
* @return void
- **/
- _postProcessContent: function(articleContent) {
+ **/
+ _postProcessContent: function (articleContent) {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);
@@ -205,7 +288,7 @@ Readability.prototype = {
* @param Function filterFn the function to use as a filter
* @return void
*/
- _removeNodes: function(nodeList, filterFn) {
+ _removeNodes: function (nodeList, filterFn) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _removeNodes");
@@ -228,7 +311,7 @@ Readability.prototype = {
* @param String newTagName the new tag name to use
* @return void
*/
- _replaceNodeTags: function(nodeList, newTagName) {
+ _replaceNodeTags: function (nodeList, newTagName) {
// Avoid ever operating on live node lists.
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
throw new Error("Do not pass live node lists to _replaceNodeTags");
@@ -249,7 +332,7 @@ Readability.prototype = {
* @param Function fn The iterate function.
* @return void
*/
- _forEachNode: function(nodeList, fn) {
+ _forEachNode: function (nodeList, fn) {
Array.prototype.forEach.call(nodeList, fn, this);
},
@@ -264,7 +347,7 @@ Readability.prototype = {
* @param Function fn The test function.
* @return void
*/
- _findNode: function(nodeList, fn) {
+ _findNode: function (nodeList, fn) {
return Array.prototype.find.call(nodeList, fn, this);
},
@@ -279,7 +362,7 @@ Readability.prototype = {
* @param Function fn The iterate function.
* @return Boolean
*/
- _someNode: function(nodeList, fn) {
+ _someNode: function (nodeList, fn) {
return Array.prototype.some.call(nodeList, fn, this);
},
@@ -294,7 +377,7 @@ Readability.prototype = {
* @param Function fn The iterate function.
* @return Boolean
*/
- _everyNode: function(nodeList, fn) {
+ _everyNode: function (nodeList, fn) {
return Array.prototype.every.call(nodeList, fn, this);
},
@@ -304,23 +387,26 @@ Readability.prototype = {
* @return ...NodeList
* @return Array
*/
- _concatNodeLists: function() {
+ _concatNodeLists: function () {
var slice = Array.prototype.slice;
var args = slice.call(arguments);
- var nodeLists = args.map(function(list) {
+ var nodeLists = args.map(function (list) {
return slice.call(list);
});
return Array.prototype.concat.apply([], nodeLists);
},
- _getAllNodesWithTag: function(node, tagNames) {
+ _getAllNodesWithTag: function (node, tagNames) {
if (node.querySelectorAll) {
return node.querySelectorAll(tagNames.join(","));
}
- return [].concat.apply([], tagNames.map(function(tag) {
- var collection = node.getElementsByTagName(tag);
- return Array.isArray(collection) ? collection : Array.from(collection);
- }));
+ return [].concat.apply(
+ [],
+ tagNames.map(function (tag) {
+ var collection = node.getElementsByTagName(tag);
+ return Array.isArray(collection) ? collection : Array.from(collection);
+ })
+ );
},
/**
@@ -331,11 +417,11 @@ Readability.prototype = {
* @param Element
* @return void
*/
- _cleanClasses: function(node) {
+ _cleanClasses: function (node) {
var classesToPreserve = this._classesToPreserve;
var className = (node.getAttribute("class") || "")
.split(/\s+/)
- .filter(function(cls) {
+ .filter(function (cls) {
return classesToPreserve.indexOf(cls) != -1;
})
.join(" ");
@@ -358,7 +444,7 @@ Readability.prototype = {
* @param Element
* @return void
*/
- _fixRelativeUris: function(articleContent) {
+ _fixRelativeUris: function (articleContent) {
var baseURI = this._doc.baseURI;
var documentURI = this._doc.documentURI;
function toAbsoluteURI(uri) {
@@ -377,14 +463,17 @@ Readability.prototype = {
}
var links = this._getAllNodesWithTag(articleContent, ["a"]);
- this._forEachNode(links, function(link) {
+ this._forEachNode(links, function (link) {
var href = link.getAttribute("href");
if (href) {
// Remove links with javascript: URIs, since
// they won't work after scripts have been removed from the page.
if (href.indexOf("javascript:") === 0) {
// if the link only contains simple text content, it can be converted to a text node
- if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
+ if (
+ link.childNodes.length === 1 &&
+ link.childNodes[0].nodeType === this.TEXT_NODE
+ ) {
var text = this._doc.createTextNode(link.textContent);
link.parentNode.replaceChild(text, link);
} else {
@@ -402,10 +491,15 @@ Readability.prototype = {
});
var medias = this._getAllNodesWithTag(articleContent, [
- "img", "picture", "figure", "video", "audio", "source"
+ "img",
+ "picture",
+ "figure",
+ "video",
+ "audio",
+ "source",
]);
- this._forEachNode(medias, function(media) {
+ this._forEachNode(medias, function (media) {
var src = media.getAttribute("src");
var poster = media.getAttribute("poster");
var srcset = media.getAttribute("srcset");
@@ -419,27 +513,40 @@ Readability.prototype = {
}
if (srcset) {
- var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
- return toAbsoluteURI(p1) + (p2 || "") + p3;
- });
+ var newSrcset = srcset.replace(
+ this.REGEXPS.srcsetUrl,
+ function (_, p1, p2, p3) {
+ return toAbsoluteURI(p1) + (p2 || "") + p3;
+ }
+ );
media.setAttribute("srcset", newSrcset);
}
});
},
- _simplifyNestedElements: function(articleContent) {
+ _simplifyNestedElements: function (articleContent) {
var node = articleContent;
while (node) {
- if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
+ if (
+ node.parentNode &&
+ ["DIV", "SECTION"].includes(node.tagName) &&
+ !(node.id && node.id.startsWith("readability"))
+ ) {
if (this._isElementWithoutContent(node)) {
node = this._removeAndGetNext(node);
continue;
- } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) {
+ } else if (
+ this._hasSingleTagInsideElement(node, "DIV") ||
+ this._hasSingleTagInsideElement(node, "SECTION")
+ ) {
var child = node.children[0];
for (var i = 0; i < node.attributes.length; i++) {
- child.setAttribute(node.attributes[i].name, node.attributes[i].value);
+ child.setAttribute(
+ node.attributes[i].name,
+ node.attributes[i].value
+ );
}
node.parentNode.replaceChild(child, node);
node = child;
@@ -456,7 +563,7 @@ Readability.prototype = {
*
* @return string
**/
- _getArticleTitle: function() {
+ _getArticleTitle: function () {
var doc = this._doc;
var curTitle = "";
var origTitle = "";
@@ -466,8 +573,12 @@ Readability.prototype = {
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
- curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]);
- } catch (e) {/* ignore exceptions setting the title. */}
+ curTitle = origTitle = this._getInnerText(
+ doc.getElementsByTagName("title")[0]
+ );
+ } catch (e) {
+ /* ignore exceptions setting the title. */
+ }
var titleHadHierarchicalSeparators = false;
function wordCount(str) {
@@ -475,7 +586,7 @@ Readability.prototype = {
}
// If there's a separator in the title, first remove the final part
- if ((/ [\|\-\\\/>»] /).test(curTitle)) {
+ if (/ [\|\-\\\/>»] /.test(curTitle)) {
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
@@ -491,7 +602,7 @@ Readability.prototype = {
doc.getElementsByTagName("h2")
);
var trimmedTitle = curTitle.trim();
- var match = this._someNode(headings, function(heading) {
+ var match = this._someNode(headings, function (heading) {
return heading.textContent.trim() === trimmedTitle;
});
@@ -511,8 +622,7 @@ Readability.prototype = {
} else if (curTitle.length > 150 || curTitle.length < 15) {
var hOnes = doc.getElementsByTagName("h1");
- if (hOnes.length === 1)
- curTitle = this._getInnerText(hOnes[0]);
+ if (hOnes.length === 1) curTitle = this._getInnerText(hOnes[0]);
}
curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
@@ -521,9 +631,12 @@ Readability.prototype = {
// title or we decreased the number of words by more than 1 word, use
// the original title.
var curTitleWordCount = wordCount(curTitle);
- if (curTitleWordCount <= 4 &&
- (!titleHadHierarchicalSeparators ||
- curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) {
+ if (
+ curTitleWordCount <= 4 &&
+ (!titleHadHierarchicalSeparators ||
+ curTitleWordCount !=
+ wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)
+ ) {
curTitle = origTitle;
}
@@ -536,7 +649,7 @@ Readability.prototype = {
*
* @return void
**/
- _prepDocument: function() {
+ _prepDocument: function () {
var doc = this._doc;
// Remove all style tags in head
@@ -556,9 +669,11 @@ Readability.prototype = {
*/
_nextNode: function (node) {
var next = node;
- while (next
- && (next.nodeType != this.ELEMENT_NODE)
- && this.REGEXPS.whitespace.test(next.textContent)) {
+ while (
+ next &&
+ next.nodeType != this.ELEMENT_NODE &&
+ this.REGEXPS.whitespace.test(next.textContent)
+ ) {
next = next.nextSibling;
}
return next;
@@ -572,7 +687,7 @@ Readability.prototype = {
* <div>foo<br>bar<p>abc</p></div>
*/
_replaceBrs: function (elem) {
- this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
+ this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) {
var next = br.nextSibling;
// Whether 2 or more <br> elements have been found and replaced with a
@@ -582,7 +697,7 @@ Readability.prototype = {
// If we find a <br> chain, remove the <br>s until we hit another node
// or non-whitespace. This leaves behind the first <br> in the chain
// (which will be replaced with a <p> later).
- while ((next = this._nextNode(next)) && (next.tagName == "BR")) {
+ while ((next = this._nextNode(next)) && next.tagName == "BR") {
replaced = true;
var brSibling = next.nextSibling;
next.parentNode.removeChild(next);
@@ -601,12 +716,10 @@ Readability.prototype = {
// If we've hit another <br><br>, we're done adding children to this <p>.
if (next.tagName == "BR") {
var nextElem = this._nextNode(next.nextSibling);
- if (nextElem && nextElem.tagName == "BR")
- break;
+ if (nextElem && nextElem.tagName == "BR") break;
}
- if (!this._isPhrasingContent(next))
- break;
+ if (!this._isPhrasingContent(next)) break;
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
@@ -618,8 +731,7 @@ Readability.prototype = {
p.removeChild(p.lastChild);
}
- if (p.parentNode.tagName === "P")
- this._setNodeTag(p.parentNode, "DIV");
+ if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
}
});
},
@@ -637,12 +749,14 @@ Readability.prototype = {
replacement.appendChild(node.firstChild);
}
node.parentNode.replaceChild(replacement, node);
- if (node.readability)
- replacement.readability = node.readability;
+ if (node.readability) replacement.readability = node.readability;
for (var i = 0; i < node.attributes.length; i++) {
try {
- replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
+ replacement.setAttribute(
+ node.attributes[i].name,
+ node.attributes[i].value
+ );
} catch (ex) {
/* it's possible for setAttribute() to throw if the attribute name
* isn't a valid XML Name. Such attributes can however be parsed from
@@ -662,7 +776,7 @@ Readability.prototype = {
* @param Element
* @return void
**/
- _prepArticle: function(articleContent) {
+ _prepArticle: function (articleContent) {
this._cleanStyles(articleContent);
// Check for data tables before we continue, to avoid removing items in
@@ -688,7 +802,10 @@ Readability.prototype = {
this._forEachNode(articleContent.children, function (topCandidate) {
this._cleanMatchedNodes(topCandidate, function (node, matchString) {
- return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold;
+ return (
+ this.REGEXPS.shareElements.test(matchString) &&
+ node.textContent.length < shareElementThreshold
+ );
});
});
@@ -706,38 +823,56 @@ Readability.prototype = {
this._cleanConditionally(articleContent, "div");
// replace H1 with H2 as H1 should be only title that is displayed separately
- this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2");
+ this._replaceNodeTags(
+ this._getAllNodesWithTag(articleContent, ["h1"]),
+ "h2"
+ );
// Remove extra paragraphs
- this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
- var imgCount = paragraph.getElementsByTagName("img").length;
- var embedCount = paragraph.getElementsByTagName("embed").length;
- var objectCount = paragraph.getElementsByTagName("object").length;
- // At this point, nasty iframes have been removed, only remain embedded video ones.
- var iframeCount = paragraph.getElementsByTagName("iframe").length;
- var totalCount = imgCount + embedCount + objectCount + iframeCount;
-
- return totalCount === 0 && !this._getInnerText(paragraph, false);
- });
+ this._removeNodes(
+ this._getAllNodesWithTag(articleContent, ["p"]),
+ function (paragraph) {
+ var imgCount = paragraph.getElementsByTagName("img").length;
+ var embedCount = paragraph.getElementsByTagName("embed").length;
+ var objectCount = paragraph.getElementsByTagName("object").length;
+ // At this point, nasty iframes have been removed, only remain embedded video ones.
+ var iframeCount = paragraph.getElementsByTagName("iframe").length;
+ var totalCount = imgCount + embedCount + objectCount + iframeCount;
+
+ return totalCount === 0 && !this._getInnerText(paragraph, false);
+ }
+ );
- this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
- var next = this._nextNode(br.nextSibling);
- if (next && next.tagName == "P")
- br.parentNode.removeChild(br);
- });
+ this._forEachNode(
+ this._getAllNodesWithTag(articleContent, ["br"]),
+ function (br) {
+ var next = this._nextNode(br.nextSibling);
+ if (next && next.tagName == "P") br.parentNode.removeChild(br);
+ }
+ );
// Remove single-cell tables
- this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
- var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
- if (this._hasSingleTagInsideElement(tbody, "TR")) {
- var row = tbody.firstElementChild;
- if (this._hasSingleTagInsideElement(row, "TD")) {
- var cell = row.firstElementChild;
- cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
- table.parentNode.replaceChild(cell, table);
+ this._forEachNode(
+ this._getAllNodesWithTag(articleContent, ["table"]),
+ function (table) {
+ var tbody = this._hasSingleTagInsideElement(table, "TBODY")
+ ? table.firstElementChild
+ : table;
+ if (this._hasSingleTagInsideElement(tbody, "TR")) {
+ var row = tbody.firstElementChild;
+ if (this._hasSingleTagInsideElement(row, "TD")) {
+ var cell = row.firstElementChild;
+ cell = this._setNodeTag(
+ cell,
+ this._everyNode(cell.childNodes, this._isPhrasingContent)
+ ? "P"
+ : "DIV"
+ );
+ table.parentNode.replaceChild(cell, table);
+ }
}
}
- });
+ );
},
/**
@@ -746,9 +881,9 @@ Readability.prototype = {
*
* @param Element
* @return void
- **/
- _initializeNode: function(node) {
- node.readability = {"contentScore": 0};
+ **/
+ _initializeNode: function (node) {
+ node.readability = { contentScore: 0 };
switch (node.tagName) {
case "DIV":
@@ -786,7 +921,7 @@ Readability.prototype = {
node.readability.contentScore += this._getClassWeight(node);
},
- _removeAndGetNext: function(node) {
+ _removeAndGetNext: function (node) {
var nextNode = this._getNextNode(node, true);
node.parentNode.removeChild(node);
return nextNode;
@@ -799,7 +934,7 @@ Readability.prototype = {
*
* Calling this in a loop will traverse the DOM depth-first.
*/
- _getNextNode: function(node, ignoreSelfAndKids) {
+ _getNextNode: function (node, ignoreSelfAndKids) {
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.firstElementChild) {
return node.firstElementChild;
@@ -821,18 +956,24 @@ Readability.prototype = {
// 1 = same text, 0 = completely different text
// works the way that it splits both texts into words and then finds words that are unique in second text
// the result is given by the lower length of unique parts
- _textSimilarity: function(textA, textB) {
- var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
- var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean);
+ _textSimilarity: function (textA, textB) {
+ var tokensA = textA
+ .toLowerCase()
+ .split(this.REGEXPS.tokenize)
+ .filter(Boolean);
+ var tokensB = textB
+ .toLowerCase()
+ .split(this.REGEXPS.tokenize)
+ .filter(Boolean);
if (!tokensA.length || !tokensB.length) {
return 0;
}
- var uniqTokensB = tokensB.filter(token => !tokensA.includes(token));
+ var uniqTokensB = tokensB.filter((token) => !tokensA.includes(token));
var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
return 1 - distanceB;
},
- _checkByline: function(node, matchString) {
+ _checkByline: function (node, matchString) {
if (this._articleByline) {
return false;
}
@@ -842,7 +983,12 @@ Readability.prototype = {
var itemprop = node.getAttribute("itemprop");
}
- if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
+ if (
+ (rel === "author" ||
+ (itemprop && itemprop.indexOf("author") !== -1) ||
+ this.REGEXPS.byline.test(matchString)) &&
+ this._isValidByline(node.textContent)
+ ) {
this._articleByline = node.textContent.trim();
return true;
}
@@ -850,13 +996,13 @@ Readability.prototype = {
return false;
},
- _getNodeAncestors: function(node, maxDepth) {
+ _getNodeAncestors: function (node, maxDepth) {
maxDepth = maxDepth || 0;
- var i = 0, ancestors = [];
+ var i = 0,
+ ancestors = [];
while (node.parentNode) {
ancestors.push(node.parentNode);
- if (maxDepth && ++i === maxDepth)
- break;
+ if (maxDepth && ++i === maxDepth) break;
node = node.parentNode;
}
return ancestors;
@@ -868,7 +1014,7 @@ Readability.prototype = {
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
- **/
+ **/
_grabArticle: function (page) {
this.log("**** grabArticle ****");
var doc = this._doc;
@@ -885,7 +1031,9 @@ Readability.prototype = {
while (true) {
this.log("Starting grabArticle loop");
- var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
+ var stripUnlikelyCandidates = this._flagIsActive(
+ this.FLAG_STRIP_UNLIKELYS
+ );
// First, node prepping. Trash nodes that look cruddy (like ones with the
// class name "comment", etc), and turn divs into P tags where they have been
@@ -896,7 +1044,6 @@ Readability.prototype = {
let shouldRemoveTitleHeader = true;
while (node) {
-
if (node.tagName === "HTML") {
this._articleLang = node.getAttribute("lang");
}
@@ -916,7 +1063,11 @@ Readability.prototype = {
}
if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
- this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim());
+ this.log(
+ "Removing header: ",
+ node.textContent.trim(),
+ this._articleTitle.trim()
+ );
shouldRemoveTitleHeader = false;
node = this._removeAndGetNext(node);
continue;
@@ -924,29 +1075,44 @@ Readability.prototype = {
// Remove unlikely candidates
if (stripUnlikelyCandidates) {
- if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
- !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
- !this._hasAncestorTag(node, "table") &&
- !this._hasAncestorTag(node, "code") &&
- node.tagName !== "BODY" &&
- node.tagName !== "A") {
+ if (
+ this.REGEXPS.unlikelyCandidates.test(matchString) &&
+ !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
+ !this._hasAncestorTag(node, "table") &&
+ !this._hasAncestorTag(node, "code") &&
+ node.tagName !== "BODY" &&
+ node.tagName !== "A"
+ ) {
this.log("Removing unlikely candidate - " + matchString);
node = this._removeAndGetNext(node);
continue;
}
if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
- this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString);
+ this.log(
+ "Removing content with role " +
+ node.getAttribute("role") +
+ " - " +
+ matchString
+ );
node = this._removeAndGetNext(node);
continue;
}
}
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
- if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
- node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
- node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
- this._isElementWithoutContent(node)) {
+ if (
+ (node.tagName === "DIV" ||
+ node.tagName === "SECTION" ||
+ node.tagName === "HEADER" ||
+ node.tagName === "H1" ||
+ node.tagName === "H2" ||
+ node.tagName === "H3" ||
+ node.tagName === "H4" ||
+ node.tagName === "H5" ||
+ node.tagName === "H6") &&
+ this._isElementWithoutContent(node)
+ ) {
node = this._removeAndGetNext(node);
continue;
}
@@ -983,7 +1149,10 @@ Readability.prototype = {
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
- if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
+ if (
+ this._hasSingleTagInsideElement(node, "P") &&
+ this._getLinkDensity(node) < 0.25
+ ) {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
@@ -1001,21 +1170,22 @@ Readability.prototype = {
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
- **/
+ **/
var candidates = [];
- this._forEachNode(elementsToScore, function(elementToScore) {
- if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined")
+ this._forEachNode(elementsToScore, function (elementToScore) {
+ if (
+ !elementToScore.parentNode ||
+ typeof elementToScore.parentNode.tagName === "undefined"
+ )
return;
// If this paragraph is less than 25 characters, don't even count it.
var innerText = this._getInnerText(elementToScore);
- if (innerText.length < 25)
- return;
+ if (innerText.length < 25) return;
// Exclude nodes with no ancestor.
var ancestors = this._getNodeAncestors(elementToScore, 5);
- if (ancestors.length === 0)
- return;
+ if (ancestors.length === 0) return;
var contentScore = 0;
@@ -1029,11 +1199,15 @@ Readability.prototype = {
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
// Initialize and score ancestors.
- this._forEachNode(ancestors, function(ancestor, level) {
- if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined")
+ this._forEachNode(ancestors, function (ancestor, level) {
+ if (
+ !ancestor.tagName ||
+ !ancestor.parentNode ||
+ typeof ancestor.parentNode.tagName === "undefined"
+ )
return;
- if (typeof(ancestor.readability) === "undefined") {
+ if (typeof ancestor.readability === "undefined") {
this._initializeNode(ancestor);
candidates.push(ancestor);
}
@@ -1042,12 +1216,9 @@ Readability.prototype = {
// - parent: 1 (no division)
// - grandparent: 2
// - great grandparent+: ancestor level * 3
- if (level === 0)
- var scoreDivider = 1;
- else if (level === 1)
- scoreDivider = 2;
- else
- scoreDivider = level * 3;
+ if (level === 0) var scoreDivider = 1;
+ else if (level === 1) scoreDivider = 2;
+ else scoreDivider = level * 3;
ancestor.readability.contentScore += contentScore / scoreDivider;
});
});
@@ -1061,7 +1232,9 @@ Readability.prototype = {
// Scale the final candidates score based on link density. Good content
// should have a relatively small link density (5% or less) and be mostly
// unaffected by this operation.
- var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
+ var candidateScore =
+ candidate.readability.contentScore *
+ (1 - this._getLinkDensity(candidate));
candidate.readability.contentScore = candidateScore;
this.log("Candidate:", candidate, "with score " + candidateScore);
@@ -1069,7 +1242,10 @@ Readability.prototype = {
for (var t = 0; t < this._nbTopCandid