From 58b522be65516578b812f767c1f37042d4ff6c5b Mon Sep 17 00:00:00 2001 From: NunoSempere Date: Fri, 15 Sep 2023 12:40:50 +0300 Subject: prettier pass. --- plugins/readability/readability.js | 1149 +++++++++++++++++++++++------------- 1 file changed, 730 insertions(+), 419 deletions(-) (limited to 'plugins/readability') diff --git a/plugins/readability/readability.js b/plugins/readability/readability.js index 0a5bda7..3bef151 100644 --- a/plugins/readability/readability.js +++ b/plugins/readability/readability.js @@ -31,7 +31,9 @@ function Readability(doc, options) { doc = options; options = arguments[2]; } else if (!doc || !doc.documentElement) { - throw new Error("First argument to Readability constructor should be a document object."); + throw new Error( + "First argument to Readability constructor should be a document object." + ); } options = options || {}; @@ -45,41 +47,49 @@ function Readability(doc, options) { // Configurable options this._debug = !!options.debug; - this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; - this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; + this._maxElemsToParse = + options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; + this._nbTopCandidates = + options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; - this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat( + options.classesToPreserve || [] + ); this._keepClasses = !!options.keepClasses; - this._serializer = options.serializer || function(el) { - return el.innerHTML; - }; + this._serializer = + options.serializer || + function (el) { + return el.innerHTML; + }; this._disableJSONLD = !!options.disableJSONLD; // Start with all flags set - this._flags = this.FLAG_STRIP_UNLIKELYS | - this.FLAG_WEIGHT_CLASSES | - this.FLAG_CLEAN_CONDITIONALLY; - + this._flags = + this.FLAG_STRIP_UNLIKELYS | + this.FLAG_WEIGHT_CLASSES | + this.FLAG_CLEAN_CONDITIONALLY; // Control whether log messages are sent to the console if (this._debug) { - let logNode = function(node) { + let logNode = function (node) { if (node.nodeType == node.TEXT_NODE) { return `${node.nodeName} ("${node.textContent}")`; } - let attrPairs = Array.from(node.attributes || [], function(attr) { + let attrPairs = Array.from(node.attributes || [], function (attr) { return `${attr.name}="${attr.value}"`; }).join(" "); return `<${node.localName} ${attrPairs}>`; }; this.log = function () { if (typeof dump !== "undefined") { - var msg = Array.prototype.map.call(arguments, function(x) { - return (x && x.nodeName) ? logNode(x) : x; - }).join(" "); + var msg = Array.prototype.map + .call(arguments, function (x) { + return x && x.nodeName ? logNode(x) : x; + }) + .join(" "); dump("Reader: (Readability) " + msg + "\n"); } else if (typeof console !== "undefined") { - let args = Array.from(arguments, arg => { + let args = Array.from(arguments, (arg) => { if (arg && arg.nodeType == this.ELEMENT_NODE) { return logNode(arg); } @@ -111,7 +121,9 @@ Readability.prototype = { DEFAULT_N_TOP_CANDIDATES: 5, // Element tags to score by default. - DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), + DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre" + .toUpperCase() + .split(","), // The default number of chars an article must have in order to return a result DEFAULT_CHAR_THRESHOLD: 500, @@ -121,16 +133,21 @@ Readability.prototype = { REGEXPS: { // NOTE: These two regular expressions are duplicated in // Readability-readerable.js. Please keep both copies in sync. - unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + unlikelyCandidates: + /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, - positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, - extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, + positive: + /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, + negative: + /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + extraneous: + /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, normalize: /\s{2,}/g, - videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, + videos: + /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, @@ -141,40 +158,106 @@ Readability.prototype = { srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, // See: https://schema.org/Article - jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ + jsonLdArticleTypes: + /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/, }, - UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + UNLIKELY_ROLES: [ + "menu", + "menubar", + "complementary", + "navigation", + "alert", + "alertdialog", + "dialog", + ], - DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), + DIV_TO_P_ELEMS: new Set([ + "BLOCKQUOTE", + "DL", + "DIV", + "IMG", + "OL", + "P", + "PRE", + "TABLE", + "UL", + ]), ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], - PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], + PRESENTATIONAL_ATTRIBUTES: [ + "align", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "frame", + "hspace", + "rules", + "style", + "valign", + "vspace", + ], - DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], + DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"], // The commented out elements qualify as phrasing content but tend to be // removed by readability when put into paragraphs, so we ignore them here. PHRASING_ELEMS: [ // "CANVAS", "IFRAME", "SVG", "VIDEO", - "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", - "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", - "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", - "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", - "SUP", "TEXTAREA", "TIME", "VAR", "WBR" + "ABBR", + "AUDIO", + "B", + "BDO", + "BR", + "BUTTON", + "CITE", + "CODE", + "DATA", + "DATALIST", + "DFN", + "EM", + "EMBED", + "I", + "IMG", + "INPUT", + "KBD", + "LABEL", + "MARK", + "MATH", + "METER", + "NOSCRIPT", + "OBJECT", + "OUTPUT", + "PROGRESS", + "Q", + "RUBY", + "SAMP", + "SCRIPT", + "SELECT", + "SMALL", + "SPAN", + "STRONG", + "SUB", + "SUP", + "TEXTAREA", + "TIME", + "VAR", + "WBR", ], // These are the classes that readability sets itself. - CLASSES_TO_PRESERVE: [ "page" ], + CLASSES_TO_PRESERVE: ["page"], // These are the list of HTML entities that need to be escaped. HTML_ESCAPE_MAP: { - "lt": "<", - "gt": ">", - "amp": "&", - "quot": '"', - "apos": "'", + lt: "<", + gt: ">", + amp: "&", + quot: '"', + apos: "'", }, /** @@ -182,8 +265,8 @@ Readability.prototype = { * * @param Element * @return void - **/ - _postProcessContent: function(articleContent) { + **/ + _postProcessContent: function (articleContent) { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); @@ -205,7 +288,7 @@ Readability.prototype = { * @param Function filterFn the function to use as a filter * @return void */ - _removeNodes: function(nodeList, filterFn) { + _removeNodes: function (nodeList, filterFn) { // Avoid ever operating on live node lists. if (this._docJSDOMParser && nodeList._isLiveNodeList) { throw new Error("Do not pass live node lists to _removeNodes"); @@ -228,7 +311,7 @@ Readability.prototype = { * @param String newTagName the new tag name to use * @return void */ - _replaceNodeTags: function(nodeList, newTagName) { + _replaceNodeTags: function (nodeList, newTagName) { // Avoid ever operating on live node lists. if (this._docJSDOMParser && nodeList._isLiveNodeList) { throw new Error("Do not pass live node lists to _replaceNodeTags"); @@ -249,7 +332,7 @@ Readability.prototype = { * @param Function fn The iterate function. * @return void */ - _forEachNode: function(nodeList, fn) { + _forEachNode: function (nodeList, fn) { Array.prototype.forEach.call(nodeList, fn, this); }, @@ -264,7 +347,7 @@ Readability.prototype = { * @param Function fn The test function. * @return void */ - _findNode: function(nodeList, fn) { + _findNode: function (nodeList, fn) { return Array.prototype.find.call(nodeList, fn, this); }, @@ -279,7 +362,7 @@ Readability.prototype = { * @param Function fn The iterate function. * @return Boolean */ - _someNode: function(nodeList, fn) { + _someNode: function (nodeList, fn) { return Array.prototype.some.call(nodeList, fn, this); }, @@ -294,7 +377,7 @@ Readability.prototype = { * @param Function fn The iterate function. * @return Boolean */ - _everyNode: function(nodeList, fn) { + _everyNode: function (nodeList, fn) { return Array.prototype.every.call(nodeList, fn, this); }, @@ -304,23 +387,26 @@ Readability.prototype = { * @return ...NodeList * @return Array */ - _concatNodeLists: function() { + _concatNodeLists: function () { var slice = Array.prototype.slice; var args = slice.call(arguments); - var nodeLists = args.map(function(list) { + var nodeLists = args.map(function (list) { return slice.call(list); }); return Array.prototype.concat.apply([], nodeLists); }, - _getAllNodesWithTag: function(node, tagNames) { + _getAllNodesWithTag: function (node, tagNames) { if (node.querySelectorAll) { return node.querySelectorAll(tagNames.join(",")); } - return [].concat.apply([], tagNames.map(function(tag) { - var collection = node.getElementsByTagName(tag); - return Array.isArray(collection) ? collection : Array.from(collection); - })); + return [].concat.apply( + [], + tagNames.map(function (tag) { + var collection = node.getElementsByTagName(tag); + return Array.isArray(collection) ? collection : Array.from(collection); + }) + ); }, /** @@ -331,11 +417,11 @@ Readability.prototype = { * @param Element * @return void */ - _cleanClasses: function(node) { + _cleanClasses: function (node) { var classesToPreserve = this._classesToPreserve; var className = (node.getAttribute("class") || "") .split(/\s+/) - .filter(function(cls) { + .filter(function (cls) { return classesToPreserve.indexOf(cls) != -1; }) .join(" "); @@ -358,7 +444,7 @@ Readability.prototype = { * @param Element * @return void */ - _fixRelativeUris: function(articleContent) { + _fixRelativeUris: function (articleContent) { var baseURI = this._doc.baseURI; var documentURI = this._doc.documentURI; function toAbsoluteURI(uri) { @@ -377,14 +463,17 @@ Readability.prototype = { } var links = this._getAllNodesWithTag(articleContent, ["a"]); - this._forEachNode(links, function(link) { + this._forEachNode(links, function (link) { var href = link.getAttribute("href"); if (href) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.indexOf("javascript:") === 0) { // if the link only contains simple text content, it can be converted to a text node - if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + if ( + link.childNodes.length === 1 && + link.childNodes[0].nodeType === this.TEXT_NODE + ) { var text = this._doc.createTextNode(link.textContent); link.parentNode.replaceChild(text, link); } else { @@ -402,10 +491,15 @@ Readability.prototype = { }); var medias = this._getAllNodesWithTag(articleContent, [ - "img", "picture", "figure", "video", "audio", "source" + "img", + "picture", + "figure", + "video", + "audio", + "source", ]); - this._forEachNode(medias, function(media) { + this._forEachNode(medias, function (media) { var src = media.getAttribute("src"); var poster = media.getAttribute("poster"); var srcset = media.getAttribute("srcset"); @@ -419,27 +513,40 @@ Readability.prototype = { } if (srcset) { - var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { - return toAbsoluteURI(p1) + (p2 || "") + p3; - }); + var newSrcset = srcset.replace( + this.REGEXPS.srcsetUrl, + function (_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + } + ); media.setAttribute("srcset", newSrcset); } }); }, - _simplifyNestedElements: function(articleContent) { + _simplifyNestedElements: function (articleContent) { var node = articleContent; while (node) { - if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if ( + node.parentNode && + ["DIV", "SECTION"].includes(node.tagName) && + !(node.id && node.id.startsWith("readability")) + ) { if (this._isElementWithoutContent(node)) { node = this._removeAndGetNext(node); continue; - } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + } else if ( + this._hasSingleTagInsideElement(node, "DIV") || + this._hasSingleTagInsideElement(node, "SECTION") + ) { var child = node.children[0]; for (var i = 0; i < node.attributes.length; i++) { - child.setAttribute(node.attributes[i].name, node.attributes[i].value); + child.setAttribute( + node.attributes[i].name, + node.attributes[i].value + ); } node.parentNode.replaceChild(child, node); node = child; @@ -456,7 +563,7 @@ Readability.prototype = { * * @return string **/ - _getArticleTitle: function() { + _getArticleTitle: function () { var doc = this._doc; var curTitle = ""; var origTitle = ""; @@ -466,8 +573,12 @@ Readability.prototype = { // If they had an element with id "title" in their HTML if (typeof curTitle !== "string") - curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); - } catch (e) {/* ignore exceptions setting the title. */} + curTitle = origTitle = this._getInnerText( + doc.getElementsByTagName("title")[0] + ); + } catch (e) { + /* ignore exceptions setting the title. */ + } var titleHadHierarchicalSeparators = false; function wordCount(str) { @@ -475,7 +586,7 @@ Readability.prototype = { } // If there's a separator in the title, first remove the final part - if ((/ [\|\-\\\/>»] /).test(curTitle)) { + if (/ [\|\-\\\/>»] /.test(curTitle)) { titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); @@ -491,7 +602,7 @@ Readability.prototype = { doc.getElementsByTagName("h2") ); var trimmedTitle = curTitle.trim(); - var match = this._someNode(headings, function(heading) { + var match = this._someNode(headings, function (heading) { return heading.textContent.trim() === trimmedTitle; }); @@ -511,8 +622,7 @@ Readability.prototype = { } else if (curTitle.length > 150 || curTitle.length < 15) { var hOnes = doc.getElementsByTagName("h1"); - if (hOnes.length === 1) - curTitle = this._getInnerText(hOnes[0]); + if (hOnes.length === 1) curTitle = this._getInnerText(hOnes[0]); } curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); @@ -521,9 +631,12 @@ Readability.prototype = { // title or we decreased the number of words by more than 1 word, use // the original title. var curTitleWordCount = wordCount(curTitle); - if (curTitleWordCount <= 4 && - (!titleHadHierarchicalSeparators || - curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { + if ( + curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != + wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1) + ) { curTitle = origTitle; } @@ -536,7 +649,7 @@ Readability.prototype = { * * @return void **/ - _prepDocument: function() { + _prepDocument: function () { var doc = this._doc; // Remove all style tags in head @@ -556,9 +669,11 @@ Readability.prototype = { */ _nextNode: function (node) { var next = node; - while (next - && (next.nodeType != this.ELEMENT_NODE) - && this.REGEXPS.whitespace.test(next.textContent)) { + while ( + next && + next.nodeType != this.ELEMENT_NODE && + this.REGEXPS.whitespace.test(next.textContent) + ) { next = next.nextSibling; } return next; @@ -572,7 +687,7 @@ Readability.prototype = { *
foo
bar

abc

*/ _replaceBrs: function (elem) { - this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { + this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) { var next = br.nextSibling; // Whether 2 or more
elements have been found and replaced with a @@ -582,7 +697,7 @@ Readability.prototype = { // If we find a
chain, remove the
s until we hit another node // or non-whitespace. This leaves behind the first
in the chain // (which will be replaced with a

later). - while ((next = this._nextNode(next)) && (next.tagName == "BR")) { + while ((next = this._nextNode(next)) && next.tagName == "BR") { replaced = true; var brSibling = next.nextSibling; next.parentNode.removeChild(next); @@ -601,12 +716,10 @@ Readability.prototype = { // If we've hit another

, we're done adding children to this

. if (next.tagName == "BR") { var nextElem = this._nextNode(next.nextSibling); - if (nextElem && nextElem.tagName == "BR") - break; + if (nextElem && nextElem.tagName == "BR") break; } - if (!this._isPhrasingContent(next)) - break; + if (!this._isPhrasingContent(next)) break; // Otherwise, make this node a child of the new

. var sibling = next.nextSibling; @@ -618,8 +731,7 @@ Readability.prototype = { p.removeChild(p.lastChild); } - if (p.parentNode.tagName === "P") - this._setNodeTag(p.parentNode, "DIV"); + if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV"); } }); }, @@ -637,12 +749,14 @@ Readability.prototype = { replacement.appendChild(node.firstChild); } node.parentNode.replaceChild(replacement, node); - if (node.readability) - replacement.readability = node.readability; + if (node.readability) replacement.readability = node.readability; for (var i = 0; i < node.attributes.length; i++) { try { - replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + replacement.setAttribute( + node.attributes[i].name, + node.attributes[i].value + ); } catch (ex) { /* it's possible for setAttribute() to throw if the attribute name * isn't a valid XML Name. Such attributes can however be parsed from @@ -662,7 +776,7 @@ Readability.prototype = { * @param Element * @return void **/ - _prepArticle: function(articleContent) { + _prepArticle: function (articleContent) { this._cleanStyles(articleContent); // Check for data tables before we continue, to avoid removing items in @@ -688,7 +802,10 @@ Readability.prototype = { this._forEachNode(articleContent.children, function (topCandidate) { this._cleanMatchedNodes(topCandidate, function (node, matchString) { - return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; + return ( + this.REGEXPS.shareElements.test(matchString) && + node.textContent.length < shareElementThreshold + ); }); }); @@ -706,38 +823,56 @@ Readability.prototype = { this._cleanConditionally(articleContent, "div"); // replace H1 with H2 as H1 should be only title that is displayed separately - this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); + this._replaceNodeTags( + this._getAllNodesWithTag(articleContent, ["h1"]), + "h2" + ); // Remove extra paragraphs - this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { - var imgCount = paragraph.getElementsByTagName("img").length; - var embedCount = paragraph.getElementsByTagName("embed").length; - var objectCount = paragraph.getElementsByTagName("object").length; - // At this point, nasty iframes have been removed, only remain embedded video ones. - var iframeCount = paragraph.getElementsByTagName("iframe").length; - var totalCount = imgCount + embedCount + objectCount + iframeCount; - - return totalCount === 0 && !this._getInnerText(paragraph, false); - }); + this._removeNodes( + this._getAllNodesWithTag(articleContent, ["p"]), + function (paragraph) { + var imgCount = paragraph.getElementsByTagName("img").length; + var embedCount = paragraph.getElementsByTagName("embed").length; + var objectCount = paragraph.getElementsByTagName("object").length; + // At this point, nasty iframes have been removed, only remain embedded video ones. + var iframeCount = paragraph.getElementsByTagName("iframe").length; + var totalCount = imgCount + embedCount + objectCount + iframeCount; + + return totalCount === 0 && !this._getInnerText(paragraph, false); + } + ); - this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { - var next = this._nextNode(br.nextSibling); - if (next && next.tagName == "P") - br.parentNode.removeChild(br); - }); + this._forEachNode( + this._getAllNodesWithTag(articleContent, ["br"]), + function (br) { + var next = this._nextNode(br.nextSibling); + if (next && next.tagName == "P") br.parentNode.removeChild(br); + } + ); // Remove single-cell tables - this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { - var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; - if (this._hasSingleTagInsideElement(tbody, "TR")) { - var row = tbody.firstElementChild; - if (this._hasSingleTagInsideElement(row, "TD")) { - var cell = row.firstElementChild; - cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); - table.parentNode.replaceChild(cell, table); + this._forEachNode( + this._getAllNodesWithTag(articleContent, ["table"]), + function (table) { + var tbody = this._hasSingleTagInsideElement(table, "TBODY") + ? table.firstElementChild + : table; + if (this._hasSingleTagInsideElement(tbody, "TR")) { + var row = tbody.firstElementChild; + if (this._hasSingleTagInsideElement(row, "TD")) { + var cell = row.firstElementChild; + cell = this._setNodeTag( + cell, + this._everyNode(cell.childNodes, this._isPhrasingContent) + ? "P" + : "DIV" + ); + table.parentNode.replaceChild(cell, table); + } } } - }); + ); }, /** @@ -746,9 +881,9 @@ Readability.prototype = { * * @param Element * @return void - **/ - _initializeNode: function(node) { - node.readability = {"contentScore": 0}; + **/ + _initializeNode: function (node) { + node.readability = { contentScore: 0 }; switch (node.tagName) { case "DIV": @@ -786,7 +921,7 @@ Readability.prototype = { node.readability.contentScore += this._getClassWeight(node); }, - _removeAndGetNext: function(node) { + _removeAndGetNext: function (node) { var nextNode = this._getNextNode(node, true); node.parentNode.removeChild(node); return nextNode; @@ -799,7 +934,7 @@ Readability.prototype = { * * Calling this in a loop will traverse the DOM depth-first. */ - _getNextNode: function(node, ignoreSelfAndKids) { + _getNextNode: function (node, ignoreSelfAndKids) { // First check for kids if those aren't being ignored if (!ignoreSelfAndKids && node.firstElementChild) { return node.firstElementChild; @@ -821,18 +956,24 @@ Readability.prototype = { // 1 = same text, 0 = completely different text // works the way that it splits both texts into words and then finds words that are unique in second text // the result is given by the lower length of unique parts - _textSimilarity: function(textA, textB) { - var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); - var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + _textSimilarity: function (textA, textB) { + var tokensA = textA + .toLowerCase() + .split(this.REGEXPS.tokenize) + .filter(Boolean); + var tokensB = textB + .toLowerCase() + .split(this.REGEXPS.tokenize) + .filter(Boolean); if (!tokensA.length || !tokensB.length) { return 0; } - var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); + var uniqTokensB = tokensB.filter((token) => !tokensA.includes(token)); var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; return 1 - distanceB; }, - _checkByline: function(node, matchString) { + _checkByline: function (node, matchString) { if (this._articleByline) { return false; } @@ -842,7 +983,12 @@ Readability.prototype = { var itemprop = node.getAttribute("itemprop"); } - if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { + if ( + (rel === "author" || + (itemprop && itemprop.indexOf("author") !== -1) || + this.REGEXPS.byline.test(matchString)) && + this._isValidByline(node.textContent) + ) { this._articleByline = node.textContent.trim(); return true; } @@ -850,13 +996,13 @@ Readability.prototype = { return false; }, - _getNodeAncestors: function(node, maxDepth) { + _getNodeAncestors: function (node, maxDepth) { maxDepth = maxDepth || 0; - var i = 0, ancestors = []; + var i = 0, + ancestors = []; while (node.parentNode) { ancestors.push(node.parentNode); - if (maxDepth && ++i === maxDepth) - break; + if (maxDepth && ++i === maxDepth) break; node = node.parentNode; } return ancestors; @@ -868,7 +1014,7 @@ Readability.prototype = { * * @param page a document to run upon. Needs to be a full document, complete with body. * @return Element - **/ + **/ _grabArticle: function (page) { this.log("**** grabArticle ****"); var doc = this._doc; @@ -885,7 +1031,9 @@ Readability.prototype = { while (true) { this.log("Starting grabArticle loop"); - var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); + var stripUnlikelyCandidates = this._flagIsActive( + this.FLAG_STRIP_UNLIKELYS + ); // First, node prepping. Trash nodes that look cruddy (like ones with the // class name "comment", etc), and turn divs into P tags where they have been @@ -896,7 +1044,6 @@ Readability.prototype = { let shouldRemoveTitleHeader = true; while (node) { - if (node.tagName === "HTML") { this._articleLang = node.getAttribute("lang"); } @@ -916,7 +1063,11 @@ Readability.prototype = { } if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { - this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); + this.log( + "Removing header: ", + node.textContent.trim(), + this._articleTitle.trim() + ); shouldRemoveTitleHeader = false; node = this._removeAndGetNext(node); continue; @@ -924,29 +1075,44 @@ Readability.prototype = { // Remove unlikely candidates if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString) && - !this._hasAncestorTag(node, "table") && - !this._hasAncestorTag(node, "code") && - node.tagName !== "BODY" && - node.tagName !== "A") { + if ( + this.REGEXPS.unlikelyCandidates.test(matchString) && + !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && + !this._hasAncestorTag(node, "code") && + node.tagName !== "BODY" && + node.tagName !== "A" + ) { this.log("Removing unlikely candidate - " + matchString); node = this._removeAndGetNext(node); continue; } if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { - this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + this.log( + "Removing content with role " + + node.getAttribute("role") + + " - " + + matchString + ); node = this._removeAndGetNext(node); continue; } } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). - if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || - node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || - node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && - this._isElementWithoutContent(node)) { + if ( + (node.tagName === "DIV" || + node.tagName === "SECTION" || + node.tagName === "HEADER" || + node.tagName === "H1" || + node.tagName === "H2" || + node.tagName === "H3" || + node.tagName === "H4" || + node.tagName === "H5" || + node.tagName === "H6") && + this._isElementWithoutContent(node) + ) { node = this._removeAndGetNext(node); continue; } @@ -983,7 +1149,10 @@ Readability.prototype = { // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. - if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { + if ( + this._hasSingleTagInsideElement(node, "P") && + this._getLinkDensity(node) < 0.25 + ) { var newNode = node.children[0]; node.parentNode.replaceChild(newNode, node); node = newNode; @@ -1001,21 +1170,22 @@ Readability.prototype = { * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ + **/ var candidates = []; - this._forEachNode(elementsToScore, function(elementToScore) { - if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") + this._forEachNode(elementsToScore, function (elementToScore) { + if ( + !elementToScore.parentNode || + typeof elementToScore.parentNode.tagName === "undefined" + ) return; // If this paragraph is less than 25 characters, don't even count it. var innerText = this._getInnerText(elementToScore); - if (innerText.length < 25) - return; + if (innerText.length < 25) return; // Exclude nodes with no ancestor. var ancestors = this._getNodeAncestors(elementToScore, 5); - if (ancestors.length === 0) - return; + if (ancestors.length === 0) return; var contentScore = 0; @@ -1029,11 +1199,15 @@ Readability.prototype = { contentScore += Math.min(Math.floor(innerText.length / 100), 3); // Initialize and score ancestors. - this._forEachNode(ancestors, function(ancestor, level) { - if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") + this._forEachNode(ancestors, function (ancestor, level) { + if ( + !ancestor.tagName || + !ancestor.parentNode || + typeof ancestor.parentNode.tagName === "undefined" + ) return; - if (typeof(ancestor.readability) === "undefined") { + if (typeof ancestor.readability === "undefined") { this._initializeNode(ancestor); candidates.push(ancestor); } @@ -1042,12 +1216,9 @@ Readability.prototype = { // - parent: 1 (no division) // - grandparent: 2 // - great grandparent+: ancestor level * 3 - if (level === 0) - var scoreDivider = 1; - else if (level === 1) - scoreDivider = 2; - else - scoreDivider = level * 3; + if (level === 0) var scoreDivider = 1; + else if (level === 1) scoreDivider = 2; + else scoreDivider = level * 3; ancestor.readability.contentScore += contentScore / scoreDivider; }); }); @@ -1061,7 +1232,9 @@ Readability.prototype = { // Scale the final candidates score based on link density. Good content // should have a relatively small link density (5% or less) and be mostly // unaffected by this operation. - var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); + var candidateScore = + candidate.readability.contentScore * + (1 - this._getLinkDensity(candidate)); candidate.readability.contentScore = candidateScore; this.log("Candidate:", candidate, "with score " + candidateScore); @@ -1069,7 +1242,10 @@ Readability.prototype = { for (var t = 0; t < this._nbTopCandidates; t++) { var aTopCandidate = topCandidates[t]; - if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) { + if ( + !aTopCandidate || + candidateScore > aTopCandidate.readability.contentScore + ) { topCandidates.splice(t, 0, candidate); if (topCandidates.length > this._nbTopCandidates) topCandidates.pop(); @@ -1103,8 +1279,14 @@ Readability.prototype = { // and whose scores are quite closed with current `topCandidate` node. var alternativeCandidateAncestors = []; for (var i = 1; i < topCandidates.length; i++) { - if (topCandidates[i].readability.contentScore / topCandidate.readability.contentScore >= 0.75) { - alternativeCandidateAncestors.push(this._getNodeAncestors(topCandidates[i])); + if ( + topCandidates[i].readability.contentScore / + topCandidate.readability.contentScore >= + 0.75 + ) { + alternativeCandidateAncestors.push( + this._getNodeAncestors(topCandidates[i]) + ); } } var MINIMUM_TOPCANDIDATES = 3; @@ -1112,8 +1294,17 @@ Readability.prototype = { parentOfTopCandidate = topCandidate.parentNode; while (parentOfTopCandidate.tagName !== "BODY") { var listsContainingThisAncestor = 0; - for (var ancestorIndex = 0; ancestorIndex < alternativeCandidateAncestors.length && listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; ancestorIndex++) { - listsContainingThisAncestor += Number(alternativeCandidateAncestors[ancestorIndex].includes(parentOfTopCandidate)); + for ( + var ancestorIndex = 0; + ancestorIndex < alternativeCandidateAncestors.length && + listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; + ancestorIndex++ + ) { + listsContainingThisAncestor += Number( + alternativeCandidateAncestors[ancestorIndex].includes( + parentOfTopCandidate + ) + ); } if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { topCandidate = parentOfTopCandidate; @@ -1143,8 +1334,7 @@ Readability.prototype = { continue; } var parentScore = parentOfTopCandidate.readability.contentScore; - if (parentScore < scoreThreshold) - break; + if (parentScore < scoreThreshold) break; if (parentScore > lastScore) { // Alright! We found a better parent to use. topCandidate = parentOfTopCandidate; @@ -1157,7 +1347,10 @@ Readability.prototype = { // If the top candidate is the only child, use parent instead. This will help sibling // joining logic when adjacent content is actually located in parent's sibling node. parentOfTopCandidate = topCandidate.parentNode; - while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) { + while ( + parentOfTopCandidate.tagName != "BODY" && + parentOfTopCandidate.children.length == 1 + ) { topCandidate = parentOfTopCandidate; parentOfTopCandidate = topCandidate.parentNode; } @@ -1170,10 +1363,12 @@ Readability.prototype = { // that might also be related. Things like preambles, content split by ads // that we removed, etc. var articleContent = doc.createElement("DIV"); - if (isPaging) - articleContent.id = "readability-content"; + if (isPaging) articleContent.id = "readability-content"; - var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); + var siblingScoreThreshold = Math.max( + 10, + topCandidate.readability.contentScore * 0.2 + ); // Keep potential top candidate's parent node to try to get text direction of it later. parentOfTopCandidate = topCandidate.parentNode; var siblings = parentOfTopCandidate.children; @@ -1182,8 +1377,17 @@ Readability.prototype = { var sibling = siblings[s]; var append = false; - this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : ""); - this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown"); + this.log( + "Looking at sibling node:", + sibling, + sibling.readability + ? "with score " + sibling.readability.contentScore + : "" + ); + this.log( + "Sibling has score", + sibling.readability ? sibling.readability.contentScore : "Unknown" + ); if (sibling === topCandidate) { append = true; @@ -1191,11 +1395,17 @@ Readability.prototype = { var contentBonus = 0; // Give a bonus if sibling nodes and top candidates have the example same classname - if (sibling.className === topCandidate.className && topCandidate.className !== "") + if ( + sibling.className === topCandidate.className && + topCandidate.className !== "" + ) contentBonus += topCandidate.readability.contentScore * 0.2; - if (sibling.readability && - ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) { + if ( + sibling.readability && + sibling.readability.contentScore + contentBonus >= + siblingScoreThreshold + ) { append = true; } else if (sibling.nodeName === "P") { var linkDensity = this._getLinkDensity(sibling); @@ -1204,8 +1414,12 @@ Readability.prototype = { if (nodeLength > 80 && linkDensity < 0.25) { append = true; - } else if (nodeLength < 80 && nodeLength > 0 && linkDensity === 0 && - nodeContent.search(/\.( |$)/) !== -1) { + } else if ( + nodeLength < 80 && + nodeLength > 0 && + linkDensity === 0 && + nodeContent.search(/\.( |$)/) !== -1 + ) { append = true; } } @@ -1276,15 +1490,27 @@ Readability.prototype = { if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { this._removeFlag(this.FLAG_STRIP_UNLIKELYS); - this._attempts.push({articleContent: articleContent, textLength: textLength}); + this._attempts.push({ + articleContent: articleContent, + textLength: textLength, + }); } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { this._removeFlag(this.FLAG_WEIGHT_CLASSES); - this._attempts.push({articleContent: articleContent, textLength: textLength}); + this._attempts.push({ + articleContent: articleContent, + textLength: textLength, + }); } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); - this._attempts.push({articleContent: articleContent, textLength: textLength}); + this._attempts.push({ + articleContent: articleContent, + textLength: textLength, + }); } else { - this._attempts.push({articleContent: articleContent, textLength: textLength}); + this._attempts.push({ + articleContent: articleContent, + textLength: textLength, + }); // No luck after removing flags, just return the longest text we found during the different loops this._attempts.sort(function (a, b) { return b.textLength - a.textLength; @@ -1302,10 +1528,11 @@ Readability.prototype = { if (parseSuccessful) { // Find out text direction from ancestors of final top candidate. - var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate)); - this._someNode(ancestors, function(ancestor) { - if (!ancestor.tagName) - return false; + var ancestors = [parentOfTopCandidate, topCandidate].concat( + this._getNodeAncestors(parentOfTopCandidate) + ); + this._someNode(ancestors, function (ancestor) { + if (!ancestor.tagName) return false; var articleDir = ancestor.getAttribute("dir"); if (articleDir) { this._articleDir = articleDir; @@ -1326,10 +1553,10 @@ Readability.prototype = { * @param possibleByline {string} - a string to check whether its a byline. * @return Boolean - whether the input string is a byline. */ - _isValidByline: function(byline) { + _isValidByline: function (byline) { if (typeof byline == "string" || byline instanceof String) { byline = byline.trim(); - return (byline.length > 0) && (byline.length < 100); + return byline.length > 0 && byline.length < 100; } return false; }, @@ -1340,18 +1567,23 @@ Readability.prototype = { * @param str {string} - a string to unescape. * @return string without HTML entity. */ - _unescapeHtmlEntities: function(str) { + _unescapeHtmlEntities: function (str) { if (!str) { return str; } var htmlEscapeMap = this.HTML_ESCAPE_MAP; - return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { - return htmlEscapeMap[tag]; - }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { - var num = parseInt(hex || numStr, hex ? 16 : 10); - return String.fromCharCode(num); - }); + return str + .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) { + return htmlEscapeMap[tag]; + }) + .replace( + /&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, + function (_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + } + ); }, /** @@ -1364,11 +1596,17 @@ Readability.prototype = { var metadata; - this._forEachNode(scripts, function(jsonLdElement) { - if (!metadata && jsonLdElement.getAttribute("type") === "application/ld+json") { + this._forEachNode(scripts, function (jsonLdElement) { + if ( + !metadata && + jsonLdElement.getAttribute("type") === "application/ld+json" + ) { try { // Strip CDATA markers if present - var content = jsonLdElement.textContent.replace(/^\s*\s*$/g, ""); + var content = jsonLdElement.textContent.replace( + /^\s*\s*$/g, + "" + ); var parsed = JSON.parse(content); if ( !parsed["@context"] || @@ -1378,10 +1616,8 @@ Readability.prototype = { } if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { - parsed = parsed["@graph"].find(function(it) { - return (it["@type"] || "").match( - this.REGEXPS.jsonLdArticleTypes - ); + parsed = parsed["@graph"].find(function (it) { + return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes); }); } @@ -1395,14 +1631,19 @@ Readability.prototype = { metadata = {}; - if (typeof parsed.name === "string" && typeof parsed.headline === "string" && parsed.name !== parsed.headline) { + if ( + typeof parsed.name === "string" && + typeof parsed.headline === "string" && + parsed.name !== parsed.headline + ) { // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. var title = this._getArticleTitle(); var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; - var headlineMatches = this._textSimilarity(parsed.headline, title) > 0.75; + var headlineMatches = + this._textSimilarity(parsed.headline, title) > 0.75; if (headlineMatches && !nameMatches) { metadata.title = parsed.headline; @@ -1417,12 +1658,16 @@ Readability.prototype = { if (parsed.author) { if (typeof parsed.author.name === "string") { metadata.byline = parsed.author.name.trim(); - } else if (Array.isArray(parsed.author) && parsed.author[0] && typeof parsed.author[0].name === "string") { + } else if ( + Array.isArray(parsed.author) && + parsed.author[0] && + typeof parsed.author[0].name === "string" + ) { metadata.byline = parsed.author - .filter(function(author) { + .filter(function (author) { return author && typeof author.name === "string"; }) - .map(function(author) { + .map(function (author) { return author.name.trim(); }) .join(", "); @@ -1431,10 +1676,7 @@ Readability.prototype = { if (typeof parsed.description === "string") { metadata.excerpt = parsed.description.trim(); } - if ( - parsed.publisher && - typeof parsed.publisher.name === "string" - ) { + if (parsed.publisher && typeof parsed.publisher.name === "string") { metadata.siteName = parsed.publisher.name.trim(); } return; @@ -1454,19 +1696,21 @@ Readability.prototype = { * * @return Object with optional "excerpt" and "byline" properties */ - _getArticleMetadata: function(jsonld) { + _getArticleMetadata: function (jsonld) { var metadata = {}; var values = {}; var metaElements = this._doc.getElementsByTagName("meta"); // property is a space-separated list of values - var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; + var propertyPattern = + /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; // name is a single value - var namePattern = /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; + var namePattern = + /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; // Find description tags. - this._forEachNode(metaElements, function(element) { + this._forEachNode(metaElements, function (element) { var elementName = element.getAttribute("name"); var elementProperty = element.getAttribute("property"); var content = element.getAttribute("content"); @@ -1498,38 +1742,40 @@ Readability.prototype = { }); // get title - metadata.title = jsonld.title || - values["dc:title"] || - values["dcterm:title"] || - values["og:title"] || - values["weibo:article:title"] || - values["weibo:webpage:title"] || - values["title"] || - values["twitter:title"]; + metadata.title = + jsonld.title || + values["dc:title"] || + values["dcterm:title"] || + values["og:title"] || + values["weibo:article:title"] || + values["weibo:webpage:title"] || + values["title"] || + values["twitter:title"]; if (!metadata.title) { metadata.title = this._getArticleTitle(); } // get author - metadata.byline = jsonld.byline || - values["dc:creator"] || - values["dcterm:creator"] || - values["author"]; + metadata.byline = + jsonld.byline || + values["dc:creator"] || + values["dcterm:creator"] || + values["author"]; // get description - metadata.excerpt = jsonld.excerpt || - values["dc:description"] || - values["dcterm:description"] || - values["og:description"] || - values["weibo:article:description"] || - values["weibo:webpage:description"] || - values["description"] || - values["twitter:description"]; + metadata.excerpt = + jsonld.excerpt || + values["dc:description"] || + values["dcterm:description"] || + values["og:description"] || + values["weibo:article:description"] || + values["weibo:webpage:description"] || + values["description"] || + values["twitter:description"]; // get site name - metadata.siteName = jsonld.siteName || - values["og:site_name"]; + metadata.siteName = jsonld.siteName || values["og:site_name"]; // in many sites the meta value is escaped with HTML entities, // so here we need to unescape it @@ -1546,8 +1792,8 @@ Readability.prototype = { * whether as a direct child or as its descendants. * * @param Element - **/ - _isSingleImage: function(node) { + **/ + _isSingleImage: function (node) { if (node.tagName === "IMG") { return true; } @@ -1566,12 +1812,12 @@ Readability.prototype = { * some sites (e.g. Medium). * * @param Element - **/ - _unwrapNoscriptImages: function(doc) { + **/ + _unwrapNoscriptImages: function (doc) { // Find img without source or attributes that might contains image, and remove it. // This is done to prevent a placeholder img is replaced by img from noscript in next step. var imgs = Array.from(doc.getElementsByTagName("img")); - this._forEachNode(imgs, function(img) { + this._forEachNode(imgs, function (img) { for (var i = 0; i < img.attributes.length; i++) { var attr = img.attributes[i]; switch (attr.name) { @@ -1592,7 +1838,7 @@ Readability.prototype = { // Next find noscript and try to extract its image var noscripts = Array.from(doc.getElementsByTagName("noscript")); - this._forEachNode(noscripts, function(noscript) { + this._forEachNode(noscripts, function (noscript) { // Parse content of noscript and make sure it only contains image var tmp = doc.createElement("div"); tmp.innerHTML = noscript.innerHTML; @@ -1617,7 +1863,11 @@ Readability.prototype = { continue; } - if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) { + if ( + attr.name === "src" || + attr.name === "srcset" || + /\.(jpg|jpeg|png|webp)/i.test(attr.value) + ) { if (newImg.getAttribute(attr.name) === attr.value) { continue; } @@ -1640,13 +1890,16 @@ Readability.prototype = { * Removes script tags from the document. * * @param Element - **/ - _removeScripts: function(doc) { - this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) { - scriptNode.nodeValue = ""; - scriptNode.removeAttribute("src"); - return true; - }); + **/ + _removeScripts: function (doc) { + this._removeNodes( + this._getAllNodesWithTag(doc, ["script"]), + function (scriptNode) { + scriptNode.nodeValue = ""; + scriptNode.removeAttribute("src"); + return true; + } + ); this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"])); }, @@ -1657,25 +1910,31 @@ Readability.prototype = { * * @param Element * @param string tag of child element - **/ - _hasSingleTagInsideElement: function(element, tag) { + **/ + _hasSingleTagInsideElement: function (element, tag) { // There should be exactly 1 element child with given tag if (element.children.length != 1 || element.children[0].tagName !== tag) { return false; } // And there should be no text nodes with real content - return !this._someNode(element.childNodes, function(node) { - return node.nodeType === this.TEXT_NODE && - this.REGEXPS.hasContent.test(node.textContent); + return !this._someNode(element.childNodes, function (node) { + return ( + node.nodeType === this.TEXT_NODE && + this.REGEXPS.hasContent.test(node.textContent) + ); }); }, - _isElementWithoutContent: function(node) { - return node.nodeType === this.ELEMENT_NODE && + _isElementWithoutContent: function (node) { + return ( + node.nodeType === this.ELEMENT_NODE && node.textContent.trim().length == 0 && (node.children.length == 0 || - node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length); + node.children.length == + node.getElementsByTagName("br").length + + node.getElementsByTagName("hr").length) + ); }, /** @@ -1684,25 +1943,35 @@ Readability.prototype = { * @param Element */ _hasChildBlockElement: function (element) { - return this._someNode(element.childNodes, function(node) { - return this.DIV_TO_P_ELEMS.has(node.tagName) || - this._hasChildBlockElement(node); + return this._someNode(element.childNodes, function (node) { + return ( + this.DIV_TO_P_ELEMS.has(node.tagName) || + this._hasChildBlockElement(node) + ); }); }, /*** * Determine if a node qualifies as phrasing content. * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content - **/ - _isPhrasingContent: function(node) { - return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || - ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") && - this._everyNode(node.childNodes, this._isPhrasingContent)); + **/ + _isPhrasingContent: function (node) { + return ( + node.nodeType === this.TEXT_NODE || + this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || + ((node.tagName === "A" || + node.tagName === "DEL" || + node.tagName === "INS") && + this._everyNode(node.childNodes, this._isPhrasingContent)) + ); }, - _isWhitespace: function(node) { - return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) || - (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR"); + _isWhitespace: function (node) { + return ( + (node.nodeType === this.TEXT_NODE && + node.textContent.trim().length === 0) || + (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR") + ); }, /** @@ -1712,9 +1981,10 @@ Readability.prototype = { * @param Element * @param Boolean normalizeSpaces (default: true) * @return string - **/ - _getInnerText: function(e, normalizeSpaces) { - normalizeSpaces = (typeof normalizeSpaces === "undefined") ? true : normalizeSpaces; + **/ + _getInnerText: function (e, normalizeSpaces) { + normalizeSpaces = + typeof normalizeSpaces === "undefined" ? true : normalizeSpaces; var textContent = e.textContent.trim(); if (normalizeSpaces) { @@ -1729,8 +1999,8 @@ Readability.prototype = { * @param Element * @param string - what to split on. Default is "," * @return number (integer) - **/ - _getCharCount: function(e, s) { + **/ + _getCharCount: function (e, s) { s = s || ","; return this._getInnerText(e).split(s).length - 1; }, @@ -1741,10 +2011,9 @@ Readability.prototype = { * * @param Element * @return void - **/ - _cleanStyles: function(e) { - if (!e || e.tagName.toLowerCase() === "svg") - return; + **/ + _cleanStyles: function (e) { + if (!e || e.tagName.toLowerCase() === "svg") return; // Remove `style` and deprecated presentational attributes for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { @@ -1769,16 +2038,15 @@ Readability.prototype = { * * @param Element * @return number (float) - **/ - _getLinkDensity: function(element) { + **/ + _getLinkDensity: function (element) { var textLength = this._getInnerText(element).length; - if (textLength === 0) - return 0; + if (textLength === 0) return 0; var linkLength = 0; // XXX implement _reduceNodeList? - this._forEachNode(element.getElementsByTagName("a"), function(linkNode) { + this._forEachNode(element.getElementsByTagName("a"), function (linkNode) { var href = linkNode.getAttribute("href"); var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1; linkLength += this._getInnerText(linkNode).length * coefficient; @@ -1793,29 +2061,24 @@ Readability.prototype = { * * @param Element * @return number (Integer) - **/ - _getClassWeight: function(e) { - if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) - return 0; + **/ + _getClassWeight: function (e) { + if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) return 0; var weight = 0; // Look for a special classname - if (typeof(e.className) === "string" && e.className !== "") { - if (this.REGEXPS.negative.test(e.className)) - weight -= 25; + if (typeof e.className === "string" && e.className !== "") { + if (this.REGEXPS.negative.test(e.className)) weight -= 25; - if (this.REGEXPS.positive.test(e.className)) - weight += 25; + if (this.REGEXPS.positive.test(e.className)) weight += 25; } // Look for a special ID - if (typeof(e.id) === "string" && e.id !== "") { - if (this.REGEXPS.negative.test(e.id)) - weight -= 25; + if (typeof e.id === "string" && e.id !== "") { + if (this.REGEXPS.negative.test(e.id)) weight -= 25; - if (this.REGEXPS.positive.test(e.id)) - weight += 25; + if (this.REGEXPS.positive.test(e.id)) weight += 25; } return weight; @@ -1829,10 +2092,10 @@ Readability.prototype = { * @param string tag to clean * @return void **/ - _clean: function(e, tag) { + _clean: function (e, tag) { var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; -