diff options
author | NunoSempere <nuno.sempere@protonmail.com> | 2023-09-15 12:40:50 +0300 |
---|---|---|
committer | NunoSempere <nuno.sempere@protonmail.com> | 2023-09-15 12:40:50 +0300 |
commit | 58b522be65516578b812f767c1f37042d4ff6c5b (patch) | |
tree | 61725207c1525a3743127bcd60d8633fd95de707 /plugins/readability | |
parent | 2ce28303518c42cd17096145beecc1c89110429f (diff) |
prettier pass.
Diffstat (limited to 'plugins/readability')
-rw-r--r-- | plugins/readability/readability.js | 1149 |
1 files changed, 730 insertions, 419 deletions
diff --git a/plugins/readability/readability.js b/plugins/readability/readability.js index 0a5bda7..3bef151 100644 --- a/plugins/readability/readability.js +++ b/plugins/readability/readability.js @@ -31,7 +31,9 @@ function Readability(doc, options) { doc = options; options = arguments[2]; } else if (!doc || !doc.documentElement) { - throw new Error("First argument to Readability constructor should be a document object."); + throw new Error( + "First argument to Readability constructor should be a document object." + ); } options = options || {}; @@ -45,41 +47,49 @@ function Readability(doc, options) { // Configurable options this._debug = !!options.debug; - this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; - this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; + this._maxElemsToParse = + options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; + this._nbTopCandidates = + options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; - this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat( + options.classesToPreserve || [] + ); this._keepClasses = !!options.keepClasses; - this._serializer = options.serializer || function(el) { - return el.innerHTML; - }; + this._serializer = + options.serializer || + function (el) { + return el.innerHTML; + }; this._disableJSONLD = !!options.disableJSONLD; // Start with all flags set - this._flags = this.FLAG_STRIP_UNLIKELYS | - this.FLAG_WEIGHT_CLASSES | - this.FLAG_CLEAN_CONDITIONALLY; - + this._flags = + this.FLAG_STRIP_UNLIKELYS | + this.FLAG_WEIGHT_CLASSES | + this.FLAG_CLEAN_CONDITIONALLY; // Control whether log messages are sent to the console if (this._debug) { - let logNode = function(node) { + let logNode = function (node) { if (node.nodeType == node.TEXT_NODE) { return `${node.nodeName} ("${node.textContent}")`; } - let attrPairs = Array.from(node.attributes || [], function(attr) { + let attrPairs = Array.from(node.attributes || [], function (attr) { return `${attr.name}="${attr.value}"`; }).join(" "); return `<${node.localName} ${attrPairs}>`; }; this.log = function () { if (typeof dump !== "undefined") { - var msg = Array.prototype.map.call(arguments, function(x) { - return (x && x.nodeName) ? logNode(x) : x; - }).join(" "); + var msg = Array.prototype.map + .call(arguments, function (x) { + return x && x.nodeName ? logNode(x) : x; + }) + .join(" "); dump("Reader: (Readability) " + msg + "\n"); } else if (typeof console !== "undefined") { - let args = Array.from(arguments, arg => { + let args = Array.from(arguments, (arg) => { if (arg && arg.nodeType == this.ELEMENT_NODE) { return logNode(arg); } @@ -111,7 +121,9 @@ Readability.prototype = { DEFAULT_N_TOP_CANDIDATES: 5, // Element tags to score by default. - DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","), + DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre" + .toUpperCase() + .split(","), // The default number of chars an article must have in order to return a result DEFAULT_CHAR_THRESHOLD: 500, @@ -121,16 +133,21 @@ Readability.prototype = { REGEXPS: { // NOTE: These two regular expressions are duplicated in // Readability-readerable.js. Please keep both copies in sync. - unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + unlikelyCandidates: + /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, - positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, - extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, + positive: + /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, + negative: + /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + extraneous: + /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, normalize: /\s{2,}/g, - videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, + videos: + /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, @@ -141,40 +158,106 @@ Readability.prototype = { srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, // See: https://schema.org/Article - jsonLdArticleTypes: /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/ + jsonLdArticleTypes: + /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/, }, - UNLIKELY_ROLES: [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ], + UNLIKELY_ROLES: [ + "menu", + "menubar", + "complementary", + "navigation", + "alert", + "alertdialog", + "dialog", + ], - DIV_TO_P_ELEMS: new Set([ "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL" ]), + DIV_TO_P_ELEMS: new Set([ + "BLOCKQUOTE", + "DL", + "DIV", + "IMG", + "OL", + "P", + "PRE", + "TABLE", + "UL", + ]), ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], - PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ], + PRESENTATIONAL_ATTRIBUTES: [ + "align", + "background", + "bgcolor", + "border", + "cellpadding", + "cellspacing", + "frame", + "hspace", + "rules", + "style", + "valign", + "vspace", + ], - DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ], + DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"], // The commented out elements qualify as phrasing content but tend to be // removed by readability when put into paragraphs, so we ignore them here. PHRASING_ELEMS: [ // "CANVAS", "IFRAME", "SVG", "VIDEO", - "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", - "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", - "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q", - "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB", - "SUP", "TEXTAREA", "TIME", "VAR", "WBR" + "ABBR", + "AUDIO", + "B", + "BDO", + "BR", + "BUTTON", + "CITE", + "CODE", + "DATA", + "DATALIST", + "DFN", + "EM", + "EMBED", + "I", + "IMG", + "INPUT", + "KBD", + "LABEL", + "MARK", + "MATH", + "METER", + "NOSCRIPT", + "OBJECT", + "OUTPUT", + "PROGRESS", + "Q", + "RUBY", + "SAMP", + "SCRIPT", + "SELECT", + "SMALL", + "SPAN", + "STRONG", + "SUB", + "SUP", + "TEXTAREA", + "TIME", + "VAR", + "WBR", ], // These are the classes that readability sets itself. - CLASSES_TO_PRESERVE: [ "page" ], + CLASSES_TO_PRESERVE: ["page"], // These are the list of HTML entities that need to be escaped. HTML_ESCAPE_MAP: { - "lt": "<", - "gt": ">", - "amp": "&", - "quot": '"', - "apos": "'", + lt: "<", + gt: ">", + amp: "&", + quot: '"', + apos: "'", }, /** @@ -182,8 +265,8 @@ Readability.prototype = { * * @param Element * @return void - **/ - _postProcessContent: function(articleContent) { + **/ + _postProcessContent: function (articleContent) { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); @@ -205,7 +288,7 @@ Readability.prototype = { * @param Function filterFn the function to use as a filter * @return void */ - _removeNodes: function(nodeList, filterFn) { + _removeNodes: function (nodeList, filterFn) { // Avoid ever operating on live node lists. if (this._docJSDOMParser && nodeList._isLiveNodeList) { throw new Error("Do not pass live node lists to _removeNodes"); @@ -228,7 +311,7 @@ Readability.prototype = { * @param String newTagName the new tag name to use * @return void */ - _replaceNodeTags: function(nodeList, newTagName) { + _replaceNodeTags: function (nodeList, newTagName) { // Avoid ever operating on live node lists. if (this._docJSDOMParser && nodeList._isLiveNodeList) { throw new Error("Do not pass live node lists to _replaceNodeTags"); @@ -249,7 +332,7 @@ Readability.prototype = { * @param Function fn The iterate function. * @return void */ - _forEachNode: function(nodeList, fn) { + _forEachNode: function (nodeList, fn) { Array.prototype.forEach.call(nodeList, fn, this); }, @@ -264,7 +347,7 @@ Readability.prototype = { * @param Function fn The test function. * @return void */ - _findNode: function(nodeList, fn) { + _findNode: function (nodeList, fn) { return Array.prototype.find.call(nodeList, fn, this); }, @@ -279,7 +362,7 @@ Readability.prototype = { * @param Function fn The iterate function. * @return Boolean */ - _someNode: function(nodeList, fn) { + _someNode: function (nodeList, fn) { return Array.prototype.some.call(nodeList, fn, this); }, @@ -294,7 +377,7 @@ Readability.prototype = { * @param Function fn The iterate function. * @return Boolean */ - _everyNode: function(nodeList, fn) { + _everyNode: function (nodeList, fn) { return Array.prototype.every.call(nodeList, fn, this); }, @@ -304,23 +387,26 @@ Readability.prototype = { * @return ...NodeList * @return Array */ - _concatNodeLists: function() { + _concatNodeLists: function () { var slice = Array.prototype.slice; var args = slice.call(arguments); - var nodeLists = args.map(function(list) { + var nodeLists = args.map(function (list) { return slice.call(list); }); return Array.prototype.concat.apply([], nodeLists); }, - _getAllNodesWithTag: function(node, tagNames) { + _getAllNodesWithTag: function (node, tagNames) { if (node.querySelectorAll) { return node.querySelectorAll(tagNames.join(",")); } - return [].concat.apply([], tagNames.map(function(tag) { - var collection = node.getElementsByTagName(tag); - return Array.isArray(collection) ? collection : Array.from(collection); - })); + return [].concat.apply( + [], + tagNames.map(function (tag) { + var collection = node.getElementsByTagName(tag); + return Array.isArray(collection) ? collection : Array.from(collection); + }) + ); }, /** @@ -331,11 +417,11 @@ Readability.prototype = { * @param Element * @return void */ - _cleanClasses: function(node) { + _cleanClasses: function (node) { var classesToPreserve = this._classesToPreserve; var className = (node.getAttribute("class") || "") .split(/\s+/) - .filter(function(cls) { + .filter(function (cls) { return classesToPreserve.indexOf(cls) != -1; }) .join(" "); @@ -358,7 +444,7 @@ Readability.prototype = { * @param Element * @return void */ - _fixRelativeUris: function(articleContent) { + _fixRelativeUris: function (articleContent) { var baseURI = this._doc.baseURI; var documentURI = this._doc.documentURI; function toAbsoluteURI(uri) { @@ -377,14 +463,17 @@ Readability.prototype = { } var links = this._getAllNodesWithTag(articleContent, ["a"]); - this._forEachNode(links, function(link) { + this._forEachNode(links, function (link) { var href = link.getAttribute("href"); if (href) { // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.indexOf("javascript:") === 0) { // if the link only contains simple text content, it can be converted to a text node - if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + if ( + link.childNodes.length === 1 && + link.childNodes[0].nodeType === this.TEXT_NODE + ) { var text = this._doc.createTextNode(link.textContent); link.parentNode.replaceChild(text, link); } else { @@ -402,10 +491,15 @@ Readability.prototype = { }); var medias = this._getAllNodesWithTag(articleContent, [ - "img", "picture", "figure", "video", "audio", "source" + "img", + "picture", + "figure", + "video", + "audio", + "source", ]); - this._forEachNode(medias, function(media) { + this._forEachNode(medias, function (media) { var src = media.getAttribute("src"); var poster = media.getAttribute("poster"); var srcset = media.getAttribute("srcset"); @@ -419,27 +513,40 @@ Readability.prototype = { } if (srcset) { - var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { - return toAbsoluteURI(p1) + (p2 || "") + p3; - }); + var newSrcset = srcset.replace( + this.REGEXPS.srcsetUrl, + function (_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + } + ); media.setAttribute("srcset", newSrcset); } }); }, - _simplifyNestedElements: function(articleContent) { + _simplifyNestedElements: function (articleContent) { var node = articleContent; while (node) { - if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) { + if ( + node.parentNode && + ["DIV", "SECTION"].includes(node.tagName) && + !(node.id && node.id.startsWith("readability")) + ) { if (this._isElementWithoutContent(node)) { node = this._removeAndGetNext(node); continue; - } else if (this._hasSingleTagInsideElement(node, "DIV") || this._hasSingleTagInsideElement(node, "SECTION")) { + } else if ( + this._hasSingleTagInsideElement(node, "DIV") || + this._hasSingleTagInsideElement(node, "SECTION") + ) { var child = node.children[0]; for (var i = 0; i < node.attributes.length; i++) { - child.setAttribute(node.attributes[i].name, node.attributes[i].value); + child.setAttribute( + node.attributes[i].name, + node.attributes[i].value + ); } node.parentNode.replaceChild(child, node); node = child; @@ -456,7 +563,7 @@ Readability.prototype = { * * @return string **/ - _getArticleTitle: function() { + _getArticleTitle: function () { var doc = this._doc; var curTitle = ""; var origTitle = ""; @@ -466,8 +573,12 @@ Readability.prototype = { // If they had an element with id "title" in their HTML if (typeof curTitle !== "string") - curTitle = origTitle = this._getInnerText(doc.getElementsByTagName("title")[0]); - } catch (e) {/* ignore exceptions setting the title. */} + curTitle = origTitle = this._getInnerText( + doc.getElementsByTagName("title")[0] + ); + } catch (e) { + /* ignore exceptions setting the title. */ + } var titleHadHierarchicalSeparators = false; function wordCount(str) { @@ -475,7 +586,7 @@ Readability.prototype = { } // If there's a separator in the title, first remove the final part - if ((/ [\|\-\\\/>»] /).test(curTitle)) { + if (/ [\|\-\\\/>»] /.test(curTitle)) { titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); @@ -491,7 +602,7 @@ Readability.prototype = { doc.getElementsByTagName("h2") ); var trimmedTitle = curTitle.trim(); - var match = this._someNode(headings, function(heading) { + var match = this._someNode(headings, function (heading) { return heading.textContent.trim() === trimmedTitle; }); @@ -511,8 +622,7 @@ Readability.prototype = { } else if (curTitle.length > 150 || curTitle.length < 15) { var hOnes = doc.getElementsByTagName("h1"); - if (hOnes.length === 1) - curTitle = this._getInnerText(hOnes[0]); + if (hOnes.length === 1) curTitle = this._getInnerText(hOnes[0]); } curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); @@ -521,9 +631,12 @@ Readability.prototype = { // title or we decreased the number of words by more than 1 word, use // the original title. var curTitleWordCount = wordCount(curTitle); - if (curTitleWordCount <= 4 && - (!titleHadHierarchicalSeparators || - curTitleWordCount != wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)) { + if ( + curTitleWordCount <= 4 && + (!titleHadHierarchicalSeparators || + curTitleWordCount != + wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1) + ) { curTitle = origTitle; } @@ -536,7 +649,7 @@ Readability.prototype = { * * @return void **/ - _prepDocument: function() { + _prepDocument: function () { var doc = this._doc; // Remove all style tags in head @@ -556,9 +669,11 @@ Readability.prototype = { */ _nextNode: function (node) { var next = node; - while (next - && (next.nodeType != this.ELEMENT_NODE) - && this.REGEXPS.whitespace.test(next.textContent)) { + while ( + next && + next.nodeType != this.ELEMENT_NODE && + this.REGEXPS.whitespace.test(next.textContent) + ) { next = next.nextSibling; } return next; @@ -572,7 +687,7 @@ Readability.prototype = { * <div>foo<br>bar<p>abc</p></div> */ _replaceBrs: function (elem) { - this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) { + this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) { var next = br.nextSibling; // Whether 2 or more <br> elements have been found and replaced with a @@ -582,7 +697,7 @@ Readability.prototype = { // If we find a <br> chain, remove the <br>s until we hit another node // or non-whitespace. This leaves behind the first <br> in the chain // (which will be replaced with a <p> later). - while ((next = this._nextNode(next)) && (next.tagName == "BR")) { + while ((next = this._nextNode(next)) && next.tagName == "BR") { replaced = true; var brSibling = next.nextSibling; next.parentNode.removeChild(next); @@ -601,12 +716,10 @@ Readability.prototype = { // If we've hit another <br><br>, we're done adding children to this <p>. if (next.tagName == "BR") { var nextElem = this._nextNode(next.nextSibling); - if (nextElem && nextElem.tagName == "BR") - break; + if (nextElem && nextElem.tagName == "BR") break; } - if (!this._isPhrasingContent(next)) - break; + if (!this._isPhrasingContent(next)) break; // Otherwise, make this node a child of the new <p>. var sibling = next.nextSibling; @@ -618,8 +731,7 @@ Readability.prototype = { p.removeChild(p.lastChild); } - if (p.parentNode.tagName === "P") - this._setNodeTag(p.parentNode, "DIV"); + if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV"); } }); }, @@ -637,12 +749,14 @@ Readability.prototype = { replacement.appendChild(node.firstChild); } node.parentNode.replaceChild(replacement, node); - if (node.readability) - replacement.readability = node.readability; + if (node.readability) replacement.readability = node.readability; for (var i = 0; i < node.attributes.length; i++) { try { - replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + replacement.setAttribute( + node.attributes[i].name, + node.attributes[i].value + ); } catch (ex) { /* it's possible for setAttribute() to throw if the attribute name * isn't a valid XML Name. Such attributes can however be parsed from @@ -662,7 +776,7 @@ Readability.prototype = { * @param Element * @return void **/ - _prepArticle: function(articleContent) { + _prepArticle: function (articleContent) { this._cleanStyles(articleContent); // Check for data tables before we continue, to avoid removing items in @@ -688,7 +802,10 @@ Readability.prototype = { this._forEachNode(articleContent.children, function (topCandidate) { this._cleanMatchedNodes(topCandidate, function (node, matchString) { - return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; + return ( + this.REGEXPS.shareElements.test(matchString) && + node.textContent.length < shareElementThreshold + ); }); }); @@ -706,38 +823,56 @@ Readability.prototype = { this._cleanConditionally(articleContent, "div"); // replace H1 with H2 as H1 should be only title that is displayed separately - this._replaceNodeTags(this._getAllNodesWithTag(articleContent, ["h1"]), "h2"); + this._replaceNodeTags( + this._getAllNodesWithTag(articleContent, ["h1"]), + "h2" + ); // Remove extra paragraphs - this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { - var imgCount = paragraph.getElementsByTagName("img").length; - var embedCount = paragraph.getElementsByTagName("embed").length; - var objectCount = paragraph.getElementsByTagName("object").length; - // At this point, nasty iframes have been removed, only remain embedded video ones. - var iframeCount = paragraph.getElementsByTagName("iframe").length; - var totalCount = imgCount + embedCount + objectCount + iframeCount; - - return totalCount === 0 && !this._getInnerText(paragraph, false); - }); + this._removeNodes( + this._getAllNodesWithTag(articleContent, ["p"]), + function (paragraph) { + var imgCount = paragraph.getElementsByTagName("img").length; + var embedCount = paragraph.getElementsByTagName("embed").length; + var objectCount = paragraph.getElementsByTagName("object").length; + // At this point, nasty iframes have been removed, only remain embedded video ones. + var iframeCount = paragraph.getElementsByTagName("iframe").length; + var totalCount = imgCount + embedCount + objectCount + iframeCount; + + return totalCount === 0 && !this._getInnerText(paragraph, false); + } + ); - this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) { - var next = this._nextNode(br.nextSibling); - if (next && next.tagName == "P") - br.parentNode.removeChild(br); - }); + this._forEachNode( + this._getAllNodesWithTag(articleContent, ["br"]), + function (br) { + var next = this._nextNode(br.nextSibling); + if (next && next.tagName == "P") br.parentNode.removeChild(br); + } + ); // Remove single-cell tables - this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) { - var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table; - if (this._hasSingleTagInsideElement(tbody, "TR")) { - var row = tbody.firstElementChild; - if (this._hasSingleTagInsideElement(row, "TD")) { - var cell = row.firstElementChild; - cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV"); - table.parentNode.replaceChild(cell, table); + this._forEachNode( + this._getAllNodesWithTag(articleContent, ["table"]), + function (table) { + var tbody = this._hasSingleTagInsideElement(table, "TBODY") + ? table.firstElementChild + : table; + if (this._hasSingleTagInsideElement(tbody, "TR")) { + var row = tbody.firstElementChild; + if (this._hasSingleTagInsideElement(row, "TD")) { + var cell = row.firstElementChild; + cell = this._setNodeTag( + cell, + this._everyNode(cell.childNodes, this._isPhrasingContent) + ? "P" + : "DIV" + ); + table.parentNode.replaceChild(cell, table); + } } } - }); + ); }, /** @@ -746,9 +881,9 @@ Readability.prototype = { * * @param Element * @return void - **/ - _initializeNode: function(node) { - node.readability = {"contentScore": 0}; + **/ + _initializeNode: function (node) { + node.readability = { contentScore: 0 }; switch (node.tagName) { case "DIV": @@ -786,7 +921,7 @@ Readability.prototype = { node.readability.contentScore += this._getClassWeight(node); }, - _removeAndGetNext: function(node) { + _removeAndGetNext: function (node) { var nextNode = this._getNextNode(node, true); node.parentNode.removeChild(node); return nextNode; @@ -799,7 +934,7 @@ Readability.prototype = { * * Calling this in a loop will traverse the DOM depth-first. */ - _getNextNode: function(node, ignoreSelfAndKids) { + _getNextNode: function (node, ignoreSelfAndKids) { // First check for kids if those aren't being ignored if (!ignoreSelfAndKids && node.firstElementChild) { return node.firstElementChild; @@ -821,18 +956,24 @@ Readability.prototype = { // 1 = same text, 0 = completely different text // works the way that it splits both texts into words and then finds words that are unique in second text // the result is given by the lower length of unique parts - _textSimilarity: function(textA, textB) { - var tokensA = textA.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); - var tokensB = textB.toLowerCase().split(this.REGEXPS.tokenize).filter(Boolean); + _textSimilarity: function (textA, textB) { + var tokensA = textA + .toLowerCase() + .split(this.REGEXPS.tokenize) + .filter(Boolean); + var tokensB = textB + .toLowerCase() + .split(this.REGEXPS.tokenize) + .filter(Boolean); if (!tokensA.length || !tokensB.length) { return 0; } - var uniqTokensB = tokensB.filter(token => !tokensA.includes(token)); + var uniqTokensB = tokensB.filter((token) => !tokensA.includes(token)); var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; return 1 - distanceB; }, - _checkByline: function(node, matchString) { + _checkByline: function (node, matchString) { if (this._articleByline) { return false; } @@ -842,7 +983,12 @@ Readability.prototype = { var itemprop = node.getAttribute("itemprop"); } - if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { + if ( + (rel === "author" || + (itemprop && itemprop.indexOf("author") !== -1) || + this.REGEXPS.byline.test(matchString)) && + this._isValidByline(node.textContent) + ) { this._articleByline = node.textContent.trim(); return true; } @@ -850,13 +996,13 @@ Readability.prototype = { return false; }, - _getNodeAncestors: function(node, maxDepth) { + _getNodeAncestors: function (node, maxDepth) { maxDepth = maxDepth || 0; - var i = 0, ancestors = []; + var i = 0, + ancestors = []; while (node.parentNode) { ancestors.push(node.parentNode); - if (maxDepth && ++i === maxDepth) - break; + if (maxDepth && ++i === maxDepth) break; node = node.parentNode; } return ancestors; @@ -868,7 +1014,7 @@ Readability.prototype = { * * @param page a document to run upon. Needs to be a full document, complete with body. * @return Element - **/ + **/ _grabArticle: function (page) { this.log("**** grabArticle ****"); var doc = this._doc; @@ -885,7 +1031,9 @@ Readability.prototype = { while (true) { this.log("Starting grabArticle loop"); - var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS); + var stripUnlikelyCandidates = this._flagIsActive( + this.FLAG_STRIP_UNLIKELYS + ); // First, node prepping. Trash nodes that look cruddy (like ones with the // class name "comment", etc), and turn divs into P tags where they have been @@ -896,7 +1044,6 @@ Readability.prototype = { let shouldRemoveTitleHeader = true; while (node) { - if (node.tagName === "HTML") { this._articleLang = node.getAttribute("lang"); } @@ -916,7 +1063,11 @@ Readability.prototype = { } if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { - this.log("Removing header: ", node.textContent.trim(), this._articleTitle.trim()); + this.log( + "Removing header: ", + node.textContent.trim(), + this._articleTitle.trim() + ); shouldRemoveTitleHeader = false; node = this._removeAndGetNext(node); continue; @@ -924,29 +1075,44 @@ Readability.prototype = { // Remove unlikely candidates if (stripUnlikelyCandidates) { - if (this.REGEXPS.unlikelyCandidates.test(matchString) && - !this.REGEXPS.okMaybeItsACandidate.test(matchString) && - !this._hasAncestorTag(node, "table") && - !this._hasAncestorTag(node, "code") && - node.tagName !== "BODY" && - node.tagName !== "A") { + if ( + this.REGEXPS.unlikelyCandidates.test(matchString) && + !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && + !this._hasAncestorTag(node, "code") && + node.tagName !== "BODY" && + node.tagName !== "A" + ) { this.log("Removing unlikely candidate - " + matchString); node = this._removeAndGetNext(node); continue; } if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { - this.log("Removing content with role " + node.getAttribute("role") + " - " + matchString); + this.log( + "Removing content with role " + + node.getAttribute("role") + + " - " + + matchString + ); node = this._removeAndGetNext(node); continue; } } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). - if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" || - node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" || - node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") && - this._isElementWithoutContent(node)) { + if ( + (node.tagName === "DIV" || + node.tagName === "SECTION" || + node.tagName === "HEADER" || + node.tagName === "H1" || + node.tagName === "H2" || + node.tagName === "H3" || + node.tagName === "H4" || + node.tagName === "H5" || + node.tagName === "H6") && + this._isElementWithoutContent(node) + ) { node = this._removeAndGetNext(node); continue; } @@ -983,7 +1149,10 @@ Readability.prototype = { // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. - if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) { + if ( + this._hasSingleTagInsideElement(node, "P") && + this._getLinkDensity(node) < 0.25 + ) { var newNode = node.children[0]; node.parentNode.replaceChild(newNode, node); node = newNode; @@ -1001,21 +1170,22 @@ Readability.prototype = { * Then add their score to their parent node. * * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. - **/ + **/ var candidates = []; - this._forEachNode(elementsToScore, function(elementToScore) { - if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === "undefined") + this._forEachNode(elementsToScore, function (elementToScore) { + if ( + !elementToScore.parentNode || + typeof elementToScore.parentNode.tagName === "undefined" + ) return; // If this paragraph is less than 25 characters, don't even count it. var innerText = this._getInnerText(elementToScore); - if (innerText.length < 25) - return; + if (innerText.length < 25) return; // Exclude nodes with no ancestor. var ancestors = this._getNodeAncestors(elementToScore, 5); - if (ancestors.length === 0) - return; + if (ancestors.length === 0) return; var contentScore = 0; @@ -1029,11 +1199,15 @@ Readability.prototype = { contentScore += Math.min(Math.floor(innerText.length / 100), 3); // Initialize and score ancestors. - this._forEachNode(ancestors, function(ancestor, level) { - if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === "undefined") + this._forEachNode(ancestors, function (ancestor, level) { + if ( + !ancestor.tagName || + !ancestor.parentNode || + typeof ancestor.parentNode.tagName === "undefined" + ) return; - if (typeof(ancestor.readability) === "undefined") { + if (typeof ancestor.readability === "undefined") { this._initializeNode(ancestor); candidates.push(ancestor); } @@ -1042,12 +1216,9 @@ Readability.prototype = { // - parent: 1 (no division) // - grandparent: 2 // - great grandparent+: ancestor level * 3 - if (level === 0) - var scoreDivider = 1; - else if (level === 1) - scoreDivider = 2; - else - scoreDivider = level * 3; + |