diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js index ab2f503e1..2d3d6f156 100644 --- a/toolkit/components/reader/JSDOMParser.js +++ b/toolkit/components/reader/JSDOMParser.js @@ -315,6 +315,7 @@ } } getElems(this); + elems._isLiveNodeList = true; return elems; } @@ -503,17 +504,9 @@ }, setValue: function(newValue) { this._value = newValue; - delete this._decodedValue; }, - setDecodedValue: function(newValue) { - this._value = encodeHTML(newValue); - this._decodedValue = newValue; - }, - getDecodedValue: function() { - if (typeof this._decodedValue === "undefined") { - this._decodedValue = (this._value && decodeHTML(this._value)) || ""; - } - return this._decodedValue; + getEncodedValue: function() { + return encodeHTML(this._value); }, }; @@ -673,6 +666,14 @@ this.setAttribute("src", str); }, + get srcset() { + return this.getAttribute("srcset") || ""; + }, + + set srcset(str) { + this.setAttribute("srcset", str); + }, + get nodeName() { return this.tagName; }, @@ -689,7 +690,7 @@ for (var j = 0; j < child.attributes.length; j++) { var attr = child.attributes[j]; // the attribute value will be HTML escaped. - var val = attr.value; + var val = attr.getEncodedValue(); var quote = (val.indexOf('"') === -1 ? '"' : "'"); arr.push(" " + attr.name + "=" + quote + val + quote); } @@ -767,8 +768,9 @@ getAttribute: function (name) { for (var i = this.attributes.length; --i >= 0;) { var attr = this.attributes[i]; - if (attr.name === name) - return attr.getDecodedValue(); + if (attr.name === name) { + return attr.value; + } } return undefined; }, @@ -777,11 +779,11 @@ for (var i = this.attributes.length; --i >= 0;) { var attr = this.attributes[i]; if (attr.name === name) { - attr.setDecodedValue(value); + attr.setValue(value); return; } } - this.attributes.push(new Attribute(name, encodeHTML(value))); + this.attributes.push(new Attribute(name, value)); }, removeAttribute: function (name) { @@ -945,7 +947,7 @@ // Read the attribute value (and consume the matching quote) var value = this.readString(c); - node.attributes.push(new Attribute(name, value)); + node.attributes.push(new Attribute(name, decodeHTML(value))); return; }, diff --git a/toolkit/components/reader/Readability-readerable.js b/toolkit/components/reader/Readability-readerable.js index d0e1b8164..839d9fbf7 100644 --- a/toolkit/components/reader/Readability-readerable.js +++ b/toolkit/components/reader/Readability-readerable.js @@ -31,13 +31,16 @@ var REGEXPS = { // NOTE: These two regular expressions are duplicated in // Readability.js. Please keep both copies in sync. - unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, - okMaybeItsACandidate: /and|article|body|column|main|shadow/i, + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, }; function isNodeVisible(node) { - // Have to null-check node.style to deal with SVG and MathML nodes. - return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden"); + // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. + return (!node.style || node.style.display != "none") + && !node.hasAttribute("hidden") + //check for "fallback-image" so that wikimedia math images are displayed + && (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1)); } /** diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 69fb53f86..4a3689885 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -43,6 +43,7 @@ function Readability(doc, options) { options = options || {}; this._doc = doc; + this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; this._articleTitle = null; this._articleByline = null; this._articleDir = null; @@ -55,6 +56,7 @@ function Readability(doc, options) { this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []); + this._keepClasses = !!options.keepClasses; // Start with all flags set this._flags = this.FLAG_STRIP_UNLIKELYS | @@ -121,20 +123,23 @@ Readability.prototype = { REGEXPS: { // NOTE: These two regular expressions are duplicated in // Readability-readerable.js. Please keep both copies in sync. - unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, - okMaybeItsACandidate: /and|article|body|column|main|shadow/i, + unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, + okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, - negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, + negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, byline: /byline|author|dateline|writtenby|p-author/i, replaceFonts: /<(\/?)font[^>]*>/gi, normalize: /\s{2,}/g, videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, + shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, whitespace: /^\s*$/, hasContent: /\S$/, + srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, + b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i }, DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ], @@ -159,6 +164,15 @@ Readability.prototype = { // These are the classes that readability sets itself. CLASSES_TO_PRESERVE: [ "page" ], + // These are the list of HTML entities that need to be escaped. + HTML_ESCAPE_MAP: { + "lt": "<", + "gt": ">", + "amp": "&", + "quot": '"', + "apos": "'", + }, + /** * Run any post-process modifications to article content as necessary. * @@ -169,8 +183,10 @@ Readability.prototype = { // Readability cannot open relative uris so we convert them to absolute uris. this._fixRelativeUris(articleContent); - // Remove classes. - this._cleanClasses(articleContent); + if (!this._keepClasses) { + // Remove classes. + this._cleanClasses(articleContent); + } }, /** @@ -184,6 +200,10 @@ Readability.prototype = { * @return void */ _removeNodes: function(nodeList, filterFn) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _removeNodes"); + } for (var i = nodeList.length - 1; i >= 0; i--) { var node = nodeList[i]; var parentNode = node.parentNode; @@ -203,6 +223,10 @@ Readability.prototype = { * @return void */ _replaceNodeTags: function(nodeList, newTagName) { + // Avoid ever operating on live node lists. + if (this._docJSDOMParser && nodeList._isLiveNodeList) { + throw new Error("Do not pass live node lists to _replaceNodeTags"); + } for (var i = nodeList.length - 1; i >= 0; i--) { var node = nodeList[i]; this._setNodeTag(node, newTagName); @@ -322,6 +346,7 @@ Readability.prototype = { if (baseURI == documentURI && uri.charAt(0) == "#") { return uri; } + // Otherwise, resolve against base URI: try { return new URL(uri, baseURI).href; @@ -335,22 +360,50 @@ Readability.prototype = { this._forEachNode(links, function(link) { var href = link.getAttribute("href"); if (href) { - // Replace links with javascript: URIs with text content, since + // Remove links with javascript: URIs, since // they won't work after scripts have been removed from the page. if (href.indexOf("javascript:") === 0) { - var text = this._doc.createTextNode(link.textContent); - link.parentNode.replaceChild(text, link); + // if the link only contains simple text content, it can be converted to a text node + if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) { + var text = this._doc.createTextNode(link.textContent); + link.parentNode.replaceChild(text, link); + } else { + // if the link has multiple children, they should all be preserved + var container = this._doc.createElement("span"); + while (link.childNodes.length > 0) { + container.appendChild(link.childNodes[0]); + } + link.parentNode.replaceChild(container, link); + } } else { link.setAttribute("href", toAbsoluteURI(href)); } } }); - var imgs = this._getAllNodesWithTag(articleContent, ["img"]); - this._forEachNode(imgs, function(img) { - var src = img.getAttribute("src"); + var medias = this._getAllNodesWithTag(articleContent, [ + "img", "picture", "figure", "video", "audio", "source" + ]); + + this._forEachNode(medias, function(media) { + var src = media.getAttribute("src"); + var poster = media.getAttribute("poster"); + var srcset = media.getAttribute("srcset"); + if (src) { - img.setAttribute("src", toAbsoluteURI(src)); + media.setAttribute("src", toAbsoluteURI(src)); + } + + if (poster) { + media.setAttribute("poster", toAbsoluteURI(poster)); + } + + if (srcset) { + var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) { + return toAbsoluteURI(p1) + (p2 || "") + p3; + }); + + media.setAttribute("srcset", newSrcset); } }); }, @@ -444,13 +497,13 @@ Readability.prototype = { var doc = this._doc; // Remove all style tags in head - this._removeNodes(doc.getElementsByTagName("style")); + this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); if (doc.body) { this._replaceBrs(doc.body); } - this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN"); + this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); }, /** @@ -530,7 +583,7 @@ Readability.prototype = { _setNodeTag: function (node, tag) { this.log("_setNodeTag", node, tag); - if (node.__JSDOMParser__) { + if (this._docJSDOMParser) { node.localName = tag.toLowerCase(); node.tagName = tag.toUpperCase(); return node; @@ -545,7 +598,16 @@ Readability.prototype = { replacement.readability = node.readability; for (var i = 0; i < node.attributes.length; i++) { - replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + try { + replacement.setAttribute(node.attributes[i].name, node.attributes[i].value); + } catch (ex) { + /* it's possible for setAttribute() to throw if the attribute name + * isn't a valid XML Name. Such attributes can however be parsed from + * source in HTML docs, see https://github.com/whatwg/html/issues/4275, + * so we can hit them here and then throw. We don't care about such + * attributes so we ignore them. + */ + } } return replacement; }, @@ -565,6 +627,8 @@ Readability.prototype = { // visually linked to other content-ful elements (text, images, etc.). this._markDataTables(articleContent); + this._fixLazyImages(articleContent); + // Clean out junk from the article content this._cleanConditionally(articleContent, "form"); this._cleanConditionally(articleContent, "fieldset"); @@ -575,10 +639,15 @@ Readability.prototype = { this._clean(articleContent, "link"); this._clean(articleContent, "aside"); - // Clean out elements have "share" in their id/class combinations from final top candidates, + // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, // which means we don't remove the top candidates even they have "share". - this._forEachNode(articleContent.children, function(topCandidate) { - this._cleanMatchedNodes(topCandidate, /share/); + + var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; + + this._forEachNode(articleContent.children, function (topCandidate) { + this._cleanMatchedNodes(topCandidate, function (node, matchString) { + return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold; + }); }); // If there is only one h2 and its text content substantially equals article title, @@ -614,7 +683,7 @@ Readability.prototype = { this._cleanConditionally(articleContent, "div"); // Remove extra paragraphs - this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) { + this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) { var imgCount = paragraph.getElementsByTagName("img").length; var embedCount = paragraph.getElementsByTagName("embed").length; var objectCount = paragraph.getElementsByTagName("object").length; @@ -729,9 +798,10 @@ Readability.prototype = { if (node.getAttribute !== undefined) { var rel = node.getAttribute("rel"); + var itemprop = node.getAttribute("itemprop"); } - if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { + if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) { this._articleByline = node.textContent.trim(); return true; } @@ -800,12 +870,19 @@ Readability.prototype = { if (stripUnlikelyCandidates) { if (this.REGEXPS.unlikelyCandidates.test(matchString) && !this.REGEXPS.okMaybeItsACandidate.test(matchString) && + !this._hasAncestorTag(node, "table") && node.tagName !== "BODY" && node.tagName !== "A") { this.log("Removing unlikely candidate - " + matchString); node = this._removeAndGetNext(node); continue; } + + if (node.getAttribute("role") == "complementary") { + this.log("Removing complementary content - " + matchString); + node = this._removeAndGetNext(node); + continue; + } } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). @@ -1199,6 +1276,26 @@ Readability.prototype = { return false; }, + /** + * Converts some of the common HTML entities in string to their corresponding characters. + * + * @param str {string} - a string to unescape. + * @return string without HTML entity. + */ + _unescapeHtmlEntities: function(str) { + if (!str) { + return str; + } + + var htmlEscapeMap = this.HTML_ESCAPE_MAP; + return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) { + return htmlEscapeMap[tag]; + }).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) { + var num = parseInt(hex || numStr, hex ? 16 : 10); + return String.fromCharCode(num); + }); + }, + /** * Attempts to get excerpt and byline metadata for the article. * @@ -1220,6 +1317,9 @@ Readability.prototype = { var elementName = element.getAttribute("name"); var elementProperty = element.getAttribute("property"); var content = element.getAttribute("content"); + if (!content) { + return; + } var matches = null; var name = null; @@ -1276,21 +1376,123 @@ Readability.prototype = { // get site name metadata.siteName = values["og:site_name"]; + // in many sites the meta value is escaped with HTML entities, + // so here we need to unescape it + metadata.title = this._unescapeHtmlEntities(metadata.title); + metadata.byline = this._unescapeHtmlEntities(metadata.byline); + metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); + metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); + return metadata; }, + /** + * Check if node is image, or if node contains exactly only one image + * whether as a direct child or as its descendants. + * + * @param Element + **/ + _isSingleImage: function(node) { + if (node.tagName === "IMG") { + return true; + } + + if (node.children.length !== 1 || node.textContent.trim() !== "") { + return false; + } + + return this._isSingleImage(node.children[0]); + }, + + /** + * Find all