Update Readability from upstream.
parent
14c4ffa708
commit
4562043d2f
|
@ -315,6 +315,7 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
getElems(this);
|
getElems(this);
|
||||||
|
elems._isLiveNodeList = true;
|
||||||
return elems;
|
return elems;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -503,17 +504,9 @@
|
||||||
},
|
},
|
||||||
setValue: function(newValue) {
|
setValue: function(newValue) {
|
||||||
this._value = newValue;
|
this._value = newValue;
|
||||||
delete this._decodedValue;
|
|
||||||
},
|
},
|
||||||
setDecodedValue: function(newValue) {
|
getEncodedValue: function() {
|
||||||
this._value = encodeHTML(newValue);
|
return encodeHTML(this._value);
|
||||||
this._decodedValue = newValue;
|
|
||||||
},
|
|
||||||
getDecodedValue: function() {
|
|
||||||
if (typeof this._decodedValue === "undefined") {
|
|
||||||
this._decodedValue = (this._value && decodeHTML(this._value)) || "";
|
|
||||||
}
|
|
||||||
return this._decodedValue;
|
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -673,6 +666,14 @@
|
||||||
this.setAttribute("src", str);
|
this.setAttribute("src", str);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
get srcset() {
|
||||||
|
return this.getAttribute("srcset") || "";
|
||||||
|
},
|
||||||
|
|
||||||
|
set srcset(str) {
|
||||||
|
this.setAttribute("srcset", str);
|
||||||
|
},
|
||||||
|
|
||||||
get nodeName() {
|
get nodeName() {
|
||||||
return this.tagName;
|
return this.tagName;
|
||||||
},
|
},
|
||||||
|
@ -689,7 +690,7 @@
|
||||||
for (var j = 0; j < child.attributes.length; j++) {
|
for (var j = 0; j < child.attributes.length; j++) {
|
||||||
var attr = child.attributes[j];
|
var attr = child.attributes[j];
|
||||||
// the attribute value will be HTML escaped.
|
// the attribute value will be HTML escaped.
|
||||||
var val = attr.value;
|
var val = attr.getEncodedValue();
|
||||||
var quote = (val.indexOf('"') === -1 ? '"' : "'");
|
var quote = (val.indexOf('"') === -1 ? '"' : "'");
|
||||||
arr.push(" " + attr.name + "=" + quote + val + quote);
|
arr.push(" " + attr.name + "=" + quote + val + quote);
|
||||||
}
|
}
|
||||||
|
@ -767,8 +768,9 @@
|
||||||
getAttribute: function (name) {
|
getAttribute: function (name) {
|
||||||
for (var i = this.attributes.length; --i >= 0;) {
|
for (var i = this.attributes.length; --i >= 0;) {
|
||||||
var attr = this.attributes[i];
|
var attr = this.attributes[i];
|
||||||
if (attr.name === name)
|
if (attr.name === name) {
|
||||||
return attr.getDecodedValue();
|
return attr.value;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return undefined;
|
return undefined;
|
||||||
},
|
},
|
||||||
|
@ -777,11 +779,11 @@
|
||||||
for (var i = this.attributes.length; --i >= 0;) {
|
for (var i = this.attributes.length; --i >= 0;) {
|
||||||
var attr = this.attributes[i];
|
var attr = this.attributes[i];
|
||||||
if (attr.name === name) {
|
if (attr.name === name) {
|
||||||
attr.setDecodedValue(value);
|
attr.setValue(value);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.attributes.push(new Attribute(name, encodeHTML(value)));
|
this.attributes.push(new Attribute(name, value));
|
||||||
},
|
},
|
||||||
|
|
||||||
removeAttribute: function (name) {
|
removeAttribute: function (name) {
|
||||||
|
@ -945,7 +947,7 @@
|
||||||
// Read the attribute value (and consume the matching quote)
|
// Read the attribute value (and consume the matching quote)
|
||||||
var value = this.readString(c);
|
var value = this.readString(c);
|
||||||
|
|
||||||
node.attributes.push(new Attribute(name, value));
|
node.attributes.push(new Attribute(name, decodeHTML(value)));
|
||||||
|
|
||||||
return;
|
return;
|
||||||
},
|
},
|
||||||
|
|
|
@ -31,13 +31,16 @@
|
||||||
var REGEXPS = {
|
var REGEXPS = {
|
||||||
// NOTE: These two regular expressions are duplicated in
|
// NOTE: These two regular expressions are duplicated in
|
||||||
// Readability.js. Please keep both copies in sync.
|
// Readability.js. Please keep both copies in sync.
|
||||||
unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
|
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
|
||||||
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
|
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
|
||||||
};
|
};
|
||||||
|
|
||||||
function isNodeVisible(node) {
|
function isNodeVisible(node) {
|
||||||
// Have to null-check node.style to deal with SVG and MathML nodes.
|
// Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
|
||||||
return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
|
return (!node.style || node.style.display != "none")
|
||||||
|
&& !node.hasAttribute("hidden")
|
||||||
|
//check for "fallback-image" so that wikimedia math images are displayed
|
||||||
|
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -43,6 +43,7 @@ function Readability(doc, options) {
|
||||||
options = options || {};
|
options = options || {};
|
||||||
|
|
||||||
this._doc = doc;
|
this._doc = doc;
|
||||||
|
this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
|
||||||
this._articleTitle = null;
|
this._articleTitle = null;
|
||||||
this._articleByline = null;
|
this._articleByline = null;
|
||||||
this._articleDir = null;
|
this._articleDir = null;
|
||||||
|
@ -55,6 +56,7 @@ function Readability(doc, options) {
|
||||||
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
|
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
|
||||||
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
|
this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
|
||||||
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
|
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
|
||||||
|
this._keepClasses = !!options.keepClasses;
|
||||||
|
|
||||||
// Start with all flags set
|
// Start with all flags set
|
||||||
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
||||||
|
@ -121,20 +123,23 @@ Readability.prototype = {
|
||||||
REGEXPS: {
|
REGEXPS: {
|
||||||
// NOTE: These two regular expressions are duplicated in
|
// NOTE: These two regular expressions are duplicated in
|
||||||
// Readability-readerable.js. Please keep both copies in sync.
|
// Readability-readerable.js. Please keep both copies in sync.
|
||||||
unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
|
unlikelyCandidates: /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
|
||||||
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
|
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
|
||||||
|
|
||||||
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
|
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
|
||||||
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
|
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
|
||||||
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
|
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
|
||||||
byline: /byline|author|dateline|writtenby|p-author/i,
|
byline: /byline|author|dateline|writtenby|p-author/i,
|
||||||
replaceFonts: /<(\/?)font[^>]*>/gi,
|
replaceFonts: /<(\/?)font[^>]*>/gi,
|
||||||
normalize: /\s{2,}/g,
|
normalize: /\s{2,}/g,
|
||||||
videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
|
videos: /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
|
||||||
|
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
|
||||||
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
|
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
|
||||||
prevLink: /(prev|earl|old|new|<|«)/i,
|
prevLink: /(prev|earl|old|new|<|«)/i,
|
||||||
whitespace: /^\s*$/,
|
whitespace: /^\s*$/,
|
||||||
hasContent: /\S$/,
|
hasContent: /\S$/,
|
||||||
|
srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
|
||||||
|
b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i
|
||||||
},
|
},
|
||||||
|
|
||||||
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
|
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
|
||||||
|
@ -159,6 +164,15 @@ Readability.prototype = {
|
||||||
// These are the classes that readability sets itself.
|
// These are the classes that readability sets itself.
|
||||||
CLASSES_TO_PRESERVE: [ "page" ],
|
CLASSES_TO_PRESERVE: [ "page" ],
|
||||||
|
|
||||||
|
// These are the list of HTML entities that need to be escaped.
|
||||||
|
HTML_ESCAPE_MAP: {
|
||||||
|
"lt": "<",
|
||||||
|
"gt": ">",
|
||||||
|
"amp": "&",
|
||||||
|
"quot": '"',
|
||||||
|
"apos": "'",
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run any post-process modifications to article content as necessary.
|
* Run any post-process modifications to article content as necessary.
|
||||||
*
|
*
|
||||||
|
@ -169,8 +183,10 @@ Readability.prototype = {
|
||||||
// Readability cannot open relative uris so we convert them to absolute uris.
|
// Readability cannot open relative uris so we convert them to absolute uris.
|
||||||
this._fixRelativeUris(articleContent);
|
this._fixRelativeUris(articleContent);
|
||||||
|
|
||||||
|
if (!this._keepClasses) {
|
||||||
// Remove classes.
|
// Remove classes.
|
||||||
this._cleanClasses(articleContent);
|
this._cleanClasses(articleContent);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -184,6 +200,10 @@ Readability.prototype = {
|
||||||
* @return void
|
* @return void
|
||||||
*/
|
*/
|
||||||
_removeNodes: function(nodeList, filterFn) {
|
_removeNodes: function(nodeList, filterFn) {
|
||||||
|
// Avoid ever operating on live node lists.
|
||||||
|
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
|
||||||
|
throw new Error("Do not pass live node lists to _removeNodes");
|
||||||
|
}
|
||||||
for (var i = nodeList.length - 1; i >= 0; i--) {
|
for (var i = nodeList.length - 1; i >= 0; i--) {
|
||||||
var node = nodeList[i];
|
var node = nodeList[i];
|
||||||
var parentNode = node.parentNode;
|
var parentNode = node.parentNode;
|
||||||
|
@ -203,6 +223,10 @@ Readability.prototype = {
|
||||||
* @return void
|
* @return void
|
||||||
*/
|
*/
|
||||||
_replaceNodeTags: function(nodeList, newTagName) {
|
_replaceNodeTags: function(nodeList, newTagName) {
|
||||||
|
// Avoid ever operating on live node lists.
|
||||||
|
if (this._docJSDOMParser && nodeList._isLiveNodeList) {
|
||||||
|
throw new Error("Do not pass live node lists to _replaceNodeTags");
|
||||||
|
}
|
||||||
for (var i = nodeList.length - 1; i >= 0; i--) {
|
for (var i = nodeList.length - 1; i >= 0; i--) {
|
||||||
var node = nodeList[i];
|
var node = nodeList[i];
|
||||||
this._setNodeTag(node, newTagName);
|
this._setNodeTag(node, newTagName);
|
||||||
|
@ -322,6 +346,7 @@ Readability.prototype = {
|
||||||
if (baseURI == documentURI && uri.charAt(0) == "#") {
|
if (baseURI == documentURI && uri.charAt(0) == "#") {
|
||||||
return uri;
|
return uri;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise, resolve against base URI:
|
// Otherwise, resolve against base URI:
|
||||||
try {
|
try {
|
||||||
return new URL(uri, baseURI).href;
|
return new URL(uri, baseURI).href;
|
||||||
|
@ -335,22 +360,50 @@ Readability.prototype = {
|
||||||
this._forEachNode(links, function(link) {
|
this._forEachNode(links, function(link) {
|
||||||
var href = link.getAttribute("href");
|
var href = link.getAttribute("href");
|
||||||
if (href) {
|
if (href) {
|
||||||
// Replace links with javascript: URIs with text content, since
|
// Remove links with javascript: URIs, since
|
||||||
// they won't work after scripts have been removed from the page.
|
// they won't work after scripts have been removed from the page.
|
||||||
if (href.indexOf("javascript:") === 0) {
|
if (href.indexOf("javascript:") === 0) {
|
||||||
|
// if the link only contains simple text content, it can be converted to a text node
|
||||||
|
if (link.childNodes.length === 1 && link.childNodes[0].nodeType === this.TEXT_NODE) {
|
||||||
var text = this._doc.createTextNode(link.textContent);
|
var text = this._doc.createTextNode(link.textContent);
|
||||||
link.parentNode.replaceChild(text, link);
|
link.parentNode.replaceChild(text, link);
|
||||||
|
} else {
|
||||||
|
// if the link has multiple children, they should all be preserved
|
||||||
|
var container = this._doc.createElement("span");
|
||||||
|
while (link.childNodes.length > 0) {
|
||||||
|
container.appendChild(link.childNodes[0]);
|
||||||
|
}
|
||||||
|
link.parentNode.replaceChild(container, link);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
link.setAttribute("href", toAbsoluteURI(href));
|
link.setAttribute("href", toAbsoluteURI(href));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
var imgs = this._getAllNodesWithTag(articleContent, ["img"]);
|
var medias = this._getAllNodesWithTag(articleContent, [
|
||||||
this._forEachNode(imgs, function(img) {
|
"img", "picture", "figure", "video", "audio", "source"
|
||||||
var src = img.getAttribute("src");
|
]);
|
||||||
|
|
||||||
|
this._forEachNode(medias, function(media) {
|
||||||
|
var src = media.getAttribute("src");
|
||||||
|
var poster = media.getAttribute("poster");
|
||||||
|
var srcset = media.getAttribute("srcset");
|
||||||
|
|
||||||
if (src) {
|
if (src) {
|
||||||
img.setAttribute("src", toAbsoluteURI(src));
|
media.setAttribute("src", toAbsoluteURI(src));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (poster) {
|
||||||
|
media.setAttribute("poster", toAbsoluteURI(poster));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (srcset) {
|
||||||
|
var newSrcset = srcset.replace(this.REGEXPS.srcsetUrl, function(_, p1, p2, p3) {
|
||||||
|
return toAbsoluteURI(p1) + (p2 || "") + p3;
|
||||||
|
});
|
||||||
|
|
||||||
|
media.setAttribute("srcset", newSrcset);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
@ -444,13 +497,13 @@ Readability.prototype = {
|
||||||
var doc = this._doc;
|
var doc = this._doc;
|
||||||
|
|
||||||
// Remove all style tags in head
|
// Remove all style tags in head
|
||||||
this._removeNodes(doc.getElementsByTagName("style"));
|
this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
|
||||||
|
|
||||||
if (doc.body) {
|
if (doc.body) {
|
||||||
this._replaceBrs(doc.body);
|
this._replaceBrs(doc.body);
|
||||||
}
|
}
|
||||||
|
|
||||||
this._replaceNodeTags(doc.getElementsByTagName("font"), "SPAN");
|
this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -530,7 +583,7 @@ Readability.prototype = {
|
||||||
|
|
||||||
_setNodeTag: function (node, tag) {
|
_setNodeTag: function (node, tag) {
|
||||||
this.log("_setNodeTag", node, tag);
|
this.log("_setNodeTag", node, tag);
|
||||||
if (node.__JSDOMParser__) {
|
if (this._docJSDOMParser) {
|
||||||
node.localName = tag.toLowerCase();
|
node.localName = tag.toLowerCase();
|
||||||
node.tagName = tag.toUpperCase();
|
node.tagName = tag.toUpperCase();
|
||||||
return node;
|
return node;
|
||||||
|
@ -545,7 +598,16 @@ Readability.prototype = {
|
||||||
replacement.readability = node.readability;
|
replacement.readability = node.readability;
|
||||||
|
|
||||||
for (var i = 0; i < node.attributes.length; i++) {
|
for (var i = 0; i < node.attributes.length; i++) {
|
||||||
|
try {
|
||||||
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
|
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
|
||||||
|
} catch (ex) {
|
||||||
|
/* it's possible for setAttribute() to throw if the attribute name
|
||||||
|
* isn't a valid XML Name. Such attributes can however be parsed from
|
||||||
|
* source in HTML docs, see https://github.com/whatwg/html/issues/4275,
|
||||||
|
* so we can hit them here and then throw. We don't care about such
|
||||||
|
* attributes so we ignore them.
|
||||||
|
*/
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return replacement;
|
return replacement;
|
||||||
},
|
},
|
||||||
|
@ -565,6 +627,8 @@ Readability.prototype = {
|
||||||
// visually linked to other content-ful elements (text, images, etc.).
|
// visually linked to other content-ful elements (text, images, etc.).
|
||||||
this._markDataTables(articleContent);
|
this._markDataTables(articleContent);
|
||||||
|
|
||||||
|
this._fixLazyImages(articleContent);
|
||||||
|
|
||||||
// Clean out junk from the article content
|
// Clean out junk from the article content
|
||||||
this._cleanConditionally(articleContent, "form");
|
this._cleanConditionally(articleContent, "form");
|
||||||
this._cleanConditionally(articleContent, "fieldset");
|
this._cleanConditionally(articleContent, "fieldset");
|
||||||
|
@ -575,10 +639,15 @@ Readability.prototype = {
|
||||||
this._clean(articleContent, "link");
|
this._clean(articleContent, "link");
|
||||||
this._clean(articleContent, "aside");
|
this._clean(articleContent, "aside");
|
||||||
|
|
||||||
// Clean out elements have "share" in their id/class combinations from final top candidates,
|
// Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
|
||||||
// which means we don't remove the top candidates even they have "share".
|
// which means we don't remove the top candidates even they have "share".
|
||||||
|
|
||||||
|
var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
|
||||||
|
|
||||||
this._forEachNode(articleContent.children, function (topCandidate) {
|
this._forEachNode(articleContent.children, function (topCandidate) {
|
||||||
this._cleanMatchedNodes(topCandidate, /share/);
|
this._cleanMatchedNodes(topCandidate, function (node, matchString) {
|
||||||
|
return this.REGEXPS.shareElements.test(matchString) && node.textContent.length < shareElementThreshold;
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// If there is only one h2 and its text content substantially equals article title,
|
// If there is only one h2 and its text content substantially equals article title,
|
||||||
|
@ -614,7 +683,7 @@ Readability.prototype = {
|
||||||
this._cleanConditionally(articleContent, "div");
|
this._cleanConditionally(articleContent, "div");
|
||||||
|
|
||||||
// Remove extra paragraphs
|
// Remove extra paragraphs
|
||||||
this._removeNodes(articleContent.getElementsByTagName("p"), function (paragraph) {
|
this._removeNodes(this._getAllNodesWithTag(articleContent, ["p"]), function (paragraph) {
|
||||||
var imgCount = paragraph.getElementsByTagName("img").length;
|
var imgCount = paragraph.getElementsByTagName("img").length;
|
||||||
var embedCount = paragraph.getElementsByTagName("embed").length;
|
var embedCount = paragraph.getElementsByTagName("embed").length;
|
||||||
var objectCount = paragraph.getElementsByTagName("object").length;
|
var objectCount = paragraph.getElementsByTagName("object").length;
|
||||||
|
@ -729,9 +798,10 @@ Readability.prototype = {
|
||||||
|
|
||||||
if (node.getAttribute !== undefined) {
|
if (node.getAttribute !== undefined) {
|
||||||
var rel = node.getAttribute("rel");
|
var rel = node.getAttribute("rel");
|
||||||
|
var itemprop = node.getAttribute("itemprop");
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
|
if ((rel === "author" || (itemprop && itemprop.indexOf("author") !== -1) || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
|
||||||
this._articleByline = node.textContent.trim();
|
this._articleByline = node.textContent.trim();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -800,12 +870,19 @@ Readability.prototype = {
|
||||||
if (stripUnlikelyCandidates) {
|
if (stripUnlikelyCandidates) {
|
||||||
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
||||||
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
|
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
|
||||||
|
!this._hasAncestorTag(node, "table") &&
|
||||||
node.tagName !== "BODY" &&
|
node.tagName !== "BODY" &&
|
||||||
node.tagName !== "A") {
|
node.tagName !== "A") {
|
||||||
this.log("Removing unlikely candidate - " + matchString);
|
this.log("Removing unlikely candidate - " + matchString);
|
||||||
node = this._removeAndGetNext(node);
|
node = this._removeAndGetNext(node);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (node.getAttribute("role") == "complementary") {
|
||||||
|
this.log("Removing complementary content - " + matchString);
|
||||||
|
node = this._removeAndGetNext(node);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
||||||
|
@ -1199,6 +1276,26 @@ Readability.prototype = {
|
||||||
return false;
|
return false;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts some of the common HTML entities in string to their corresponding characters.
|
||||||
|
*
|
||||||
|
* @param str {string} - a string to unescape.
|
||||||
|
* @return string without HTML entity.
|
||||||
|
*/
|
||||||
|
_unescapeHtmlEntities: function(str) {
|
||||||
|
if (!str) {
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
var htmlEscapeMap = this.HTML_ESCAPE_MAP;
|
||||||
|
return str.replace(/&(quot|amp|apos|lt|gt);/g, function(_, tag) {
|
||||||
|
return htmlEscapeMap[tag];
|
||||||
|
}).replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, function(_, hex, numStr) {
|
||||||
|
var num = parseInt(hex || numStr, hex ? 16 : 10);
|
||||||
|
return String.fromCharCode(num);
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Attempts to get excerpt and byline metadata for the article.
|
* Attempts to get excerpt and byline metadata for the article.
|
||||||
*
|
*
|
||||||
|
@ -1220,6 +1317,9 @@ Readability.prototype = {
|
||||||
var elementName = element.getAttribute("name");
|
var elementName = element.getAttribute("name");
|
||||||
var elementProperty = element.getAttribute("property");
|
var elementProperty = element.getAttribute("property");
|
||||||
var content = element.getAttribute("content");
|
var content = element.getAttribute("content");
|
||||||
|
if (!content) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
var matches = null;
|
var matches = null;
|
||||||
var name = null;
|
var name = null;
|
||||||
|
|
||||||
|
@ -1276,21 +1376,123 @@ Readability.prototype = {
|
||||||
// get site name
|
// get site name
|
||||||
metadata.siteName = values["og:site_name"];
|
metadata.siteName = values["og:site_name"];
|
||||||
|
|
||||||
|
// in many sites the meta value is escaped with HTML entities,
|
||||||
|
// so here we need to unescape it
|
||||||
|
metadata.title = this._unescapeHtmlEntities(metadata.title);
|
||||||
|
metadata.byline = this._unescapeHtmlEntities(metadata.byline);
|
||||||
|
metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
|
||||||
|
metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
|
||||||
|
|
||||||
return metadata;
|
return metadata;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if node is image, or if node contains exactly only one image
|
||||||
|
* whether as a direct child or as its descendants.
|
||||||
|
*
|
||||||
|
* @param Element
|
||||||
|
**/
|
||||||
|
_isSingleImage: function(node) {
|
||||||
|
if (node.tagName === "IMG") {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node.children.length !== 1 || node.textContent.trim() !== "") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return this._isSingleImage(node.children[0]);
|
||||||
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find all <noscript> that are located after <img> nodes, and which contain only one
|
||||||
|
* <img> element. Replace the first image with the image from inside the <noscript> tag,
|
||||||
|
* and remove the <noscript> tag. This improves the quality of the images we use on
|
||||||
|
* some sites (e.g. Medium).
|
||||||
|
*
|
||||||
|
* @param Element
|
||||||
|
**/
|
||||||
|
_unwrapNoscriptImages: function(doc) {
|
||||||
|
// Find img without source or attributes that might contains image, and remove it.
|
||||||
|
// This is done to prevent a placeholder img is replaced by img from noscript in next step.
|
||||||
|
var imgs = Array.from(doc.getElementsByTagName("img"));
|
||||||
|
this._forEachNode(imgs, function(img) {
|
||||||
|
for (var i = 0; i < img.attributes.length; i++) {
|
||||||
|
var attr = img.attributes[i];
|
||||||
|
switch (attr.name) {
|
||||||
|
case "src":
|
||||||
|
case "srcset":
|
||||||
|
case "data-src":
|
||||||
|
case "data-srcset":
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
img.parentNode.removeChild(img);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Next find noscript and try to extract its image
|
||||||
|
var noscripts = Array.from(doc.getElementsByTagName("noscript"));
|
||||||
|
this._forEachNode(noscripts, function(noscript) {
|
||||||
|
// Parse content of noscript and make sure it only contains image
|
||||||
|
var tmp = doc.createElement("div");
|
||||||
|
tmp.innerHTML = noscript.innerHTML;
|
||||||
|
if (!this._isSingleImage(tmp)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If noscript has previous sibling and it only contains image,
|
||||||
|
// replace it with noscript content. However we also keep old
|
||||||
|
// attributes that might contains image.
|
||||||
|
var prevElement = noscript.previousElementSibling;
|
||||||
|
if (prevElement && this._isSingleImage(prevElement)) {
|
||||||
|
var prevImg = prevElement;
|
||||||
|
if (prevImg.tagName !== "IMG") {
|
||||||
|
prevImg = prevElement.getElementsByTagName("img")[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
var newImg = tmp.getElementsByTagName("img")[0];
|
||||||
|
for (var i = 0; i < prevImg.attributes.length; i++) {
|
||||||
|
var attr = prevImg.attributes[i];
|
||||||
|
if (attr.value === "") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attr.name === "src" || attr.name === "srcset" || /\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
||||||
|
if (newImg.getAttribute(attr.name) === attr.value) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var attrName = attr.name;
|
||||||
|
if (newImg.hasAttribute(attrName)) {
|
||||||
|
attrName = "data-old-" + attrName;
|
||||||
|
}
|
||||||
|
|
||||||
|
newImg.setAttribute(attrName, attr.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes script tags from the document.
|
* Removes script tags from the document.
|
||||||
*
|
*
|
||||||
* @param Element
|
* @param Element
|
||||||
**/
|
**/
|
||||||
_removeScripts: function(doc) {
|
_removeScripts: function(doc) {
|
||||||
this._removeNodes(doc.getElementsByTagName("script"), function(scriptNode) {
|
this._removeNodes(this._getAllNodesWithTag(doc, ["script"]), function(scriptNode) {
|
||||||
scriptNode.nodeValue = "";
|
scriptNode.nodeValue = "";
|
||||||
scriptNode.removeAttribute("src");
|
scriptNode.removeAttribute("src");
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
this._removeNodes(doc.getElementsByTagName("noscript"));
|
this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"]));
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1473,21 +1675,21 @@ Readability.prototype = {
|
||||||
_clean: function(e, tag) {
|
_clean: function(e, tag) {
|
||||||
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
|
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
|
||||||
|
|
||||||
this._removeNodes(e.getElementsByTagName(tag), function(element) {
|
this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(element) {
|
||||||
// Allow youtube and vimeo videos through as people usually want to see those.
|
// Allow youtube and vimeo videos through as people usually want to see those.
|
||||||
if (isEmbed) {
|
if (isEmbed) {
|
||||||
var attributeValues = [].map.call(element.attributes, function(attr) {
|
|
||||||
return attr.value;
|
|
||||||
}).join("|");
|
|
||||||
|
|
||||||
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
// First, check the elements attributes to see if any of them contain youtube or vimeo
|
||||||
if (this.REGEXPS.videos.test(attributeValues))
|
for (var i = 0; i < element.attributes.length; i++) {
|
||||||
|
if (this.REGEXPS.videos.test(element.attributes[i].value)) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Then check the elements inside this element for the same.
|
// For embed with <object> tag, check inner HTML as well.
|
||||||
if (this.REGEXPS.videos.test(element.innerHTML))
|
if (element.tagName === "object" && this.REGEXPS.videos.test(element.innerHTML)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
|
@ -1604,6 +1806,76 @@ Readability.prototype = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/* convert images and figures that have properties like data-src into images that can be loaded without JS */
|
||||||
|
_fixLazyImages: function (root) {
|
||||||
|
this._forEachNode(this._getAllNodesWithTag(root, ["img", "picture", "figure"]), function (elem) {
|
||||||
|
// In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
|
||||||
|
// So, here we check if the data uri is too short, just might as well remove it.
|
||||||
|
if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
|
||||||
|
// Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
|
||||||
|
var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
|
||||||
|
if (parts[1] === "image/svg+xml") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure this element has other attributes which contains image.
|
||||||
|
// If it doesn't, then this src is important and shouldn't be removed.
|
||||||
|
var srcCouldBeRemoved = false;
|
||||||
|
for (var i = 0; i < elem.attributes.length; i++) {
|
||||||
|
var attr = elem.attributes[i];
|
||||||
|
if (attr.name === "src") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
||||||
|
srcCouldBeRemoved = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
|
||||||
|
// it will be too small, therefore it might be placeholder image.
|
||||||
|
if (srcCouldBeRemoved) {
|
||||||
|
var b64starts = elem.src.search(/base64\s*/i) + 7;
|
||||||
|
var b64length = elem.src.length - b64starts;
|
||||||
|
if (b64length < 133) {
|
||||||
|
elem.removeAttribute("src");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
|
||||||
|
if ((elem.src || (elem.srcset && elem.srcset != "null")) && elem.className.toLowerCase().indexOf("lazy") === -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var j = 0; j < elem.attributes.length; j++) {
|
||||||
|
attr = elem.attributes[j];
|
||||||
|
if (attr.name === "src" || attr.name === "srcset") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
var copyTo = null;
|
||||||
|
if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
|
||||||
|
copyTo = "srcset";
|
||||||
|
} else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
|
||||||
|
copyTo = "src";
|
||||||
|
}
|
||||||
|
if (copyTo) {
|
||||||
|
//if this is an img or picture, set the attribute directly
|
||||||
|
if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
|
||||||
|
elem.setAttribute(copyTo, attr.value);
|
||||||
|
} else if (elem.tagName === "FIGURE" && !this._getAllNodesWithTag(elem, ["img", "picture"]).length) {
|
||||||
|
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
|
||||||
|
//see the nytimes-3 testcase for an example
|
||||||
|
var img = this._doc.createElement("img");
|
||||||
|
img.setAttribute(copyTo, attr.value);
|
||||||
|
elem.appendChild(img);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean an element of all tags of type "tag" if they look fishy.
|
* Clean an element of all tags of type "tag" if they look fishy.
|
||||||
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||||
|
@ -1621,12 +1893,17 @@ Readability.prototype = {
|
||||||
// without effecting the traversal.
|
// without effecting the traversal.
|
||||||
//
|
//
|
||||||
// TODO: Consider taking into account original contentScore here.
|
// TODO: Consider taking into account original contentScore here.
|
||||||
this._removeNodes(e.getElementsByTagName(tag), function(node) {
|
this._removeNodes(this._getAllNodesWithTag(e, [tag]), function(node) {
|
||||||
// First check if we're in a data table, in which case don't remove us.
|
// First check if this node IS data table, in which case don't remove it.
|
||||||
var isDataTable = function(t) {
|
var isDataTable = function(t) {
|
||||||
return t._readabilityDataTable;
|
return t._readabilityDataTable;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (tag === "table" && isDataTable(node)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Next check if we're inside a data table, in which case don't remove it as well.
|
||||||
if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
|
if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1650,10 +1927,22 @@ Readability.prototype = {
|
||||||
var input = node.getElementsByTagName("input").length;
|
var input = node.getElementsByTagName("input").length;
|
||||||
|
|
||||||
var embedCount = 0;
|
var embedCount = 0;
|
||||||
var embeds = node.getElementsByTagName("embed");
|
var embeds = this._getAllNodesWithTag(node, ["object", "embed", "iframe"]);
|
||||||
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
|
|
||||||
if (!this.REGEXPS.videos.test(embeds[ei].src))
|
for (var i = 0; i < embeds.length; i++) {
|
||||||
embedCount += 1;
|
// If this embed has attribute that matches video regex, don't delete it.
|
||||||
|
for (var j = 0; j < embeds[i].attributes.length; j++) {
|
||||||
|
if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For embed with <object> tag, check inner HTML as well.
|
||||||
|
if (embeds[i].tagName === "object" && this.REGEXPS.videos.test(embeds[i].innerHTML)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
embedCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
var linkDensity = this._getLinkDensity(node);
|
var linkDensity = this._getLinkDensity(node);
|
||||||
|
@ -1674,17 +1963,17 @@ Readability.prototype = {
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clean out elements whose id/class combinations match specific string.
|
* Clean out elements that match the specified conditions
|
||||||
*
|
*
|
||||||
* @param Element
|
* @param Element
|
||||||
* @param RegExp match id/class combination.
|
* @param Function determines whether a node should be removed
|
||||||
* @return void
|
* @return void
|
||||||
**/
|
**/
|
||||||
_cleanMatchedNodes: function(e, regex) {
|
_cleanMatchedNodes: function(e, filter) {
|
||||||
var endOfSearchMarkerNode = this._getNextNode(e, true);
|
var endOfSearchMarkerNode = this._getNextNode(e, true);
|
||||||
var next = this._getNextNode(e);
|
var next = this._getNextNode(e);
|
||||||
while (next && next != endOfSearchMarkerNode) {
|
while (next && next != endOfSearchMarkerNode) {
|
||||||
if (regex.test(next.className + " " + next.id)) {
|
if (filter.call(this, next, next.className + " " + next.id)) {
|
||||||
next = this._removeAndGetNext(next);
|
next = this._removeAndGetNext(next);
|
||||||
} else {
|
} else {
|
||||||
next = this._getNextNode(next);
|
next = this._getNextNode(next);
|
||||||
|
@ -1699,11 +1988,9 @@ Readability.prototype = {
|
||||||
* @return void
|
* @return void
|
||||||
**/
|
**/
|
||||||
_cleanHeaders: function(e) {
|
_cleanHeaders: function(e) {
|
||||||
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
|
this._removeNodes(this._getAllNodesWithTag(e, ["h1", "h2"]), function (header) {
|
||||||
this._removeNodes(e.getElementsByTagName("h" + headerIndex), function (header) {
|
|
||||||
return this._getClassWeight(header) < 0;
|
return this._getClassWeight(header) < 0;
|
||||||
});
|
});
|
||||||
}
|
|
||||||
},
|
},
|
||||||
|
|
||||||
_flagIsActive: function(flag) {
|
_flagIsActive: function(flag) {
|
||||||
|
@ -1715,7 +2002,11 @@ Readability.prototype = {
|
||||||
},
|
},
|
||||||
|
|
||||||
_isProbablyVisible: function(node) {
|
_isProbablyVisible: function(node) {
|
||||||
return (!node.style || node.style.display != "none") && !node.hasAttribute("hidden");
|
// Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
|
||||||
|
return (!node.style || node.style.display != "none")
|
||||||
|
&& !node.hasAttribute("hidden")
|
||||||
|
//check for "fallback-image" so that wikimedia math images are displayed
|
||||||
|
&& (!node.hasAttribute("aria-hidden") || node.getAttribute("aria-hidden") != "true" || (node.className && node.className.indexOf && node.className.indexOf("fallback-image") !== -1));
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1739,6 +2030,9 @@ Readability.prototype = {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unwrap image from noscript
|
||||||
|
this._unwrapNoscriptImages(this._doc);
|
||||||
|
|
||||||
// Remove script tags from the document.
|
// Remove script tags from the document.
|
||||||
this._removeScripts(this._doc);
|
this._removeScripts(this._doc);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue