Bug 1158228 - Merge github's readability code into m-c. a=sledru
This commit is contained in:
@@ -108,6 +108,9 @@ Readability.prototype = {
|
|||||||
// it quits and just show a link.
|
// it quits and just show a link.
|
||||||
DEFAULT_MAX_PAGES: 5,
|
DEFAULT_MAX_PAGES: 5,
|
||||||
|
|
||||||
|
// Element tags to score by default.
|
||||||
|
DEFAULT_TAGS_TO_SCORE: ["SECTION", "P", "TD", "PRE"],
|
||||||
|
|
||||||
// All of the regular expressions in use within readability.
|
// All of the regular expressions in use within readability.
|
||||||
// Defined up here so we don't instantiate them repeatedly in loops.
|
// Defined up here so we don't instantiate them repeatedly in loops.
|
||||||
REGEXPS: {
|
REGEXPS: {
|
||||||
@@ -119,7 +122,7 @@ Readability.prototype = {
|
|||||||
byline: /byline|author|dateline|writtenby/i,
|
byline: /byline|author|dateline|writtenby/i,
|
||||||
replaceFonts: /<(\/?)font[^>]*>/gi,
|
replaceFonts: /<(\/?)font[^>]*>/gi,
|
||||||
normalize: /\s{2,}/g,
|
normalize: /\s{2,}/g,
|
||||||
videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
videos: /https?:\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
|
||||||
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
|
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
|
||||||
prevLink: /(prev|earl|old|new|<|«)/i,
|
prevLink: /(prev|earl|old|new|<|«)/i,
|
||||||
whitespace: /^\s*$/,
|
whitespace: /^\s*$/,
|
||||||
@@ -186,6 +189,15 @@ Readability.prototype = {
|
|||||||
return Array.prototype.concat.apply([], nodeLists);
|
return Array.prototype.concat.apply([], nodeLists);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
_getAllNodesWithTag: function(node, tagNames) {
|
||||||
|
if (node.querySelectorAll) {
|
||||||
|
return node.querySelectorAll(tagNames.join(','));
|
||||||
|
}
|
||||||
|
return [].concat.apply([], tagNames.map(function(tag) {
|
||||||
|
return node.getElementsByTagName(tag);
|
||||||
|
}));
|
||||||
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts each <a> and <img> uri in the given element to an absolute URI.
|
* Converts each <a> and <img> uri in the given element to an absolute URI.
|
||||||
*
|
*
|
||||||
@@ -586,6 +598,18 @@ Readability.prototype = {
|
|||||||
return false;
|
return false;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
_getNodeAncestors: function(node, maxDepth) {
|
||||||
|
maxDepth = maxDepth || 0;
|
||||||
|
var i = 0, ancestors = [];
|
||||||
|
while (node.parentNode) {
|
||||||
|
ancestors.push(node.parentNode)
|
||||||
|
if (maxDepth && ++i === maxDepth)
|
||||||
|
break;
|
||||||
|
node = node.parentNode;
|
||||||
|
}
|
||||||
|
return ancestors;
|
||||||
|
},
|
||||||
|
|
||||||
/***
|
/***
|
||||||
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
|
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
|
||||||
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
|
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
|
||||||
@@ -640,8 +664,9 @@ Readability.prototype = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
|
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
|
||||||
elementsToScore.push(node);
|
elementsToScore.push(node);
|
||||||
|
}
|
||||||
|
|
||||||
// Turn all divs that don't have children block level elements into p's
|
// Turn all divs that don't have children block level elements into p's
|
||||||
if (node.tagName === "DIV") {
|
if (node.tagName === "DIV") {
|
||||||
@@ -680,30 +705,18 @@ Readability.prototype = {
|
|||||||
**/
|
**/
|
||||||
var candidates = [];
|
var candidates = [];
|
||||||
this._forEachNode(elementsToScore, function(elementToScore) {
|
this._forEachNode(elementsToScore, function(elementToScore) {
|
||||||
var parentNode = elementToScore.parentNode;
|
if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
|
||||||
var grandParentNode = parentNode ? parentNode.parentNode : null;
|
|
||||||
var innerText = this._getInnerText(elementToScore);
|
|
||||||
|
|
||||||
if (!parentNode || typeof(parentNode.tagName) === 'undefined')
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// If this paragraph is less than 25 characters, don't even count it.
|
// If this paragraph is less than 25 characters, don't even count it.
|
||||||
|
var innerText = this._getInnerText(elementToScore);
|
||||||
if (innerText.length < 25)
|
if (innerText.length < 25)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// Initialize readability data for the parent.
|
// Exclude nodes with no ancestor.
|
||||||
if (typeof parentNode.readability === 'undefined') {
|
var ancestors = this._getNodeAncestors(elementToScore, 3);
|
||||||
this._initializeNode(parentNode);
|
if (ancestors.length === 0)
|
||||||
candidates.push(parentNode);
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize readability data for the grandparent.
|
|
||||||
if (grandParentNode &&
|
|
||||||
typeof(grandParentNode.readability) === 'undefined' &&
|
|
||||||
typeof(grandParentNode.tagName) !== 'undefined') {
|
|
||||||
this._initializeNode(grandParentNode);
|
|
||||||
candidates.push(grandParentNode);
|
|
||||||
}
|
|
||||||
|
|
||||||
var contentScore = 0;
|
var contentScore = 0;
|
||||||
|
|
||||||
@@ -716,11 +729,18 @@ Readability.prototype = {
|
|||||||
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||||
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
|
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
|
||||||
|
|
||||||
// Add the score to the parent. The grandparent gets half.
|
// Initialize and score ancestors.
|
||||||
parentNode.readability.contentScore += contentScore;
|
this._forEachNode(ancestors, function(ancestor, level) {
|
||||||
|
if (!ancestor.tagName)
|
||||||
|
return;
|
||||||
|
|
||||||
if (grandParentNode)
|
if (typeof(ancestor.readability) === 'undefined') {
|
||||||
grandParentNode.readability.contentScore += contentScore / 2;
|
this._initializeNode(ancestor);
|
||||||
|
candidates.push(ancestor);
|
||||||
|
}
|
||||||
|
|
||||||
|
ancestor.readability.contentScore += contentScore / (level === 0 ? 1 : level * 2);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// After we've calculated scores, loop through all of the possible
|
// After we've calculated scores, loop through all of the possible
|
||||||
@@ -848,10 +868,6 @@ Readability.prototype = {
|
|||||||
sibling = this._setNodeTag(sibling, "DIV");
|
sibling = this._setNodeTag(sibling, "DIV");
|
||||||
}
|
}
|
||||||
|
|
||||||
// To ensure a node does not interfere with readability styles,
|
|
||||||
// remove its classnames.
|
|
||||||
sibling.removeAttribute("class");
|
|
||||||
|
|
||||||
articleContent.appendChild(sibling);
|
articleContent.appendChild(sibling);
|
||||||
// siblings is a reference to the children array, and
|
// siblings is a reference to the children array, and
|
||||||
// sibling is removed from the array when we call appendChild().
|
// sibling is removed from the array when we call appendChild().
|
||||||
@@ -953,7 +969,7 @@ Readability.prototype = {
|
|||||||
var elementName = element.getAttribute("name");
|
var elementName = element.getAttribute("name");
|
||||||
var elementProperty = element.getAttribute("property");
|
var elementProperty = element.getAttribute("property");
|
||||||
|
|
||||||
if (elementName === "author") {
|
if ([elementName, elementProperty].indexOf("author") !== -1) {
|
||||||
metadata.byline = element.getAttribute("content");
|
metadata.byline = element.getAttribute("content");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -1597,6 +1613,7 @@ Readability.prototype = {
|
|||||||
|
|
||||||
var tagsList = e.getElementsByTagName(tag);
|
var tagsList = e.getElementsByTagName(tag);
|
||||||
var curTagsLength = tagsList.length;
|
var curTagsLength = tagsList.length;
|
||||||
|
var isList = tag === "ul" || tag === "ol";
|
||||||
|
|
||||||
// Gather counts for other typical elements embedded within.
|
// Gather counts for other typical elements embedded within.
|
||||||
// Traverse backwards so we can remove nodes at the same time
|
// Traverse backwards so we can remove nodes at the same time
|
||||||
@@ -1632,13 +1649,13 @@ Readability.prototype = {
|
|||||||
var toRemove = false;
|
var toRemove = false;
|
||||||
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
|
if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
|
||||||
toRemove = true;
|
toRemove = true;
|
||||||
} else if (li > p && tag !== "ul" && tag !== "ol") {
|
} else if (!isList && li > p) {
|
||||||
toRemove = true;
|
toRemove = true;
|
||||||
} else if ( input > Math.floor(p/3) ) {
|
} else if (input > Math.floor(p/3)) {
|
||||||
toRemove = true;
|
toRemove = true;
|
||||||
} else if (contentLength < 25 && (img === 0 || img > 2) ) {
|
} else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
|
||||||
toRemove = true;
|
toRemove = true;
|
||||||
} else if (weight < 25 && linkDensity > 0.2) {
|
} else if (!isList && weight < 25 && linkDensity > 0.2) {
|
||||||
toRemove = true;
|
toRemove = true;
|
||||||
} else if (weight >= 25 && linkDensity > 0.5) {
|
} else if (weight >= 25 && linkDensity > 0.5) {
|
||||||
toRemove = true;
|
toRemove = true;
|
||||||
@@ -1663,7 +1680,7 @@ Readability.prototype = {
|
|||||||
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
|
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
|
||||||
var headers = e.getElementsByTagName('h' + headerIndex);
|
var headers = e.getElementsByTagName('h' + headerIndex);
|
||||||
for (var i = headers.length - 1; i >= 0; i -= 1) {
|
for (var i = headers.length - 1; i >= 0; i -= 1) {
|
||||||
if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33)
|
if (this._getClassWeight(headers[i]) < 0)
|
||||||
headers[i].parentNode.removeChild(headers[i]);
|
headers[i].parentNode.removeChild(headers[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1686,32 +1703,42 @@ Readability.prototype = {
|
|||||||
*
|
*
|
||||||
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
|
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
|
||||||
*/
|
*/
|
||||||
isProbablyReaderable: function() {
|
isProbablyReaderable: function(helperIsVisible) {
|
||||||
var nodes = this._doc.getElementsByTagName("p");
|
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
|
||||||
if (nodes.length < 5) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
var possibleParagraphs = 0;
|
// FIXME we should have a fallback for helperIsVisible, but this is
|
||||||
for (var i = 0; i < nodes.length; i++) {
|
// problematic because of jsdom's elem.style handling - see
|
||||||
var node = nodes[i];
|
// https://github.com/mozilla/readability/pull/186 for context.
|
||||||
|
|
||||||
|
var score = 0;
|
||||||
|
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
|
||||||
|
// this callback:
|
||||||
|
return this._someNode(nodes, function(node) {
|
||||||
|
if (helperIsVisible && !helperIsVisible(node))
|
||||||
|
return false;
|
||||||
var matchString = node.className + " " + node.id;
|
var matchString = node.className + " " + node.id;
|
||||||
|
|
||||||
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
||||||
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
|
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
|
||||||
continue;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node.textContent.trim().length < 100) {
|
if (node.matches && node.matches("li p")) {
|
||||||
continue;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
possibleParagraphs++;
|
var textContentLength = node.textContent.trim().length;
|
||||||
if (possibleParagraphs >= 5) {
|
if (textContentLength < 140) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
score += Math.sqrt(textContentLength - 140);
|
||||||
|
|
||||||
|
if (score > 20) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user