No bug, update Readability.js to the version in github, rs=margaret,me per discussion earlier today a=readinglist

This commit is contained in:
Gijs Kruitbosch
2015-03-17 20:58:58 -07:00
parent 8bbfbd81c1
commit eb14a0bf25

View File

@@ -25,9 +25,9 @@
* This code is heavily based on Arc90's readability.js (1.7.1) script * This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability * available at: http://code.google.com/p/arc90labs-readability
*/ */
var root = this;
var Readability = function(uri, doc) { var Readability = function(uri, doc) {
const ENABLE_LOGGING = false; var ENABLE_LOGGING = false;
this._uri = uri; this._uri = uri;
this._doc = doc; this._doc = doc;
@@ -53,8 +53,26 @@ var Readability = function(uri, doc) {
// Control whether log messages are sent to the console // Control whether log messages are sent to the console
if (ENABLE_LOGGING) { if (ENABLE_LOGGING) {
this.log = function (msg) { function logEl(e) {
dump("Reader: (Readability) " + msg); var rv = e.nodeName + " ";
if (e.nodeType == e.TEXT_NODE) {
return rv + '("' + e.textContent + '")';
}
var classDesc = e.className && ("." + e.className.replace(/ /g, "."));
var elDesc = e.id ? "(#" + e.id + classDesc + ")" :
(classDesc ? "(" + classDesc + ")" : "");
return rv + elDesc;
}
this.log = function () {
if ("dump" in root) {
var msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? logEl(x) : x;
}).join(" ");
dump("Reader: (Readability) " + msg + "\n");
} else if ("console" in root) {
var args = ["Reader: (Readability) "].concat(arguments);
console.log.apply(console, args);
}
}; };
} else { } else {
this.log = function () {}; this.log = function () {};
@@ -203,27 +221,15 @@ Readability.prototype = {
_prepDocument: function() { _prepDocument: function() {
var doc = this._doc; var doc = this._doc;
// In some cases a body element can't be found (if the HTML is
// totally hosed for example) so we create a new body node and
// append it to the document.
if (!doc.body) {
var body = doc.createElement("body");
try {
doc.body = body;
} catch(e) {
doc.documentElement.appendChild(body);
this.log(e);
}
}
// Remove all style tags in head // Remove all style tags in head
var styleTags = doc.getElementsByTagName("style"); var styleTags = doc.getElementsByTagName("style");
for (var st = 0; st < styleTags.length; st += 1) { for (var st = 0; st < styleTags.length; st += 1) {
styleTags[st].textContent = ""; styleTags[st].textContent = "";
} }
this._replaceBrs(doc.body); if (doc.body) {
this._replaceBrs(doc.body);
}
var fonts = doc.getElementsByTagName("FONT"); var fonts = doc.getElementsByTagName("FONT");
for (var i = fonts.length; --i >=0;) { for (var i = fonts.length; --i >=0;) {
@@ -412,6 +418,13 @@ Readability.prototype = {
var doc = this._doc; var doc = this._doc;
var isPaging = (page !== null ? true: false); var isPaging = (page !== null ? true: false);
page = page ? page : this._doc.body; page = page ? page : this._doc.body;
// We can't grab an article if we don't have a page!
if (!page) {
this.log("No body found in document. Abort.");
return null;
}
var pageCacheHtml = page.innerHTML; var pageCacheHtml = page.innerHTML;
// Check if any "dir" is set on the toplevel document element // Check if any "dir" is set on the toplevel document element
@@ -576,8 +589,7 @@ Readability.prototype = {
var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
candidate.readability.contentScore = candidateScore; candidate.readability.contentScore = candidateScore;
this.log('Candidate: ' + candidate + " (" + candidate.className + ":" + this.log('Candidate:', candidate, "with score " + candidateScore);
candidate.id + ") with score " + candidateScore);
for (var t = 0; t < this.N_TOP_CANDIDATES; t++) { for (var t = 0; t < this.N_TOP_CANDIDATES; t++) {
var aTopCandidate = topCandidates[t]; var aTopCandidate = topCandidates[t];
@@ -592,15 +604,18 @@ Readability.prototype = {
} }
var topCandidate = topCandidates[0] || null; var topCandidate = topCandidates[0] || null;
var neededToCreateTopCandidate = false;
// If we still have no top candidate, just use the body as a last resort. // If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify. // We also have to copy the body node so it is something we can modify.
if (topCandidate === null || topCandidate.tagName === "BODY") { if (topCandidate === null || topCandidate.tagName === "BODY") {
// Move all of the page's children into topCandidate // Move all of the page's children into topCandidate
topCandidate = doc.createElement("DIV"); topCandidate = doc.createElement("DIV");
neededToCreateTopCandidate = true;
var children = page.childNodes; var children = page.childNodes;
for (var i = 0; i < children.length; ++i) { while (children.length) {
topCandidate.appendChild(children[i]); this.log("Moving child out:", children[0]);
topCandidate.appendChild(children[0]);
} }
page.appendChild(topCandidate); page.appendChild(topCandidate);
@@ -622,7 +637,7 @@ Readability.prototype = {
var siblingNode = siblingNodes[s]; var siblingNode = siblingNodes[s];
var append = false; var append = false;
this.log("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); this.log("Looking at sibling node:", siblingNode, ((typeof siblingNode.readability !== 'undefined') ? ("with score " + siblingNode.readability.contentScore) : ''));
this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
if (siblingNode === topCandidate) if (siblingNode === topCandidate)
@@ -651,7 +666,7 @@ Readability.prototype = {
} }
if (append) { if (append) {
this.log("Appending node: " + siblingNode); this.log("Appending node:", siblingNode);
// siblingNodes is a reference to the childNodes array, and // siblingNodes is a reference to the childNodes array, and
// siblingNode is removed from the array when we call appendChild() // siblingNode is removed from the array when we call appendChild()
@@ -663,14 +678,14 @@ Readability.prototype = {
if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
// We have a node that isn't a common block level element, like a form or td tag. // We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident. */ // Turn it into a div so it doesn't get filtered out later by accident. */
this.log("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); this.log("Altering siblingNode:", siblingNode, 'to div.');
this._setNodeTag(siblingNode, "DIV"); this._setNodeTag(siblingNode, "DIV");
} }
// To ensure a node does not interfere with readability styles, // To ensure a node does not interfere with readability styles,
// remove its classnames. // remove its classnames.
siblingNode.className = ""; siblingNode.removeAttribute("class");
// Append sibling and subtract from our list because it removes // Append sibling and subtract from our list because it removes
// the node when you append to another node. // the node when you append to another node.
@@ -678,20 +693,33 @@ Readability.prototype = {
} }
} }
this.log("Article content pre-prep: " + articleContent.innerHTML);
// So we have all of the content that we need. Now we clean it up for presentation. // So we have all of the content that we need. Now we clean it up for presentation.
this._prepArticle(articleContent); this._prepArticle(articleContent);
this.log("Article content post-prep: " + articleContent.innerHTML);
if (this._curPageNum === 1) { if (this._curPageNum === 1) {
var div = doc.createElement("DIV"); if (neededToCreateTopCandidate) {
div.id = "readability-page-1"; // We already created a fake div thing, and there wouldn't have been any siblings left
div.className = "page"; // for the previous loop, so there's no point trying to create a new div, and then
var children = articleContent.childNodes; // move all the children over. Just assign IDs and class names here. No need to append
for (var i = 0; i < children.length; ++i) { // because that already happened anyway.
div.appendChild(children[i]); topCandidate.id = "readability-page-1";
topCandidate.className = "page";
} else {
var div = doc.createElement("DIV");
div.id = "readability-page-1";
div.className = "page";
var children = articleContent.childNodes;
while (children.length) {
div.appendChild(children[0]);
}
articleContent.appendChild(div);
} }
articleContent.appendChild(div);
} }
this.log("Article content after paging: " + articleContent.innerHTML);
// Now that we've gone through the full algorithm, check to see if // Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run // we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of // grabArticle with different flags set. This gives us a higher likelihood of
@@ -1401,7 +1429,7 @@ Readability.prototype = {
var weight = this._getClassWeight(tagsList[i]); var weight = this._getClassWeight(tagsList[i]);
var contentScore = 0; var contentScore = 0;
this.log("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")"); this.log("Cleaning Conditionally", tagsList[i]);
if (weight + contentScore < 0) { if (weight + contentScore < 0) {
tagsList[i].parentNode.removeChild(tagsList[i]); tagsList[i].parentNode.removeChild(tagsList[i]);
@@ -1508,6 +1536,8 @@ Readability.prototype = {
if (!articleContent) if (!articleContent)
return null; return null;
this.log("Grabbed: " + articleContent.innerHTML);
this._postProcessContent(articleContent); this._postProcessContent(articleContent);
// if (nextPageLink) { // if (nextPageLink) {