1 /*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */
2 /*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */
4 var dbg = function(s) {
5 window.console.log("Readability: " + s);
9 * Readability. An Arc90 Lab Experiment.
10 * Website: http://lab.arc90.com/experiments/readability
11 * Source: http://code.google.com/p/arc90labs-readability
13 * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
15 * Copyright (c) 2010 Arc90 Inc
16 * Readability is licensed under the Apache License, Version 2.0.
20 emailSrc: 'http://lab.arc90.com/experiments/readability/email.php',
22 convertLinksToFootnotes: false,
23 reversePageScroll: false, /* If they hold shift and hit space, scroll up */
25 * The frame hack is to workaround a firefox bug where if you
26 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
27 * So we fake a scrollbar in the wrapping div.
30 bodyCache: null, /* Cache the body HTML in case we need to re-use it later */
31 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */
34 FLAG_STRIP_UNLIKELYS: 0x1,
35 FLAG_WEIGHT_CLASSES: 0x2,
36 FLAG_CLEAN_CONDITIONALLY: 0x4,
38 maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
39 parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
40 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
43 * All of the regular expressions in use within readability.
44 * Defined up here so we don't instantiate them repeatedly in loops.
47 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
48 okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
49 positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
50 negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
51 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
52 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
53 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
54 replaceFonts: /<(\/?)font[^>]*>/gi,
57 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g,
58 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
59 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
60 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
61 prevLink: /(prev|earl|old|new|<|«)/i
68 * 1. Prep the document by removing script tags, css, etc.
69 * 2. Build readability's DOM tree.
70 * 3. Grab the article content from the current dom tree.
71 * 4. Replace the current DOM tree with the new one.
77 /* Before we do anything, remove all scripts that are not readability. */
78 window.onload = window.onunload = function() {};
80 readability.removeScripts(document);
82 if(document.body && !readability.bodyCache) {
83 readability.bodyCache = document.body.innerHTML;
86 /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
87 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
89 /* Pull out any possible next page link first */
90 var nextPageLink = readability.findNextPageLink(document.body);
92 readability.prepDocument();
94 /* Build readability's DOM tree */
95 var overlay = document.createElement("DIV");
96 var innerDiv = document.createElement("DIV");
97 var articleTools = readability.getArticleTools();
98 var articleTitle = readability.getArticleTitle();
99 var articleContent = readability.grabArticle();
100 var articleFooter = readability.getArticleFooter();
102 if(!articleContent) {
103 articleContent = document.createElement("DIV");
104 articleContent.id = "readability-content";
105 articleContent.innerHTML = [
106 "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
107 (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames. You may want to try running readability itself on this source page: <a href='" + readability.biggestFrame.src + "'>" + readability.biggestFrame.src + "</a></p>" : ""),
108 "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
114 overlay.id = "readOverlay";
115 innerDiv.id = "readInner";
117 /* Apply user-selected styling */
118 document.body.className = readStyle;
119 document.dir = readability.getSuggestedDirection(articleTitle.innerHTML);
121 if (readStyle === "style-athelas" || readStyle === "style-apertura"){
122 overlay.className = readStyle + " rdbTypekit";
125 overlay.className = readStyle;
127 innerDiv.className = readMargin + " " + readSize;
129 if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
130 readability.convertLinksToFootnotes = true;
133 /* Glue the structure of our document together. */
134 innerDiv.appendChild( articleTitle );
135 innerDiv.appendChild( articleContent );
136 innerDiv.appendChild( articleFooter );
137 overlay.appendChild( articleTools );
138 overlay.appendChild( innerDiv );
140 /* Clear the old HTML, insert the new content. */
141 document.body.innerHTML = "";
142 document.body.insertBefore(overlay, document.body.firstChild);
143 document.body.removeAttribute('style');
145 if(readability.frameHack)
147 var readOverlay = document.getElementById('readOverlay');
148 readOverlay.style.height = '100%';
149 readOverlay.style.overflow = 'auto';
153 * If someone tries to use Readability on a site's root page, give them a warning about usage.
155 if((window.location.protocol + "//" + window.location.host + "/") === window.location.href)
157 articleContent.style.display = "none";
158 var rootWarning = document.createElement('p');
159 rootWarning.id = "readability-warning";
160 rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
161 "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
163 innerDiv.insertBefore( rootWarning, articleContent );
166 readability.postProcessContent(articleContent);
168 window.scrollTo(0, 0);
170 /* If we're using the Typekit library, select the font */
171 if (readStyle === "style-athelas" || readStyle === "style-apertura") {
172 readability.useRdbTypekit();
177 * Append any additional pages after a small timeout so that people
178 * can start reading without having to wait for this to finish processing.
180 window.setTimeout(function() {
181 readability.appendNextPage(nextPageLink);
185 /** Smooth scrolling **/
186 document.onkeydown = function(e) {
187 var code = (window.event) ? event.keyCode : e.keyCode;
189 readability.reversePageScroll = true;
194 readability.curScrollStep = 0;
195 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
197 if(readability.reversePageScroll) {
198 readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
201 readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
208 document.onkeyup = function(e) {
209 var code = (window.event) ? event.keyCode : e.keyCode;
211 readability.reversePageScroll = false;
218 * Run any post-process modifications to article content as necessary.
223 postProcessContent: function(articleContent) {
224 if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
225 readability.addFootnotes(articleContent);
228 readability.fixImageFloats(articleContent);
232 * Some content ends up looking ugly if the image is too large to be floated.
233 * If the image is wider than a threshold (currently 55%), no longer float it,
239 fixImageFloats: function (articleContent) {
240 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
241 images = articleContent.getElementsByTagName('img');
243 for(var i=0, il = images.length; i < il; i+=1) {
244 var image = images[i];
246 if(image.offsetWidth > imageWidthThreshold) {
247 image.className += " blockImage";
253 * Get the article tools Element that has buttons like reload, print, email.
257 getArticleTools: function () {
258 var articleTools = document.createElement("DIV");
260 articleTools.id = "readTools";
261 articleTools.innerHTML =
262 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
263 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
264 "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
270 * retuns the suggested direction of the string
272 * @return "rtl" || "ltr"
274 getSuggestedDirection: function(text) {
275 function sanitizeText() {
276 return text.replace(/@\w+/, "");
279 function countMatches(match) {
280 var matches = text.match(new RegExp(match, "g"));
281 return matches !== null ? matches.length : 0;
285 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
286 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
288 // if 20% of chars are Hebrew or Arbic then direction is rtl
289 return (count_heb + count_arb) * 100 / text.length > 20;
292 text = sanitizeText(text);
293 return isRTL() ? "rtl" : "ltr";
298 * Get the article title as an H1.
302 getArticleTitle: function () {
307 curTitle = origTitle = document.title;
309 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
310 curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
315 if(curTitle.match(/ [\|\-] /))
317 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
319 if(curTitle.split(' ').length < 3) {
320 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
323 else if(curTitle.indexOf(': ') !== -1)
325 curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
327 if(curTitle.split(' ').length < 3) {
328 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
331 else if(curTitle.length > 150 || curTitle.length < 15)
333 var hOnes = document.getElementsByTagName('h1');
334 if(hOnes.length === 1)
336 curTitle = readability.getInnerText(hOnes[0]);
340 curTitle = curTitle.replace( readability.regexps.trim, "" );
342 if(curTitle.split(' ').length <= 4) {
343 curTitle = origTitle;
346 var articleTitle = document.createElement("H1");
347 articleTitle.innerHTML = curTitle;
353 * Get the footer with the readability mark etc.
357 getArticleFooter: function () {
358 var articleFooter = document.createElement("DIV");
361 * For research purposes, generate an img src that contains the chosen readstyle etc,
362 * so we can generate aggregate stats and change styles based on them in the future
364 // var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize);
365 /* TODO: attach this to an image */
367 articleFooter.id = "readFooter";
368 articleFooter.innerHTML = [
369 "<div id='rdb-footer-print'>Excerpted from <cite>" + document.title + "</cite><br />" + window.location.href + "</div>",
370 "<div id='rdb-footer-wrapper'>",
371 "<div id='rdb-footer-left'>",
372 "<a href='http://lab.arc90.com/experiments/readability' id='readability-logo'>Readability — </a>",
373 "<a href='http://www.arc90.com/' id='arc90-logo'> An Arc90 Laboratory Experiment </a>",
374 " <span id='readability-url'> http://lab.arc90.com/experiments/readability</span>",
376 "<div id='rdb-footer-right'>",
377 "<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter »</a>",
378 "<span class='version'>Readability version " + readability.version + "</span>",
382 return articleFooter;
386 * Prepare the HTML document for readability to scrape it.
387 * This includes things like stripping javascript, CSS, and handling terrible markup.
391 prepDocument: function () {
393 * In some cases a body element can't be found (if the HTML is totally hosed for example)
394 * so we create a new body node and append it to the document.
396 if(document.body === null)
398 var body = document.createElement("body");
400 document.body = body;
403 document.documentElement.appendChild(body);
408 document.body.id = "readabilityBody";
410 var frames = document.getElementsByTagName('frame');
411 if(frames.length > 0)
413 var bestFrame = null;
414 var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */
415 var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
416 for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
418 var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
419 var canAccessFrame = false;
421 var frameBody = frames[frameIndex].contentWindow.document.body;
422 canAccessFrame = true;
428 if(frameSize > biggestFrameSize) {
429 biggestFrameSize = frameSize;
430 readability.biggestFrame = frames[frameIndex];
433 if(canAccessFrame && frameSize > bestFrameSize)
435 readability.frameHack = true;
437 bestFrame = frames[frameIndex];
438 bestFrameSize = frameSize;
444 var newBody = document.createElement('body');
445 newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
446 newBody.style.overflow = 'scroll';
447 document.body = newBody;
449 var frameset = document.getElementsByTagName('frameset')[0];
451 frameset.parentNode.removeChild(frameset); }
455 /* Remove all stylesheets */
456 for (var k=0;k < document.styleSheets.length; k+=1) {
457 if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
458 document.styleSheets[k].disabled = true;
462 /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
463 var styleTags = document.getElementsByTagName("style");
464 for (var st=0;st < styleTags.length; st+=1) {
465 styleTags[st].textContent = "";
468 /* Turn all double br's into p's */
469 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
470 document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
474 * For easier reading, convert this document to have footnotes at the bottom rather than inline links.
475 * @see http://www.roughtype.com/archives/2010/05/experiments_in.php
479 addFootnotes: function(articleContent) {
480 var footnotesWrapper = document.getElementById('readability-footnotes'),
481 articleFootnotes = document.getElementById('readability-footnotes-list');
483 if(!footnotesWrapper) {
484 footnotesWrapper = document.createElement("DIV");
485 footnotesWrapper.id = 'readability-footnotes';
486 footnotesWrapper.innerHTML = '<h3>References</h3>';
487 footnotesWrapper.style.display = 'none'; /* Until we know we have footnotes, don't show the references block. */
489 articleFootnotes = document.createElement('ol');
490 articleFootnotes.id = 'readability-footnotes-list';
492 footnotesWrapper.appendChild(articleFootnotes);
494 var readFooter = document.getElementById('readFooter');
497 readFooter.parentNode.insertBefore(footnotesWrapper, readFooter);
501 var articleLinks = articleContent.getElementsByTagName('a');
502 var linkCount = articleFootnotes.getElementsByTagName('li').length;
503 for (var i = 0; i < articleLinks.length; i+=1)
505 var articleLink = articleLinks[i],
506 footnoteLink = articleLink.cloneNode(true),
507 refLink = document.createElement('a'),
508 footnote = document.createElement('li'),
509 linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
510 linkText = readability.getInnerText(articleLink);
512 if(articleLink.className && articleLink.className.indexOf('readability-DoNotFootnote') !== -1 || linkText.match(readability.regexps.skipFootnoteLink)) {
518 /** Add a superscript reference after the article link */
519 refLink.href = '#readabilityFootnoteLink-' + linkCount;
520 refLink.innerHTML = '<small><sup>[' + linkCount + ']</sup></small>';
521 refLink.className = 'readability-DoNotFootnote';
522 try { refLink.style.color = 'inherit'; } catch(e) {} /* IE7 doesn't like inherit. */
524 if(articleLink.parentNode.lastChild === articleLink) {
525 articleLink.parentNode.appendChild(refLink);
527 articleLink.parentNode.insertBefore(refLink, articleLink.nextSibling);
530 articleLink.name = 'readabilityLink-' + linkCount;
531 try { articleLink.style.color = 'inherit'; } catch(err) {} /* IE7 doesn't like inherit. */
533 footnote.innerHTML = "<small><sup><a href='#readabilityLink-" + linkCount + "' title='Jump to Link in Article'>^</a></sup></small> ";
535 footnoteLink.innerHTML = (footnoteLink.title ? footnoteLink.title : linkText);
536 footnoteLink.name = 'readabilityFootnoteLink-' + linkCount;
538 footnote.appendChild(footnoteLink);
539 footnote.innerHTML = footnote.innerHTML + "<small> (" + linkDomain + ")</small>";
541 articleFootnotes.appendChild(footnote);
545 footnotesWrapper.style.display = 'block';
549 useRdbTypekit: function () {
550 var rdbHead = document.getElementsByTagName('head')[0];
551 var rdbTKScript = document.createElement('script');
552 var rdbTKCode = null;
554 var rdbTKLink = document.createElement('a');
555 rdbTKLink.setAttribute('class','rdbTK-powered');
556 rdbTKLink.setAttribute('title','Fonts by Typekit');
557 rdbTKLink.innerHTML = "Fonts by <span class='rdbTK'>Typekit</span>";
559 if (readStyle === "style-athelas") {
560 rdbTKCode = "sxt6vzy";
561 dbg("Using Athelas Theme");
563 rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=athelas');
564 rdbTKLink.setAttribute('id','rdb-athelas');
565 document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
567 if (readStyle === "style-apertura") {
568 rdbTKCode = "bae8ybu";
569 dbg("Using Inverse Theme");
571 rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=inverse');
572 rdbTKLink.setAttribute('id','rdb-inverse');
573 document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
577 * Setting new script tag attributes to pull Typekits libraries
579 rdbTKScript.setAttribute('type','text/javascript');
580 rdbTKScript.setAttribute('src',"http://use.typekit.com/" + rdbTKCode + ".js");
581 rdbTKScript.setAttribute('charset','UTF-8');
582 rdbHead.appendChild(rdbTKScript);
585 * In the future, maybe try using the following experimental Callback function?:
586 * http://gist.github.com/192350
588 * http://getsatisfaction.com/typekit/topics/support_a_pre_and_post_load_callback_function
590 var typekitLoader = function() {
591 dbg("Looking for Typekit.");
592 if(typeof Typekit !== "undefined") {
594 dbg("Caught typekit");
596 clearInterval(window.typekitInterval);
598 dbg("Typekit error: " + e);
603 window.typekitInterval = window.setInterval(typekitLoader, 100);
607 * Prepare the article node for display. Clean out any inline styles,
608 * iframes, forms, strip extraneous <p> tags, etc.
613 prepArticle: function (articleContent) {
614 readability.cleanStyles(articleContent);
615 readability.killBreaks(articleContent);
617 /* Clean out junk from the article content */
618 readability.cleanConditionally(articleContent, "form");
619 readability.clean(articleContent, "object");
620 readability.clean(articleContent, "h1");
623 * If there is only one h2, they are probably using it
624 * as a header and not a subheader, so remove it since we already have a header.
626 if(articleContent.getElementsByTagName('h2').length === 1) {
627 readability.clean(articleContent, "h2");
629 readability.clean(articleContent, "iframe");
631 readability.cleanHeaders(articleContent);
633 /* Do these last as the previous stuff may have removed junk that will affect these */
634 readability.cleanConditionally(articleContent, "table");
635 readability.cleanConditionally(articleContent, "ul");
636 readability.cleanConditionally(articleContent, "div");
638 /* Remove extra paragraphs */
639 var articleParagraphs = articleContent.getElementsByTagName('p');
640 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
641 var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
642 var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
643 var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
645 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
646 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
651 articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
654 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
659 * Initialize a node with the readability object. Also checks the
660 * className/id for special names to add to its score.
665 initializeNode: function (node) {
666 node.readability = {"contentScore": 0};
668 switch(node.tagName) {
670 node.readability.contentScore += 5;
676 node.readability.contentScore += 3;
687 node.readability.contentScore -= 3;
697 node.readability.contentScore -= 5;
701 node.readability.contentScore += readability.getClassWeight(node);
705 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
706 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
708 * @param page a document to run upon. Needs to be a full document, complete with body.
711 grabArticle: function (page) {
712 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
713 isPaging = (page !== null) ? true: false;
715 page = page ? page : document.body;
717 var pageCacheHtml = page.innerHTML;
719 var allElements = page.getElementsByTagName('*');
722 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
723 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
725 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
726 * TODO: Shouldn't this be a reverse traversal?
729 var nodesToScore = [];
730 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
731 /* Remove unlikely candidates */
732 if (stripUnlikelyCandidates) {
733 var unlikelyMatchString = node.className + node.id;
736 unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
737 unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
738 node.tagName !== "BODY"
742 dbg("Removing unlikely candidate - " + unlikelyMatchString);
743 node.parentNode.removeChild(node);
749 if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
750 nodesToScore[nodesToScore.length] = node;
753 /* Turn all divs that don't have children block level elements into p's */
754 if (node.tagName === "DIV") {
755 if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
756 var newNode = document.createElement('p');
758 newNode.innerHTML = node.innerHTML;
759 node.parentNode.replaceChild(newNode, node);
762 nodesToScore[nodesToScore.length] = node;
765 dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
771 for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
772 var childNode = node.childNodes[i];
773 if(childNode.nodeType === 3) { // Node.TEXT_NODE
774 var p = document.createElement('p');
775 p.innerHTML = childNode.nodeValue;
776 p.style.display = 'inline';
777 p.className = 'readability-styled';
778 childNode.parentNode.replaceChild(p, childNode);
786 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
787 * Then add their score to their parent node.
789 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
792 for (var pt=0; pt < nodesToScore.length; pt+=1) {
793 var parentNode = nodesToScore[pt].parentNode;
794 var grandParentNode = parentNode ? parentNode.parentNode : null;
795 var innerText = readability.getInnerText(nodesToScore[pt]);
797 if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
801 /* If this paragraph is less than 25 characters, don't even count it. */
802 if(innerText.length < 25) {
805 /* Initialize readability data for the parent. */
806 if(typeof parentNode.readability === 'undefined') {
807 readability.initializeNode(parentNode);
808 candidates.push(parentNode);
811 /* Initialize readability data for the grandparent. */
812 if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
813 readability.initializeNode(grandParentNode);
814 candidates.push(grandParentNode);
817 var contentScore = 0;
819 /* Add a point for the paragraph itself as a base. */
822 /* Add points for any commas within this paragraph */
823 contentScore += innerText.split(',').length;
825 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
826 contentScore += Math.min(Math.floor(innerText.length / 100), 3);
828 /* Add the score to the parent. The grandparent gets half. */
829 parentNode.readability.contentScore += contentScore;
831 if(grandParentNode) {
832 grandParentNode.readability.contentScore += contentScore/2;
837 * After we've calculated scores, loop through all of the possible candidate nodes we found
838 * and find the one with the highest score.
840 var topCandidate = null;
841 for(var c=0, cl=candidates.length; c < cl; c+=1)
844 * Scale the final candidates score based on link density. Good content should have a
845 * relatively small link density (5% or less) and be mostly unaffected by this operation.
847 candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
849 dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
851 if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
852 topCandidate = candidates[c]; }
856 * If we still have no top candidate, just use the body as a last resort.
857 * We also have to copy the body node so it is something we can modify.
859 if (topCandidate === null || topCandidate.tagName === "BODY")
861 topCandidate = document.createElement("DIV");
862 topCandidate.innerHTML = page.innerHTML;
864 page.appendChild(topCandidate);
865 readability.initializeNode(topCandidate);
869 * Now that we have the top candidate, look through its siblings for content that might also be related.
870 * Things like preambles, content split by ads that we removed, etc.
872 var articleContent = document.createElement("DIV");
874 articleContent.id = "readability-content";
876 var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
877 var siblingNodes = topCandidate.parentNode.childNodes;
880 for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
881 var siblingNode = siblingNodes[s];
885 * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
886 * Example of error visible here: http://www.esquire.com/features/honesty0707
892 dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
893 dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
895 if(siblingNode === topCandidate)
900 var contentBonus = 0;
901 /* Give a bonus if sibling nodes and top candidates have the example same classname */
902 if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
903 contentBonus += topCandidate.readability.contentScore * 0.2;
906 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
911 if(siblingNode.nodeName === "P") {
912 var linkDensity = readability.getLinkDensity(siblingNode);
913 var nodeContent = readability.getInnerText(siblingNode);
914 var nodeLength = nodeContent.length;
916 if(nodeLength > 80 && linkDensity < 0.25)
920 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
927 dbg("Appending node: " + siblingNode);
929 var nodeToAppend = null;
930 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
931 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
933 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
934 nodeToAppend = document.createElement("DIV");
936 nodeToAppend.id = siblingNode.id;
937 nodeToAppend.innerHTML = siblingNode.innerHTML;
940 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
941 nodeToAppend = siblingNode;
946 nodeToAppend = siblingNode;
951 /* To ensure a node does not interfere with readability styles, remove its classnames */
952 nodeToAppend.className = "";
954 /* Append sibling and subtract from our list because it removes the node when you append to another node */
955 articleContent.appendChild(nodeToAppend);
960 * So we have all of the content that we need. Now we clean it up for presentation.
962 readability.prepArticle(articleContent);
964 if (readability.curPageNum === 1) {
965 articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
969 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
970 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
971 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
972 * finding the -right- content.
974 if(readability.getInnerText(articleContent, false).length < 250) {
975 page.innerHTML = pageCacheHtml;
977 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
978 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
979 return readability.grabArticle(page);
981 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
982 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
983 return readability.grabArticle(page);
985 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
986 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
987 return readability.grabArticle(page);
993 return articleContent;
997 * Removes script tags from the document.
1001 removeScripts: function (doc) {
1002 var scripts = doc.getElementsByTagName('script');
1003 for(var i = scripts.length-1; i >= 0; i-=1)
1005 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
1007 scripts[i].nodeValue="";
1008 scripts[i].removeAttribute('src');
1009 if (scripts[i].parentNode) {
1010 scripts[i].parentNode.removeChild(scripts[i]);
1017 * Get the inner text of a node - cross browser compatibly.
1018 * This also strips out any excess whitespace to be found.
1023 getInnerText: function (e, normalizeSpaces) {
1024 var textContent = "";
1026 if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
1030 normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
1032 if (navigator.appName === "Microsoft Internet Explorer") {
1033 textContent = e.innerText.replace( readability.regexps.trim, "" ); }
1035 textContent = e.textContent.replace( readability.regexps.trim, "" ); }
1037 if(normalizeSpaces) {
1038 return textContent.replace( readability.regexps.normalize, " "); }
1040 return textContent; }
1044 * Get the number of times a string s appears in the node e.
1047 * @param string - what to split on. Default is ","
1048 * @return number (integer)
1050 getCharCount: function (e,s) {
1052 return readability.getInnerText(e).split(s).length-1;
1056 * Remove the style attribute on every e and under.
1057 * TODO: Test if getElementsByTagName(*) is faster.
1062 cleanStyles: function (e) {
1064 var cur = e.firstChild;
1069 // Remove any root styles, if we're able.
1070 if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
1071 e.removeAttribute('style'); }
1073 // Go until there are no more child nodes
1074 while ( cur !== null ) {
1075 if ( cur.nodeType === 1 ) {
1076 // Remove style attribute(s) :
1077 if(cur.className !== "readability-styled") {
1078 cur.removeAttribute("style");
1080 readability.cleanStyles( cur );
1082 cur = cur.nextSibling;
1087 * Get the density of links as a percentage of the content
1088 * This is the amount of text that is inside a link divided by the total text in the node.
1091 * @return number (float)
1093 getLinkDensity: function (e) {
1094 var links = e.getElementsByTagName("a");
1095 var textLength = readability.getInnerText(e).length;
1097 for(var i=0, il=links.length; i<il;i+=1)
1099 linkLength += readability.getInnerText(links[i]).length;
1102 return linkLength / textLength;
1106 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
1109 * @return string the base url
1111 findBaseUrl: function () {
1112 var noUrlParams = window.location.pathname.split("?")[0],
1113 urlSlashes = noUrlParams.split("/").reverse(),
1114 cleanedSegments = [],
1117 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
1118 var segment = urlSlashes[i];
1120 // Split off and save anything that looks like a file type.
1121 if (segment.indexOf(".") !== -1) {
1122 possibleType = segment.split(".")[1];
1124 /* If the type isn't alpha-only, it's probably not actually a file extension. */
1125 if(!possibleType.match(/[^a-zA-Z]/)) {
1126 segment = segment.split(".")[0];
1131 * EW-CMS specific segment replacement. Ugly.
1132 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
1134 if(segment.indexOf(',00') !== -1) {
1135 segment = segment.replace(',00', '');
1138 // If our first or second segment has anything looking like a page number, remove it.
1139 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
1140 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
1146 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
1147 if (i < 2 && segment.match(/^\d{1,2}$/)) {
1151 /* If this is the first segment and it's just "index", remove it. */
1152 if(i === 0 && segment.toLowerCase() === "index") {
1156 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
1157 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
1161 /* If it's not marked for deletion, push it to cleanedSegments. */
1163 cleanedSegments.push(segment);
1167 // This is our final, cleaned, base article URL.
1168 return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
1172 * Look for any paging links that may occur within the document.
1175 * @return object (array)
1177 findNextPageLink: function (elem) {
1178 var possiblePages = {},
1179 allLinks = elem.getElementsByTagName('a'),
1180 articleBaseUrl = readability.findBaseUrl();
1183 * Loop through all links, looking for hints that they may be next-page links.
1184 * Things like having "page" in their textContent, className or id, or being a child
1185 * of a node with a page-y className or id.
1187 * Also possible: levenshtein distance? longest common subsequence?
1189 * After we do that, assign each page a score, and
1191 for(var i = 0, il = allLinks.length; i < il; i+=1) {
1192 var link = allLinks[i],
1193 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
1195 /* If we've already seen this page, ignore it */
1196 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
1200 /* If it's on a different domain, skip it. */
1201 if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1205 var linkText = readability.getInnerText(link);
1207 /* If the linkText looks like it's not the next page, skip it. */
1208 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1212 /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
1213 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1214 if(!linkHrefLeftover.match(/\d/)) {
1218 if(!(linkHref in possiblePages)) {
1219 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
1221 possiblePages[linkHref].linkText += ' | ' + linkText;
1224 var linkObj = possiblePages[linkHref];
1227 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1228 * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1230 if(linkHref.indexOf(articleBaseUrl) !== 0) {
1231 linkObj.score -= 25;
1234 var linkData = linkText + ' ' + link.className + ' ' + link.id;
1235 if(linkData.match(readability.regexps.nextLink)) {
1236 linkObj.score += 50;
1238 if(linkData.match(/pag(e|ing|inat)/i)) {
1239 linkObj.score += 25;
1241 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1242 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1243 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1244 linkObj.score -= 65;
1247 if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
1248 linkObj.score -= 50;
1250 if(linkData.match(readability.regexps.prevLink)) {
1251 linkObj.score -= 200;
1254 /* If a parentNode contains page or paging or paginat */
1255 var parentNode = link.parentNode,
1256 positiveNodeMatch = false,
1257 negativeNodeMatch = false;
1259 var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
1260 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
1261 positiveNodeMatch = true;
1262 linkObj.score += 25;
1264 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
1265 /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
1266 if(!parentNodeClassAndId.match(readability.regexps.positive)) {
1267 linkObj.score -= 25;
1268 negativeNodeMatch = true;
1272 parentNode = parentNode.parentNode;
1276 * If the URL looks like it has paging in it, add to the score.
1277 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1279 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1280 linkObj.score += 25;
1283 /* If the URL contains negative values, give a slight decrease. */
1284 if (linkHref.match(readability.regexps.extraneous)) {
1285 linkObj.score -= 15;
1289 * Minor punishment to anything that doesn't match our current URL.
1290 * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
1291 * Dan, can you show me a counterexample where this is necessary?
1292 * if (linkHref.indexOf(window.location.href) !== 0) {
1293 * linkObj.score -= 1;
1298 * If the link text can be parsed as a number, give it a minor bonus, with a slight
1299 * bias towards lower numbered pages. This is so that pages that might not have 'next'
1300 * in their text can still get scored, and sorted properly by score.
1302 var linkTextAsNumber = parseInt(linkText, 10);
1303 if(linkTextAsNumber) {
1304 // Punish 1 since we're either already there, or it's probably before what we want anyways.
1305 if (linkTextAsNumber === 1) {
1306 linkObj.score -= 10;
1309 // Todo: Describe this better
1310 linkObj.score += Math.max(0, 10 - linkTextAsNumber);
1316 * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
1317 * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
1320 for(var page in possiblePages) {
1321 if(possiblePages.hasOwnProperty(page)) {
1322 if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
1323 topPage = possiblePages[page];
1329 var nextHref = topPage.href.replace(/\/$/,'');
1331 dbg('NEXT PAGE IS ' + nextHref);
1332 readability.parsedPages[nextHref] = true;
1341 * Build a simple cross browser compatible XHR.
1343 * TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk.
1346 if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
1347 return new XMLHttpRequest();
1350 try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
1351 try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
1352 try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
1358 successfulRequest: function (request) {
1359 return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
1362 ajax: function (url, options) {
1363 var request = readability.xhr();
1365 function respondToReadyState(readyState) {
1366 if (request.readyState === 4) {
1367 if (readability.successfulRequest(request)) {
1368 if (options.success) { options.success(request); }
1371 if (options.error) { options.error(request); }
1376 if (typeof options === 'undefined') { options = {}; }
1378 request.onreadystatechange = respondToReadyState;
1380 request.open('get', url, true);
1381 request.setRequestHeader('Accept', 'text/html');
1384 request.send(options.postBody);
1387 if (options.error) { options.error(); }
1394 * Make an AJAX request for each page and append it to the document.
1398 appendNextPage: function (nextPageLink) {
1399 readability.curPageNum+=1;
1401 var articlePage = document.createElement("DIV");
1402 articlePage.id = 'readability-page-' + readability.curPageNum;
1403 articlePage.className = 'page';
1404 articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">§</p>';
1406 document.getElementById("readability-content").appendChild(articlePage);
1408 if(readability.curPageNum > readability.maxPages) {
1409 var nextPageMarkup = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
1411 articlePage.innerHTML = articlePage.innerHTML + nextPageMarkup;
1416 * Now that we've built the article page DOM element, get the page content
1417 * asynchronously and load the cleaned content into the div we created for it.
1419 var replaceContent = function(pageUrl, thisPage) {
1420 readability.ajax(pageUrl, {
1421 success: function(r) {
1423 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1424 var eTag = r.getResponseHeader('ETag');
1426 if(eTag in readability.pageETags) {
1427 dbg("Exact duplicate page found via ETag. Aborting.");
1428 articlePage.style.display = 'none';
1431 readability.pageETags[eTag] = 1;
1435 // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
1436 var page = document.createElement("DIV");
1439 * Do some preprocessing to our HTML to make it ready for appending.
1440 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1441 * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
1442 * • Turn all double br's into p's - was handled by prepDocument in the original view.
1443 * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
1445 var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
1446 responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
1447 responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
1448 responseHtml = responseHtml.replace(readability.regexps.replaceBrs, '</p><p>');
1449 responseHtml = responseHtml.replace(readability.regexps.replaceFonts, '<$1span>');
1451 page.innerHTML = responseHtml;
1454 * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
1456 readability.flags = 0x1 | 0x2 | 0x4;
1458 var nextPageLink = readability.findNextPageLink(page),
1459 content = readability.grabArticle(page);
1462 dbg("No content found in page to append. Aborting.");
1467 * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
1468 * Compare it against all of the the previous document's we've gotten. If the previous
1469 * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
1471 var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
1472 if(firstP && firstP.innerHTML.length > 100) {
1473 for(var i=1; i <= readability.curPageNum; i+=1) {
1474 var rPage = document.getElementById('readability-page-' + i);
1475 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
1476 dbg('Duplicate of page ' + i + ' - skipping.');
1477 articlePage.style.display = 'none';
1478 readability.parsedPages[pageUrl] = true;
1484 readability.removeScripts(content);
1486 thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
1489 * After the page has rendered, post process the content. This delay is necessary because,
1490 * in webkit at least, offsetWidth is not set in time to determine image width. We have to
1491 * wait a little bit for reflow to finish before we can fix floating images.
1494 function() { readability.postProcessContent(thisPage); },
1499 readability.appendNextPage(nextPageLink);
1503 }(nextPageLink, articlePage);
1507 * Get an elements class/id weight. Uses regular expressions to tell if this
1508 * element looks good or bad.
1511 * @return number (Integer)
1513 getClassWeight: function (e) {
1514 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1520 /* Look for a special classname */
1521 if (typeof(e.className) === 'string' && e.className !== '')
1523 if(e.className.search(readability.regexps.negative) !== -1) {
1526 if(e.className.search(readability.regexps.positive) !== -1) {
1530 /* Look for a special ID */
1531 if (typeof(e.id) === 'string' && e.id !== '')
1533 if(e.id.search(readability.regexps.negative) !== -1) {
1536 if(e.id.search(readability.regexps.positive) !== -1) {
1543 nodeIsVisible: function (node) {
1544 return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
1548 * Remove extraneous break tags from a node.
1553 killBreaks: function (e) {
1555 e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaks,'<br />');
1558 dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks);
1563 * Clean a node of all elements of type "tag".
1564 * (Unless it's a youtube/vimeo video. People love movies.)
1567 * @param string tag to clean
1570 clean: function (e, tag) {
1571 var targetList = e.getElementsByTagName( tag );
1572 var isEmbed = (tag === 'object' || tag === 'embed');
1574 for (var y=targetList.length-1; y >= 0; y-=1) {
1575 /* Allow youtube and vimeo videos through as people usually want to see those. */
1577 var attributeValues = "";
1578 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1579 attributeValues += targetList[y].attributes[i].value + '|';
1582 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
1583 if (attributeValues.search(readability.regexps.videos) !== -1) {
1587 /* Then check the elements inside this element for the same. */
1588 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
1594 targetList[y].parentNode.removeChild(targetList[y]);
1599 * Clean an element of all tags of type "tag" if they look fishy.
1600 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
1604 cleanConditionally: function (e, tag) {
1606 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1610 var tagsList = e.getElementsByTagName(tag);
1611 var curTagsLength = tagsList.length;
1614 * Gather counts for other typical elements embedded within.
1615 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1617 * TODO: Consider taking into account original contentScore here.
1619 for (var i=curTagsLength-1; i >= 0; i-=1) {
1620 var weight = readability.getClassWeight(tagsList[i]);
1621 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1623 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1625 if(weight+contentScore < 0)
1627 tagsList[i].parentNode.removeChild(tagsList[i]);
1629 else if ( readability.getCharCount(tagsList[i],',') < 10) {
1631 * If there are not very many commas, and the number of
1632 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1634 var p = tagsList[i].getElementsByTagName("p").length;
1635 var img = tagsList[i].getElementsByTagName("img").length;
1636 var li = tagsList[i].getElementsByTagName("li").length-100;
1637 var input = tagsList[i].getElementsByTagName("input").length;
1640 var embeds = tagsList[i].getElementsByTagName("embed");
1641 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1642 if (embeds[ei].src.search(readability.regexps.videos) === -1) {
1647 var linkDensity = readability.getLinkDensity(tagsList[i]);
1648 var contentLength = readability.getInnerText(tagsList[i]).length;
1649 var toRemove = false;
1653 } else if(li > p && tag !== "ul" && tag !== "ol") {
1655 } else if( input > Math.floor(p/3) ) {
1657 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1659 } else if(weight < 25 && linkDensity > 0.2) {
1661 } else if(weight >= 25 && linkDensity > 0.5) {
1663 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1668 tagsList[i].parentNode.removeChild(tagsList[i]);
1675 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1680 cleanHeaders: function (e) {
1681 for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
1682 var headers = e.getElementsByTagName('h' + headerIndex);
1683 for (var i=headers.length-1; i >=0; i-=1) {
1684 if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1685 headers[i].parentNode.removeChild(headers[i]);
1691 /*** Smooth scrolling logic ***/
1694 * easeInOut animation algorithm - returns an integer that says how far to move at this point in the animation.
1695 * Borrowed from jQuery's easing library.
1698 easeInOut: function(start,end,totalSteps,actualStep) {
1699 var delta = end - start;
1701 if ((actualStep/=totalSteps/2) < 1) {
1702 return delta/2*actualStep*actualStep + start;
1705 return -delta/2 * ((actualStep)*(actualStep-2) - 1) + start;
1709 * Helper function to, in a cross compatible way, get or set the current scroll offset of the document.
1710 * @return mixed integer on get, the result of window.scrollTo on set
1712 scrollTop: function(scroll){
1713 var setScroll = typeof scroll !== 'undefined';
1716 return window.scrollTo(0, scroll);
1718 if(typeof window.pageYOffset !== 'undefined') {
1719 return window.pageYOffset;
1721 else if(document.documentElement.clientHeight) {
1722 return document.documentElement.scrollTop;
1725 return document.body.scrollTop;
1730 * scrollTo - Smooth scroll to the point of scrollEnd in the document.
1734 scrollTo: function (scrollStart, scrollEnd, steps, interval) {
1736 (scrollStart < scrollEnd && readability.scrollTop() < scrollEnd) ||
1737 (scrollStart > scrollEnd && readability.scrollTop() > scrollEnd)
1739 readability.curScrollStep+=1;
1740 if(readability.curScrollStep > steps) {
1744 var oldScrollTop = readability.scrollTop();
1746 readability.scrollTop(readability.easeInOut(scrollStart, scrollEnd, steps, readability.curScrollStep));
1748 // We're at the end of the window.
1749 if(oldScrollTop === readability.scrollTop()) {
1753 window.setTimeout(function() {
1754 readability.scrollTo(scrollStart, scrollEnd, steps, interval);
1761 * Show the email popup.
1765 emailBox: function () {
1766 var emailContainerExists = document.getElementById('email-container');
1767 if(null !== emailContainerExists)
1772 var emailContainer = document.createElement("DIV");
1773 emailContainer.setAttribute('id', 'email-container');
1774 emailContainer.innerHTML = '<iframe src="'+readability.emailSrc + '?pageUrl='+encodeURIComponent(window.location)+'&pageTitle='+encodeURIComponent(document.title)+'" scrolling="no" onload="readability.removeFrame()" style="width:500px; height: 490px; border: 0;"></iframe>';
1776 document.body.appendChild(emailContainer);
1780 * Close the email popup. This is a hacktackular way to check if we're in a "close loop".
1781 * Since we don't have crossdomain access to the frame, we can only know when it has
1782 * loaded again. If it's loaded over 3 times, we know to close the frame.
1786 removeFrame: function () {
1787 readability.iframeLoads+=1;
1788 if (readability.iframeLoads > 3)
1790 var emailContainer = document.getElementById('email-container');
1791 if (null !== emailContainer) {
1792 emailContainer.parentNode.removeChild(emailContainer);
1795 readability.iframeLoads = 0;
1799 htmlspecialchars: function (s) {
1800 if (typeof(s) === "string") {
1801 s = s.replace(/&/g, "&");
1802 s = s.replace(/"/g, """);
1803 s = s.replace(/'/g, "'");
1804 s = s.replace(/</g, "<");
1805 s = s.replace(/>/g, ">");
1811 flagIsActive: function(flag) {
1812 return (readability.flags & flag) > 0;
1815 addFlag: function(flag) {
1816 readability.flags = readability.flags | flag;
1819 removeFlag: function(flag) {
1820 readability.flags = readability.flags & ~flag;