| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // Local modifications to this file are described in the README.chromium |
| 6 // file. |
| 1 | 7 |
| 2 var dbg = (typeof console !== 'undefined') ? function(s) { | 8 var dbg = (typeof console !== 'undefined') ? function(s) { |
| 3 console.log("Readability: " + s); | 9 console.log("Readability: " + s); |
| 4 } : function() {}; | 10 } : function() {}; |
| 5 | 11 |
| 6 /* | 12 /* |
| 7 * Readability. An Arc90 Lab Experiment. | 13 * Readability. An Arc90 Lab Experiment. |
| 8 * Website: http://lab.arc90.com/experiments/readability | 14 * Website: http://lab.arc90.com/experiments/readability |
| 9 * Source: http://code.google.com/p/arc90labs-readability | 15 * Source: http://code.google.com/p/arc90labs-readability |
| 10 * | 16 * |
| 11 * "Readability" is a trademark of Arc90 Inc and may not be used without explici
t permission. | 17 * "Readability" is a trademark of Arc90 Inc and may not be used without explici
t permission. |
| 12 * | 18 * |
| 13 * Copyright (c) 2010 Arc90 Inc | 19 * Copyright (c) 2010 Arc90 Inc |
| 14 * Readability is licensed under the Apache License, Version 2.0. | 20 * Readability is licensed under the Apache License, Version 2.0. |
| 15 **/ | 21 **/ |
| 16 var readability = { | 22 var readability = { |
| 17 readStyle: "style-newspaper", | 23 readStyle: "style-newspaper", |
| 18 readSize: "size-medium", | 24 readSize: "size-medium", |
| 19 readMargin: "margin-wide", | 25 readMargin: "margin-wide", |
| 20 | 26 |
| 21 distilledHTML: '', | 27 distilledHTML: '', |
| 22 distilledArticleContent: null, | 28 distilledArticleContent: null, |
| 29 nextPageLink: '', |
| 23 | 30 |
| 24 version: '1.7.1', | 31 version: '1.7.1', |
| 25 iframeLoads: 0, | 32 iframeLoads: 0, |
| 26 convertLinksToFootnotes: false, | 33 convertLinksToFootnotes: false, |
| 27 reversePageScroll: false, /* If they hold shift and hit space, scroll
up */ | 34 reversePageScroll: false, /* If they hold shift and hit space, scroll
up */ |
| 28 frameHack: false, /** | 35 frameHack: false, /** |
| 29 * The frame hack is to workaround a firefo
x bug where if you | 36 * The frame hack is to workaround a firefo
x bug where if you |
| 30 * pull content out of a frame and stick it
into the parent element, the scrollbar won't appear. | 37 * pull content out of a frame and stick it
into the parent element, the scrollbar won't appear. |
| 31 * So we fake a scrollbar in the wrapping d
iv. | 38 * So we fake a scrollbar in the wrapping d
iv. |
| 32 **/ | 39 **/ |
| 33 biggestFrame: false, | 40 biggestFrame: false, |
| 34 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ | 41 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ |
| 35 | 42 |
| 36 /* constants */ | 43 /* constants */ |
| 37 FLAG_STRIP_UNLIKELYS: 0x1, | 44 FLAG_STRIP_UNLIKELYS: 0x1, |
| 38 FLAG_WEIGHT_CLASSES: 0x2, | 45 FLAG_WEIGHT_CLASSES: 0x2, |
| 39 FLAG_CLEAN_CONDITIONALLY: 0x4, | 46 FLAG_CLEAN_CONDITIONALLY: 0x4, |
| 40 | 47 |
| 41 maxPages: 30, /* The maximum number of pages to loop through before we ca
ll it quits and just show a link. */ | 48 maxPages: 30, /* The maximum number of pages to loop through before we ca
ll it quits and just show a link. */ |
| 42 parsedPages: {}, /* The list of pages we've parsed in this call of readabili
ty, for autopaging. As a key store for easier searching. */ | 49 parsedPages: {}, /* The list of pages we've parsed in this call of readabili
ty, for autopaging. As a key store for easier searching. */ |
| 43 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas
e they happen to match, we'll know it's a duplicate. */ | 50 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in cas
e they happen to match, we'll know it's a duplicate. */ |
| 44 | 51 |
| 45 /** | 52 /** |
| 46 * All of the regular expressions in use within readability. | 53 * All of the regular expressions in use within readability. |
| 47 * Defined up here so we don't instantiate them repeatedly in loops. | 54 * Defined up here so we don't instantiate them repeatedly in loops. |
| 48 **/ | 55 **/ |
| 49 regexps: { | 56 regexps: { |
| 50 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header
|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu
p|tweet|twitter/i, | 57 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header
|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popu
p|tweet|twitter/i, |
| 51 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, | 58 okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
| 52 positive: /article|body|content|entry|hentry|main|page|pagi
nation|post|text|blog|story/i, | 59 positive: /article|body|content|entry|hentry|main|page|pagi
nation|post|text|blog|story/i, |
| 53 negative: /combx|comment|com-|contact|foot|footer|footnote|
masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp
ing|tags|tool|widget/i, | 60 negative: /combx|comment|com-|contact|foot|footer|footnote|
masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopp
ing|tags|tool|widget/i, |
| 54 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r
eply|all|login|sign|single/i, | 61 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|r
eply|all|login|sign|single/i, |
| 55 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, | 62 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, |
| 56 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, | 63 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, |
| 57 replaceFonts: /<(\/?)font[^>]*>/gi, | 64 replaceFonts: /<(\/?)font[^>]*>/gi, |
| 58 trim: /^\s+|\s+$/g, | 65 trim: /^\s+|\s+$/g, |
| 59 normalize: /\s{2,}/g, | 66 normalize: /\s{2,}/g, |
| 60 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, | 67 killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, |
| 61 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, | 68 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
| 62 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)
\s*$/i, | 69 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)
\s*$/i, |
| 63 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
// Match: next, continue, >, >>, » but not >|, »| as those usually mean last. | 70 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
// Match: next, continue, >, >>, » but not >|, »| as those usually mean last. |
| 64 prevLink: /(prev|earl|old|new|<|«)/i | 71 prevLink: /(prev|earl|old|new|<|«)/i |
| 65 }, | 72 }, |
| 66 | 73 |
| 67 /** | 74 /** |
| 68 * Runs readability. | 75 * Runs readability. |
| 69 * | 76 * |
| 70 * Workflow: | 77 * Workflow: |
| 71 * 1. Prep the document by removing script tags, css, etc. | 78 * 1. Prep the document by removing script tags, css, etc. |
| 72 * 2. Build readability's DOM tree. | 79 * 2. Build readability's DOM tree. |
| 73 * 3. Grab the article content from the current dom tree. | 80 * 3. Grab the article content from the current dom tree. |
| 74 * 4. Replace the current DOM tree with the new one. | 81 * 4. Replace the current DOM tree with the new one. |
| 75 * 5. Read peacefully. | 82 * 5. Read peacefully. |
| 76 * | 83 * |
| 77 * @return void | 84 * @return void |
| 78 **/ | 85 **/ |
| 79 init: function() { | 86 init: function() { |
| 80 /* Before we do anything, remove all scripts that are not readability. *
/ | 87 /* Before we do anything, remove all scripts that are not readability. *
/ |
| 81 window.onload = window.onunload = function() {}; | 88 window.onload = window.onunload = function() {}; |
| 82 | 89 |
| 83 readability.removeScripts(document); | 90 readability.removeScripts(document); |
| 84 | 91 |
| 85 /* Make sure this document is added to the list of parsed pages first, s
o we don't double up on the first page */ | 92 /* Make sure this document is added to the list of parsed pages first, s
o we don't double up on the first page */ |
| 86 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; | 93 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; |
| 87 | 94 |
| 88 /* Pull out any possible next page link first */ | 95 /* Pull out any possible next page link first */ |
| 89 var nextPageLink = readability.findNextPageLink(document.body); | 96 readability.nextPageLink = readability.findNextPageLink(document.body); |
| 90 | 97 |
| 98 /* We handle processing of nextPage from C++ set nextPageLink to null */ |
| 99 var nextPageLink = null; |
| 100 |
| 91 readability.prepDocument(); | 101 readability.prepDocument(); |
| 92 | 102 |
| 93 /* Build readability's DOM tree */ | 103 /* Build readability's DOM tree */ |
| 94 var overlay = document.createElement("DIV"); | 104 var overlay = document.createElement("DIV"); |
| 95 var innerDiv = document.createElement("DIV"); | 105 var innerDiv = document.createElement("DIV"); |
| 96 var articleTools = readability.getArticleTools(); | 106 var articleTools = readability.getArticleTools(); |
| 97 var articleTitleText = readability.getArticleTitle(); | 107 var articleTitleText = readability.getArticleTitle(); |
| 98 var articleContent = readability.grabArticle(); | 108 var articleContent = readability.grabArticle(); |
| 99 | 109 |
| 100 if(!articleContent) { | 110 if(!articleContent) { |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 145 rootWarning.innerHTML = "<em>Readability</em> was intended for u
se on individual articles and not home pages. " + | 155 rootWarning.innerHTML = "<em>Readability</em> was intended for u
se on individual articles and not home pages. " + |
| 146 "If you'd like to try rendering this page anyway, <a onClick='ja
vascript:document.getElementById(\"readability-warning\").style.display=\"none\"
;document.getElementById(\"readability-content\").style.display=\"block\";'>clic
k here</a> to continue."; | 156 "If you'd like to try rendering this page anyway, <a onClick='ja
vascript:document.getElementById(\"readability-warning\").style.display=\"none\"
;document.getElementById(\"readability-content\").style.display=\"block\";'>clic
k here</a> to continue."; |
| 147 | 157 |
| 148 innerDiv.insertBefore( rootWarning, articleContent ); | 158 innerDiv.insertBefore( rootWarning, articleContent ); |
| 149 } | 159 } |
| 150 | 160 |
| 151 readability.postProcessContent(articleContent); | 161 readability.postProcessContent(articleContent); |
| 152 | 162 |
| 153 window.scrollTo(0, 0); | 163 window.scrollTo(0, 0); |
| 154 | 164 |
| 155 // TODO(bengr): Remove this assignment of null to nextPageLink when | |
| 156 // the processing of the next page link is safe. | |
| 157 nextPageLink = null; | |
| 158 | |
| 159 if (nextPageLink) { | 165 if (nextPageLink) { |
| 160 /** | 166 /** |
| 161 * Append any additional pages after a small timeout so that people | 167 * Append any additional pages after a small timeout so that people |
| 162 * can start reading without having to wait for this to finish proce
ssing. | 168 * can start reading without having to wait for this to finish proce
ssing. |
| 163 **/ | 169 **/ |
| 164 window.setTimeout(function() { | 170 window.setTimeout(function() { |
| 165 readability.appendNextPage(nextPageLink); | 171 readability.appendNextPage(nextPageLink); |
| 166 }, 500); | 172 }, 500); |
| 167 } | 173 } |
| 168 | 174 |
| 169 /** Smooth scrolling **/ | 175 /** Smooth scrolling **/ |
| 170 document.onkeydown = function(e) { | 176 document.onkeydown = function(e) { |
| 171 var code = (window.event) ? event.keyCode : e.keyCode; | 177 var code = (window.event) ? event.keyCode : e.keyCode; |
| 172 if (code === 16) { | 178 if (code === 16) { |
| 173 readability.reversePageScroll = true; | 179 readability.reversePageScroll = true; |
| 174 return; | 180 return; |
| 175 } | 181 } |
| 176 | 182 |
| 177 if (code === 32) { | 183 if (code === 32) { |
| 178 readability.curScrollStep = 0; | 184 readability.curScrollStep = 0; |
| 179 var windowHeight = window.innerHeight ? window.innerHeight : (do
cument.documentElement.clientHeight ? document.documentElement.clientHeight : do
cument.body.clientHeight); | 185 var windowHeight = window.innerHeight ? window.innerHeight : (do
cument.documentElement.clientHeight ? document.documentElement.clientHeight : do
cument.body.clientHeight); |
| 180 | 186 |
| 181 if(readability.reversePageScroll) { | 187 if(readability.reversePageScroll) { |
| 182 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() - (windowHeight - 50), 20, 10); | 188 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() - (windowHeight - 50), 20, 10); |
| 183 } | 189 } |
| 184 else { | 190 else { |
| 185 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() + (windowHeight - 50), 20, 10); | 191 readability.scrollTo(readability.scrollTop(), readability.sc
rollTop() + (windowHeight - 50), 20, 10); |
| 186 } | 192 } |
| 187 | 193 |
| 188 return false; | 194 return false; |
| 189 } | 195 } |
| 190 }; | 196 }; |
| 191 | 197 |
| 192 document.onkeyup = function(e) { | 198 document.onkeyup = function(e) { |
| 193 var code = (window.event) ? event.keyCode : e.keyCode; | 199 var code = (window.event) ? event.keyCode : e.keyCode; |
| 194 if (code === 16) { | 200 if (code === 16) { |
| 195 readability.reversePageScroll = false; | 201 readability.reversePageScroll = false; |
| 196 return; | 202 return; |
| 197 } | 203 } |
| 198 }; | 204 }; |
| 199 }, | 205 }, |
| 200 | 206 |
| 201 /** | 207 /** |
| 202 * Run any post-process modifications to article content as necessary. | 208 * Run any post-process modifications to article content as necessary. |
| 203 * | 209 * |
| 204 * @param Element | 210 * @param Element |
| 205 * @return void | 211 * @return void |
| 206 **/ | 212 **/ |
| 207 postProcessContent: function(articleContent) { | 213 postProcessContent: function(articleContent) { |
| 208 if(readability.convertLinksToFootnotes && !window.location.href.match(/w
ikipedia\.org/g)) { | 214 if(readability.convertLinksToFootnotes && !window.location.href.match(/w
ikipedia\.org/g)) { |
| 209 readability.addFootnotes(articleContent); | 215 readability.addFootnotes(articleContent); |
| 210 } | 216 } |
| 211 | 217 |
| 212 readability.fixImageFloats(articleContent); | 218 readability.fixImageFloats(articleContent); |
| 213 }, | 219 }, |
| 214 | 220 |
| 215 /** | 221 /** |
| 216 * Some content ends up looking ugly if the image is too large to be floated
. | 222 * Some content ends up looking ugly if the image is too large to be floated
. |
| 217 * If the image is wider than a threshold (currently 55%), no longer float i
t, | 223 * If the image is wider than a threshold (currently 55%), no longer float i
t, |
| 218 * center it instead. | 224 * center it instead. |
| 219 * | 225 * |
| 220 * @param Element | 226 * @param Element |
| 221 * @return void | 227 * @return void |
| 222 **/ | 228 **/ |
| 223 fixImageFloats: function (articleContent) { | 229 fixImageFloats: function (articleContent) { |
| 224 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.
55, | 230 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.
55, |
| 225 images = articleContent.getElementsByTagName('img'); | 231 images = articleContent.getElementsByTagName('img'); |
| 226 | 232 |
| 227 for(var i=0, il = images.length; i < il; i+=1) { | 233 for(var i=0, il = images.length; i < il; i+=1) { |
| 228 var image = images[i]; | 234 var image = images[i]; |
| 229 | 235 |
| 230 if(image.offsetWidth > imageWidthThreshold) { | 236 if(image.offsetWidth > imageWidthThreshold) { |
| 231 image.className += " blockImage"; | 237 image.className += " blockImage"; |
| 232 } | 238 } |
| 233 } | 239 } |
| 234 }, | 240 }, |
| 235 | 241 |
| 236 /** | 242 /** |
| 237 * Get the article tools Element that has buttons like reload, print. | 243 * Get the article tools Element that has buttons like reload, print. |
| 238 * | 244 * |
| 239 * @return void | 245 * @return void |
| 240 **/ | 246 **/ |
| 241 getArticleTools: function () { | 247 getArticleTools: function () { |
| 242 var articleTools = document.createElement("DIV"); | 248 var articleTools = document.createElement("DIV"); |
| 243 | 249 |
| 244 articleTools.id = "readTools"; | 250 articleTools.id = "readTools"; |
| 245 articleTools.innerHTML = | 251 articleTools.innerHTML = |
| 246 "<a href='#' onclick='return window.location.reload()' title='Reload
original page' id='reload-page'>Reload Original Page</a>" + | 252 "<a href='#' onclick='return window.location.reload()' title='Reload
original page' id='reload-page'>Reload Original Page</a>" + |
| 247 "<a href='#' onclick='javascript:window.print();' title='Print page'
id='print-page'>Print Page</a>" + | 253 "<a href='#' onclick='javascript:window.print();' title='Print page'
id='print-page'>Print Page</a>" + |
| 248 "<a href='#' onclick='readability.emailBox(); return false;' title='
Email page' id='email-page'>Email Page</a>"; | 254 "<a href='#' onclick='readability.emailBox(); return false;' title='
Email page' id='email-page'>Email Page</a>"; |
| 249 | 255 |
| 250 return articleTools; | 256 return articleTools; |
| 251 }, | 257 }, |
| 252 | 258 |
| 253 /** | 259 /** |
| 254 * retuns the suggested direction of the string | 260 * retuns the suggested direction of the string |
| 255 * | 261 * |
| 256 * @return "rtl" || "ltr" | 262 * @return "rtl" || "ltr" |
| 257 **/ | 263 **/ |
| 258 getSuggestedDirection: function(text) { | 264 getSuggestedDirection: function(text) { |
| 259 function sanitizeText() { | 265 function sanitizeText() { |
| 260 return text.replace(/@\w+/, ""); | 266 return text.replace(/@\w+/, ""); |
| 261 } | 267 } |
| 262 | 268 |
| 263 function countMatches(match) { | 269 function countMatches(match) { |
| 264 var matches = text.match(new RegExp(match, "g")); | 270 var matches = text.match(new RegExp(match, "g")); |
| 265 return matches !== null ? matches.length : 0; | 271 return matches !== null ? matches.length : 0; |
| 266 } | 272 } |
| 267 | 273 |
| 268 function isRTL() { | 274 function isRTL() { |
| 269 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); | 275 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); |
| 270 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); | 276 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); |
| 271 | 277 |
| 272 // if 20% of chars are Hebrew or Arbic then direction is rtl | 278 // if 20% of chars are Hebrew or Arbic then direction is rtl |
| 273 return (count_heb + count_arb) * 100 / text.length > 20; | 279 return (count_heb + count_arb) * 100 / text.length > 20; |
| 274 } | 280 } |
| 275 | 281 |
| 276 text = sanitizeText(text); | 282 text = sanitizeText(text); |
| 277 return isRTL() ? "rtl" : "ltr"; | 283 return isRTL() ? "rtl" : "ltr"; |
| 278 }, | 284 }, |
| 279 | 285 |
| 280 /** | 286 /** |
| 281 * Get the article title as an H1. | 287 * Get the article title as an H1. |
| 282 * | 288 * |
| 283 * @return void | 289 * @return void |
| 284 **/ | 290 **/ |
| 285 getArticleTitle: function () { | 291 getArticleTitle: function () { |
| 286 var curTitle = "", | 292 var curTitle = "", |
| 287 origTitle = ""; | 293 origTitle = ""; |
| 288 | 294 |
| 289 try { | 295 try { |
| 290 curTitle = origTitle = document.title; | 296 curTitle = origTitle = document.title; |
| 291 if(typeof curTitle !== "string") { /* If they had an element with id
"title" in their HTML */ | 297 if(typeof curTitle !== "string") { /* If they had an element with id
"title" in their HTML */ |
| 292 curTitle = origTitle = readability.getInnerText(document.getElem
entsByTagName('title')[0]); | 298 curTitle = origTitle = readability.getInnerText(document.getElem
entsByTagName('title')[0]); |
| 293 } | 299 } |
| 294 } | 300 } |
| 295 catch(e) {} | 301 catch(e) {} |
| 296 | 302 |
| 297 if(curTitle.match(/ [\|\-] /)) | 303 if(curTitle.match(/ [\|\-] /)) |
| 298 { | 304 { |
| 299 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); | 305 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
| 300 | 306 |
| 301 if(curTitle.split(' ').length < 3) { | 307 if(curTitle.split(' ').length < 3) { |
| 302 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); | 308 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
| 303 } | 309 } |
| 304 } | 310 } |
| 305 else if(curTitle.indexOf(': ') !== -1) | 311 else if(curTitle.indexOf(': ') !== -1) |
| 306 { | 312 { |
| 307 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); | 313 curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
| 308 | 314 |
| 309 if(curTitle.split(' ').length < 3) { | 315 if(curTitle.split(' ').length < 3) { |
| 310 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); | 316 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
| (...skipping 12 matching lines...) Expand all Loading... |
| 323 | 329 |
| 324 if(curTitle.split(' ').length <= 4) { | 330 if(curTitle.split(' ').length <= 4) { |
| 325 curTitle = origTitle; | 331 curTitle = origTitle; |
| 326 } | 332 } |
| 327 return curTitle; | 333 return curTitle; |
| 328 }, | 334 }, |
| 329 | 335 |
| 330 /** | 336 /** |
| 331 * Prepare the HTML document for readability to scrape it. | 337 * Prepare the HTML document for readability to scrape it. |
| 332 * This includes things like stripping javascript, CSS, and handling terribl
e markup. | 338 * This includes things like stripping javascript, CSS, and handling terribl
e markup. |
| 333 * | 339 * |
| 334 * @return void | 340 * @return void |
| 335 **/ | 341 **/ |
| 336 prepDocument: function () { | 342 prepDocument: function () { |
| 337 /** | 343 /** |
| 338 * In some cases a body element can't be found (if the HTML is totally h
osed for example) | 344 * In some cases a body element can't be found (if the HTML is totally h
osed for example) |
| 339 * so we create a new body node and append it to the document. | 345 * so we create a new body node and append it to the document. |
| 340 */ | 346 */ |
| 341 if(document.body === null) | 347 if(document.body === null) |
| 342 { | 348 { |
| 343 var body = document.createElement("body"); | 349 var body = document.createElement("body"); |
| 344 try { | 350 try { |
| 345 document.body = body; | 351 document.body = body; |
| 346 } | 352 } |
| 347 catch(e) { | 353 catch(e) { |
| 348 document.documentElement.appendChild(body); | 354 document.documentElement.appendChild(body); |
| 349 dbg(e); | 355 dbg(e); |
| 350 } | 356 } |
| 351 } | 357 } |
| 352 | 358 |
| 353 document.body.id = "readabilityBody"; | 359 document.body.id = "readabilityBody"; |
| 354 | 360 |
| 355 var frames = document.getElementsByTagName('frame'); | 361 var frames = document.getElementsByTagName('frame'); |
| (...skipping 11 matching lines...) Expand all Loading... |
| 367 canAccessFrame = true; | 373 canAccessFrame = true; |
| 368 } | 374 } |
| 369 catch(eFrames) { | 375 catch(eFrames) { |
| 370 dbg(eFrames); | 376 dbg(eFrames); |
| 371 } | 377 } |
| 372 | 378 |
| 373 if(frameSize > biggestFrameSize) { | 379 if(frameSize > biggestFrameSize) { |
| 374 biggestFrameSize = frameSize; | 380 biggestFrameSize = frameSize; |
| 375 readability.biggestFrame = frames[frameIndex]; | 381 readability.biggestFrame = frames[frameIndex]; |
| 376 } | 382 } |
| 377 | 383 |
| 378 if(canAccessFrame && frameSize > bestFrameSize) | 384 if(canAccessFrame && frameSize > bestFrameSize) |
| 379 { | 385 { |
| 380 readability.frameHack = true; | 386 readability.frameHack = true; |
| 381 | 387 |
| 382 bestFrame = frames[frameIndex]; | 388 bestFrame = frames[frameIndex]; |
| 383 bestFrameSize = frameSize; | 389 bestFrameSize = frameSize; |
| 384 } | 390 } |
| 385 } | 391 } |
| 386 | 392 |
| 387 if(bestFrame) | 393 if(bestFrame) |
| 388 { | 394 { |
| 389 var newBody = document.createElement('body'); | 395 var newBody = document.createElement('body'); |
| 390 readability.moveNodeInnards(bestFrame.contentWindow.document.bod
y, newBody); | 396 readability.moveNodeInnards(bestFrame.contentWindow.document.bod
y, newBody); |
| 391 newBody.style.overflow = 'scroll'; | 397 newBody.style.overflow = 'scroll'; |
| 392 document.body = newBody; | 398 document.body = newBody; |
| 393 | 399 |
| 394 var frameset = document.getElementsByTagName('frameset')[0]; | 400 var frameset = document.getElementsByTagName('frameset')[0]; |
| 395 if(frameset) { | 401 if(frameset) { |
| 396 frameset.parentNode.removeChild(frameset); } | 402 frameset.parentNode.removeChild(frameset); } |
| 397 } | 403 } |
| 398 } | 404 } |
| 399 | 405 |
| 400 /* Remove all stylesheets */ | 406 /* Remove all stylesheets */ |
| 401 for (var k=0;k < document.styleSheets.length; k+=1) { | 407 for (var k=0;k < document.styleSheets.length; k+=1) { |
| 402 if (document.styleSheets[k].href !== null && document.styleSheets[k]
.href.lastIndexOf("readability") === -1) { | 408 if (document.styleSheets[k].href !== null && document.styleSheets[k]
.href.lastIndexOf("readability") === -1) { |
| 403 document.styleSheets[k].disabled = true; | 409 document.styleSheets[k].disabled = true; |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 448 readability.cleanConditionally(articleContent, "table"); | 454 readability.cleanConditionally(articleContent, "table"); |
| 449 readability.cleanConditionally(articleContent, "ul"); | 455 readability.cleanConditionally(articleContent, "ul"); |
| 450 readability.cleanConditionally(articleContent, "div"); | 456 readability.cleanConditionally(articleContent, "div"); |
| 451 | 457 |
| 452 /* Remove extra paragraphs */ | 458 /* Remove extra paragraphs */ |
| 453 var articleParagraphs = articleContent.getElementsByTagName('p'); | 459 var articleParagraphs = articleContent.getElementsByTagName('p'); |
| 454 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { | 460 for(var i = articleParagraphs.length-1; i >= 0; i-=1) { |
| 455 var imgCount = articleParagraphs[i].getElementsByTagName('img').l
ength; | 461 var imgCount = articleParagraphs[i].getElementsByTagName('img').l
ength; |
| 456 var embedCount = articleParagraphs[i].getElementsByTagName('embed')
.length; | 462 var embedCount = articleParagraphs[i].getElementsByTagName('embed')
.length; |
| 457 var objectCount = articleParagraphs[i].getElementsByTagName('object'
).length; | 463 var objectCount = articleParagraphs[i].getElementsByTagName('object'
).length; |
| 458 | 464 |
| 459 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab
ility.getInnerText(articleParagraphs[i], false) === '') { | 465 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readab
ility.getInnerText(articleParagraphs[i], false) === '') { |
| 460 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]
); | 466 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]
); |
| 461 } | 467 } |
| 462 } | 468 } |
| 463 | 469 |
| 464 try { | 470 try { |
| 465 readability.replaceBrsWithPs(articleContent); | 471 readability.replaceBrsWithPs(articleContent); |
| 466 } | 472 } |
| 467 catch (e) { | 473 catch (e) { |
| 468 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block
-elements bug. Ignoring.: " + e); | 474 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block
-elements bug. Ignoring.: " + e); |
| 469 } | 475 } |
| 470 }, | 476 }, |
| 471 | 477 |
| 472 /** | 478 /** |
| 473 * Initialize a node with the readability object. Also checks the | 479 * Initialize a node with the readability object. Also checks the |
| 474 * className/id for special names to add to its score. | 480 * className/id for special names to add to its score. |
| 475 * | 481 * |
| 476 * @param Element | 482 * @param Element |
| 477 * @return void | 483 * @return void |
| 478 **/ | 484 **/ |
| 479 initializeNode: function (node) { | 485 initializeNode: function (node) { |
| 480 node.readability = {"contentScore": 0}; | 486 node.readability = {"contentScore": 0}; |
| 481 | 487 |
| 482 switch(node.tagName) { | 488 switch(node.tagName) { |
| 483 case 'DIV': | 489 case 'DIV': |
| 484 node.readability.contentScore += 5; | 490 node.readability.contentScore += 5; |
| 485 break; | 491 break; |
| 486 | 492 |
| 487 case 'PRE': | 493 case 'PRE': |
| 488 case 'TD': | 494 case 'TD': |
| 489 case 'BLOCKQUOTE': | 495 case 'BLOCKQUOTE': |
| 490 node.readability.contentScore += 3; | 496 node.readability.contentScore += 3; |
| 491 break; | 497 break; |
| 492 | 498 |
| 493 case 'ADDRESS': | 499 case 'ADDRESS': |
| 494 case 'OL': | 500 case 'OL': |
| 495 case 'UL': | 501 case 'UL': |
| 496 case 'DL': | 502 case 'DL': |
| 497 case 'DD': | 503 case 'DD': |
| 498 case 'DT': | 504 case 'DT': |
| 499 case 'LI': | 505 case 'LI': |
| 500 case 'FORM': | 506 case 'FORM': |
| 501 node.readability.contentScore -= 3; | 507 node.readability.contentScore -= 3; |
| 502 break; | 508 break; |
| 503 | 509 |
| 504 case 'H1': | 510 case 'H1': |
| 505 case 'H2': | 511 case 'H2': |
| 506 case 'H3': | 512 case 'H3': |
| 507 case 'H4': | 513 case 'H4': |
| 508 case 'H5': | 514 case 'H5': |
| 509 case 'H6': | 515 case 'H6': |
| 510 case 'TH': | 516 case 'TH': |
| 511 node.readability.contentScore -= 5; | 517 node.readability.contentScore -= 5; |
| 512 break; | 518 break; |
| 513 } | 519 } |
| 514 | 520 |
| 515 node.readability.contentScore += readability.getClassWeight(node); | 521 node.readability.contentScore += readability.getClassWeight(node); |
| 516 }, | 522 }, |
| 517 | 523 |
| 518 /*** | 524 /*** |
| 519 * grabArticle - Using a variety of metrics (content score, classname, eleme
nt types), find the content that is | 525 * grabArticle - Using a variety of metrics (content score, classname, eleme
nt types), find the content that is |
| 520 * most likely to be the stuff a user wants to read. Then retu
rn it wrapped up in a div. | 526 * most likely to be the stuff a user wants to read. Then retu
rn it wrapped up in a div. |
| 521 * | 527 * |
| 522 * @param page a document to run upon. Needs to be a full document, complete
with body. | 528 * @param page a document to run upon. Needs to be a full document, complete
with body. |
| 523 * @return Element | 529 * @return Element |
| 524 **/ | 530 **/ |
| 525 grabArticle: function (pageToClone) { | 531 grabArticle: function (pageToClone) { |
| 526 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_
STRIP_UNLIKELYS), | 532 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_
STRIP_UNLIKELYS), |
| 527 isPaging = (page !== null) ? true: false; | 533 isPaging = (page !== null) ? true: false; |
| 528 | 534 |
| 529 var page = null; | 535 var page = null; |
| 530 // Never work on the actual page. | 536 // Never work on the actual page. |
| 531 if (isPaging) { | 537 if (isPaging) { |
| 532 page = document.body.cloneNode(true); | 538 page = document.body.cloneNode(true); |
| 533 } else { | 539 } else { |
| 534 page = pageToClone.cloneNode(true); | 540 page = pageToClone.cloneNode(true); |
| 535 } | 541 } |
| 536 | 542 |
| 537 var allElements = page.getElementsByTagName('*'); | 543 var allElements = page.getElementsByTagName('*'); |
| 538 | 544 |
| 539 /** | 545 /** |
| 540 * First, node prepping. Trash nodes that look cruddy (like ones with th
e class name "comment", etc), and turn divs | 546 * First, node prepping. Trash nodes that look cruddy (like ones with th
e class name "comment", etc), and turn divs |
| 541 * into P tags where they have been used inappropriately (as in, where t
hey contain no other block level elements.) | 547 * into P tags where they have been used inappropriately (as in, where t
hey contain no other block level elements.) |
| 542 * | 548 * |
| 543 * Note: Assignment from index for performance. See http://www.peachpit.
com/articles/article.aspx?p=31567&seqNum=5 | 549 * Note: Assignment from index for performance. See http://www.peachpit.
com/articles/article.aspx?p=31567&seqNum=5 |
| 544 * TODO: Shouldn't this be a reverse traversal? | 550 * TODO: Shouldn't this be a reverse traversal? |
| 545 **/ | 551 **/ |
| 546 var node = null; | 552 var node = null; |
| 547 var nodesToScore = []; | 553 var nodesToScore = []; |
| 548 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { | 554 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { |
| 549 /* Remove unlikely candidates */ | 555 /* Remove unlikely candidates */ |
| 550 if (stripUnlikelyCandidates) { | 556 if (stripUnlikelyCandidates) { |
| 551 var unlikelyMatchString = node.className + node.id; | 557 var unlikelyMatchString = node.className + node.id; |
| 552 if ( | 558 if ( |
| 553 ( | 559 ( |
| 554 unlikelyMatchString.search(readability.regexps.unlikelyC
andidates) !== -1 && | 560 unlikelyMatchString.search(readability.regexps.unlikelyC
andidates) !== -1 && |
| 555 unlikelyMatchString.search(readability.regexps.okMaybeIt
sACandidate) === -1 && | 561 unlikelyMatchString.search(readability.regexps.okMaybeIt
sACandidate) === -1 && |
| 556 node.tagName !== "BODY" | 562 node.tagName !== "BODY" |
| 557 ) | 563 ) |
| 558 ) | 564 ) |
| 559 { | 565 { |
| 560 dbg("Removing unlikely candidate - " + unlikelyMatchString); | 566 dbg("Removing unlikely candidate - " + unlikelyMatchString); |
| 561 node.parentNode.removeChild(node); | 567 node.parentNode.removeChild(node); |
| 562 nodeIndex-=1; | 568 nodeIndex-=1; |
| 563 continue; | 569 continue; |
| 564 } | 570 } |
| 565 } | 571 } |
| 566 | 572 |
| 567 if (node.tagName === "P" || node.tagName === "TD" || node.tagName ==
= "PRE") { | 573 if (node.tagName === "P" || node.tagName === "TD" || node.tagName ==
= "PRE") { |
| 568 nodesToScore[nodesToScore.length] = node; | 574 nodesToScore[nodesToScore.length] = node; |
| 569 } | 575 } |
| 570 | 576 |
| 571 /* Turn all divs that don't have children block level elements into
p's */ | 577 /* Turn all divs that don't have children block level elements into
p's */ |
| 572 if (node.tagName === "DIV") { | 578 if (node.tagName === "DIV") { |
| 573 if (node.innerHTML.search(readability.regexps.divToPElements) ==
= -1) { | 579 if (node.innerHTML.search(readability.regexps.divToPElements) ==
= -1) { |
| 574 var newNode = document.createElement('p'); | 580 var newNode = document.createElement('p'); |
| (...skipping 16 matching lines...) Expand all Loading... |
| 591 if(childNode.nodeType === 3) { // Node.TEXT_NODE | 597 if(childNode.nodeType === 3) { // Node.TEXT_NODE |
| 592 var p = document.createElement('p'); | 598 var p = document.createElement('p'); |
| 593 var t = document.createTextNode(childNode.nodeValue)
; | 599 var t = document.createTextNode(childNode.nodeValue)
; |
| 594 p.appendChild(t); | 600 p.appendChild(t); |
| 595 p.style.display = 'inline'; | 601 p.style.display = 'inline'; |
| 596 p.className = 'readability-styled'; | 602 p.className = 'readability-styled'; |
| 597 childNode.parentNode.replaceChild(p, childNode); | 603 childNode.parentNode.replaceChild(p, childNode); |
| 598 } | 604 } |
| 599 } | 605 } |
| 600 } | 606 } |
| 601 } | 607 } |
| 602 } | 608 } |
| 603 | 609 |
| 604 /** | 610 /** |
| 605 * Loop through all paragraphs, and assign a score to them based on how
content-y they look. | 611 * Loop through all paragraphs, and assign a score to them based on how
content-y they look. |
| 606 * Then add their score to their parent node. | 612 * Then add their score to their parent node. |
| 607 * | 613 * |
| 608 * A score is determined by things like number of commas, class names, e
tc. Maybe eventually link density. | 614 * A score is determined by things like number of commas, class names, e
tc. Maybe eventually link density. |
| 609 **/ | 615 **/ |
| 610 var candidates = []; | 616 var candidates = []; |
| 611 for (var pt=0; pt < nodesToScore.length; pt+=1) { | 617 for (var pt=0; pt < nodesToScore.length; pt+=1) { |
| (...skipping 21 matching lines...) Expand all Loading... |
| 633 candidates.push(grandParentNode); | 639 candidates.push(grandParentNode); |
| 634 } | 640 } |
| 635 | 641 |
| 636 var contentScore = 0; | 642 var contentScore = 0; |
| 637 | 643 |
| 638 /* Add a point for the paragraph itself as a base. */ | 644 /* Add a point for the paragraph itself as a base. */ |
| 639 contentScore+=1; | 645 contentScore+=1; |
| 640 | 646 |
| 641 /* Add points for any commas within this paragraph */ | 647 /* Add points for any commas within this paragraph */ |
| 642 contentScore += innerText.split(',').length; | 648 contentScore += innerText.split(',').length; |
| 643 | 649 |
| 644 /* For every 100 characters in this paragraph, add another point. Up
to 3 points. */ | 650 /* For every 100 characters in this paragraph, add another point. Up
to 3 points. */ |
| 645 contentScore += Math.min(Math.floor(innerText.length / 100), 3); | 651 contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
| 646 | 652 |
| 647 /* Add the score to the parent. The grandparent gets half. */ | 653 /* Add the score to the parent. The grandparent gets half. */ |
| 648 parentNode.readability.contentScore += contentScore; | 654 parentNode.readability.contentScore += contentScore; |
| 649 | 655 |
| 650 if(grandParentNode) { | 656 if(grandParentNode) { |
| 651 grandParentNode.readability.contentScore += contentScore/2;
| 657 grandParentNode.readability.contentScore += contentScore/2; |
| 652 } | 658 } |
| 653 } | 659 } |
| 654 | 660 |
| 655 /** | 661 /** |
| 656 * After we've calculated scores, loop through all of the possible candi
date nodes we found | 662 * After we've calculated scores, loop through all of the possible candi
date nodes we found |
| 657 * and find the one with the highest score. | 663 * and find the one with the highest score. |
| 658 **/ | 664 **/ |
| 659 var topCandidate = null; | 665 var topCandidate = null; |
| 660 for(var c=0, cl=candidates.length; c < cl; c+=1) | 666 for(var c=0, cl=candidates.length; c < cl; c+=1) |
| 661 { | 667 { |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 718 var contentBonus = 0; | 724 var contentBonus = 0; |
| 719 /* Give a bonus if sibling nodes and top candidates have the example
same classname */ | 725 /* Give a bonus if sibling nodes and top candidates have the example
same classname */ |
| 720 if(siblingNode.className === topCandidate.className && topCandidate.
className !== "") { | 726 if(siblingNode.className === topCandidate.className && topCandidate.
className !== "") { |
| 721 contentBonus += topCandidate.readability.contentScore * 0.2; | 727 contentBonus += topCandidate.readability.contentScore * 0.2; |
| 722 } | 728 } |
| 723 | 729 |
| 724 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re
adability.contentScore+contentBonus) >= siblingScoreThreshold) | 730 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.re
adability.contentScore+contentBonus) >= siblingScoreThreshold) |
| 725 { | 731 { |
| 726 append = true; | 732 append = true; |
| 727 } | 733 } |
| 728 | 734 |
| 729 if(siblingNode.nodeName === "P") { | 735 if(siblingNode.nodeName === "P") { |
| 730 var linkDensity = readability.getLinkDensity(siblingNode); | 736 var linkDensity = readability.getLinkDensity(siblingNode); |
| 731 var nodeContent = readability.getInnerText(siblingNode); | 737 var nodeContent = readability.getInnerText(siblingNode); |
| 732 var nodeLength = nodeContent.length; | 738 var nodeLength = nodeContent.length; |
| 733 | 739 |
| 734 if(nodeLength > 80 && linkDensity < 0.25) | 740 if(nodeLength > 80 && linkDensity < 0.25) |
| 735 { | 741 { |
| 736 append = true; | 742 append = true; |
| 737 } | 743 } |
| 738 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear
ch(/\.( |$)/) !== -1) | 744 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.sear
ch(/\.( |$)/) !== -1) |
| 739 { | 745 { |
| 740 append = true; | 746 append = true; |
| 741 } | 747 } |
| 742 } | 748 } |
| 743 | 749 |
| 744 if(append) { | 750 if(append) { |
| 745 dbg("Appending node: " + siblingNode); | 751 dbg("Appending node: " + siblingNode); |
| 746 | 752 |
| 747 var nodeToAppend = null; | 753 var nodeToAppend = null; |
| 748 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P
") { | 754 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P
") { |
| 749 /* We have a node that isn't a common block level element, l
ike a form or td tag. Turn it into a div so it doesn't get filtered out later by
accident. */ | 755 /* We have a node that isn't a common block level element, l
ike a form or td tag. Turn it into a div so it doesn't get filtered out later by
accident. */ |
| 750 | 756 |
| 751 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to
div.'); | 757 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to
div.'); |
| 752 nodeToAppend = document.createElement("DIV"); | 758 nodeToAppend = document.createElement("DIV"); |
| 753 try { | 759 try { |
| 754 nodeToAppend.id = siblingNode.id; | 760 nodeToAppend.id = siblingNode.id; |
| 755 readability.moveNodeInnards(siblingNode, nodeToAppend); | 761 readability.moveNodeInnards(siblingNode, nodeToAppend); |
| 756 } | 762 } |
| 757 catch(er) { | 763 catch(er) { |
| 758 dbg("Could not alter siblingNode to div, probably an IE
restriction, reverting back to original."); | 764 dbg("Could not alter siblingNode to div, probably an IE
restriction, reverting back to original."); |
| 759 nodeToAppend = siblingNode; | 765 nodeToAppend = siblingNode; |
| 760 s-=1; | 766 s-=1; |
| 761 sl-=1; | 767 sl-=1; |
| 762 } | 768 } |
| 763 } else { | 769 } else { |
| 764 nodeToAppend = siblingNode; | 770 nodeToAppend = siblingNode; |
| 765 s-=1; | 771 s-=1; |
| 766 sl-=1; | 772 sl-=1; |
| 767 } | 773 } |
| 768 | 774 |
| 769 /* To ensure a node does not interfere with readability styles,
remove its classnames */ | 775 /* To ensure a node does not interfere with readability styles,
remove its classnames */ |
| 770 nodeToAppend.className = ""; | 776 nodeToAppend.className = ""; |
| 771 | 777 |
| 772 /* Append sibling and subtract from our list because it removes
the node when you append to another node */ | 778 /* Append sibling and subtract from our list because it removes
the node when you append to another node */ |
| 773 articleContent.appendChild(nodeToAppend); | 779 articleContent.appendChild(nodeToAppend); |
| 774 } | 780 } |
| 775 } | 781 } |
| 776 | 782 |
| 777 /** | 783 /** |
| 778 * So we have all of the content that we need. Now we clean it up for pr
esentation. | 784 * So we have all of the content that we need. Now we clean it up for pr
esentation. |
| 779 **/ | 785 **/ |
| 780 readability.distilledArticleContent = articleContent.cloneNode(true); | 786 readability.distilledArticleContent = articleContent.cloneNode(true); |
| 781 //readability.prepArticle(articleContent); | 787 //readability.prepArticle(articleContent); |
| 782 | 788 |
| 783 if (readability.curPageNum === 1) { | 789 if (readability.curPageNum === 1) { |
| 784 var newNode = document.createElement('div'); | 790 var newNode = document.createElement('div'); |
| 785 newNode.id = "readability-page-1"; | 791 newNode.id = "readability-page-1"; |
| 786 newNode.setAttribute("class", "page"); | 792 newNode.setAttribute("class", "page"); |
| 787 readability.moveNodeInnards(articleContent, newNode); | 793 readability.moveNodeInnards(articleContent, newNode); |
| 788 articleContent.appendChild(newNode); | 794 articleContent.appendChild(newNode); |
| 789 } | 795 } |
| 790 | 796 |
| 791 /** | 797 /** |
| 792 * Now that we've gone through the full algorithm, check to see if we go
t any meaningful content. | 798 * Now that we've gone through the full algorithm, check to see if we go
t any meaningful content. |
| 793 * If we didn't, we may need to re-run grabArticle with different flags
set. This gives us a higher | 799 * If we didn't, we may need to re-run grabArticle with different flags
set. This gives us a higher |
| 794 * likelihood of finding the content, and the sieve approach gives us a
higher likelihood of | 800 * likelihood of finding the content, and the sieve approach gives us a
higher likelihood of |
| 795 * finding the -right- content. | 801 * finding the -right- content. |
| 796 **/ | 802 **/ |
| 797 if(readability.getInnerText(articleContent, false).length < 250) { | 803 if(readability.getInnerText(articleContent, false).length < 250) { |
| 798 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { | 804 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { |
| 799 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); | 805 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); |
| 800 return readability.grabArticle(document.body); | 806 return readability.grabArticle(document.body); |
| 801 } | 807 } |
| 802 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES))
{ | 808 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES))
{ |
| 803 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); | 809 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); |
| 804 return readability.grabArticle(document.body); | 810 return readability.grabArticle(document.body); |
| 805 } | 811 } |
| 806 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL
LY)) { | 812 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONAL
LY)) { |
| 807 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); | 813 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); |
| 808 return readability.grabArticle(document.body); | 814 return readability.grabArticle(document.body); |
| 809 } else { | 815 } else { |
| 810 return null; | 816 return null; |
| 811 } | 817 } |
| 812 } | 818 } |
| 813 | 819 |
| 814 return articleContent; | 820 return articleContent; |
| 815 }, | 821 }, |
| 816 | 822 |
| 817 /** | 823 /** |
| 818 * Removes script tags from the document. | 824 * Removes script tags from the document. |
| 819 * | 825 * |
| 820 * @param Element | 826 * @param Element |
| 821 **/ | 827 **/ |
| 822 removeScripts: function (doc) { | 828 removeScripts: function (doc) { |
| 823 var scripts = doc.getElementsByTagName('script'); | 829 var scripts = doc.getElementsByTagName('script'); |
| 824 for(var i = scripts.length-1; i >= 0; i-=1) | 830 for(var i = scripts.length-1; i >= 0; i-=1) |
| 825 { | 831 { |
| 826 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf
('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) | 832 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf
('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) |
| 827 { | 833 { |
| 828 scripts[i].nodeValue=""; | 834 scripts[i].nodeValue=""; |
| 829 scripts[i].removeAttribute('src'); | 835 scripts[i].removeAttribute('src'); |
| 830 if (scripts[i].parentNode) { | 836 if (scripts[i].parentNode) { |
| 831 scripts[i].parentNode.removeChild(scripts[i]); | 837 scripts[i].parentNode.removeChild(scripts[i]); |
| 832 } | 838 } |
| 833 } | 839 } |
| 834 } | 840 } |
| 835 }, | 841 }, |
| 836 | 842 |
| 837 /** | 843 /** |
| 838 * Get the inner text of a node - cross browser compatibly. | 844 * Get the inner text of a node - cross browser compatibly. |
| 839 * This also strips out any excess whitespace to be found. | 845 * This also strips out any excess whitespace to be found. |
| 840 * | 846 * |
| 841 * @param Element | 847 * @param Element |
| 842 * @return string | 848 * @return string |
| 843 **/ | 849 **/ |
| 844 getInnerText: function (e, normalizeSpaces) { | 850 getInnerText: function (e, normalizeSpaces) { |
| 845 var textContent = ""; | 851 var textContent = ""; |
| 846 | 852 |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 889 | 895 |
| 890 // Remove any root styles, if we're able. | 896 // Remove any root styles, if we're able. |
| 891 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili
ty-styled') { | 897 if(typeof e.removeAttribute === 'function' && e.className !== 'readabili
ty-styled') { |
| 892 e.removeAttribute('style'); } | 898 e.removeAttribute('style'); } |
| 893 | 899 |
| 894 // Go until there are no more child nodes | 900 // Go until there are no more child nodes |
| 895 while ( cur !== null ) { | 901 while ( cur !== null ) { |
| 896 if ( cur.nodeType === 1 ) { | 902 if ( cur.nodeType === 1 ) { |
| 897 // Remove style attribute(s) : | 903 // Remove style attribute(s) : |
| 898 if(cur.className !== "readability-styled") { | 904 if(cur.className !== "readability-styled") { |
| 899 cur.removeAttribute("style"); | 905 cur.removeAttribute("style"); |
| 900 } | 906 } |
| 901 readability.cleanStyles( cur ); | 907 readability.cleanStyles( cur ); |
| 902 } | 908 } |
| 903 cur = cur.nextSibling; | 909 cur = cur.nextSibling; |
| 904 } | 910 } |
| 905 }, | 911 }, |
| 906 | 912 |
| 907 /** | 913 /** |
| 908 * Get the density of links as a percentage of the content | 914 * Get the density of links as a percentage of the content |
| 909 * This is the amount of text that is inside a link divided by the total tex
t in the node. | 915 * This is the amount of text that is inside a link divided by the total tex
t in the node. |
| 910 * | 916 * |
| 911 * @param Element | 917 * @param Element |
| 912 * @return number (float) | 918 * @return number (float) |
| 913 **/ | 919 **/ |
| 914 getLinkDensity: function (e) { | 920 getLinkDensity: function (e) { |
| 915 var links = e.getElementsByTagName("a"); | 921 var links = e.getElementsByTagName("a"); |
| 916 var textLength = readability.getInnerText(e).length; | 922 var textLength = readability.getInnerText(e).length; |
| 917 var linkLength = 0; | 923 var linkLength = 0; |
| 918 for(var i=0, il=links.length; i<il;i+=1) | 924 for(var i=0, il=links.length; i<il;i+=1) |
| 919 { | 925 { |
| 920 linkLength += readability.getInnerText(links[i]).length; | 926 linkLength += readability.getInnerText(links[i]).length; |
| 921 } | 927 } |
| 922 | 928 |
| 923 return linkLength / textLength; | 929 return linkLength / textLength; |
| 924 }, | 930 }, |
| 925 | 931 |
| 926 /** | 932 /** |
| 927 * Find a cleaned up version of the current URL, to use for comparing links
for possible next-pageyness. | 933 * Find a cleaned up version of the current URL, to use for comparing links
for possible next-pageyness. |
| 928 * | 934 * |
| 929 * @author Dan Lacy | 935 * @author Dan Lacy |
| 930 * @return string the base url | 936 * @return string the base url |
| 931 **/ | 937 **/ |
| 932 findBaseUrl: function () { | 938 findBaseUrl: function () { |
| 933 var noUrlParams = window.location.pathname.split("?")[0], | 939 var noUrlParams = window.location.pathname.split("?")[0], |
| 934 urlSlashes = noUrlParams.split("/").reverse(), | 940 urlSlashes = noUrlParams.split("/").reverse(), |
| 935 cleanedSegments = [], | 941 cleanedSegments = [], |
| 936 possibleType = ""; | 942 possibleType = ""; |
| 937 | 943 |
| 938 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { | 944 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { |
| 939 var segment = urlSlashes[i]; | 945 var segment = urlSlashes[i]; |
| 940 | 946 |
| 941 // Split off and save anything that looks like a file type. | 947 // Split off and save anything that looks like a file type. |
| 942 if (segment.indexOf(".") !== -1) { | 948 if (segment.indexOf(".") !== -1) { |
| 943 possibleType = segment.split(".")[1]; | 949 possibleType = segment.split(".")[1]; |
| 944 | 950 |
| 945 /* If the type isn't alpha-only, it's probably not actually a fi
le extension. */ | 951 /* If the type isn't alpha-only, it's probably not actually a fi
le extension. */ |
| 946 if(!possibleType.match(/[^a-zA-Z]/)) { | 952 if(!possibleType.match(/[^a-zA-Z]/)) { |
| 947 segment = segment.split(".")[0]; | 953 segment = segment.split(".")[0]; |
| 948 } | 954 } |
| 949 } | 955 } |
| 950 | 956 |
| 951 /** | 957 /** |
| 952 * EW-CMS specific segment replacement. Ugly. | 958 * EW-CMS specific segment replacement. Ugly. |
| 953 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm
l | 959 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.htm
l |
| 954 **/ | 960 **/ |
| 955 if(segment.indexOf(',00') !== -1) { | 961 if(segment.indexOf(',00') !== -1) { |
| 956 segment = segment.replace(',00', ''); | 962 segment = segment.replace(',00', ''); |
| 957 } | 963 } |
| 958 | 964 |
| 959 // If our first or second segment has anything looking like a page n
umber, remove it. | 965 // If our first or second segment has anything looking like a page n
umber, remove it. |
| 960 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1)
|| (i === 0))) { | 966 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1)
|| (i === 0))) { |
| 961 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "
"); | 967 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "
"); |
| 962 } | 968 } |
| 963 | 969 |
| 964 | 970 |
| 965 var del = false; | 971 var del = false; |
| 966 | 972 |
| 967 /* If this is purely a number, and it's the first or second segment,
it's probably a page number. Remove it. */ | 973 /* If this is purely a number, and it's the first or second segment,
it's probably a page number. Remove it. */ |
| 968 if (i < 2 && segment.match(/^\d{1,2}$/)) { | 974 if (i < 2 && segment.match(/^\d{1,2}$/)) { |
| 969 del = true; | 975 del = true; |
| 970 } | 976 } |
| 971 | 977 |
| 972 /* If this is the first segment and it's just "index", remove it. */ | 978 /* If this is the first segment and it's just "index", remove it. */ |
| 973 if(i === 0 && segment.toLowerCase() === "index") { | 979 if(i === 0 && segment.toLowerCase() === "index") { |
| 974 del = true; | 980 del = true; |
| 975 } | 981 } |
| 976 | 982 |
| 977 | 983 |
| 978 /* If our first or second segment is smaller than 3 characters, and
the first segment was purely alphas, remove it. */ | 984 /* If our first or second segment is smaller than 3 characters, and
the first segment was purely alphas, remove it. */ |
| 979 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { | 985 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { |
| 980 del = true; | 986 del = true; |
| 981 } | 987 } |
| 982 | 988 |
| 983 /* If it's not marked for deletion, push it to cleanedSegments. */ | 989 /* If it's not marked for deletion, push it to cleanedSegments. */ |
| 984 if (!del) { | 990 if (!del) { |
| 985 cleanedSegments.push(segment); | 991 cleanedSegments.push(segment); |
| 986 } | 992 } |
| 987 } | 993 } |
| 988 | 994 |
| 989 // This is our final, cleaned, base article URL. | 995 // This is our final, cleaned, base article URL. |
| 990 return window.location.protocol + "//" + window.location.host + cleanedS
egments.reverse().join("/"); | 996 return window.location.protocol + "//" + window.location.host + cleanedS
egments.reverse().join("/"); |
| 991 }, | 997 }, |
| 992 | 998 |
| 993 /** | 999 /** |
| 994 * Look for any paging links that may occur within the document. | 1000 * Look for any paging links that may occur within the document. |
| 995 * | 1001 * |
| 996 * @param body | 1002 * @param body |
| 997 * @return object (array) | 1003 * @return object (array) |
| 998 **/ | 1004 **/ |
| 999 findNextPageLink: function (elem) { | 1005 findNextPageLink: function (elem) { |
| 1000 var possiblePages = {}, | 1006 var possiblePages = {}, |
| 1001 allLinks = elem.getElementsByTagName('a'), | 1007 allLinks = elem.getElementsByTagName('a'), |
| 1002 articleBaseUrl = readability.findBaseUrl(); | 1008 articleBaseUrl = readability.findBaseUrl(); |
| 1003 | 1009 |
| 1004 /** | 1010 /** |
| 1005 * Loop through all links, looking for hints that they may be next-page
links. | 1011 * Loop through all links, looking for hints that they may be next-page
links. |
| 1006 * Things like having "page" in their textContent, className or id, or b
eing a child | 1012 * Things like having "page" in their textContent, className or id, or b
eing a child |
| 1007 * of a node with a page-y className or id. | 1013 * of a node with a page-y className or id. |
| 1008 * | 1014 * |
| 1009 * Also possible: levenshtein distance? longest common subsequence? | 1015 * Also possible: levenshtein distance? longest common subsequence? |
| 1010 * | 1016 * |
| 1011 * After we do that, assign each page a score, and | 1017 * After we do that, assign each page a score, and |
| 1012 **/ | 1018 **/ |
| 1013 for(var i = 0, il = allLinks.length; i < il; i+=1) { | 1019 for(var i = 0, il = allLinks.length; i < il; i+=1) { |
| 1014 var link = allLinks[i], | 1020 var link = allLinks[i], |
| 1015 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '
'); | 1021 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '
'); |
| 1016 | 1022 |
| 1017 /* If we've already seen this page, ignore it */ | 1023 /* If we've already seen this page, ignore it */ |
| 1018 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi
ndow.location.href || linkHref in readability.parsedPages) { | 1024 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === wi
ndow.location.href || linkHref in readability.parsedPages) { |
| 1019 continue; | 1025 continue; |
| 1020 } | 1026 } |
| 1021 | 1027 |
| 1022 /* If it's on a different domain, skip it. */ | 1028 /* If it's on a different domain, skip it. */ |
| 1023 if(window.location.host !== linkHref.split(/\/+/g)[1]) { | 1029 if(window.location.host !== linkHref.split(/\/+/g)[1]) { |
| 1024 continue; | 1030 continue; |
| 1025 } | 1031 } |
| 1026 | 1032 |
| 1027 var linkText = readability.getInnerText(link); | 1033 var linkText = readability.getInnerText(link); |
| 1028 | 1034 |
| 1029 /* If the linkText looks like it's not the next page, skip it. */ | 1035 /* If the linkText looks like it's not the next page, skip it. */ |
| 1030 if(linkText.match(readability.regexps.extraneous) || linkText.length
> 25) { | 1036 if(linkText.match(readability.regexps.extraneous) || linkText.length
> 25) { |
| 1031 continue; | 1037 continue; |
| 1032 } | 1038 } |
| 1033 | 1039 |
| 1034 /* If the leftovers of the URL after removing the base URL don't con
tain any digits, it's certainly not a next page link. */ | 1040 /* If the leftovers of the URL after removing the base URL don't con
tain any digits, it's certainly not a next page link. */ |
| 1035 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); | 1041 var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
| 1036 if(!linkHrefLeftover.match(/\d/)) { | 1042 if(!linkHrefLeftover.match(/\d/)) { |
| 1037 continue; | 1043 continue; |
| 1038 } | 1044 } |
| 1039 | 1045 |
| 1040 if(!(linkHref in possiblePages)) { | 1046 if(!(linkHref in possiblePages)) { |
| 1041 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr
ef": linkHref}; | 1047 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "hr
ef": linkHref}; |
| 1042 } else { | 1048 } else { |
| 1043 possiblePages[linkHref].linkText += ' | ' + linkText; | 1049 possiblePages[linkHref].linkText += ' | ' + linkText; |
| 1044 } | 1050 } |
| 1045 | 1051 |
| 1046 var linkObj = possiblePages[linkHref]; | 1052 var linkObj = possiblePages[linkHref]; |
| 1047 | 1053 |
| 1048 /** | 1054 /** |
| 1049 * If the articleBaseUrl isn't part of this URL, penalize this link.
It could still be the link, but the odds are lower. | 1055 * If the articleBaseUrl isn't part of this URL, penalize this link.
It could still be the link, but the odds are lower. |
| 1050 * Example: http://www.actionscript.org/resources/articles/745/1/Jav
aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html | 1056 * Example: http://www.actionscript.org/resources/articles/745/1/Jav
aScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
| 1051 **/ | 1057 **/ |
| 1052 if(linkHref.indexOf(articleBaseUrl) !== 0) { | 1058 if(linkHref.indexOf(articleBaseUrl) !== 0) { |
| 1053 linkObj.score -= 25; | 1059 linkObj.score -= 25; |
| 1054 } | 1060 } |
| 1055 | 1061 |
| 1056 var linkData = linkText + ' ' + link.className + ' ' + link.id; | 1062 var linkData = linkText + ' ' + link.className + ' ' + link.id; |
| 1057 if(linkData.match(readability.regexps.nextLink)) { | 1063 if(linkData.match(readability.regexps.nextLink)) { |
| 1058 linkObj.score += 50; | 1064 linkObj.score += 50; |
| 1059 } | 1065 } |
| 1060 if(linkData.match(/pag(e|ing|inat)/i)) { | 1066 if(linkData.match(/pag(e|ing|inat)/i)) { |
| 1061 linkObj.score += 25; | 1067 linkObj.score += 25; |
| 1062 } | 1068 } |
| 1063 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any
bonuses gotten from a > or » in the text, | 1069 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any
bonuses gotten from a > or » in the text, |
| 1064 /* If we already matched on "next", last is probably fine. If we
didn't, then it's bad. Penalize. */ | 1070 /* If we already matched on "next", last is probably fine. If we
didn't, then it's bad. Penalize. */ |
| 1065 if(!linkObj.linkText.match(readability.regexps.nextLink)) { | 1071 if(!linkObj.linkText.match(readability.regexps.nextLink)) { |
| 1066 linkObj.score -= 65; | 1072 linkObj.score -= 65; |
| 1067 } | 1073 } |
| 1068 } | 1074 } |
| 1069 if(linkData.match(readability.regexps.negative) || linkData.match(re
adability.regexps.extraneous)) { | 1075 if(linkData.match(readability.regexps.negative) || linkData.match(re
adability.regexps.extraneous)) { |
| 1070 linkObj.score -= 50; | 1076 linkObj.score -= 50; |
| 1071 } | 1077 } |
| 1072 if(linkData.match(readability.regexps.prevLink)) { | 1078 if(linkData.match(readability.regexps.prevLink)) { |
| 1073 linkObj.score -= 200; | 1079 linkObj.score -= 200; |
| 1074 } | 1080 } |
| 1075 | 1081 |
| 1076 /* If a parentNode contains page or paging or paginat */ | 1082 /* If a parentNode contains page or paging or paginat */ |
| 1077 var parentNode = link.parentNode, | 1083 var parentNode = link.parentNode, |
| 1078 positiveNodeMatch = false, | 1084 positiveNodeMatch = false, |
| 1079 negativeNodeMatch = false; | 1085 negativeNodeMatch = false; |
| 1080 while(parentNode) { | 1086 while(parentNode) { |
| 1081 var parentNodeClassAndId = parentNode.className + ' ' + parentNo
de.id; | 1087 var parentNodeClassAndId = parentNode.className + ' ' + parentNo
de.id; |
| 1082 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(/pag(e|ing|inat)/i)) { | 1088 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(/pag(e|ing|inat)/i)) { |
| 1083 positiveNodeMatch = true; | 1089 positiveNodeMatch = true; |
| 1084 linkObj.score += 25; | 1090 linkObj.score += 25; |
| 1085 } | 1091 } |
| 1086 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(readability.regexps.negative)) { | 1092 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClass
AndId.match(readability.regexps.negative)) { |
| 1087 /* If this is just something like "footer", give it a negati
ve. If it's something like "body-and-footer", leave it be. */ | 1093 /* If this is just something like "footer", give it a negati
ve. If it's something like "body-and-footer", leave it be. */ |
| 1088 if(!parentNodeClassAndId.match(readability.regexps.positive)
) { | 1094 if(!parentNodeClassAndId.match(readability.regexps.positive)
) { |
| 1089 linkObj.score -= 25; | 1095 linkObj.score -= 25; |
| 1090 negativeNodeMatch = true; | 1096 negativeNodeMatch = true; |
| 1091 } | 1097 } |
| 1092 } | 1098 } |
| 1093 | 1099 |
| 1094 parentNode = parentNode.parentNode; | 1100 parentNode = parentNode.parentNode; |
| 1095 } | 1101 } |
| 1096 | 1102 |
| 1097 /** | 1103 /** |
| 1098 * If the URL looks like it has paging in it, add to the score. | 1104 * If the URL looks like it has paging in it, add to the score. |
| 1099 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 | 1105 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
| 1100 **/ | 1106 **/ |
| 1101 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) ||
linkHref.match(/(page|paging)/i)) { | 1107 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) ||
linkHref.match(/(page|paging)/i)) { |
| 1102 linkObj.score += 25; | 1108 linkObj.score += 25; |
| 1103 } | 1109 } |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1145 topPage = possiblePages[page]; | 1151 topPage = possiblePages[page]; |
| 1146 } | 1152 } |
| 1147 } | 1153 } |
| 1148 } | 1154 } |
| 1149 | 1155 |
| 1150 if(topPage) { | 1156 if(topPage) { |
| 1151 var nextHref = topPage.href.replace(/\/$/,''); | 1157 var nextHref = topPage.href.replace(/\/$/,''); |
| 1152 | 1158 |
| 1153 dbg('NEXT PAGE IS ' + nextHref); | 1159 dbg('NEXT PAGE IS ' + nextHref); |
| 1154 readability.parsedPages[nextHref] = true; | 1160 readability.parsedPages[nextHref] = true; |
| 1155 return nextHref; | 1161 return nextHref; |
| 1156 } | 1162 } |
| 1157 else { | 1163 else { |
| 1158 return null; | 1164 return null; |
| 1159 } | 1165 } |
| 1160 }, | 1166 }, |
| 1161 | 1167 |
| 1162 createLinkDiv: function(link) { | 1168 createLinkDiv: function(link) { |
| 1163 var divNode = document.createElement('div'); | 1169 var divNode = document.createElement('div'); |
| 1164 var aNode = document.createElement('a'); | 1170 var aNode = document.createElement('a'); |
| 1165 var tNode = document.createTextNode('View Next Page'); | 1171 var tNode = document.createTextNode('View Next Page'); |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1197 } | 1203 } |
| 1198 else { | 1204 else { |
| 1199 if (options.error) { options.error(request); } | 1205 if (options.error) { options.error(request); } |
| 1200 } | 1206 } |
| 1201 } | 1207 } |
| 1202 } | 1208 } |
| 1203 | 1209 |
| 1204 if (typeof options === 'undefined') { options = {}; } | 1210 if (typeof options === 'undefined') { options = {}; } |
| 1205 | 1211 |
| 1206 request.onreadystatechange = respondToReadyState; | 1212 request.onreadystatechange = respondToReadyState; |
| 1207 | 1213 |
| 1208 request.open('get', url, true); | 1214 request.open('get', url, true); |
| 1209 request.setRequestHeader('Accept', 'text/html'); | 1215 request.setRequestHeader('Accept', 'text/html'); |
| 1210 | 1216 |
| 1211 try { | 1217 try { |
| 1212 request.send(options.postBody); | 1218 request.send(options.postBody); |
| 1213 } | 1219 } |
| 1214 catch (e) { | 1220 catch (e) { |
| 1215 if (options.error) { options.error(); } | 1221 if (options.error) { options.error(); } |
| 1216 } | 1222 } |
| 1217 | 1223 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 1232 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada
bility.curPageNum + '">§</p>'; | 1238 articlePage.innerHTML = '<p class="page-separator" title="Page ' + reada
bility.curPageNum + '">§</p>'; |
| 1233 | 1239 |
| 1234 document.getElementById("readability-content").appendChild(articlePage); | 1240 document.getElementById("readability-content").appendChild(articlePage); |
| 1235 | 1241 |
| 1236 if(readability.curPageNum > readability.maxPages) { | 1242 if(readability.curPageNum > readability.maxPages) { |
| 1237 var linkDiv = readability.createLinkDiv(nextPageLink); | 1243 var linkDiv = readability.createLinkDiv(nextPageLink); |
| 1238 | 1244 |
| 1239 articlePage.appendChild(linkDiv); | 1245 articlePage.appendChild(linkDiv); |
| 1240 return; | 1246 return; |
| 1241 } | 1247 } |
| 1242 | 1248 |
| 1243 /** | 1249 /** |
| 1244 * Now that we've built the article page DOM element, get the page conte
nt | 1250 * Now that we've built the article page DOM element, get the page conte
nt |
| 1245 * asynchronously and load the cleaned content into the div we created f
or it. | 1251 * asynchronously and load the cleaned content into the div we created f
or it. |
| 1246 **/ | 1252 **/ |
| 1247 (function(pageUrl, thisPage) { | 1253 (function(pageUrl, thisPage) { |
| 1248 readability.ajax(pageUrl, { | 1254 readability.ajax(pageUrl, { |
| 1249 success: function(r) { | 1255 success: function(r) { |
| 1250 | 1256 |
| 1251 /* First, check to see if we have a matching ETag in headers
- if we do, this is a duplicate page. */ | 1257 /* First, check to see if we have a matching ETag in headers
- if we do, this is a duplicate page. */ |
| 1252 var eTag = r.getResponseHeader('ETag'); | 1258 var eTag = r.getResponseHeader('ETag'); |
| 1253 if(eTag) { | 1259 if(eTag) { |
| 1254 if(eTag in readability.pageETags) { | 1260 if(eTag in readability.pageETags) { |
| 1255 dbg("Exact duplicate page found via ETag. Aborting."
); | 1261 dbg("Exact duplicate page found via ETag. Aborting."
); |
| 1256 articlePage.style.display = 'none'; | 1262 articlePage.style.display = 'none'; |
| 1257 return; | 1263 return; |
| 1258 } else { | 1264 } else { |
| 1259 readability.pageETags[eTag] = 1; | 1265 readability.pageETags[eTag] = 1; |
| 1260 } | 1266 } |
| 1261 } | 1267 } |
| 1262 | 1268 |
| 1263 // TODO: this ends up doubling up page numbers on NYTimes ar
ticles. Need to generically parse those away. | 1269 // TODO: this ends up doubling up page numbers on NYTimes ar
ticles. Need to generically parse those away. |
| 1264 var page = document.createElement("DIV"); | 1270 var page = document.createElement("DIV"); |
| 1265 | 1271 |
| 1266 /** | 1272 /** |
| 1267 * Do some preprocessing to our HTML to make it ready for ap
pending. | 1273 * Do some preprocessing to our HTML to make it ready for ap
pending. |
| 1268 * • Remove any script tags. Swap and reswap newlines with a
unicode character because multiline regex doesn't work in javascript. | 1274 * • Remove any script tags. Swap and reswap newlines with a
unicode character because multiline regex doesn't work in javascript. |
| 1269 * • Turn any noscript tags into divs so that we can parse t
hem. This allows us to find any next page links hidden via javascript. | 1275 * • Turn any noscript tags into divs so that we can parse t
hem. This allows us to find any next page links hidden via javascript. |
| 1270 * • Turn all double br's into p's - was handled by prepDocu
ment in the original view. | 1276 * • Turn all double br's into p's - was handled by prepDocu
ment in the original view. |
| (...skipping 30 matching lines...) Expand all Loading... |
| 1301 for(var i=1; i <= readability.curPageNum; i+=1) { | 1307 for(var i=1; i <= readability.curPageNum; i+=1) { |
| 1302 var rPage = document.getElementById('readability-pag
e-' + i); | 1308 var rPage = document.getElementById('readability-pag
e-' + i); |
| 1303 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML
) !== -1) { | 1309 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML
) !== -1) { |
| 1304 dbg('Duplicate of page ' + i + ' - skipping.'); | 1310 dbg('Duplicate of page ' + i + ' - skipping.'); |
| 1305 articlePage.style.display = 'none'; | 1311 articlePage.style.display = 'none'; |
| 1306 readability.parsedPages[pageUrl] = true; | 1312 readability.parsedPages[pageUrl] = true; |
| 1307 return; | 1313 return; |
| 1308 } | 1314 } |
| 1309 } | 1315 } |
| 1310 } | 1316 } |
| 1311 | 1317 |
| 1312 readability.removeScripts(content); | 1318 readability.removeScripts(content); |
| 1313 | 1319 |
| 1314 readability.moveNodeInnards(content, thisPage); | 1320 readability.moveNodeInnards(content, thisPage); |
| 1315 | 1321 |
| 1316 /** | 1322 /** |
| 1317 * After the page has rendered, post process the content. Th
is delay is necessary because, | 1323 * After the page has rendered, post process the content. Th
is delay is necessary because, |
| 1318 * in webkit at least, offsetWidth is not set in time to det
ermine image width. We have to | 1324 * in webkit at least, offsetWidth is not set in time to det
ermine image width. We have to |
| 1319 * wait a little bit for reflow to finish before we can fix
floating images. | 1325 * wait a little bit for reflow to finish before we can fix
floating images. |
| 1320 **/ | 1326 **/ |
| 1321 window.setTimeout( | 1327 window.setTimeout( |
| 1322 function() { readability.postProcessContent(thisPage); }
, | 1328 function() { readability.postProcessContent(thisPage); }
, |
| 1323 500 | 1329 500 |
| 1324 ); | 1330 ); |
| 1325 | 1331 |
| 1326 if(nextPageLink) { | 1332 if(nextPageLink) { |
| 1327 readability.appendNextPage(nextPageLink); | 1333 readability.appendNextPage(nextPageLink); |
| 1328 } | 1334 } |
| 1329 } | 1335 } |
| 1330 }); | 1336 }); |
| 1331 }(nextPageLink, articlePage)); | 1337 }(nextPageLink, articlePage)); |
| 1332 }, | 1338 }, |
| 1333 | 1339 |
| 1334 /** | 1340 /** |
| 1335 * Get an elements class/id weight. Uses regular expressions to tell if this
| 1341 * Get an elements class/id weight. Uses regular expressions to tell if this |
| 1336 * element looks good or bad. | 1342 * element looks good or bad. |
| 1337 * | 1343 * |
| 1338 * @param Element | 1344 * @param Element |
| 1339 * @return number (Integer) | 1345 * @return number (Integer) |
| 1340 **/ | 1346 **/ |
| 1341 getClassWeight: function (e) { | 1347 getClassWeight: function (e) { |
| 1342 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { | 1348 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
| 1343 return 0; | 1349 return 0; |
| 1344 } | 1350 } |
| 1345 | 1351 |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1375 /** | 1381 /** |
| 1376 * Remove extraneous break tags from a node. | 1382 * Remove extraneous break tags from a node. |
| 1377 * | 1383 * |
| 1378 * @param Element | 1384 * @param Element |
| 1379 * @return void | 1385 * @return void |
| 1380 **/ | 1386 **/ |
| 1381 killBreaks: function (e) { | 1387 killBreaks: function (e) { |
| 1382 var allElements = e.getElementsByTagName('*'); | 1388 var allElements = e.getElementsByTagName('*'); |
| 1383 while (i < allElements.length) { | 1389 while (i < allElements.length) { |
| 1384 readability.deleteExtraBreaks(allElements[i]); | 1390 readability.deleteExtraBreaks(allElements[i]); |
| 1385 i++; | 1391 i++; |
| 1386 } | 1392 } |
| 1387 }, | 1393 }, |
| 1388 | 1394 |
| 1389 /** | 1395 /** |
| 1390 * Clean a node of all elements of type "tag". | 1396 * Clean a node of all elements of type "tag". |
| 1391 * (Unless it's a youtube/vimeo video. People love movies.) | 1397 * (Unless it's a youtube/vimeo video. People love movies.) |
| 1392 * | 1398 * |
| 1393 * @param Element | 1399 * @param Element |
| 1394 * @param string tag to clean | 1400 * @param string tag to clean |
| 1395 * @return void | 1401 * @return void |
| 1396 **/ | 1402 **/ |
| 1397 clean: function (e, tag) { | 1403 clean: function (e, tag) { |
| 1398 var targetList = e.getElementsByTagName( tag ); | 1404 var targetList = e.getElementsByTagName( tag ); |
| 1399 var isEmbed = (tag === 'object' || tag === 'embed'); | 1405 var isEmbed = (tag === 'object' || tag === 'embed'); |
| 1400 | 1406 |
| 1401 for (var y=targetList.length-1; y >= 0; y-=1) { | 1407 for (var y=targetList.length-1; y >= 0; y-=1) { |
| 1402 /* Allow youtube and vimeo videos through as people usually want to
see those. */ | 1408 /* Allow youtube and vimeo videos through as people usually want to
see those. */ |
| 1403 if(isEmbed) { | 1409 if(isEmbed) { |
| 1404 var attributeValues = ""; | 1410 var attributeValues = ""; |
| 1405 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1)
{ | 1411 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1)
{ |
| 1406 attributeValues += targetList[y].attributes[i].value + '|'; | 1412 attributeValues += targetList[y].attributes[i].value + '|'; |
| 1407 } | 1413 } |
| 1408 | 1414 |
| 1409 /* First, check the elements attributes to see if any of them co
ntain youtube or vimeo */ | 1415 /* First, check the elements attributes to see if any of them co
ntain youtube or vimeo */ |
| 1410 if (attributeValues.search(readability.regexps.videos) !== -1) { | 1416 if (attributeValues.search(readability.regexps.videos) !== -1) { |
| 1411 continue; | 1417 continue; |
| 1412 } | 1418 } |
| 1413 | 1419 |
| 1414 /* Then check the elements inside this element for the same. */ | 1420 /* Then check the elements inside this element for the same. */ |
| 1415 if (targetList[y].innerHTML.search(readability.regexps.videos) !
== -1) { | 1421 if (targetList[y].innerHTML.search(readability.regexps.videos) !
== -1) { |
| 1416 continue; | 1422 continue; |
| 1417 } | 1423 } |
| 1418 | 1424 |
| 1419 } | 1425 } |
| 1420 | 1426 |
| 1421 targetList[y].parentNode.removeChild(targetList[y]); | 1427 targetList[y].parentNode.removeChild(targetList[y]); |
| 1422 } | 1428 } |
| 1423 }, | 1429 }, |
| 1424 | 1430 |
| 1425 /** | 1431 /** |
| 1426 * Clean an element of all tags of type "tag" if they look fishy. | 1432 * Clean an element of all tags of type "tag" if they look fishy. |
| 1427 * "Fishy" is an algorithm based on content length, classnames, link density
, number of images & embeds, etc. | 1433 * "Fishy" is an algorithm based on content length, classnames, link density
, number of images & embeds, etc. |
| 1428 * | 1434 * |
| 1429 * @return void | 1435 * @return void |
| 1430 **/ | 1436 **/ |
| 1431 cleanConditionally: function (e, tag) { | 1437 cleanConditionally: function (e, tag) { |
| 1432 | 1438 |
| 1433 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { | 1439 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { |
| 1434 return; | 1440 return; |
| 1435 } | 1441 } |
| 1436 | 1442 |
| 1437 var tagsList = e.getElementsByTagName(tag); | 1443 var tagsList = e.getElementsByTagName(tag); |
| 1438 var curTagsLength = tagsList.length; | 1444 var curTagsLength = tagsList.length; |
| 1439 | 1445 |
| 1440 /** | 1446 /** |
| 1441 * Gather counts for other typical elements embedded within. | 1447 * Gather counts for other typical elements embedded within. |
| 1442 * Traverse backwards so we can remove nodes at the same time without ef
fecting the traversal. | 1448 * Traverse backwards so we can remove nodes at the same time without ef
fecting the traversal. |
| 1443 * | 1449 * |
| 1444 * TODO: Consider taking into account original contentScore here. | 1450 * TODO: Consider taking into account original contentScore here. |
| 1445 **/ | 1451 **/ |
| 1446 for (var i=curTagsLength-1; i >= 0; i-=1) { | 1452 for (var i=curTagsLength-1; i >= 0; i-=1) { |
| 1447 var weight = readability.getClassWeight(tagsList[i]); | 1453 var weight = readability.getClassWeight(tagsList[i]); |
| 1448 var contentScore = (typeof tagsList[i].readability !== 'undefined')
? tagsList[i].readability.contentScore : 0; | 1454 var contentScore = (typeof tagsList[i].readability !== 'undefined')
? tagsList[i].readability.contentScore : 0; |
| 1449 | 1455 |
| 1450 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla
ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde
fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); | 1456 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].cla
ssName + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'unde
fined') ? (" with score " + tagsList[i].readability.contentScore) : '')); |
| 1451 | 1457 |
| 1452 if(weight+contentScore < 0) | 1458 if(weight+contentScore < 0) |
| 1453 { | 1459 { |
| 1454 tagsList[i].parentNode.removeChild(tagsList[i]); | 1460 tagsList[i].parentNode.removeChild(tagsList[i]); |
| 1455 } | 1461 } |
| 1456 else if ( readability.getCharCount(tagsList[i],',') < 10) { | 1462 else if ( readability.getCharCount(tagsList[i],',') < 10) { |
| 1457 /** | 1463 /** |
| 1458 * If there are not very many commas, and the number of | 1464 * If there are not very many commas, and the number of |
| 1459 * non-paragraph elements is more than paragraphs or other omino
us signs, remove the element. | 1465 * non-paragraph elements is more than paragraphs or other omino
us signs, remove the element. |
| 1460 **/ | 1466 **/ |
| 1461 var p = tagsList[i].getElementsByTagName("p").length; | 1467 var p = tagsList[i].getElementsByTagName("p").length; |
| 1462 var img = tagsList[i].getElementsByTagName("img").length; | 1468 var img = tagsList[i].getElementsByTagName("img").length; |
| 1463 var li = tagsList[i].getElementsByTagName("li").length-100; | 1469 var li = tagsList[i].getElementsByTagName("li").length-100; |
| 1464 var input = tagsList[i].getElementsByTagName("input").length; | 1470 var input = tagsList[i].getElementsByTagName("input").length; |
| 1465 | 1471 |
| 1466 var embedCount = 0; | 1472 var embedCount = 0; |
| 1467 var embeds = tagsList[i].getElementsByTagName("embed"); | 1473 var embeds = tagsList[i].getElementsByTagName("embed"); |
| 1468 for(var ei=0,il=embeds.length; ei < il; ei+=1) { | 1474 for(var ei=0,il=embeds.length; ei < il; ei+=1) { |
| 1469 if (embeds[ei].src.search(readability.regexps.videos) === -1
) { | 1475 if (embeds[ei].src.search(readability.regexps.videos) === -1
) { |
| 1470 embedCount+=1; | 1476 embedCount+=1; |
| 1471 } | 1477 } |
| 1472 } | 1478 } |
| 1473 | 1479 |
| 1474 var linkDensity = readability.getLinkDensity(tagsList[i]); | 1480 var linkDensity = readability.getLinkDensity(tagsList[i]); |
| 1475 var contentLength = readability.getInnerText(tagsList[i]).length
; | 1481 var contentLength = readability.getInnerText(tagsList[i]).length
; |
| 1476 var toRemove = false; | 1482 var toRemove = false; |
| 1477 | 1483 |
| 1478 if ( img > p ) { | 1484 if ( img > p ) { |
| 1479 toRemove = true; | 1485 toRemove = true; |
| 1480 } else if(li > p && tag !== "ul" && tag !== "ol") { | 1486 } else if(li > p && tag !== "ul" && tag !== "ol") { |
| 1481 toRemove = true; | 1487 toRemove = true; |
| 1482 } else if( input > Math.floor(p/3) ) { | 1488 } else if( input > Math.floor(p/3) ) { |
| 1483 toRemove = true; | 1489 toRemove = true; |
| 1484 } else if(contentLength < 25 && (img === 0 || img > 2) ) { | 1490 } else if(contentLength < 25 && (img === 0 || img > 2) ) { |
| 1485 toRemove = true; | 1491 toRemove = true; |
| 1486 } else if(weight < 25 && linkDensity > 0.2) { | 1492 } else if(weight < 25 && linkDensity > 0.2) { |
| 1487 toRemove = true; | 1493 toRemove = true; |
| 1488 } else if(weight >= 25 && linkDensity > 0.5) { | 1494 } else if(weight >= 25 && linkDensity > 0.5) { |
| 1489 toRemove = true; | 1495 toRemove = true; |
| 1490 } else if((embedCount === 1 && contentLength < 75) || embedCount
> 1) { | 1496 } else if((embedCount === 1 && contentLength < 75) || embedCount
> 1) { |
| 1491 toRemove = true; | 1497 toRemove = true; |
| 1492 } | 1498 } |
| 1493 | 1499 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 1515 } | 1521 } |
| 1516 }, | 1522 }, |
| 1517 | 1523 |
| 1518 flagIsActive: function(flag) { | 1524 flagIsActive: function(flag) { |
| 1519 return (readability.flags & flag) > 0; | 1525 return (readability.flags & flag) > 0; |
| 1520 }, | 1526 }, |
| 1521 | 1527 |
| 1522 addFlag: function(flag) { | 1528 addFlag: function(flag) { |
| 1523 readability.flags = readability.flags | flag; | 1529 readability.flags = readability.flags | flag; |
| 1524 }, | 1530 }, |
| 1525 | 1531 |
| 1526 removeFlag: function(flag) { | 1532 removeFlag: function(flag) { |
| 1527 readability.flags = readability.flags & ~flag; | 1533 readability.flags = readability.flags & ~flag; |
| 1528 }, | 1534 }, |
| 1529 | 1535 |
| 1530 // Removes the children of |src| and appends them to |dest|. | 1536 // Removes the children of |src| and appends them to |dest|. |
| 1531 moveNodeInnards: function(src, dest) { | 1537 moveNodeInnards: function(src, dest) { |
| 1532 try { | 1538 try { |
| 1533 while (src.firstChild) { | 1539 while (src.firstChild) { |
| 1534 dest.appendChild(src.removeChild(src.firstChild)); | 1540 dest.appendChild(src.removeChild(src.firstChild)); |
| 1535 } | 1541 } |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1584 var lastBr = readability.isMultipleBr(node, false); | 1590 var lastBr = readability.isMultipleBr(node, false); |
| 1585 var ret = false; | 1591 var ret = false; |
| 1586 while (lastBr && lastBr != node) { | 1592 while (lastBr && lastBr != node) { |
| 1587 var toRemove = lastBr; | 1593 var toRemove = lastBr; |
| 1588 lastBr = lastBr.previousSibling; | 1594 lastBr = lastBr.previousSibling; |
| 1589 toRemove.parentNode.removeChild(toRemove); | 1595 toRemove.parentNode.removeChild(toRemove); |
| 1590 ret = true; | 1596 ret = true; |
| 1591 } | 1597 } |
| 1592 return ret; | 1598 return ret; |
| 1593 }, | 1599 }, |
| 1594 | 1600 |
| 1595 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a | 1601 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a |
| 1596 // <P> node, and makes all next siblings of that pair children of <P>, up | 1602 // <P> node, and makes all next siblings of that pair children of <P>, up |
| 1597 // until the next pair of <BR> nodes is reached. | 1603 // until the next pair of <BR> nodes is reached. |
| 1598 replaceDoubleBrWithP: function(node) { | 1604 replaceDoubleBrWithP: function(node) { |
| 1599 // Check that we are starting with a BR. | 1605 // Check that we are starting with a BR. |
| 1600 var second = readability.isMultipleBr(node, true); | 1606 var second = readability.isMultipleBr(node, true); |
| 1601 if (!second) { | 1607 if (!second) { |
| 1602 return; | 1608 return; |
| 1603 } | 1609 } |
| 1604 // Make all next siblings of the second BR into children of a P. | 1610 // Make all next siblings of the second BR into children of a P. |
| 1605 var p = document.createElement('p'); | 1611 var p = document.createElement('p'); |
| 1606 var curr = second.nextSibling; | 1612 var curr = second.nextSibling; |
| 1607 while (curr) { | 1613 while (curr) { |
| 1608 if (readability.isMultipleBr(curr, true)) { | 1614 if (readability.isMultipleBr(curr, true)) { |
| 1609 break; | 1615 break; |
| 1610 } | 1616 } |
| 1611 var next = curr.nextSibling; | 1617 var next = curr.nextSibling; |
| 1612 p.appendChild(curr.parentNode.removeChild(curr)); | 1618 p.appendChild(curr.parentNode.removeChild(curr)); |
| 1613 curr = next; | 1619 curr = next; |
| 1614 } | 1620 } |
| 1615 var ret = curr; | 1621 var ret = curr; |
| 1616 | 1622 |
| 1617 // Remove all nodes between the first and second BR. | 1623 // Remove all nodes between the first and second BR. |
| 1618 curr = node.nextSibling; | 1624 curr = node.nextSibling; |
| 1619 while (curr && curr != second) { | 1625 while (curr && curr != second) { |
| 1620 var next = curr.nextSibling; | 1626 var next = curr.nextSibling; |
| 1621 curr.parentNode.removeChild(curr); | 1627 curr.parentNode.removeChild(curr); |
| 1622 curr = next; | 1628 curr = next; |
| 1623 } | 1629 } |
| 1624 // Remove the second BR. | 1630 // Remove the second BR. |
| 1625 second.parentNode.removeChild(second); | 1631 second.parentNode.removeChild(second); |
| 1626 // Replace the first BR with the P. | 1632 // Replace the first BR with the P. |
| 1627 node.parentNode.replaceChild(p, node); | 1633 node.parentNode.replaceChild(p, node); |
| 1628 | 1634 |
| 1629 return ret; | 1635 return ret; |
| 1630 }, | 1636 }, |
| 1631 | 1637 |
| 1632 // Returns true if the NodeList contains a double <BR>. | 1638 // Returns true if the NodeList contains a double <BR>. |
| 1633 hasDoubleBr: function(nodeList) { | 1639 hasDoubleBr: function(nodeList) { |
| 1634 for (var i = 0; i < nodeList.length; nodeList++) { | 1640 for (var i = 0; i < nodeList.length; nodeList++) { |
| 1635 if (readability.isMultipleBr(nodeList[i], true)) { | 1641 if (readability.isMultipleBr(nodeList[i], true)) { |
| 1636 return true; | 1642 return true; |
| 1637 } | 1643 } |
| 1638 } | 1644 } |
| 1639 return false; | 1645 return false; |
| 1640 }, | 1646 }, |
| 1641 | 1647 |
| 1642 // Replaces double <BR> tags with <P> tags. | 1648 // Replaces double <BR> tags with <P> tags. |
| 1643 replaceDoubleBrsWithPs: function(node) { | 1649 replaceDoubleBrsWithPs: function(node) { |
| 1644 var allElements = node.getElementsByTagName('BR'); | 1650 var allElements = node.getElementsByTagName('BR'); |
| 1645 var node = null; | 1651 var node = null; |
| 1646 while (allElements && allElements.length > 0 && | 1652 while (allElements && allElements.length > 0 && |
| 1647 readability.hasDoubleBr(allElements)) { | 1653 readability.hasDoubleBr(allElements)) { |
| 1648 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { | 1654 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { |
| 1649 var next = node; | 1655 var next = node; |
| 1650 while (next = readability.replaceDoubleBrWithP(next)); | 1656 while (next = readability.replaceDoubleBrWithP(next)); |
| 1651 } | 1657 } |
| 1652 allElements = document.body.getElementsByTagName('BR'); | 1658 allElements = document.body.getElementsByTagName('BR'); |
| 1653 } | 1659 } |
| 1654 }, | 1660 }, |
| 1655 | 1661 |
| 1656 | 1662 |
| 1657 // Replaces a BR and the whitespace that follows it with a P. | 1663 // Replaces a BR and the whitespace that follows it with a P. |
| 1658 replaceBrWithP: function(node) { | 1664 replaceBrWithP: function(node) { |
| 1659 if (!readability.isBrNode(node)) { | 1665 if (!readability.isBrNode(node)) { |
| 1660 return; | 1666 return; |
| 1661 } | 1667 } |
| 1662 var p = document.createElement('p'); | 1668 var p = document.createElement('p'); |
| 1663 var curr = node.nextSibling; | 1669 var curr = node.nextSibling; |
| 1664 while (curr && !isBrNode(curr)) { | 1670 while (curr && !isBrNode(curr)) { |
| 1665 var next = curr.nextSibling; | 1671 var next = curr.nextSibling; |
| 1666 if (readability.isWhitespaceNode(curr)) { | 1672 if (readability.isWhitespaceNode(curr)) { |
| 1667 curr.parentNode.removeChild(curr); | 1673 curr.parentNode.removeChild(curr); |
| 1668 } else { | 1674 } else { |
| 1669 p.appendChild(curr.parentNode.removeChild(curr)); | 1675 p.appendChild(curr.parentNode.removeChild(curr)); |
| 1670 } | 1676 } |
| 1671 curr = next; | 1677 curr = next; |
| 1672 } | 1678 } |
| 1673 node.parentNode.replaceChild(p, node); | 1679 node.parentNode.replaceChild(p, node); |
| 1674 return curr; | 1680 return curr; |
| 1675 }, | 1681 }, |
| 1676 | 1682 |
| 1677 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t
ag | 1683 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> t
ag |
| 1678 // children of the <P>. | 1684 // children of the <P>. |
| 1679 replaceBrsWithPs: function(node) { | 1685 replaceBrsWithPs: function(node) { |
| 1680 var allElements = node.getElementsByTagName('BR'); | 1686 var allElements = node.getElementsByTagName('BR'); |
| 1681 var node = null; | 1687 var node = null; |
| 1682 while (allElements && allElements.length > 0) { | 1688 while (allElements && allElements.length > 0) { |
| 1683 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { | 1689 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex +
= 1) { |
| 1684 var next = node; | 1690 var next = node; |
| 1685 while (next = readability.replaceBrWithP(next)); | 1691 while (next = readability.replaceBrWithP(next)); |
| 1686 } | 1692 } |
| 1687 allElements = document.body.getElementsByTagName('BR'); | 1693 allElements = document.body.getElementsByTagName('BR'); |
| 1688 } | 1694 } |
| 1689 }, | 1695 }, |
| 1690 | 1696 |
| 1691 // Replaces any tag with any other tag. | 1697 // Replaces any tag with any other tag. |
| 1692 replaceTagsWithTags: function(node, srcTag, destTag) { | 1698 replaceTagsWithTags: function(node, srcTag, destTag) { |
| 1693 var allElements = node.getElementsByTagName(srcTag); | 1699 var allElements = node.getElementsByTagName(srcTag); |
| 1694 for (var i = 0; i < allElements.length; i++) { | 1700 for (var i = 0; i < allElements.length; i++) { |
| 1695 var dest = document.createElement(destTag); | 1701 var dest = document.createElement(destTag); |
| 1696 readability.moveNodeInnards(allElements[i], dest); | 1702 readability.moveNodeInnards(allElements[i], dest); |
| 1697 node.replaceNode(dest, allElements[i]); | 1703 allElements[i].parentNode.replaceChild(dest, allElements[i]); |
| 1698 } | 1704 } |
| 1699 }, | 1705 }, |
| 1700 | 1706 |
| 1701 // Replaces all <noscript> tags with <p> tags. | 1707 // Replaces all <noscript> tags with <p> tags. |
| 1702 replaceNoscriptsWithPs: function(node) { | 1708 replaceNoscriptsWithPs: function(node) { |
| 1703 readability.replaceTagsWithTags(node, 'noscript', 'p'); | 1709 readability.replaceTagsWithTags(node, 'noscript', 'p'); |
| 1704 }, | 1710 }, |
| 1705 | 1711 |
| 1706 // Replaces all <font> tags with <span> tags. | 1712 // Replaces all <font> tags with <span> tags. |
| 1707 replaceFontsWithSpans: function(node) { | 1713 replaceFontsWithSpans: function(node) { |
| 1708 readability.replaceTagsWithTags(node, 'font', 'span'); | 1714 readability.replaceTagsWithTags(node, 'font', 'span'); |
| 1709 }, | 1715 }, |
| 1710 | 1716 |
| 1711 // Returns a list of image URLs in the distilled article. | 1717 // Returns a list of image URLs in the distilled article. |
| 1712 getImages : function() { | 1718 getImages : function() { |
| 1713 var images = document.getElementsByTagName('img'); | 1719 var images = document.getElementsByTagName('img'); |
| 1714 var result = new Array(images.length); | 1720 var result = new Array(images.length); |
| 1715 dbg("Number of images: " + images.length); | 1721 dbg("Number of images: " + images.length); |
| 1716 for(i = 0; i < images.length; i++) { | 1722 for(i = 0; i < images.length; i++) { |
| 1717 result[i] = images[i].src; | 1723 result[i] = images[i].src; |
| 1718 dbg("Image: " + result[i]); | 1724 dbg("Image: " + result[i]); |
| 1719 } | 1725 } |
| 1720 return result; | 1726 return result; |
| 1721 }, | 1727 }, |
| 1722 | 1728 |
| 1723 // Returns the distilled article HTML from the page(s). | 1729 // Returns the distilled article HTML from the page(s). |
| 1724 getDistilledArticleHTML : function() { | 1730 getDistilledArticleHTML : function() { |
| 1725 return readability.distilledHTML; | 1731 return readability.distilledHTML; |
| 1732 }, |
| 1733 |
| 1734 // Returns the next page of this article. |
| 1735 getNextPageLink : function() { |
| 1736 return readability.nextPageLink; |
| 1726 } | 1737 } |
| 1727 }; | 1738 }; |
| 1728 | 1739 |
| 1729 // Extracts long-form content from a page and returns and array where the first | 1740 // Extracts long-form content from a page and returns and array where the first |
| 1730 // element is the article title, the second element is HTML containing the | 1741 // element is the article title, the second element is HTML containing the |
| 1731 // long-form content, and remaining elements are URLs for images referenced by | 1742 // long-form content, and remaining elements are URLs for images referenced by |
| 1732 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which | 1743 // that HTML. Each <img> tag in the HTML has an id field set to k - 2, which |
| 1733 // corresponds to a URL listed at index k in the array returned. | 1744 // corresponds to a URL listed at index k in the array returned. |
| 1734 (function () { | 1745 (function () { |
| 1735 readability.init(); | 1746 readability.init(); |
| 1736 var result = new Array(2); | 1747 var result = new Array(3); |
| 1737 result[0] = readability.getArticleTitle(); | 1748 result[0] = readability.getArticleTitle(); |
| 1738 result[1] = readability.getDistilledArticleHTML(); | 1749 result[1] = readability.getDistilledArticleHTML(); |
| 1750 result[2] = readability.getNextPageLink(); |
| 1739 return result.concat(readability.getImages()); | 1751 return result.concat(readability.getImages()); |
| 1740 }()) | 1752 }()) |
| 1741 | 1753 |
| OLD | NEW |