OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 /* | 5 /* |
6 * Parts of this file are adapted from Readability. | 6 * Parts of this file are adapted from Readability. |
7 * | 7 * |
8 * Readability is Copyright (c) 2010 Src90 Inc | 8 * Readability is Copyright (c) 2010 Src90 Inc |
9 * and licenced under the Apache License, Version 2.0. | 9 * and licenced under the Apache License, Version 2.0. |
10 */ | 10 */ |
11 | 11 |
12 package org.chromium.distiller; | 12 package org.chromium.distiller; |
13 | 13 |
14 import org.chromium.distiller.proto.DomDistillerProtos; | 14 import org.chromium.distiller.proto.DomDistillerProtos; |
15 | 15 |
16 import com.google.gwt.dom.client.AnchorElement; | 16 import com.google.gwt.dom.client.AnchorElement; |
17 import com.google.gwt.dom.client.BaseElement; | 17 import com.google.gwt.dom.client.BaseElement; |
18 import com.google.gwt.dom.client.Document; | 18 import com.google.gwt.dom.client.Document; |
19 import com.google.gwt.dom.client.Element; | 19 import com.google.gwt.dom.client.Element; |
20 import com.google.gwt.dom.client.NodeList; | 20 import com.google.gwt.dom.client.NodeList; |
21 import com.google.gwt.regexp.shared.RegExp; | 21 import com.google.gwt.regexp.shared.RegExp; |
22 import com.google.gwt.user.client.Window; | |
23 | 22 |
24 import java.util.ArrayList; | |
25 import java.util.Arrays; | |
26 import java.util.Collections; | |
27 import java.util.HashMap; | 23 import java.util.HashMap; |
28 import java.util.HashSet; | 24 import java.util.HashSet; |
29 import java.util.List; | |
30 import java.util.Map; | 25 import java.util.Map; |
31 import java.util.Set; | 26 import java.util.Set; |
32 | 27 |
33 /** | 28 /** |
34 * This class finds the next and previous page links for the distilled document.
The functionality | 29 * This class finds the next and previous page links for the distilled document.
The functionality |
35 * for next page links is migrated from readability.getArticleTitle() in chromiu
m codebase's | 30 * for next page links is migrated from readability.getArticleTitle() in chromiu
m codebase's |
36 * third_party/readability/js/readability.js, and then expanded for previous pag
e links; boilerpipe | 31 * third_party/readability/js/readability.js, and then expanded for previous pag
e links; boilerpipe |
37 * doesn't have such capability. | 32 * doesn't have such capability. |
38 * First, it determines the prefix URL of the document. Then, for each anchor i
n the document, its | 33 * First, it determines the prefix URL of the document. Then, for each anchor i
n the document, its |
39 * href and text are compared to the prefix URL and examined for next- or previo
us-paging-related | 34 * href and text are compared to the prefix URL and examined for next- or previo
us-paging-related |
(...skipping 14 matching lines...) Expand all Loading... |
54 + "|tool|widget", | 49 + "|tool|widget", |
55 "i"); | 50 "i"); |
56 private static final RegExp REG_EXTRANEOUS = RegExp.compile( | 51 private static final RegExp REG_EXTRANEOUS = RegExp.compile( |
57 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig
n|single" | 52 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig
n|single" |
58 + "|as one|article|post|篇", | 53 + "|as one|article|post|篇", |
59 "i"); | 54 "i"); |
60 private static final RegExp REG_PAGINATION = RegExp.compile("pag(e|ing|inat)
", "i"); | 55 private static final RegExp REG_PAGINATION = RegExp.compile("pag(e|ing|inat)
", "i"); |
61 private static final RegExp REG_LINK_PAGINATION = | 56 private static final RegExp REG_LINK_PAGINATION = |
62 RegExp.compile("p(a|g|ag)?(e|ing|ination)?(=|\\/)[0-9]{1,2}$", "i"); | 57 RegExp.compile("p(a|g|ag)?(e|ing|ination)?(=|\\/)[0-9]{1,2}$", "i"); |
63 private static final RegExp REG_FIRST_LAST = RegExp.compile("(first|last)",
"i"); | 58 private static final RegExp REG_FIRST_LAST = RegExp.compile("(first|last)",
"i"); |
64 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1
2-2". | |
65 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123". | |
66 private static final RegExp REG_PAGE_NUMBER = | |
67 RegExp.compile("((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$", "gi"); | |
68 | |
69 private static final RegExp REG_HREF_CLEANER = RegExp.compile("/?(#.*)?$"); | 59 private static final RegExp REG_HREF_CLEANER = RegExp.compile("/?(#.*)?$"); |
70 private static final RegExp REG_NUMBER = RegExp.compile("\\d"); | 60 private static final RegExp REG_NUMBER = RegExp.compile("\\d"); |
71 | 61 |
72 public static DomDistillerProtos.PaginationInfo getPaginationInfo(String ori
ginal_url) { | 62 public static DomDistillerProtos.PaginationInfo getPaginationInfo(String ori
ginal_url) { |
73 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn
fo.create(); | 63 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn
fo.create(); |
74 String next = findNext(Document.get().getDocumentElement(), original_url
); | 64 String next = findNext(Document.get().getDocumentElement(), original_url
); |
75 if (next != null) { | 65 if (next != null) { |
76 info.setNextPage(next); | 66 info.setNextPage(next); |
77 } | 67 } |
78 return info; | 68 return info; |
(...skipping 295 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
374 | 364 |
375 BaseElement base = doc.createBaseElement(); | 365 BaseElement base = doc.createBaseElement(); |
376 base.setHref(base_url); | 366 base.setHref(base_url); |
377 doc.getHead().appendChild(base); | 367 doc.getHead().appendChild(base); |
378 | 368 |
379 AnchorElement a = doc.createAnchorElement(); | 369 AnchorElement a = doc.createAnchorElement(); |
380 doc.getBody().appendChild(a); | 370 doc.getBody().appendChild(a); |
381 return a; | 371 return a; |
382 } | 372 } |
383 | 373 |
384 private static String fixMissingScheme(String url) { | |
385 if (url.isEmpty()) return ""; | |
386 if (!url.contains("://")) return "http://" + url; | |
387 return url; | |
388 } | |
389 | |
390 // The link is resolved using an anchor within a new HTML document with a ba
se tag. | 374 // The link is resolved using an anchor within a new HTML document with a ba
se tag. |
391 public static String resolveLinkHref(AnchorElement link, AnchorElement baseA
nchor) { | 375 public static String resolveLinkHref(AnchorElement link, AnchorElement baseA
nchor) { |
392 String linkHref = link.getAttribute("href"); | 376 String linkHref = link.getAttribute("href"); |
393 return resolveLinkHref(linkHref, baseAnchor); | 377 return resolveLinkHref(linkHref, baseAnchor); |
394 } | 378 } |
395 | 379 |
396 public static String resolveLinkHref(String linkHref, AnchorElement baseAnch
or) { | 380 public static String resolveLinkHref(String linkHref, AnchorElement baseAnch
or) { |
397 baseAnchor.setAttribute("href", linkHref); | 381 baseAnchor.setAttribute("href", linkHref); |
398 return baseAnchor.getHref(); | 382 return baseAnchor.getHref(); |
399 } | 383 } |
400 | 384 |
401 private static String getScheme(String url) { | 385 private static String getScheme(String url) { |
402 return StringUtil.split(url, ":\\/\\/")[0]; | 386 return StringUtil.split(url, ":\\/\\/")[0]; |
403 } | 387 } |
404 | 388 |
405 // Port number is also included if it exists. | 389 // Port number is also included if it exists. |
406 private static String getHostname(String url) { | 390 private static String getHostname(String url) { |
407 url = StringUtil.split(url, ":\\/\\/")[1]; | 391 url = StringUtil.split(url, ":\\/\\/")[1]; |
408 if (!url.contains("/")) return url; | 392 if (!url.contains("/")) return url; |
409 return StringUtil.split(url, "\\/")[0]; | 393 return StringUtil.split(url, "\\/")[0]; |
410 } | 394 } |
411 | 395 |
412 private static String getPath(String url) { | |
413 url = StringUtil.split(url, ":\\/\\/")[1]; | |
414 if (!url.contains("/")) return ""; | |
415 return StringUtil.findAndReplace(url, "^([^/]*)/", ""); | |
416 } | |
417 | |
418 public static Integer pageDiff(String url, String linkHref, AnchorElement li
nk, int skip) { | 396 public static Integer pageDiff(String url, String linkHref, AnchorElement li
nk, int skip) { |
419 int commonLen = skip; | 397 int commonLen = skip; |
420 int i; | 398 int i; |
421 for (i=skip; i<Math.min(url.length(), linkHref.length()); i++) { | 399 for (i=skip; i<Math.min(url.length(), linkHref.length()); i++) { |
422 if (url.charAt(i) != linkHref.charAt(i)) { | 400 if (url.charAt(i) != linkHref.charAt(i)) { |
423 break; | 401 break; |
424 } | 402 } |
425 } | 403 } |
426 commonLen = i; | 404 commonLen = i; |
427 url = url.substring(commonLen, url.length()); | 405 url = url.substring(commonLen, url.length()); |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
499 } | 477 } |
500 | 478 |
501 private enum PageLink { | 479 private enum PageLink { |
502 NEXT, | 480 NEXT, |
503 PREV, | 481 PREV, |
504 } | 482 } |
505 | 483 |
506 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme
nt, String>(); | 484 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme
nt, String>(); |
507 | 485 |
508 } | 486 } |
OLD | NEW |