Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(167)

Side by Side Diff: java/org/chromium/distiller/PagingLinksFinder.java

Issue 1725243002: Fix some warnings in Eclipse (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: rebase Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 /* 5 /*
6 * Parts of this file are adapted from Readability. 6 * Parts of this file are adapted from Readability.
7 * 7 *
8 * Readability is Copyright (c) 2010 Src90 Inc 8 * Readability is Copyright (c) 2010 Src90 Inc
9 * and licenced under the Apache License, Version 2.0. 9 * and licenced under the Apache License, Version 2.0.
10 */ 10 */
11 11
12 package org.chromium.distiller; 12 package org.chromium.distiller;
13 13
14 import org.chromium.distiller.proto.DomDistillerProtos; 14 import org.chromium.distiller.proto.DomDistillerProtos;
15 15
16 import com.google.gwt.dom.client.AnchorElement; 16 import com.google.gwt.dom.client.AnchorElement;
17 import com.google.gwt.dom.client.BaseElement; 17 import com.google.gwt.dom.client.BaseElement;
18 import com.google.gwt.dom.client.Document; 18 import com.google.gwt.dom.client.Document;
19 import com.google.gwt.dom.client.Element; 19 import com.google.gwt.dom.client.Element;
20 import com.google.gwt.dom.client.NodeList; 20 import com.google.gwt.dom.client.NodeList;
21 import com.google.gwt.regexp.shared.RegExp; 21 import com.google.gwt.regexp.shared.RegExp;
22 import com.google.gwt.user.client.Window;
23 22
24 import java.util.ArrayList;
25 import java.util.Arrays;
26 import java.util.Collections;
27 import java.util.HashMap; 23 import java.util.HashMap;
28 import java.util.HashSet; 24 import java.util.HashSet;
29 import java.util.List;
30 import java.util.Map; 25 import java.util.Map;
31 import java.util.Set; 26 import java.util.Set;
32 27
33 /** 28 /**
34 * This class finds the next and previous page links for the distilled document. The functionality 29 * This class finds the next and previous page links for the distilled document. The functionality
35 * for next page links is migrated from readability.getArticleTitle() in chromiu m codebase's 30 * for next page links is migrated from readability.getArticleTitle() in chromiu m codebase's
36 * third_party/readability/js/readability.js, and then expanded for previous pag e links; boilerpipe 31 * third_party/readability/js/readability.js, and then expanded for previous pag e links; boilerpipe
37 * doesn't have such capability. 32 * doesn't have such capability.
38 * First, it determines the prefix URL of the document. Then, for each anchor i n the document, its 33 * First, it determines the prefix URL of the document. Then, for each anchor i n the document, its
39 * href and text are compared to the prefix URL and examined for next- or previo us-paging-related 34 * href and text are compared to the prefix URL and examined for next- or previo us-paging-related
(...skipping 14 matching lines...) Expand all
54 + "|tool|widget", 49 + "|tool|widget",
55 "i"); 50 "i");
56 private static final RegExp REG_EXTRANEOUS = RegExp.compile( 51 private static final RegExp REG_EXTRANEOUS = RegExp.compile(
57 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig n|single" 52 "print|archive|comment|discuss|e[\\-]?mail|share|reply|all|login|sig n|single"
58 + "|as one|article|post|篇", 53 + "|as one|article|post|篇",
59 "i"); 54 "i");
60 private static final RegExp REG_PAGINATION = RegExp.compile("pag(e|ing|inat) ", "i"); 55 private static final RegExp REG_PAGINATION = RegExp.compile("pag(e|ing|inat) ", "i");
61 private static final RegExp REG_LINK_PAGINATION = 56 private static final RegExp REG_LINK_PAGINATION =
62 RegExp.compile("p(a|g|ag)?(e|ing|ination)?(=|\\/)[0-9]{1,2}$", "i"); 57 RegExp.compile("p(a|g|ag)?(e|ing|ination)?(=|\\/)[0-9]{1,2}$", "i");
63 private static final RegExp REG_FIRST_LAST = RegExp.compile("(first|last)", "i"); 58 private static final RegExp REG_FIRST_LAST = RegExp.compile("(first|last)", "i");
64 // Examples that match PAGE_NUMBER_REGEX are: "_p3", "-pg3", "p3", "_1", "-1 2-2".
65 // Examples that don't match PAGE_NUMBER_REGEX are: "_p3 ", "p", "p123".
66 private static final RegExp REG_PAGE_NUMBER =
67 RegExp.compile("((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$", "gi");
68
69 private static final RegExp REG_HREF_CLEANER = RegExp.compile("/?(#.*)?$"); 59 private static final RegExp REG_HREF_CLEANER = RegExp.compile("/?(#.*)?$");
70 private static final RegExp REG_NUMBER = RegExp.compile("\\d"); 60 private static final RegExp REG_NUMBER = RegExp.compile("\\d");
71 61
72 public static DomDistillerProtos.PaginationInfo getPaginationInfo(String ori ginal_url) { 62 public static DomDistillerProtos.PaginationInfo getPaginationInfo(String ori ginal_url) {
73 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn fo.create(); 63 DomDistillerProtos.PaginationInfo info = DomDistillerProtos.PaginationIn fo.create();
74 String next = findNext(Document.get().getDocumentElement(), original_url ); 64 String next = findNext(Document.get().getDocumentElement(), original_url );
75 if (next != null) { 65 if (next != null) {
76 info.setNextPage(next); 66 info.setNextPage(next);
77 } 67 }
78 return info; 68 return info;
(...skipping 295 matching lines...) Expand 10 before | Expand all | Expand 10 after
374 364
375 BaseElement base = doc.createBaseElement(); 365 BaseElement base = doc.createBaseElement();
376 base.setHref(base_url); 366 base.setHref(base_url);
377 doc.getHead().appendChild(base); 367 doc.getHead().appendChild(base);
378 368
379 AnchorElement a = doc.createAnchorElement(); 369 AnchorElement a = doc.createAnchorElement();
380 doc.getBody().appendChild(a); 370 doc.getBody().appendChild(a);
381 return a; 371 return a;
382 } 372 }
383 373
384 private static String fixMissingScheme(String url) {
385 if (url.isEmpty()) return "";
386 if (!url.contains("://")) return "http://" + url;
387 return url;
388 }
389
390 // The link is resolved using an anchor within a new HTML document with a ba se tag. 374 // The link is resolved using an anchor within a new HTML document with a ba se tag.
391 public static String resolveLinkHref(AnchorElement link, AnchorElement baseA nchor) { 375 public static String resolveLinkHref(AnchorElement link, AnchorElement baseA nchor) {
392 String linkHref = link.getAttribute("href"); 376 String linkHref = link.getAttribute("href");
393 return resolveLinkHref(linkHref, baseAnchor); 377 return resolveLinkHref(linkHref, baseAnchor);
394 } 378 }
395 379
396 public static String resolveLinkHref(String linkHref, AnchorElement baseAnch or) { 380 public static String resolveLinkHref(String linkHref, AnchorElement baseAnch or) {
397 baseAnchor.setAttribute("href", linkHref); 381 baseAnchor.setAttribute("href", linkHref);
398 return baseAnchor.getHref(); 382 return baseAnchor.getHref();
399 } 383 }
400 384
401 private static String getScheme(String url) { 385 private static String getScheme(String url) {
402 return StringUtil.split(url, ":\\/\\/")[0]; 386 return StringUtil.split(url, ":\\/\\/")[0];
403 } 387 }
404 388
405 // Port number is also included if it exists. 389 // Port number is also included if it exists.
406 private static String getHostname(String url) { 390 private static String getHostname(String url) {
407 url = StringUtil.split(url, ":\\/\\/")[1]; 391 url = StringUtil.split(url, ":\\/\\/")[1];
408 if (!url.contains("/")) return url; 392 if (!url.contains("/")) return url;
409 return StringUtil.split(url, "\\/")[0]; 393 return StringUtil.split(url, "\\/")[0];
410 } 394 }
411 395
412 private static String getPath(String url) {
413 url = StringUtil.split(url, ":\\/\\/")[1];
414 if (!url.contains("/")) return "";
415 return StringUtil.findAndReplace(url, "^([^/]*)/", "");
416 }
417
418 public static Integer pageDiff(String url, String linkHref, AnchorElement li nk, int skip) { 396 public static Integer pageDiff(String url, String linkHref, AnchorElement li nk, int skip) {
419 int commonLen = skip; 397 int commonLen = skip;
420 int i; 398 int i;
421 for (i=skip; i<Math.min(url.length(), linkHref.length()); i++) { 399 for (i=skip; i<Math.min(url.length(), linkHref.length()); i++) {
422 if (url.charAt(i) != linkHref.charAt(i)) { 400 if (url.charAt(i) != linkHref.charAt(i)) {
423 break; 401 break;
424 } 402 }
425 } 403 }
426 commonLen = i; 404 commonLen = i;
427 url = url.substring(commonLen, url.length()); 405 url = url.substring(commonLen, url.length());
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
499 } 477 }
500 478
501 private enum PageLink { 479 private enum PageLink {
502 NEXT, 480 NEXT,
503 PREV, 481 PREV,
504 } 482 }
505 483
506 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>(); 484 private static final Map<Element, String> mLinkDebugInfo = new HashMap<Eleme nt, String>();
507 485
508 } 486 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698