OLD | NEW |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import java.util.ArrayList; | 7 import java.util.ArrayList; |
| 8 import java.util.HashSet; |
8 import java.util.List; | 9 import java.util.List; |
| 10 import java.util.Set; |
9 | 11 |
10 /** | 12 /** |
11 * This class stores information about the page parameter detected from potentia
l pagination URLs | 13 * This class stores information about the page parameter detected from potentia
l pagination URLs |
12 * with numeric anchor text: | 14 * with numeric anchor text: |
13 * - type of page parameter detected | 15 * - type of page parameter detected |
14 * - list of pagination URLs with their page numbers | 16 * - list of pagination URLs with their page numbers |
15 * - coefficient and delta values of the linear formula formed by the pagination
URLs: | 17 * - coefficient and delta values of the linear formula formed by the pagination
URLs: |
16 * pageParamValue = coefficient * pageNum + delta | 18 * pageParamValue = coefficient * pageNum + delta. |
17 * - detected single page URL. | |
18 */ | 19 */ |
19 class PageParamInfo implements Cloneable { | 20 class PageParamInfo implements Cloneable { |
| 21 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; |
| 22 |
| 23 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; |
| 24 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; |
20 | 25 |
21 /** | 26 /** |
22 * Stores potential pagination info: | 27 * Stores potential pagination info: |
23 * - page number represented as original plain text in document URL | 28 * - page number represented as original plain text in document URL |
24 * - if the info is extracted from an anchor, its HRef. | 29 * - if the info is extracted from an anchor, its HRef. |
25 */ | 30 */ |
26 static class PageInfo implements Comparable<PageInfo> { | 31 static class PageInfo implements Comparable<PageInfo> { |
27 int mPageNum; | 32 int mPageNum; |
28 String mUrl; | 33 String mUrl; |
29 | 34 |
(...skipping 13 matching lines...) Expand all Loading... |
43 | 48 |
44 @Override | 49 @Override |
45 public String toString() { // For debugging. | 50 public String toString() { // For debugging. |
46 return new String("pg" + mPageNum + ": " + mUrl); | 51 return new String("pg" + mPageNum + ": " + mUrl); |
47 } | 52 } |
48 } | 53 } |
49 | 54 |
50 /** | 55 /** |
51 * Stores the coefficient and delta values of the linear formula: | 56 * Stores the coefficient and delta values of the linear formula: |
52 * pageParamValue = coefficient * pageNum + delta. | 57 * pageParamValue = coefficient * pageNum + delta. |
53 * See PageParamterDetector.getPageParamLinearFormula() for details. | 58 * See getLinearFormula() for details. |
54 */ | 59 */ |
55 static class LinearFormula { | 60 static class LinearFormula { |
56 final int mCoefficient; | 61 final int mCoefficient; |
57 final int mDelta; | 62 final int mDelta; |
58 | 63 |
59 LinearFormula(int coefficient, int delta) { | 64 LinearFormula(int coefficient, int delta) { |
60 mCoefficient = coefficient; | 65 mCoefficient = coefficient; |
61 mDelta = delta; | 66 mDelta = delta; |
62 } | 67 } |
63 | 68 |
64 @Override | 69 @Override |
65 public String toString() { // For debugging. | 70 public String toString() { // For debugging. |
66 return new String("coefficient=" + mCoefficient + ", delta=" + mDelt
a); | 71 return new String("coefficient=" + mCoefficient + ", delta=" + mDelt
a); |
67 } | 72 } |
68 } | 73 } |
69 | 74 |
70 /** | 75 /** |
71 * Types of page parameter values in paging URLs. | 76 * Types of page parameter values in paging URLs. |
72 */ | 77 */ |
73 static enum Type { | 78 static enum Type { |
74 UNSET, // Initialized type to indicate empty PageParamInfo. | 79 UNSET, // Initialized type to indicate empty PageParamInfo. |
75 PAGE_NUMBER, // Value is a page number. | 80 PAGE_NUMBER, // Value is a page number. |
76 UNKNOWN, // None of the above. | 81 UNKNOWN, // None of the above. |
77 } | 82 } |
78 | 83 |
79 Type mType = Type.UNSET; | 84 Type mType = Type.UNSET; |
80 List<PageInfo> mAllPageInfo = new ArrayList<PageInfo>(); | 85 List<PageInfo> mAllPageInfo; |
81 LinearFormula mFormula = null; | 86 LinearFormula mFormula = null; |
82 String mSinglePageUrl = ""; | 87 |
| 88 PageParamInfo() { |
| 89 mAllPageInfo = new ArrayList<PageInfo>(); |
| 90 } |
| 91 |
| 92 PageParamInfo(Type type, List<PageInfo> allPageInfo, LinearFormula formula)
{ |
| 93 mType = type; |
| 94 mAllPageInfo = allPageInfo; |
| 95 mFormula = formula; |
| 96 } |
| 97 |
| 98 /** |
| 99 * Evaluates if the given list of PageLinkInfo's is a list of paging URLs: |
| 100 * - page numbers in list of PageLinkInfo's must be adjacent |
| 101 * - page numbers in list of ascending numbers must either |
| 102 * - be consecutive and form a page number sequence, or |
| 103 * - must construct a linear map with a linear formula: page_parameter = a
* page_number + b |
| 104 * - if there's only 1 PageLinkInfo, the first ascending number must be page
1, first page URL |
| 105 * must match page pattern, and the only outlink must be 2nd or 3rd page. |
| 106 * |
| 107 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns
null. |
| 108 * |
| 109 * @param allLinkInfo the list of PageLinkInfo's to evaluate |
| 110 * @param pagePattern the URL pattern to use |
| 111 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| 112 * @param firstPageUrl the URL of the PageInfo with mPageNum=1 |
| 113 */ |
| 114 static PageParamInfo evaluate(PageParameterDetector.PagePattern pagePattern, |
| 115 List<PageLinkInfo> allLinkInfo, List<PageInfo> ascendingNumbers, Str
ing firstPageUrl) { |
| 116 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { |
| 117 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin
gNumbers); |
| 118 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; |
| 119 |
| 120 LinearFormula linearFormula = getLinearFormula(allLinkInfo); |
| 121 |
| 122 // Type.PAGE_NUMBER: ascending numbers must be consecutive and form
a page number |
| 123 // sequence. |
| 124 if ((result & PAGE_NUM_CONSECUTIVE_MASK) == 0) return null; |
| 125 if (!isPageNumberSequence(ascendingNumbers)) return null; |
| 126 |
| 127 List<PageInfo> allPageInfo = new ArrayList<PageInfo>(); |
| 128 for (PageLinkInfo link : allLinkInfo) { |
| 129 allPageInfo.add(new PageInfo(link.mPageNum, |
| 130 ascendingNumbers.get(link.mPosInAscendingList).mUrl)); |
| 131 } |
| 132 return new PageParamInfo(Type.PAGE_NUMBER, allPageInfo, linearFormul
a); |
| 133 } |
| 134 |
| 135 // Most of news article have no more than 3 pages and the first page pro
bably doesn't have |
| 136 // any page parameter. If the first page url matches the the page patte
rn, we treat it as |
| 137 // the first page of this pattern. |
| 138 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { |
| 139 final PageLinkInfo onlyLink = allLinkInfo.get(0); |
| 140 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && |
| 141 onlyLink.mPosInAscendingList == 1; |
| 142 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && |
| 143 onlyLink.mPosInAscendingList == 2 && |
| 144 // onlyLink's pos is 2 (evaluated right before), so ascendin
gNumbers has >= 3 |
| 145 // elements; check if previous element is previous page. |
| 146 ascendingNumbers.get(1).mPageNum == 2; |
| 147 // 1 PageLinkInfo means ascendingNumbers has >= 1 element. |
| 148 if (ascendingNumbers.get(0).mPageNum == 1 && |
| 149 (secondPageIsOutlink || thirdPageIsOutlink) && |
| 150 pagePattern.isPagingUrl(firstPageUrl)) { |
| 151 // Has valid PageParamInfo, create and populate it. |
| 152 int coefficient; |
| 153 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; |
| 154 if (delta == 0 || delta == 1) { |
| 155 coefficient = 1; |
| 156 } else { |
| 157 coefficient = onlyLink.mPageParamValue; |
| 158 delta = 0; |
| 159 } |
| 160 List<PageInfo> allPageInfo = new ArrayList<PageInfo>(); |
| 161 allPageInfo.add(new PageInfo(1, firstPageUrl)); |
| 162 allPageInfo.add(new PageInfo(onlyLink.mPageNum, |
| 163 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl)
); |
| 164 return new PageParamInfo(Type.PAGE_NUMBER, allPageInfo, |
| 165 new LinearFormula(coefficient, delta)); |
| 166 } |
| 167 } |
| 168 |
| 169 return null; |
| 170 } |
83 | 171 |
84 /** | 172 /** |
85 * Returns a new PageParamInfo that is a clone of this object. | 173 * Returns a new PageParamInfo that is a clone of this object. |
86 */ | 174 */ |
87 public Object clone() { | 175 public Object clone() { |
88 PageParamInfo info = new PageParamInfo(); | 176 return new PageParamInfo(this.mType, this.mAllPageInfo, this.mFormula); |
89 info.mType = this.mType; | |
90 info.mAllPageInfo = this.mAllPageInfo; | |
91 info.mFormula = this.mFormula; | |
92 info.mSinglePageUrl = this.mSinglePageUrl; | |
93 return info; | |
94 } | 177 } |
95 | 178 |
96 /** | 179 /** |
97 * Evaluates which PageParamInfo is better based on the properties of PagePa
ramInfo. | 180 * Evaluates which PageParamInfo is better based on the properties of PagePa
ramInfo. |
98 * We prefer the one if the list of PageParameterDetector.LinkInfo forms a l
inear formula, see | 181 * We prefer the one if the list of PageLinkInfo forms a linear formula, see
getLinearFormula(). |
99 * PageParamterDetector.getPageParamLinearFormula(). | |
100 * | 182 * |
101 * Returns 1 if this is better, -1 if other is better and 0 if they are equa
l. | 183 * Returns 1 if this is better, -1 if other is better and 0 if they are equa
l. |
102 * | 184 * |
103 */ | 185 */ |
104 int compareTo(PageParamInfo other) { | 186 int compareTo(PageParamInfo other) { |
105 // We prefer the one where the LinkInfo array forms a linear formula, se
e isLinearFormula. | 187 // We prefer the one where the LinkInfo array forms a linear formula, se
e isLinearFormula. |
106 if (mFormula != null && other.mFormula == null) return 1; | 188 if (mFormula != null && other.mFormula == null) return 1; |
107 if (mFormula == null && other.mFormula != null) return -1; | 189 if (mFormula == null && other.mFormula != null) return -1; |
108 | 190 |
109 if (mType == other.mType) return 0; | 191 if (mType == other.mType) return 0; |
110 | 192 |
111 // For different page param types, we prefer PAGE_NUMBER. | 193 // For different page param types, we prefer PAGE_NUMBER. |
112 if (mType == Type.PAGE_NUMBER) return 1; | 194 if (mType == Type.PAGE_NUMBER) return 1; |
113 if (other.mType == Type.PAGE_NUMBER) return -1; | 195 if (other.mType == Type.PAGE_NUMBER) return -1; |
114 // Can't decide as they have unknown page type. | 196 // Can't decide as they have unknown page type. |
115 return 0; | 197 return 0; |
116 } | 198 } |
117 | 199 |
118 @Override | 200 @Override |
119 public String toString() { // For debugging. | 201 public String toString() { // For debugging. |
120 String str = new String("Type: " + mType + "\nPageInfo: " + mAllPageInfo.
size()); | 202 String str = new String("Type: " + mType + "\nPageInfo: " + mAllPageInfo.
size()); |
121 for (PageInfo page : mAllPageInfo) { | 203 for (PageInfo page : mAllPageInfo) { |
122 str += "\n " + page.toString(); | 204 str += "\n " + page.toString(); |
123 } | 205 } |
124 str += "\nsinglePageUrl: " + mSinglePageUrl + | 206 str += "\nformula: " + (mFormula == null ? "null" : mFormula.toString()); |
125 "\nformula: " + (mFormula == null ? "null" : mFormula.toString())
; | |
126 return str; | 207 return str; |
127 } | 208 } |
128 | 209 |
| 210 /** |
| 211 * Detects if page numbers in list of PageLinkInfo's are adjacent, and if pa
ge numbers in list |
| 212 * of PageInfo's are consecutive. |
| 213 * |
| 214 * For adjacency, the page numbers in list of PageLinkInfo's must either be
adjacent, or |
| 215 * separated by at most 1 plain text number which must represent the current
page number in one |
| 216 * of the PageInfo's. |
| 217 * For consecutiveness, there must be at least one pair of consecutive numbe
r values in the list |
| 218 * of PageLinkInfo's, or between a PageLinkInfo and a plain text number. |
| 219 * |
| 220 * Returns a int value that is a combination of bits: |
| 221 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent |
| 222 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons
ecutive. |
| 223 * |
| 224 * @param allLinkInfo the list of PageLinkInfo's to evaluate |
| 225 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| 226 */ |
| 227 static int arePageNumsAdjacentAndConsecutive(List<PageLinkInfo> allLinkInfo, |
| 228 List<PageInfo> ascendingNumbers) { |
| 229 int result = 0; |
| 230 |
| 231 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i
.e. the gap is |
| 232 // current page number respresented in plain text. |
| 233 int firstPos = -1; |
| 234 int lastPos = -1; |
| 235 int gapPos = -1; |
| 236 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa
ge number is unique. |
| 237 for (PageLinkInfo linkInfo : allLinkInfo) { |
| 238 final int currPos = linkInfo.mPosInAscendingList; |
| 239 if (lastPos == -1) { |
| 240 firstPos = currPos; |
| 241 } else if (currPos != lastPos + 1) { |
| 242 // If position is not strictly ascending, or the gap size is > 1
(e.g. "[3] [4] 5 6 |
| 243 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a
llLinkInfo is not |
| 244 // adjacent. |
| 245 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1
) return result; |
| 246 gapPos = currPos - 1; |
| 247 } |
| 248 // Make sure page param value, i.e. page number represented in plain
text, is unique. |
| 249 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; |
| 250 lastPos = currPos; |
| 251 } // for all LinkInfo's |
| 252 |
| 253 result |= PAGE_NUM_ADJACENT_MASK; |
| 254 |
| 255 // Now, determine if page numbers in ascendingNumbers are consecutive. |
| 256 |
| 257 // First, handle the gap. |
| 258 if (gapPos != -1) { |
| 259 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu
lt; |
| 260 // The "gap" should represent current page number in plain text. |
| 261 // Check if its adjacent page numbers are consecutive. |
| 262 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. |
| 263 // This can eliminate links affecting the number of items on a page. |
| 264 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; |
| 265 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && |
| 266 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1)
{ |
| 267 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 268 } |
| 269 return result; |
| 270 } |
| 271 |
| 272 // There is no gap. Check if at least one of the following cases is sat
isfied: |
| 273 // Case #1: "[1] [2] ..." or "1 [2] ... ". |
| 274 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum
== 1 && |
| 275 ascendingNumbers.get(1).mPageNum == 2) { |
| 276 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 277 } |
| 278 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. |
| 279 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && |
| 280 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get(
0).mUrl.isEmpty()) { |
| 281 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 282 } |
| 283 // Case #3: "... [n-1] [n]" or "... [n - 1] n". |
| 284 final int numbersSize = ascendingNumbers.size(); |
| 285 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && |
| 286 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == |
| 287 ascendingNumbers.get(numbersSize - 1).mPageNum) { |
| 288 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 289 } |
| 290 // Case #4: "... [i-1] [i] [i+1] ...". |
| 291 for (int i = firstPos + 1; i < lastPos; i++) { |
| 292 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get
(i + 1).mPageNum) { |
| 293 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 294 } |
| 295 } |
| 296 |
| 297 // Otherwise, there's no pair of consecutive values. |
| 298 return result; |
| 299 } |
| 300 |
| 301 /** |
| 302 * Determines if the list of PageLinkInfo's form a linear formula: |
| 303 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o
r delta == 0). |
| 304 * |
| 305 * The coefficient and delta are calculated from the page parameter values a
nd page numbers of 2 |
| 306 * PageLinkInfo's, and then validated against the remaining PageLinkInfo's. |
| 307 * The order of page numbers doesn't matter. |
| 308 * |
| 309 * Returns LinearFormula, containing the coefficient and delta, if the page |
| 310 * parameter forumla could be determined. Otherwise, returns null. |
| 311 * |
| 312 * @param allLinkInfo the list of PageLinkInfo's to evaluate |
| 313 */ |
| 314 // TODO(kuan): As this gets rolled out, reassesss the necessity of non-1 coe
fficient support. |
| 315 private static LinearFormula getLinearFormula(List<PageLinkInfo> allLinkInfo
) { |
| 316 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; |
| 317 |
| 318 final PageLinkInfo firstLink = allLinkInfo.get(0); |
| 319 final PageLinkInfo secondLink = allLinkInfo.get(1); |
| 320 |
| 321 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m
PageNum) > 4) { |
| 322 return null; |
| 323 } |
| 324 |
| 325 int deltaX = secondLink.mPageNum - firstLink.mPageNum; |
| 326 if (deltaX == 0) return null; |
| 327 |
| 328 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; |
| 329 int coefficient = deltaY / deltaX; |
| 330 if (coefficient == 0) return null; |
| 331 |
| 332 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum
; |
| 333 if (delta != 0 && delta != -coefficient) return null; |
| 334 |
| 335 // Check if the remaining elements are on the same linear map. |
| 336 for (int i = 2; i < allLinkInfo.size(); i++) { |
| 337 final PageLinkInfo link = allLinkInfo.get(i); |
| 338 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret
urn null; |
| 339 } |
| 340 |
| 341 return new LinearFormula(coefficient, delta); |
| 342 } |
| 343 |
| 344 /** |
| 345 * Returns true if page numbers in list of PageInfo's form a sequence, based
on a pipeline of |
| 346 * rules: |
| 347 * - first PageInfo must have a URL unless it is the first page |
| 348 * - there's only one plain number without URL in list |
| 349 * - if only two pages, they must be siblings - 2nd page number must follow
1st |
| 350 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu
tive numbers must be |
| 351 * head/tail or have URLs. |
| 352 * |
| 353 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| 354 */ |
| 355 private static boolean isPageNumberSequence(List<PageInfo> ascendingNumbers)
{ |
| 356 if (ascendingNumbers.size() <= 1) return false; |
| 357 |
| 358 // The first one must have a URL unless it is the first page. |
| 359 final PageInfo firstPage = ascendingNumbers.get(0); |
| 360 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; |
| 361 |
| 362 // There's only one plain number without URL in ascending numbers group. |
| 363 boolean hasPlainNum = false; |
| 364 for (PageInfo page : ascendingNumbers) { |
| 365 if (page.mUrl.isEmpty()) { |
| 366 if (hasPlainNum) return false; |
| 367 hasPlainNum = true; |
| 368 } |
| 369 } |
| 370 |
| 371 // If there are only two pages, they must be siblings. |
| 372 if (ascendingNumbers.size() == 2) { |
| 373 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; |
| 374 } |
| 375 |
| 376 // Check if page numbers in ascendingNumbers are adjacent and consecutiv
e. |
| 377 for (int i = 1; i < ascendingNumbers.size(); i++) { |
| 378 // If two adjacent numbers are not consecutive, we accept them only
when: |
| 379 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2
], [3]...[i], [n]. |
| 380 // 2) both of them have URLs. |
| 381 final PageInfo currPage = ascendingNumbers.get(i); |
| 382 final PageInfo prevPage = ascendingNumbers.get(i - 1); |
| 383 if (currPage.mPageNum - prevPage.mPageNum != 1) { |
| 384 if (i != 1 && i != ascendingNumbers.size() - 1) return false; |
| 385 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f
alse; |
| 386 } |
| 387 } |
| 388 |
| 389 return true; |
| 390 } |
| 391 |
129 } | 392 } |
OLD | NEW |