| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "pdf/pdfium/pdfium_page.h" | 5 #include "pdf/pdfium/pdfium_page.h" |
| 6 | 6 |
| 7 #include <math.h> | 7 #include <math.h> |
| 8 #include <stddef.h> | 8 #include <stddef.h> |
| 9 | 9 |
| 10 #include <algorithm> | 10 #include <algorithm> |
| 11 #include <memory> | 11 #include <memory> |
| 12 #include <utility> | 12 #include <utility> |
| 13 | 13 |
| 14 #include "base/logging.h" | 14 #include "base/logging.h" |
| 15 #include "base/strings/string_number_conversions.h" | 15 #include "base/strings/string_number_conversions.h" |
| 16 #include "base/strings/string_util.h" | 16 #include "base/strings/string_util.h" |
| 17 #include "base/strings/utf_string_conversions.h" | 17 #include "base/strings/utf_string_conversions.h" |
| 18 #include "base/values.h" | |
| 19 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h" | 18 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h" |
| 20 #include "pdf/pdfium/pdfium_engine.h" | 19 #include "pdf/pdfium/pdfium_engine.h" |
| 21 #include "printing/units.h" | 20 #include "printing/units.h" |
| 22 | 21 |
| 23 // Used when doing hit detection. | 22 // Used when doing hit detection. |
| 24 #define kTolerance 20.0 | 23 #define kTolerance 20.0 |
| 25 | 24 |
| 26 using printing::ConvertUnitDouble; | 25 using printing::ConvertUnitDouble; |
| 27 using printing::kPointsPerInch; | 26 using printing::kPointsPerInch; |
| 28 using printing::kPixelsPerInch; | 27 using printing::kPixelsPerInch; |
| 29 | 28 |
| 30 namespace { | 29 namespace { |
| 31 | 30 |
| 32 // Dictionary Value key names for returning the accessible page content as JSON. | |
| 33 const char kPageWidth[] = "width"; | |
| 34 const char kPageHeight[] = "height"; | |
| 35 const char kPageTextBox[] = "textBox"; | |
| 36 const char kTextBoxLeft[] = "left"; | |
| 37 const char kTextBoxTop[] = "top"; | |
| 38 const char kTextBoxWidth[] = "width"; | |
| 39 const char kTextBoxHeight[] = "height"; | |
| 40 const char kTextBoxFontSize[] = "fontSize"; | |
| 41 const char kTextBoxNodes[] = "textNodes"; | |
| 42 const char kTextNodeType[] = "type"; | |
| 43 const char kTextNodeText[] = "text"; | |
| 44 const char kTextNodeTypeText[] = "text"; | |
| 45 | |
| 46 pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) { | |
| 47 int output_width = FPDF_GetPageWidth(page); | |
| 48 int output_height = FPDF_GetPageHeight(page); | |
| 49 | |
| 50 int min_x; | |
| 51 int min_y; | |
| 52 int max_x; | |
| 53 int max_y; | |
| 54 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, | |
| 55 input.x(), input.y(), &min_x, &min_y); | |
| 56 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, | |
| 57 input.right(), input.bottom(), &max_x, &max_y); | |
| 58 | |
| 59 if (max_x < min_x) | |
| 60 std::swap(min_x, max_x); | |
| 61 if (max_y < min_y) | |
| 62 std::swap(min_y, max_y); | |
| 63 | |
| 64 pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y); | |
| 65 output_rect.Intersect(pp::Rect(0, 0, output_width, output_height)); | |
| 66 return output_rect; | |
| 67 } | |
| 68 | |
| 69 pp::FloatRect FloatPageRectToPixelRect(FPDF_PAGE page, | 31 pp::FloatRect FloatPageRectToPixelRect(FPDF_PAGE page, |
| 70 const pp::FloatRect& input) { | 32 const pp::FloatRect& input) { |
| 71 int output_width = FPDF_GetPageWidth(page); | 33 int output_width = FPDF_GetPageWidth(page); |
| 72 int output_height = FPDF_GetPageHeight(page); | 34 int output_height = FPDF_GetPageHeight(page); |
| 73 | 35 |
| 74 int min_x; | 36 int min_x; |
| 75 int min_y; | 37 int min_y; |
| 76 int max_x; | 38 int max_x; |
| 77 int max_y; | 39 int max_y; |
| 78 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, input.x(), | 40 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, input.x(), |
| 79 input.y(), &min_x, &min_y); | 41 input.y(), &min_x, &min_y); |
| 80 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, input.right(), | 42 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, input.right(), |
| 81 input.bottom(), &max_x, &max_y); | 43 input.bottom(), &max_x, &max_y); |
| 82 | 44 |
| 83 if (max_x < min_x) | 45 if (max_x < min_x) |
| 84 std::swap(min_x, max_x); | 46 std::swap(min_x, max_x); |
| 85 if (max_y < min_y) | 47 if (max_y < min_y) |
| 86 std::swap(min_y, max_y); | 48 std::swap(min_y, max_y); |
| 87 | 49 |
| 88 pp::FloatRect output_rect( | 50 pp::FloatRect output_rect( |
| 89 ConvertUnitDouble(min_x, kPointsPerInch, kPixelsPerInch), | 51 ConvertUnitDouble(min_x, kPointsPerInch, kPixelsPerInch), |
| 90 ConvertUnitDouble(min_y, kPointsPerInch, kPixelsPerInch), | 52 ConvertUnitDouble(min_y, kPointsPerInch, kPixelsPerInch), |
| 91 ConvertUnitDouble(max_x - min_x, kPointsPerInch, kPixelsPerInch), | 53 ConvertUnitDouble(max_x - min_x, kPointsPerInch, kPixelsPerInch), |
| 92 ConvertUnitDouble(max_y - min_y, kPointsPerInch, kPixelsPerInch)); | 54 ConvertUnitDouble(max_y - min_y, kPointsPerInch, kPixelsPerInch)); |
| 93 return output_rect; | 55 return output_rect; |
| 94 } | 56 } |
| 95 | 57 |
| 96 pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page, | |
| 97 int index) { | |
| 98 double left, right, bottom, top; | |
| 99 FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); | |
| 100 if (right < left) | |
| 101 std::swap(left, right); | |
| 102 if (bottom < top) | |
| 103 std::swap(top, bottom); | |
| 104 pp::Rect page_coords(left, top, right - left, bottom - top); | |
| 105 return PageRectToGViewRect(page, page_coords); | |
| 106 } | |
| 107 | |
| 108 pp::FloatRect GetFloatCharRectInPixels(FPDF_PAGE page, | 58 pp::FloatRect GetFloatCharRectInPixels(FPDF_PAGE page, |
| 109 FPDF_TEXTPAGE text_page, | 59 FPDF_TEXTPAGE text_page, |
| 110 int index) { | 60 int index) { |
| 111 double left, right, bottom, top; | 61 double left, right, bottom, top; |
| 112 FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); | 62 FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); |
| 113 if (right < left) | 63 if (right < left) |
| 114 std::swap(left, right); | 64 std::swap(left, right); |
| 115 if (bottom < top) | 65 if (bottom < top) |
| 116 std::swap(top, bottom); | 66 std::swap(top, bottom); |
| 117 pp::FloatRect page_coords(left, top, right - left, bottom - top); | 67 pp::FloatRect page_coords(left, top, right - left, bottom - top); |
| 118 return FloatPageRectToPixelRect(page, page_coords); | 68 return FloatPageRectToPixelRect(page, page_coords); |
| 119 } | 69 } |
| 120 | 70 |
| 121 // This is the character PDFium inserts where a word is broken across lines. | |
| 122 const unsigned int kSoftHyphen = 0x02; | |
| 123 | |
| 124 // The following characters should all be recognized as Unicode newlines: | |
| 125 // LF: Line Feed, U+000A | |
| 126 // VT: Vertical Tab, U+000B | |
| 127 // FF: Form Feed, U+000C | |
| 128 // CR: Carriage Return, U+000D | |
| 129 // CR+LF: CR (U+000D) followed by LF (U+000A) | |
| 130 // NEL: Next Line, U+0085 | |
| 131 // LS: Line Separator, U+2028 | |
| 132 // PS: Paragraph Separator, U+2029. | |
| 133 // Source: http://en.wikipedia.org/wiki/Newline#Unicode . | |
| 134 const unsigned int kUnicodeNewlines[] = { | |
| 135 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029 | |
| 136 }; | |
| 137 | |
| 138 bool IsSoftHyphen(unsigned int character) { | |
| 139 return kSoftHyphen == character; | |
| 140 } | |
| 141 | |
| 142 bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) { | |
| 143 return !(a.IsEmpty() || b.IsEmpty() || | |
| 144 a.bottom() < b.y() || b.bottom() < a.y()); | |
| 145 } | |
| 146 | |
| 147 bool OverlapsOnYAxis(const pp::FloatRect &a, const pp::FloatRect& b) { | 71 bool OverlapsOnYAxis(const pp::FloatRect &a, const pp::FloatRect& b) { |
| 148 return !(a.IsEmpty() || b.IsEmpty() || | 72 return !(a.IsEmpty() || b.IsEmpty() || |
| 149 a.bottom() < b.y() || b.bottom() < a.y()); | 73 a.bottom() < b.y() || b.bottom() < a.y()); |
| 150 } | 74 } |
| 151 | 75 |
| 152 bool IsEol(unsigned int character) { | |
| 153 const unsigned int* first = kUnicodeNewlines; | |
| 154 const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines); | |
| 155 return std::find(first, last, character) != last; | |
| 156 } | |
| 157 | |
| 158 } // namespace | 76 } // namespace |
| 159 | 77 |
| 160 namespace chrome_pdf { | 78 namespace chrome_pdf { |
| 161 | 79 |
| 162 PDFiumPage::PDFiumPage(PDFiumEngine* engine, | 80 PDFiumPage::PDFiumPage(PDFiumEngine* engine, |
| 163 int i, | 81 int i, |
| 164 const pp::Rect& r, | 82 const pp::Rect& r, |
| 165 bool available) | 83 bool available) |
| 166 : engine_(engine), | 84 : engine_(engine), |
| 167 page_(NULL), | 85 page_(NULL), |
| (...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 235 FPDF_TEXTPAGE PDFiumPage::GetTextPage() { | 153 FPDF_TEXTPAGE PDFiumPage::GetTextPage() { |
| 236 if (!available_) | 154 if (!available_) |
| 237 return NULL; | 155 return NULL; |
| 238 if (!text_page_) { | 156 if (!text_page_) { |
| 239 ScopedLoadCounter scoped_load(this); | 157 ScopedLoadCounter scoped_load(this); |
| 240 text_page_ = FPDFText_LoadPage(GetPage()); | 158 text_page_ = FPDFText_LoadPage(GetPage()); |
| 241 } | 159 } |
| 242 return text_page_; | 160 return text_page_; |
| 243 } | 161 } |
| 244 | 162 |
| 245 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { | |
| 246 base::DictionaryValue* node = new base::DictionaryValue(); | |
| 247 | |
| 248 if (!available_) | |
| 249 return node; | |
| 250 | |
| 251 FPDF_PAGE page = GetPage(); | |
| 252 FPDF_TEXTPAGE text_page = GetTextPage(); | |
| 253 | |
| 254 double width = FPDF_GetPageWidth(page); | |
| 255 double height = FPDF_GetPageHeight(page); | |
| 256 | |
| 257 node->SetDouble(kPageWidth, width); | |
| 258 node->SetDouble(kPageHeight, height); | |
| 259 std::unique_ptr<base::ListValue> text(new base::ListValue()); | |
| 260 | |
| 261 int chars_count = FPDFText_CountChars(text_page); | |
| 262 pp::Rect line_rect; | |
| 263 pp::Rect word_rect; | |
| 264 bool seen_literal_text_in_word = false; | |
| 265 | |
| 266 // Iterate over all of the chars on the page. Explicitly run the loop | |
| 267 // with |i == chars_count|, which is one past the last character, and | |
| 268 // pretend it's a newline character in order to ensure we always flush | |
| 269 // the last line. | |
| 270 base::string16 line; | |
| 271 for (int i = 0; i <= chars_count; i++) { | |
| 272 unsigned int character; | |
| 273 pp::Rect char_rect; | |
| 274 | |
| 275 if (i < chars_count) { | |
| 276 character = FPDFText_GetUnicode(text_page, i); | |
| 277 char_rect = GetCharRectInGViewCoords(page, text_page, i); | |
| 278 } else { | |
| 279 // Make the last character a newline so the last line isn't lost. | |
| 280 character = '\n'; | |
| 281 } | |
| 282 | |
| 283 // There are spurious STX chars appearing in place | |
| 284 // of ligatures. Apply a heuristic to check that some vertical displacement | |
| 285 // is involved before assuming they are line-breaks. | |
| 286 bool is_intraword_linebreak = false; | |
| 287 if (i < chars_count - 1 && IsSoftHyphen(character)) { | |
| 288 // check if the next char and this char are in different lines. | |
| 289 pp::Rect next_char_rect = GetCharRectInGViewCoords( | |
| 290 page, text_page, i + 1); | |
| 291 | |
| 292 // TODO(dmazzoni): this assumes horizontal text. | |
| 293 // https://crbug.com/580311 | |
| 294 is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect); | |
| 295 } | |
| 296 if (is_intraword_linebreak || | |
| 297 base::IsUnicodeWhitespace(character) || | |
| 298 IsEol(character)) { | |
| 299 if (!word_rect.IsEmpty() && seen_literal_text_in_word) { | |
| 300 word_rect = pp::Rect(); | |
| 301 seen_literal_text_in_word = false; | |
| 302 } | |
| 303 } | |
| 304 | |
| 305 if (is_intraword_linebreak || IsEol(character)) { | |
| 306 if (!line_rect.IsEmpty()) { | |
| 307 if (is_intraword_linebreak) { | |
| 308 // Add a 0-width hyphen. | |
| 309 line.push_back('-'); | |
| 310 } | |
| 311 | |
| 312 std::unique_ptr<base::DictionaryValue> text_node( | |
| 313 new base::DictionaryValue()); | |
| 314 text_node->SetString(kTextNodeType, kTextNodeTypeText); | |
| 315 text_node->SetString(kTextNodeText, line); | |
| 316 | |
| 317 base::ListValue* text_nodes = new base::ListValue(); | |
| 318 text_nodes->Append(std::move(text_node)); | |
| 319 | |
| 320 std::unique_ptr<base::DictionaryValue> line_node( | |
| 321 new base::DictionaryValue()); | |
| 322 line_node->SetDouble(kTextBoxLeft, line_rect.x()); | |
| 323 line_node->SetDouble(kTextBoxTop, line_rect.y()); | |
| 324 line_node->SetDouble(kTextBoxWidth, line_rect.width()); | |
| 325 line_node->SetDouble(kTextBoxHeight, line_rect.height()); | |
| 326 line_node->SetDouble(kTextBoxFontSize, | |
| 327 FPDFText_GetFontSize(text_page, i)); | |
| 328 line_node->Set(kTextBoxNodes, text_nodes); | |
| 329 text->Append(std::move(line_node)); | |
| 330 | |
| 331 line.clear(); | |
| 332 line_rect = pp::Rect(); | |
| 333 word_rect = pp::Rect(); | |
| 334 seen_literal_text_in_word = false; | |
| 335 } | |
| 336 continue; | |
| 337 } | |
| 338 seen_literal_text_in_word = seen_literal_text_in_word || | |
| 339 !base::IsUnicodeWhitespace(character); | |
| 340 line.push_back(character); | |
| 341 | |
| 342 if (!char_rect.IsEmpty()) { | |
| 343 line_rect = line_rect.Union(char_rect); | |
| 344 | |
| 345 if (!base::IsUnicodeWhitespace(character)) | |
| 346 word_rect = word_rect.Union(char_rect); | |
| 347 } | |
| 348 } | |
| 349 | |
| 350 node->Set(kPageTextBox, text.release()); // Takes ownership of |text| | |
| 351 | |
| 352 return node; | |
| 353 } | |
| 354 | |
| 355 void PDFiumPage::GetTextRunInfo(int start_char_index, | 163 void PDFiumPage::GetTextRunInfo(int start_char_index, |
| 356 uint32_t* out_len, | 164 uint32_t* out_len, |
| 357 double* out_font_size, | 165 double* out_font_size, |
| 358 pp::FloatRect* out_bounds) { | 166 pp::FloatRect* out_bounds) { |
| 359 FPDF_PAGE page = GetPage(); | 167 FPDF_PAGE page = GetPage(); |
| 360 FPDF_TEXTPAGE text_page = GetTextPage(); | 168 FPDF_TEXTPAGE text_page = GetTextPage(); |
| 361 int chars_count = FPDFText_CountChars(text_page); | 169 int chars_count = FPDFText_CountChars(text_page); |
| 362 int char_index = start_char_index; | 170 int char_index = start_char_index; |
| 363 while ( | 171 while ( |
| 364 char_index < chars_count && | 172 char_index < chars_count && |
| (...skipping 324 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 689 page_->loading_count_--; | 497 page_->loading_count_--; |
| 690 } | 498 } |
| 691 | 499 |
| 692 PDFiumPage::Link::Link() { | 500 PDFiumPage::Link::Link() { |
| 693 } | 501 } |
| 694 | 502 |
| 695 PDFiumPage::Link::~Link() { | 503 PDFiumPage::Link::~Link() { |
| 696 } | 504 } |
| 697 | 505 |
| 698 } // namespace chrome_pdf | 506 } // namespace chrome_pdf |
| OLD | NEW |