OLD | NEW |
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "pdf/pdfium/pdfium_page.h" | 5 #include "pdf/pdfium/pdfium_page.h" |
6 | 6 |
7 #include <math.h> | 7 #include <math.h> |
8 #include <stddef.h> | 8 #include <stddef.h> |
9 | 9 |
| 10 #include <algorithm> |
| 11 |
10 #include "base/logging.h" | 12 #include "base/logging.h" |
11 #include "base/strings/string_number_conversions.h" | 13 #include "base/strings/string_number_conversions.h" |
12 #include "base/strings/string_util.h" | 14 #include "base/strings/string_util.h" |
13 #include "base/strings/utf_string_conversions.h" | 15 #include "base/strings/utf_string_conversions.h" |
14 #include "base/values.h" | 16 #include "base/values.h" |
15 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h" | 17 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h" |
16 #include "pdf/pdfium/pdfium_engine.h" | 18 #include "pdf/pdfium/pdfium_engine.h" |
17 | 19 |
18 // Used when doing hit detection. | 20 // Used when doing hit detection. |
19 #define kTolerance 20.0 | 21 #define kTolerance 20.0 |
20 | 22 |
21 namespace { | 23 namespace { |
22 | 24 |
23 // Dictionary Value key names for returning the accessible page content as JSON. | 25 // Dictionary Value key names for returning the accessible page content as JSON. |
24 const char kPageWidth[] = "width"; | 26 const char kPageWidth[] = "width"; |
25 const char kPageHeight[] = "height"; | 27 const char kPageHeight[] = "height"; |
26 const char kPageTextBox[] = "textBox"; | 28 const char kPageTextBox[] = "textBox"; |
27 const char kTextBoxLeft[] = "left"; | 29 const char kTextBoxLeft[] = "left"; |
28 const char kTextBoxTop[] = "top"; | 30 const char kTextBoxTop[] = "top"; |
29 const char kTextBoxWidth[] = "width"; | 31 const char kTextBoxWidth[] = "width"; |
30 const char kTextBoxHeight[] = "height"; | 32 const char kTextBoxHeight[] = "height"; |
31 const char kTextBoxFontSize[] = "fontSize"; | 33 const char kTextBoxFontSize[] = "fontSize"; |
32 const char kTextBoxNodes[] = "textNodes"; | 34 const char kTextBoxNodes[] = "textNodes"; |
33 const char kTextNodeType[] = "type"; | 35 const char kTextNodeType[] = "type"; |
34 const char kTextNodeText[] = "text"; | 36 const char kTextNodeText[] = "text"; |
35 const char kTextNodeURL[] = "url"; | |
36 const char kTextNodeTypeText[] = "text"; | 37 const char kTextNodeTypeText[] = "text"; |
37 const char kTextNodeTypeURL[] = "url"; | 38 |
38 const char kDocLinkURLPrefix[] = "#page"; | 39 pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) { |
| 40 int output_width = FPDF_GetPageWidth(page); |
| 41 int output_height = FPDF_GetPageHeight(page); |
| 42 |
| 43 int min_x; |
| 44 int min_y; |
| 45 int max_x; |
| 46 int max_y; |
| 47 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, |
| 48 input.x(), input.y(), &min_x, &min_y); |
| 49 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, |
| 50 input.right(), input.bottom(), &max_x, &max_y); |
| 51 |
| 52 if (max_x < min_x) |
| 53 std::swap(min_x, max_x); |
| 54 if (max_y < min_y) |
| 55 std::swap(min_y, max_y); |
| 56 |
| 57 pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y); |
| 58 output_rect.Intersect(pp::Rect(0, 0, output_width, output_height)); |
| 59 return output_rect; |
| 60 } |
| 61 |
| 62 pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page, |
| 63 int index) { |
| 64 double left, right, bottom, top; |
| 65 FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); |
| 66 if (right < left) |
| 67 std::swap(left, right); |
| 68 if (bottom < top) |
| 69 std::swap(top, bottom); |
| 70 pp::Rect page_coords(left, top, right - left, bottom - top); |
| 71 return PageRectToGViewRect(page, page_coords); |
| 72 } |
| 73 |
| 74 // This is the character PDFium inserts where a word is broken across lines. |
| 75 const unsigned int kSoftHyphen = 0x02; |
| 76 |
| 77 // The following characters should all be recognized as Unicode newlines: |
| 78 // LF: Line Feed, U+000A |
| 79 // VT: Vertical Tab, U+000B |
| 80 // FF: Form Feed, U+000C |
| 81 // CR: Carriage Return, U+000D |
| 82 // CR+LF: CR (U+000D) followed by LF (U+000A) |
| 83 // NEL: Next Line, U+0085 |
| 84 // LS: Line Separator, U+2028 |
| 85 // PS: Paragraph Separator, U+2029. |
| 86 // Source: http://en.wikipedia.org/wiki/Newline#Unicode . |
| 87 const unsigned int kUnicodeNewlines[] = { |
| 88 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029 |
| 89 }; |
| 90 |
| 91 bool IsSoftHyphen(unsigned int character) { |
| 92 return kSoftHyphen == character; |
| 93 } |
| 94 |
| 95 bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) { |
| 96 return !(a.IsEmpty() || b.IsEmpty() || |
| 97 a.bottom() < b.y() || b.bottom() < a.y()); |
| 98 } |
| 99 |
| 100 bool IsEol(unsigned int character) { |
| 101 const unsigned int* first = kUnicodeNewlines; |
| 102 const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines); |
| 103 return std::find(first, last, character) != last; |
| 104 } |
39 | 105 |
40 } // namespace | 106 } // namespace |
41 | 107 |
42 namespace chrome_pdf { | 108 namespace chrome_pdf { |
43 | 109 |
44 PDFiumPage::PDFiumPage(PDFiumEngine* engine, | 110 PDFiumPage::PDFiumPage(PDFiumEngine* engine, |
45 int i, | 111 int i, |
46 const pp::Rect& r, | 112 const pp::Rect& r, |
47 bool available) | 113 bool available) |
48 : engine_(engine), | 114 : engine_(engine), |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
123 } | 189 } |
124 return text_page_; | 190 return text_page_; |
125 } | 191 } |
126 | 192 |
127 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { | 193 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { |
128 base::DictionaryValue* node = new base::DictionaryValue(); | 194 base::DictionaryValue* node = new base::DictionaryValue(); |
129 | 195 |
130 if (!available_) | 196 if (!available_) |
131 return node; | 197 return node; |
132 | 198 |
133 double width = FPDF_GetPageWidth(GetPage()); | 199 FPDF_PAGE page = GetPage(); |
134 double height = FPDF_GetPageHeight(GetPage()); | 200 FPDF_TEXTPAGE text_page = GetTextPage(); |
135 | 201 |
136 base::ListValue* text = new base::ListValue(); | 202 double width = FPDF_GetPageWidth(page); |
137 int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount()); | 203 double height = FPDF_GetPageHeight(page); |
138 for (int i = 0; i < box_count; i++) { | |
139 double left, top, right, bottom; | |
140 FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom); | |
141 text->Append( | |
142 GetTextBoxAsValue(height, left, top, right, bottom, rotation)); | |
143 } | |
144 | 204 |
145 node->SetDouble(kPageWidth, width); | 205 node->SetDouble(kPageWidth, width); |
146 node->SetDouble(kPageHeight, height); | 206 node->SetDouble(kPageHeight, height); |
147 node->Set(kPageTextBox, text); // Takes ownership of |text| | 207 scoped_ptr<base::ListValue> text(new base::ListValue()); |
| 208 |
| 209 int chars_count = FPDFText_CountChars(text_page); |
| 210 pp::Rect line_rect; |
| 211 pp::Rect word_rect; |
| 212 bool seen_literal_text_in_word = false; |
| 213 |
| 214 // Iterate over all of the chars on the page. Explicitly run the loop |
| 215 // with |i == chars_count|, which is one past the last character, and |
| 216 // pretend it's a newline character in order to ensure we always flush |
| 217 // the last line. |
| 218 base::string16 line; |
| 219 for (int i = 0; i <= chars_count; i++) { |
| 220 unsigned int character; |
| 221 pp::Rect char_rect; |
| 222 |
| 223 if (i < chars_count) { |
| 224 character = FPDFText_GetUnicode(text_page, i); |
| 225 char_rect = GetCharRectInGViewCoords(page, text_page, i); |
| 226 } else { |
| 227 // Make the last character a newline so the last line isn't lost. |
| 228 character = '\n'; |
| 229 } |
| 230 |
| 231 // There are spurious STX chars appearing in place |
| 232 // of ligatures. Apply a heuristic to check that some vertical displacement |
| 233 // is involved before assuming they are line-breaks. |
| 234 bool is_intraword_linebreak = false; |
| 235 if (i < chars_count - 1 && IsSoftHyphen(character)) { |
| 236 // check if the next char and this char are in different lines. |
| 237 pp::Rect next_char_rect = GetCharRectInGViewCoords( |
| 238 page, text_page, i + 1); |
| 239 |
| 240 // TODO(dmazzoni): this assumes horizontal text. |
| 241 // https://crbug.com/580311 |
| 242 is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect); |
| 243 } |
| 244 if (is_intraword_linebreak || |
| 245 base::IsUnicodeWhitespace(character) || |
| 246 IsEol(character)) { |
| 247 if (!word_rect.IsEmpty() && seen_literal_text_in_word) { |
| 248 word_rect = pp::Rect(); |
| 249 seen_literal_text_in_word = false; |
| 250 } |
| 251 } |
| 252 |
| 253 if (is_intraword_linebreak || IsEol(character)) { |
| 254 if (!line_rect.IsEmpty()) { |
| 255 if (is_intraword_linebreak) { |
| 256 // Add a 0-width hyphen. |
| 257 line.push_back('-'); |
| 258 } |
| 259 |
| 260 base::DictionaryValue* text_node = new base::DictionaryValue(); |
| 261 text_node->SetString(kTextNodeType, kTextNodeTypeText); |
| 262 text_node->SetString(kTextNodeText, line); |
| 263 |
| 264 base::ListValue* text_nodes = new base::ListValue(); |
| 265 text_nodes->Append(text_node); |
| 266 |
| 267 base::DictionaryValue* line_node = new base::DictionaryValue(); |
| 268 line_node->SetDouble(kTextBoxLeft, line_rect.x()); |
| 269 line_node->SetDouble(kTextBoxTop, line_rect.y()); |
| 270 line_node->SetDouble(kTextBoxWidth, line_rect.width()); |
| 271 line_node->SetDouble(kTextBoxHeight, line_rect.height()); |
| 272 line_node->SetDouble(kTextBoxFontSize, |
| 273 FPDFText_GetFontSize(text_page, i)); |
| 274 line_node->Set(kTextBoxNodes, text_nodes); |
| 275 text->Append(line_node); |
| 276 |
| 277 line.clear(); |
| 278 line_rect = pp::Rect(); |
| 279 word_rect = pp::Rect(); |
| 280 seen_literal_text_in_word = false; |
| 281 } |
| 282 continue; |
| 283 } |
| 284 seen_literal_text_in_word = seen_literal_text_in_word || |
| 285 !base::IsUnicodeWhitespace(character); |
| 286 line.push_back(character); |
| 287 |
| 288 if (!char_rect.IsEmpty()) { |
| 289 line_rect = line_rect.Union(char_rect); |
| 290 |
| 291 if (!base::IsUnicodeWhitespace(character)) |
| 292 word_rect = word_rect.Union(char_rect); |
| 293 } |
| 294 } |
| 295 |
| 296 node->Set(kPageTextBox, text.release()); // Takes ownership of |text| |
148 | 297 |
149 return node; | 298 return node; |
150 } | 299 } |
151 | 300 |
152 base::Value* PDFiumPage::GetTextBoxAsValue(double page_height, | |
153 double left, double top, | |
154 double right, double bottom, | |
155 int rotation) { | |
156 base::string16 text_utf16; | |
157 int char_count = | |
158 FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0); | |
159 if (char_count > 0) { | |
160 unsigned short* data = reinterpret_cast<unsigned short*>( | |
161 base::WriteInto(&text_utf16, char_count + 1)); | |
162 FPDFText_GetBoundedText(GetTextPage(), | |
163 left, top, right, bottom, | |
164 data, char_count); | |
165 } | |
166 std::string text_utf8 = base::UTF16ToUTF8(text_utf16); | |
167 | |
168 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top); | |
169 Area area; | |
170 std::vector<LinkTarget> targets; | |
171 if (link) { | |
172 targets.push_back(LinkTarget()); | |
173 area = GetLinkTarget(link, &targets[0]); | |
174 } else { | |
175 pp::Rect rect( | |
176 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation)); | |
177 GetLinks(rect, &targets); | |
178 area = targets.empty() ? TEXT_AREA : WEBLINK_AREA; | |
179 } | |
180 | |
181 int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top, | |
182 kTolerance, kTolerance); | |
183 double font_size = FPDFText_GetFontSize(GetTextPage(), char_index); | |
184 | |
185 base::DictionaryValue* node = new base::DictionaryValue(); | |
186 node->SetDouble(kTextBoxLeft, left); | |
187 node->SetDouble(kTextBoxTop, page_height - top); | |
188 node->SetDouble(kTextBoxWidth, right - left); | |
189 node->SetDouble(kTextBoxHeight, top - bottom); | |
190 node->SetDouble(kTextBoxFontSize, font_size); | |
191 | |
192 base::ListValue* text_nodes = new base::ListValue(); | |
193 | |
194 if (area == DOCLINK_AREA) { | |
195 std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page); | |
196 text_nodes->Append(CreateURLNode(text_utf8, url)); | |
197 } else if (area == WEBLINK_AREA && link) { | |
198 text_nodes->Append(CreateURLNode(text_utf8, targets[0].url)); | |
199 } else if (area == WEBLINK_AREA && !link) { | |
200 size_t start = 0; | |
201 for (const auto& target : targets) { | |
202 // If there is an extra NULL character at end, find() will not return any | |
203 // matches. There should not be any though. | |
204 if (!target.url.empty()) | |
205 DCHECK_NE(target.url.back(), '\0'); | |
206 | |
207 // PDFium may change the case of generated links. | |
208 std::string lowerCaseURL = base::ToLowerASCII(target.url); | |
209 std::string lowerCaseText = base::ToLowerASCII(text_utf8); | |
210 size_t pos = lowerCaseText.find(lowerCaseURL, start); | |
211 size_t length = target.url.size(); | |
212 if (pos == std::string::npos) { | |
213 // Check if the link is a "mailto:" URL | |
214 if (lowerCaseURL.compare(0, 7, "mailto:") == 0) { | |
215 pos = lowerCaseText.find(lowerCaseURL.substr(7), start); | |
216 length -= 7; | |
217 } | |
218 | |
219 if (pos == std::string::npos) { | |
220 // No match has been found. This should never happen. | |
221 continue; | |
222 } | |
223 } | |
224 | |
225 std::string before_text = text_utf8.substr(start, pos - start); | |
226 if (!before_text.empty()) | |
227 text_nodes->Append(CreateTextNode(before_text)); | |
228 std::string link_text = text_utf8.substr(pos, length); | |
229 text_nodes->Append(CreateURLNode(link_text, target.url)); | |
230 | |
231 start = pos + length; | |
232 } | |
233 std::string before_text = text_utf8.substr(start); | |
234 if (!before_text.empty()) | |
235 text_nodes->Append(CreateTextNode(before_text)); | |
236 } else { | |
237 text_nodes->Append(CreateTextNode(text_utf8)); | |
238 } | |
239 | |
240 node->Set(kTextBoxNodes, text_nodes); // Takes ownership of |text_nodes|. | |
241 return node; | |
242 } | |
243 | |
244 base::Value* PDFiumPage::CreateTextNode(const std::string& text) { | |
245 base::DictionaryValue* node = new base::DictionaryValue(); | |
246 node->SetString(kTextNodeType, kTextNodeTypeText); | |
247 node->SetString(kTextNodeText, text); | |
248 return node; | |
249 } | |
250 | |
251 base::Value* PDFiumPage::CreateURLNode(const std::string& text, | |
252 const std::string& url) { | |
253 base::DictionaryValue* node = new base::DictionaryValue(); | |
254 node->SetString(kTextNodeType, kTextNodeTypeURL); | |
255 node->SetString(kTextNodeText, text); | |
256 node->SetString(kTextNodeURL, url); | |
257 return node; | |
258 } | |
259 | |
260 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, | 301 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, |
261 int rotation, | 302 int rotation, |
262 int* char_index, | 303 int* char_index, |
263 int* form_type, | 304 int* form_type, |
264 LinkTarget* target) { | 305 LinkTarget* target) { |
265 if (!available_) | 306 if (!available_) |
266 return NONSELECTABLE_AREA; | 307 return NONSELECTABLE_AREA; |
267 pp::Point point2 = point - rect_.point(); | 308 pp::Point point2 = point - rect_.point(); |
268 double new_x; | 309 double new_x; |
269 double new_y; | 310 double new_y; |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
525 page_->loading_count_--; | 566 page_->loading_count_--; |
526 } | 567 } |
527 | 568 |
528 PDFiumPage::Link::Link() { | 569 PDFiumPage::Link::Link() { |
529 } | 570 } |
530 | 571 |
531 PDFiumPage::Link::~Link() { | 572 PDFiumPage::Link::~Link() { |
532 } | 573 } |
533 | 574 |
534 } // namespace chrome_pdf | 575 } // namespace chrome_pdf |
OLD | NEW |