Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "pdf/pdfium/pdfium_page.h" | 5 #include "pdf/pdfium/pdfium_page.h" |
| 6 | 6 |
| 7 #include <math.h> | 7 #include <math.h> |
| 8 #include <stddef.h> | 8 #include <stddef.h> |
| 9 | 9 |
| 10 #include "base/logging.h" | 10 #include "base/logging.h" |
| (...skipping 14 matching lines...) Expand all Loading... | |
| 25 const char kPageHeight[] = "height"; | 25 const char kPageHeight[] = "height"; |
| 26 const char kPageTextBox[] = "textBox"; | 26 const char kPageTextBox[] = "textBox"; |
| 27 const char kTextBoxLeft[] = "left"; | 27 const char kTextBoxLeft[] = "left"; |
| 28 const char kTextBoxTop[] = "top"; | 28 const char kTextBoxTop[] = "top"; |
| 29 const char kTextBoxWidth[] = "width"; | 29 const char kTextBoxWidth[] = "width"; |
| 30 const char kTextBoxHeight[] = "height"; | 30 const char kTextBoxHeight[] = "height"; |
| 31 const char kTextBoxFontSize[] = "fontSize"; | 31 const char kTextBoxFontSize[] = "fontSize"; |
| 32 const char kTextBoxNodes[] = "textNodes"; | 32 const char kTextBoxNodes[] = "textNodes"; |
| 33 const char kTextNodeType[] = "type"; | 33 const char kTextNodeType[] = "type"; |
| 34 const char kTextNodeText[] = "text"; | 34 const char kTextNodeText[] = "text"; |
| 35 const char kTextNodeURL[] = "url"; | |
| 36 const char kTextNodeTypeText[] = "text"; | 35 const char kTextNodeTypeText[] = "text"; |
| 37 const char kTextNodeTypeURL[] = "url"; | 36 |
| 38 const char kDocLinkURLPrefix[] = "#page"; | 37 pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) { |
| 38 int output_width = FPDF_GetPageWidth(page); | |
| 39 int output_height = FPDF_GetPageHeight(page); | |
| 40 | |
| 41 int min_x, min_y; | |
|
Lei Zhang
2016/01/14 02:45:06
One var per line please.
dmazzoni
2016/01/21 23:10:43
Done.
| |
| 42 int max_x, max_y; | |
| 43 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, | |
| 44 input.x(), input.y(), &min_x, &min_y); | |
| 45 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, | |
| 46 input.right(), input.bottom(), &max_x, &max_y); | |
| 47 | |
| 48 if (max_x < min_x) | |
|
jbreiden
2016/01/14 01:07:56
Hmmm.... this works best for left-to-right languag
dmazzoni
2016/01/21 23:10:43
No, the issue here is that a pp::Rect isn't allowe
| |
| 49 std::swap(min_x, max_x); | |
| 50 if (max_y < min_y) | |
| 51 std::swap(min_y, max_y); | |
| 52 | |
| 53 pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y); | |
| 54 output_rect.Intersect(pp::Rect(0, 0, output_width, output_height)); | |
| 55 return output_rect; | |
| 56 } | |
| 57 | |
| 58 pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page, | |
| 59 int index) { | |
| 60 double left, right, bottom, top; | |
| 61 FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); | |
| 62 if (right < left) | |
|
jbreiden
2016/01/14 01:07:56
Same as above.
dmazzoni
2016/01/21 23:10:43
Same answer. We have to swap or it doesn't work.
| |
| 63 std::swap(left, right); | |
| 64 if (bottom < top) | |
| 65 std::swap(top, bottom); | |
| 66 pp::Rect page_coords(left, top, right - left, bottom - top); | |
| 67 return PageRectToGViewRect(page, page_coords); | |
| 68 } | |
| 69 | |
| 70 // This is the character Pfdium inserts where a word is broken across lines. | |
|
Lei Zhang
2016/01/14 02:45:06
PDFium
dmazzoni
2016/01/21 23:10:43
Done.
| |
| 71 const unsigned int kSoftHyphen = 0x02; | |
| 72 | |
| 73 // The following characters should all be recognized as Unicode newlines: | |
| 74 // LF: Line Feed, U+000A | |
| 75 // VT: Vertical Tab, U+000B | |
| 76 // FF: Form Feed, U+000C | |
| 77 // CR: Carriage Return, U+000D | |
| 78 // CR+LF: CR (U+000D) followed by LF (U+000A) | |
| 79 // NEL: Next Line, U+0085 | |
| 80 // LS: Line Separator, U+2028 | |
| 81 // PS: Paragraph Separator, U+2029. | |
| 82 // Source: http://en.wikipedia.org/wiki/Newline#Unicode . | |
| 83 const unsigned int kUnicodeNewlines[] = { | |
| 84 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029 | |
| 85 }; | |
| 86 | |
| 87 bool IsSoftHyphen(unsigned int character) { | |
| 88 return kSoftHyphen == character; | |
| 89 } | |
| 90 | |
| 91 bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) { | |
| 92 return !(a.IsEmpty() || b.IsEmpty() || | |
| 93 a.bottom() < b.y() || b.bottom() < a.y()); | |
| 94 } | |
| 95 | |
| 96 bool IsEol(unsigned int character) { | |
| 97 const unsigned int* first = kUnicodeNewlines; | |
| 98 const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines); | |
| 99 return std::find(first, last, character) != last; | |
| 100 } | |
| 39 | 101 |
| 40 } // namespace | 102 } // namespace |
| 41 | 103 |
| 42 namespace chrome_pdf { | 104 namespace chrome_pdf { |
| 43 | 105 |
| 44 PDFiumPage::PDFiumPage(PDFiumEngine* engine, | 106 PDFiumPage::PDFiumPage(PDFiumEngine* engine, |
| 45 int i, | 107 int i, |
| 46 const pp::Rect& r, | 108 const pp::Rect& r, |
| 47 bool available) | 109 bool available) |
| 48 : engine_(engine), | 110 : engine_(engine), |
| (...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 123 } | 185 } |
| 124 return text_page_; | 186 return text_page_; |
| 125 } | 187 } |
| 126 | 188 |
| 127 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { | 189 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { |
| 128 base::DictionaryValue* node = new base::DictionaryValue(); | 190 base::DictionaryValue* node = new base::DictionaryValue(); |
| 129 | 191 |
| 130 if (!available_) | 192 if (!available_) |
| 131 return node; | 193 return node; |
| 132 | 194 |
| 133 double width = FPDF_GetPageWidth(GetPage()); | 195 FPDF_PAGE page = GetPage(); |
| 134 double height = FPDF_GetPageHeight(GetPage()); | 196 FPDF_TEXTPAGE text_page = GetTextPage(); |
| 135 | 197 |
| 136 base::ListValue* text = new base::ListValue(); | 198 double width = FPDF_GetPageWidth(page); |
| 137 int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount()); | 199 double height = FPDF_GetPageHeight(page); |
| 138 for (int i = 0; i < box_count; i++) { | |
| 139 double left, top, right, bottom; | |
| 140 FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom); | |
| 141 text->Append( | |
| 142 GetTextBoxAsValue(height, left, top, right, bottom, rotation)); | |
| 143 } | |
| 144 | 200 |
| 145 node->SetDouble(kPageWidth, width); | 201 node->SetDouble(kPageWidth, width); |
| 146 node->SetDouble(kPageHeight, height); | 202 node->SetDouble(kPageHeight, height); |
| 203 base::ListValue* text = new base::ListValue(); | |
| 204 | |
| 205 int chars_count = FPDFText_CountChars(text_page); | |
| 206 pp::Rect line_rect; | |
| 207 pp::Rect word_rect; | |
| 208 bool seen_literal_text_in_word = false; | |
| 209 | |
| 210 base::string16 line; | |
| 211 for (int i = 0; i <= chars_count; i++) { | |
|
Lei Zhang
2016/01/14 02:45:06
This went it i < chars_count in patch set 2 and ba
Lei Zhang
2016/01/14 02:47:58
"This went to"
dmazzoni
2016/01/21 23:10:43
This is on purpose. When i == chars_count, we pret
| |
| 212 unsigned int character; | |
| 213 pp::Rect char_rect; | |
| 214 | |
| 215 if (i < chars_count) { | |
| 216 character = FPDFText_GetUnicode(text_page, i); | |
| 217 char_rect = GetCharRectInGViewCoords(page, text_page, i); | |
| 218 } else { | |
| 219 // Make the last character a newline so the last line isn't lost. | |
| 220 character = '\n'; | |
| 221 } | |
| 222 | |
| 223 // There are spurious STX chars appearing in place | |
| 224 // of ligatures. Apply a heuristic to check that some vertical displacement | |
| 225 // is involved before assuming they are line-breaks. | |
| 226 bool is_intraword_linebreak = false; | |
| 227 if (i < chars_count - 1 && IsSoftHyphen(character)) { | |
| 228 // check if the next char and this char are in different lines. | |
| 229 pp::Rect next_char_rect = GetCharRectInGViewCoords( | |
| 230 page, text_page, i + 1); | |
| 231 is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect); | |
|
jbreiden
2016/01/14 01:07:56
Maybe a comment somewhere in this function mention
dmazzoni
2016/01/21 23:10:43
Added a link to a bug.
| |
| 232 } | |
| 233 if (is_intraword_linebreak || | |
| 234 base::IsUnicodeWhitespace(character) || | |
|
Lei Zhang
2016/01/14 02:45:06
funny indentation
dmazzoni
2016/01/21 23:10:43
Done.
| |
| 235 IsEol(character)) { | |
| 236 if (!word_rect.IsEmpty() && seen_literal_text_in_word) { | |
| 237 word_rect = pp::Rect(); | |
| 238 seen_literal_text_in_word = false; | |
| 239 } | |
| 240 } | |
| 241 | |
| 242 if (IsEol(character) || is_intraword_linebreak) { | |
|
Lei Zhang
2016/01/14 02:45:06
You can also check |is_intraword_linebreak| first
dmazzoni
2016/01/21 23:10:43
Done.
| |
| 243 if (!line_rect.IsEmpty()) { | |
| 244 if (is_intraword_linebreak) { | |
| 245 // Add a 0-width hyphen. | |
| 246 line.push_back('-'); | |
| 247 } | |
| 248 base::DictionaryValue* line_node = new base::DictionaryValue(); | |
|
Lei Zhang
2016/01/14 02:45:06
Can you create/initialize the Values in order? |te
dmazzoni
2016/01/21 23:10:43
Done.
| |
| 249 line_node->SetDouble(kTextBoxLeft, line_rect.x()); | |
| 250 line_node->SetDouble(kTextBoxTop, line_rect.y()); | |
| 251 line_node->SetDouble(kTextBoxWidth, line_rect.width()); | |
| 252 line_node->SetDouble(kTextBoxHeight, line_rect.height()); | |
| 253 line_node->SetDouble(kTextBoxFontSize, | |
| 254 FPDFText_GetFontSize(text_page, i)); | |
| 255 | |
| 256 base::ListValue* text_nodes = new base::ListValue(); | |
| 257 base::DictionaryValue* text_node = new base::DictionaryValue(); | |
| 258 text_node->SetString(kTextNodeType, kTextNodeTypeText); | |
| 259 text_node->SetString(kTextNodeText, line); | |
| 260 text_nodes->Append(text_node); | |
| 261 | |
| 262 line_node->Set(kTextBoxNodes, text_nodes); | |
| 263 text->Append(line_node); | |
| 264 | |
| 265 line.clear(); | |
| 266 line_rect = pp::Rect(); | |
| 267 word_rect = pp::Rect(); | |
| 268 seen_literal_text_in_word = false; | |
| 269 } | |
| 270 continue; | |
| 271 } | |
| 272 seen_literal_text_in_word = seen_literal_text_in_word || | |
| 273 !base::IsUnicodeWhitespace(character); | |
| 274 line.push_back(character); | |
| 275 | |
| 276 if (!char_rect.IsEmpty()) { | |
| 277 line_rect = line_rect.Union(char_rect); | |
| 278 | |
| 279 if (!base::IsUnicodeWhitespace(character)) | |
| 280 word_rect = word_rect.Union(char_rect); | |
| 281 } | |
| 282 } | |
| 283 | |
| 147 node->Set(kPageTextBox, text); // Takes ownership of |text| | 284 node->Set(kPageTextBox, text); // Takes ownership of |text| |
| 148 | 285 |
| 149 return node; | 286 return node; |
| 150 } | 287 } |
| 151 | 288 |
| 152 base::Value* PDFiumPage::GetTextBoxAsValue(double page_height, | |
| 153 double left, double top, | |
| 154 double right, double bottom, | |
| 155 int rotation) { | |
| 156 base::string16 text_utf16; | |
| 157 int char_count = | |
| 158 FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0); | |
| 159 if (char_count > 0) { | |
| 160 unsigned short* data = reinterpret_cast<unsigned short*>( | |
| 161 base::WriteInto(&text_utf16, char_count + 1)); | |
| 162 FPDFText_GetBoundedText(GetTextPage(), | |
| 163 left, top, right, bottom, | |
| 164 data, char_count); | |
| 165 } | |
| 166 std::string text_utf8 = base::UTF16ToUTF8(text_utf16); | |
| 167 | |
| 168 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top); | |
| 169 Area area; | |
| 170 std::vector<LinkTarget> targets; | |
| 171 if (link) { | |
| 172 targets.push_back(LinkTarget()); | |
| 173 area = GetLinkTarget(link, &targets[0]); | |
| 174 } else { | |
| 175 pp::Rect rect( | |
| 176 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation)); | |
| 177 GetLinks(rect, &targets); | |
| 178 area = targets.empty() ? TEXT_AREA : WEBLINK_AREA; | |
| 179 } | |
| 180 | |
| 181 int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top, | |
| 182 kTolerance, kTolerance); | |
| 183 double font_size = FPDFText_GetFontSize(GetTextPage(), char_index); | |
| 184 | |
| 185 base::DictionaryValue* node = new base::DictionaryValue(); | |
| 186 node->SetDouble(kTextBoxLeft, left); | |
| 187 node->SetDouble(kTextBoxTop, page_height - top); | |
| 188 node->SetDouble(kTextBoxWidth, right - left); | |
| 189 node->SetDouble(kTextBoxHeight, top - bottom); | |
| 190 node->SetDouble(kTextBoxFontSize, font_size); | |
| 191 | |
| 192 base::ListValue* text_nodes = new base::ListValue(); | |
| 193 | |
| 194 if (area == DOCLINK_AREA) { | |
| 195 std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page); | |
| 196 text_nodes->Append(CreateURLNode(text_utf8, url)); | |
| 197 } else if (area == WEBLINK_AREA && link) { | |
| 198 text_nodes->Append(CreateURLNode(text_utf8, targets[0].url)); | |
| 199 } else if (area == WEBLINK_AREA && !link) { | |
| 200 size_t start = 0; | |
| 201 for (const auto& target : targets) { | |
| 202 // If there is an extra NULL character at end, find() will not return any | |
| 203 // matches. There should not be any though. | |
| 204 if (!target.url.empty()) | |
| 205 DCHECK_NE(target.url.back(), '\0'); | |
| 206 | |
| 207 // PDFium may change the case of generated links. | |
| 208 std::string lowerCaseURL = base::ToLowerASCII(target.url); | |
| 209 std::string lowerCaseText = base::ToLowerASCII(text_utf8); | |
| 210 size_t pos = lowerCaseText.find(lowerCaseURL, start); | |
| 211 size_t length = target.url.size(); | |
| 212 if (pos == std::string::npos) { | |
| 213 // Check if the link is a "mailto:" URL | |
| 214 if (lowerCaseURL.compare(0, 7, "mailto:") == 0) { | |
| 215 pos = lowerCaseText.find(lowerCaseURL.substr(7), start); | |
| 216 length -= 7; | |
| 217 } | |
| 218 | |
| 219 if (pos == std::string::npos) { | |
| 220 // No match has been found. This should never happen. | |
| 221 continue; | |
| 222 } | |
| 223 } | |
| 224 | |
| 225 std::string before_text = text_utf8.substr(start, pos - start); | |
| 226 if (!before_text.empty()) | |
| 227 text_nodes->Append(CreateTextNode(before_text)); | |
| 228 std::string link_text = text_utf8.substr(pos, length); | |
| 229 text_nodes->Append(CreateURLNode(link_text, target.url)); | |
| 230 | |
| 231 start = pos + length; | |
| 232 } | |
| 233 std::string before_text = text_utf8.substr(start); | |
| 234 if (!before_text.empty()) | |
| 235 text_nodes->Append(CreateTextNode(before_text)); | |
| 236 } else { | |
| 237 text_nodes->Append(CreateTextNode(text_utf8)); | |
| 238 } | |
| 239 | |
| 240 node->Set(kTextBoxNodes, text_nodes); // Takes ownership of |text_nodes|. | |
| 241 return node; | |
| 242 } | |
| 243 | |
| 244 base::Value* PDFiumPage::CreateTextNode(const std::string& text) { | |
| 245 base::DictionaryValue* node = new base::DictionaryValue(); | |
| 246 node->SetString(kTextNodeType, kTextNodeTypeText); | |
| 247 node->SetString(kTextNodeText, text); | |
| 248 return node; | |
| 249 } | |
| 250 | |
| 251 base::Value* PDFiumPage::CreateURLNode(const std::string& text, | |
| 252 const std::string& url) { | |
| 253 base::DictionaryValue* node = new base::DictionaryValue(); | |
| 254 node->SetString(kTextNodeType, kTextNodeTypeURL); | |
| 255 node->SetString(kTextNodeText, text); | |
| 256 node->SetString(kTextNodeURL, url); | |
| 257 return node; | |
| 258 } | |
| 259 | |
| 260 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, | 289 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, |
| 261 int rotation, | 290 int rotation, |
| 262 int* char_index, | 291 int* char_index, |
| 263 int* form_type, | 292 int* form_type, |
| 264 LinkTarget* target) { | 293 LinkTarget* target) { |
| 265 if (!available_) | 294 if (!available_) |
| 266 return NONSELECTABLE_AREA; | 295 return NONSELECTABLE_AREA; |
| 267 pp::Point point2 = point - rect_.point(); | 296 pp::Point point2 = point - rect_.point(); |
| 268 double new_x; | 297 double new_x; |
| 269 double new_y; | 298 double new_y; |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 525 page_->loading_count_--; | 554 page_->loading_count_--; |
| 526 } | 555 } |
| 527 | 556 |
| 528 PDFiumPage::Link::Link() { | 557 PDFiumPage::Link::Link() { |
| 529 } | 558 } |
| 530 | 559 |
| 531 PDFiumPage::Link::~Link() { | 560 PDFiumPage::Link::~Link() { |
| 532 } | 561 } |
| 533 | 562 |
| 534 } // namespace chrome_pdf | 563 } // namespace chrome_pdf |
| OLD | NEW |