Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(231)

Side by Side Diff: pdf/pdfium/pdfium_page.cc

Issue 1568723002: Improve extraction of accessible text from PDF. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@fix_pdf
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « pdf/pdfium/pdfium_page.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "pdf/pdfium/pdfium_page.h" 5 #include "pdf/pdfium/pdfium_page.h"
6 6
7 #include <math.h> 7 #include <math.h>
8 #include <stddef.h> 8 #include <stddef.h>
9 9
10 #include "base/logging.h" 10 #include "base/logging.h"
(...skipping 14 matching lines...) Expand all
25 const char kPageHeight[] = "height"; 25 const char kPageHeight[] = "height";
26 const char kPageTextBox[] = "textBox"; 26 const char kPageTextBox[] = "textBox";
27 const char kTextBoxLeft[] = "left"; 27 const char kTextBoxLeft[] = "left";
28 const char kTextBoxTop[] = "top"; 28 const char kTextBoxTop[] = "top";
29 const char kTextBoxWidth[] = "width"; 29 const char kTextBoxWidth[] = "width";
30 const char kTextBoxHeight[] = "height"; 30 const char kTextBoxHeight[] = "height";
31 const char kTextBoxFontSize[] = "fontSize"; 31 const char kTextBoxFontSize[] = "fontSize";
32 const char kTextBoxNodes[] = "textNodes"; 32 const char kTextBoxNodes[] = "textNodes";
33 const char kTextNodeType[] = "type"; 33 const char kTextNodeType[] = "type";
34 const char kTextNodeText[] = "text"; 34 const char kTextNodeText[] = "text";
35 const char kTextNodeURL[] = "url";
36 const char kTextNodeTypeText[] = "text"; 35 const char kTextNodeTypeText[] = "text";
37 const char kTextNodeTypeURL[] = "url"; 36
38 const char kDocLinkURLPrefix[] = "#page"; 37 pp::Rect PageRectToGViewRect(const pp::Rect &input, FPDF_PAGE page) {
Lei Zhang 2016/01/08 04:01:36 "pp::Rect& input", put |page| as the first param.
dmazzoni 2016/01/11 19:58:01 Done.
38 int output_width = FPDF_GetPageWidth(page);
39 int output_height = FPDF_GetPageHeight(page);
40
41 int min_x, min_y;
42 int max_x, max_y;
43 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0,
44 input.x(), input.y(), &min_x, &min_y);
45 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0,
46 input.right(), input.bottom(), &max_x, &max_y);
47
48 if (max_x < min_x)
49 std::swap(min_x, max_x);
50 if (max_y < min_y)
51 std::swap(min_y, max_y);
52
53 pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y);
54 output_rect.Intersect(pp::Rect(0, 0, output_width, output_height));
55 if (output_rect.IsEmpty()) {
56 VLOG(9) << "xml-invalid-rectangle";
57 }
58 return output_rect;
59 }
60
61 pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page,
62 int index) {
63 double left, right, bottom, top;
64 FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top);
65 if (right < left)
66 std::swap(left, right);
67 if (bottom < top)
68 std::swap(top, bottom);
69 pp::Rect page_coords(left, top, right - left, bottom - top);
70 return PageRectToGViewRect(page_coords, page);
71 }
72
73 // This is the character foxit inserts where a word is broken across lines.
74 const unsigned int kSoftHyphen = 0x02;
75 // The following characters should all be recognized as Unicode newlines:
76 // LF: Line Feed, U+000A
77 // VT: Vertical Tab, U+000B
78 // FF: Form Feed, U+000C
79 // CR: Carriage Return, U+000D
80 // CR+LF: CR (U+000D) followed by LF (U+000A)
81 // NEL: Next Line, U+0085
82 // LS: Line Separator, U+2028
83 // PS: Paragraph Separator, U+2029.
84 // Source: http://en.wikipedia.org/wiki/Newline#Unicode .
85 const unsigned int kUnicodeNewlines[] = {
86 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029, 0
Lei Zhang 2016/01/08 04:01:36 Don't need the 0 sentinel value at the end?
dmazzoni 2016/01/11 19:58:02 Done.
87 };
88
89 bool IsSoftHyphen(unsigned int character) {
90 return kSoftHyphen == character;
91 }
92
93 bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) {
94 return !(a.IsEmpty() || b.IsEmpty() ||
95 a.bottom() < b.y() || b.bottom() < a.y());
96 }
97
98 bool IsEol(unsigned int character) {
99 const unsigned int* first = kUnicodeNewlines;
100 const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines);
101 return std::find(first, last, character) != last;
102 }
39 103
40 } // namespace 104 } // namespace
41 105
42 namespace chrome_pdf { 106 namespace chrome_pdf {
43 107
44 PDFiumPage::PDFiumPage(PDFiumEngine* engine, 108 PDFiumPage::PDFiumPage(PDFiumEngine* engine,
45 int i, 109 int i,
46 const pp::Rect& r, 110 const pp::Rect& r,
47 bool available) 111 bool available)
48 : engine_(engine), 112 : engine_(engine),
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
123 } 187 }
124 return text_page_; 188 return text_page_;
125 } 189 }
126 190
127 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { 191 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) {
128 base::DictionaryValue* node = new base::DictionaryValue(); 192 base::DictionaryValue* node = new base::DictionaryValue();
129 193
130 if (!available_) 194 if (!available_)
131 return node; 195 return node;
132 196
133 double width = FPDF_GetPageWidth(GetPage()); 197 FPDF_PAGE page = GetPage();
134 double height = FPDF_GetPageHeight(GetPage()); 198 FPDF_TEXTPAGE text_page = GetTextPage();
135 199
136 base::ListValue* text = new base::ListValue(); 200 double width = FPDF_GetPageWidth(page);
137 int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount()); 201 double height = FPDF_GetPageHeight(page);
138 for (int i = 0; i < box_count; i++) {
139 double left, top, right, bottom;
140 FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom);
141 text->Append(
142 GetTextBoxAsValue(height, left, top, right, bottom, rotation));
143 }
144 202
145 node->SetDouble(kPageWidth, width); 203 node->SetDouble(kPageWidth, width);
146 node->SetDouble(kPageHeight, height); 204 node->SetDouble(kPageHeight, height);
205 base::ListValue* text = new base::ListValue();
206
207 int chars_count = FPDFText_CountChars(text_page);
208 pp::Rect block_rect;
Lei Zhang 2016/01/08 04:01:36 Is this needed? It's being written to, but not rea
dmazzoni 2016/01/11 19:58:02 Done.
209 pp::Rect line_rect;
210 pp::Rect word_rect;
211 bool seen_literal_text_in_word = false;
212
213 base::string16 line;
214 for (int i = 0; i <= chars_count; i++) {
Lei Zhang 2016/01/08 04:01:36 Isn't the last iteration going out of bounds?
dmazzoni 2016/01/11 19:58:02 Done.
215 unsigned int character = FPDFText_GetUnicode(text_page, i);
216 pp::Rect char_rect = GetCharRectInGViewCoords(page, text_page, i);
217
218 // Due to b/9598615 there are spurious STX chars appearing in place
Lei Zhang 2016/01/08 04:01:36 Has that bug been fixed?
dmazzoni 2016/01/11 19:58:02 @jbreiden: Not sure. Honestly it looks reasonabl
219 // of ligatures. Apply a heuristic to check that some vertical displacement
220 // is involved before assuming they are line-breaks.
221 bool is_intraword_linebreak = false;
222 if (IsSoftHyphen(character)) {
223 if (i < chars_count) {
224 // check if the next char and this char are in different lines.
225 pp::Rect next_char_rect = GetCharRectInGViewCoords(
226 page, text_page, i + 1);
227 is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect);
228 }
229 }
230 if (base::IsUnicodeWhitespace(character) ||
231 IsEol(character) || is_intraword_linebreak) {
Lei Zhang 2016/01/08 04:01:36 Check |is_intraword_linebreak| first since that's
dmazzoni 2016/01/11 19:58:02 Done.
232 if (!word_rect.IsEmpty() && seen_literal_text_in_word) {
233 word_rect = pp::Rect(); //.SetEmpty();
234 seen_literal_text_in_word = false;
235 }
236 }
237
238 if (IsEol(character) || is_intraword_linebreak) {
239 if (!line_rect.IsEmpty()) {
240 if (is_intraword_linebreak) {
241 // Add a 0-width hyphen. Projector will treat the first word of the
Lei Zhang 2016/01/08 04:01:36 Reference to internal project name?
dmazzoni 2016/01/11 19:58:01 Done.
242 // next line as a continuation.
243 line.push_back('-');
244 }
245 base::DictionaryValue* line_node = new base::DictionaryValue();
246 line_node->SetDouble(kTextBoxLeft, line_rect.x());
247 line_node->SetDouble(kTextBoxTop, line_rect.y());
248 line_node->SetDouble(kTextBoxWidth, line_rect.width());
249 line_node->SetDouble(kTextBoxHeight, line_rect.height());
250 line_node->SetDouble(kTextBoxFontSize,
251 FPDFText_GetFontSize(text_page, i));
252
253 base::ListValue* text_nodes = new base::ListValue();
254 base::DictionaryValue* text_node = new base::DictionaryValue();
255 text_node->SetString(kTextNodeType, kTextNodeTypeText);
256 text_node->SetString(kTextNodeText, line);
257 text_nodes->Append(text_node);
258
259 line_node->Set(kTextBoxNodes, text_nodes);
260 text->Append(line_node);
261
262 if (!IsSoftHyphen(character))
263 block_rect = pp::Rect();
264 line.clear();
265 line_rect = pp::Rect();
266 word_rect = pp::Rect();
267 seen_literal_text_in_word = false;
268 }
269 continue;
270 }
271 seen_literal_text_in_word = seen_literal_text_in_word ||
272 !base::IsUnicodeWhitespace(character);
273 line.push_back(character);
274
275 if (!char_rect.IsEmpty()) {
276 if (line_rect.IsEmpty())
277 line_rect = char_rect;
278 else
279 line_rect.Union(char_rect);
Lei Zhang 2016/01/08 04:01:36 Union() calls like this have no effect.
dmazzoni 2016/01/11 19:58:01 Thanks! I meant to assign it.
280 if (block_rect.IsEmpty())
281 block_rect = char_rect;
282 else
283 block_rect.Union(char_rect);
284
285 if (!base::IsUnicodeWhitespace(character)) {
286 if (word_rect.IsEmpty())
287 word_rect = char_rect;
288 else
289 word_rect.Union(char_rect);
290 }
291 }
292 }
293
147 node->Set(kPageTextBox, text); // Takes ownership of |text| 294 node->Set(kPageTextBox, text); // Takes ownership of |text|
148 295
149 return node; 296 return node;
150 } 297 }
151 298
152 base::Value* PDFiumPage::GetTextBoxAsValue(double page_height,
153 double left, double top,
154 double right, double bottom,
155 int rotation) {
156 base::string16 text_utf16;
157 int char_count =
158 FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0);
159 if (char_count > 0) {
160 unsigned short* data = reinterpret_cast<unsigned short*>(
161 base::WriteInto(&text_utf16, char_count + 1));
162 FPDFText_GetBoundedText(GetTextPage(),
163 left, top, right, bottom,
164 data, char_count);
165 }
166 std::string text_utf8 = base::UTF16ToUTF8(text_utf16);
167
168 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top);
169 Area area;
170 std::vector<LinkTarget> targets;
171 if (link) {
172 targets.push_back(LinkTarget());
173 area = GetLinkTarget(link, &targets[0]);
174 } else {
175 pp::Rect rect(
176 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation));
177 GetLinks(rect, &targets);
178 area = targets.size() == 0 ? TEXT_AREA : WEBLINK_AREA;
179 }
180
181 int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top,
182 kTolerance, kTolerance);
183 double font_size = FPDFText_GetFontSize(GetTextPage(), char_index);
184
185 base::DictionaryValue* node = new base::DictionaryValue();
186 node->SetDouble(kTextBoxLeft, left);
187 node->SetDouble(kTextBoxTop, page_height - top);
188 node->SetDouble(kTextBoxWidth, right - left);
189 node->SetDouble(kTextBoxHeight, top - bottom);
190 node->SetDouble(kTextBoxFontSize, font_size);
191
192 base::ListValue* text_nodes = new base::ListValue();
193
194 if (area == DOCLINK_AREA) {
195 std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page);
196 text_nodes->Append(CreateURLNode(text_utf8, url));
197 } else if (area == WEBLINK_AREA && link) {
198 text_nodes->Append(CreateURLNode(text_utf8, targets[0].url));
199 } else if (area == WEBLINK_AREA && !link) {
200 size_t start = 0;
201 for (size_t i = 0; i < targets.size(); ++i) {
202 // If there is an extra NULL character at end, find() will not return any
203 // matches. There should not be any though.
204 if (!targets[i].url.empty())
205 DCHECK(targets[i].url[targets[i].url.size() - 1] != '\0');
206
207 // PDFium may change the case of generated links.
208 std::string lowerCaseURL = base::ToLowerASCII(targets[i].url);
209 std::string lowerCaseText = base::ToLowerASCII(text_utf8);
210 size_t pos = lowerCaseText.find(lowerCaseURL, start);
211 size_t length = targets[i].url.size();
212 if (pos == std::string::npos) {
213 // Check if the link is a "mailto:" URL
214 if (lowerCaseURL.compare(0, 7, "mailto:") == 0) {
215 pos = lowerCaseText.find(lowerCaseURL.substr(7), start);
216 length -= 7;
217 }
218
219 if (pos == std::string::npos) {
220 // No match has been found. This should never happen.
221 continue;
222 }
223 }
224
225 std::string before_text = text_utf8.substr(start, pos - start);
226 if (before_text.size() > 0)
227 text_nodes->Append(CreateTextNode(before_text));
228 std::string link_text = text_utf8.substr(pos, length);
229 text_nodes->Append(CreateURLNode(link_text, targets[i].url));
230
231 start = pos + length;
232 }
233 std::string before_text = text_utf8.substr(start);
234 if (before_text.size() > 0)
235 text_nodes->Append(CreateTextNode(before_text));
236 } else {
237 text_nodes->Append(CreateTextNode(text_utf8));
238 }
239
240 node->Set(kTextBoxNodes, text_nodes); // Takes ownership of |text_nodes|.
241 return node;
242 }
243
244 base::Value* PDFiumPage::CreateTextNode(const std::string& text) {
245 base::DictionaryValue* node = new base::DictionaryValue();
246 node->SetString(kTextNodeType, kTextNodeTypeText);
247 node->SetString(kTextNodeText, text);
248 return node;
249 }
250
251 base::Value* PDFiumPage::CreateURLNode(const std::string& text,
252 const std::string& url) {
253 base::DictionaryValue* node = new base::DictionaryValue();
254 node->SetString(kTextNodeType, kTextNodeTypeURL);
255 node->SetString(kTextNodeText, text);
256 node->SetString(kTextNodeURL, url);
257 return node;
258 }
259
260 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, 299 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point,
261 int rotation, 300 int rotation,
262 int* char_index, 301 int* char_index,
263 int* form_type, 302 int* form_type,
264 LinkTarget* target) { 303 LinkTarget* target) {
265 if (!available_) 304 if (!available_)
266 return NONSELECTABLE_AREA; 305 return NONSELECTABLE_AREA;
267 pp::Point point2 = point - rect_.point(); 306 pp::Point point2 = point - rect_.point();
268 double new_x; 307 double new_x;
269 double new_y; 308 double new_y;
(...skipping 258 matching lines...) Expand 10 before | Expand all | Expand 10 after
528 page_->loading_count_--; 567 page_->loading_count_--;
529 } 568 }
530 569
531 PDFiumPage::Link::Link() { 570 PDFiumPage::Link::Link() {
532 } 571 }
533 572
534 PDFiumPage::Link::~Link() { 573 PDFiumPage::Link::~Link() {
535 } 574 }
536 575
537 } // namespace chrome_pdf 576 } // namespace chrome_pdf
OLDNEW
« no previous file with comments | « pdf/pdfium/pdfium_page.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698