pdf/pdfium/pdfium_page.cc - Issue 1568723002: Improve extraction of accessible text from PDF.

Side by Side Diff: pdf/pdfium/pdfium_page.cc

Issue 1568723002: Improve extraction of accessible text from PDF. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@fix_pdf

Patch Set: Get rid of unneccessary assertion in test Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "pdf/pdfium/pdfium_page.h"	5 #include "pdf/pdfium/pdfium_page.h"

6	6

7 #include <math.h>	7 #include <math.h>

8 #include <stddef.h>	8 #include <stddef.h>

9	9

	10 #include <algorithm>

	11

10 #include "base/logging.h"	12 #include "base/logging.h"

11 #include "base/strings/string_number_conversions.h"	13 #include "base/strings/string_number_conversions.h"

12 #include "base/strings/string_util.h"	14 #include "base/strings/string_util.h"

13 #include "base/strings/utf_string_conversions.h"	15 #include "base/strings/utf_string_conversions.h"

14 #include "base/values.h"	16 #include "base/values.h"

15 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h"	17 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h"

16 #include "pdf/pdfium/pdfium_engine.h"	18 #include "pdf/pdfium/pdfium_engine.h"

17	19

18 // Used when doing hit detection.	20 // Used when doing hit detection.

19 #define kTolerance 20.0	21 #define kTolerance 20.0

20	22

21 namespace {	23 namespace {

22	24

23 // Dictionary Value key names for returning the accessible page content as JSON.	25 // Dictionary Value key names for returning the accessible page content as JSON.

24 const char kPageWidth[] = "width";	26 const char kPageWidth[] = "width";

25 const char kPageHeight[] = "height";	27 const char kPageHeight[] = "height";

26 const char kPageTextBox[] = "textBox";	28 const char kPageTextBox[] = "textBox";

27 const char kTextBoxLeft[] = "left";	29 const char kTextBoxLeft[] = "left";

28 const char kTextBoxTop[] = "top";	30 const char kTextBoxTop[] = "top";

29 const char kTextBoxWidth[] = "width";	31 const char kTextBoxWidth[] = "width";

30 const char kTextBoxHeight[] = "height";	32 const char kTextBoxHeight[] = "height";

31 const char kTextBoxFontSize[] = "fontSize";	33 const char kTextBoxFontSize[] = "fontSize";

32 const char kTextBoxNodes[] = "textNodes";	34 const char kTextBoxNodes[] = "textNodes";

33 const char kTextNodeType[] = "type";	35 const char kTextNodeType[] = "type";

34 const char kTextNodeText[] = "text";	36 const char kTextNodeText[] = "text";

35 const char kTextNodeURL[] = "url";

36 const char kTextNodeTypeText[] = "text";	37 const char kTextNodeTypeText[] = "text";

37 const char kTextNodeTypeURL[] = "url";	38

38 const char kDocLinkURLPrefix[] = "#page";	39 pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) {

	40 int output_width = FPDF_GetPageWidth(page);

	41 int output_height = FPDF_GetPageHeight(page);

	42

	43 int min_x;

	44 int min_y;

	45 int max_x;

	46 int max_y;

	47 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0,

	48 input.x(), input.y(), &min_x, &min_y);

	49 FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0,

	50 input.right(), input.bottom(), &max_x, &max_y);

	51

	52 if (max_x < min_x)

	53 std::swap(min_x, max_x);

	54 if (max_y < min_y)

	55 std::swap(min_y, max_y);

	56

	57 pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y);

	58 output_rect.Intersect(pp::Rect(0, 0, output_width, output_height));

	59 return output_rect;

	60 }

	61

	62 pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page,

	63 int index) {

	64 double left, right, bottom, top;

	65 FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top);

	66 if (right < left)

	67 std::swap(left, right);

	68 if (bottom < top)

	69 std::swap(top, bottom);

	70 pp::Rect page_coords(left, top, right - left, bottom - top);

	71 return PageRectToGViewRect(page, page_coords);

	72 }

	73

	74 // This is the character PDFium inserts where a word is broken across lines.

	75 const unsigned int kSoftHyphen = 0x02;

	76

	77 // The following characters should all be recognized as Unicode newlines:

	78 // LF: Line Feed, U+000A

	79 // VT: Vertical Tab, U+000B

	80 // FF: Form Feed, U+000C

	81 // CR: Carriage Return, U+000D

	82 // CR+LF: CR (U+000D) followed by LF (U+000A)

	83 // NEL: Next Line, U+0085

	84 // LS: Line Separator, U+2028

	85 // PS: Paragraph Separator, U+2029.

	86 // Source: http://en.wikipedia.org/wiki/Newline#Unicode .

	87 const unsigned int kUnicodeNewlines[] = {

	88 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029

	89 };

	90

	91 bool IsSoftHyphen(unsigned int character) {

	92 return kSoftHyphen == character;

	93 }

	94

	95 bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) {

	96 return !(a.IsEmpty() \|\| b.IsEmpty() \|\|

	97 a.bottom() < b.y() \|\| b.bottom() < a.y());

	98 }

	99

	100 bool IsEol(unsigned int character) {

	101 const unsigned int* first = kUnicodeNewlines;

	102 const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines);

	103 return std::find(first, last, character) != last;

	104 }

39	105

40 } // namespace	106 } // namespace

41	107

42 namespace chrome_pdf {	108 namespace chrome_pdf {

43	109

44 PDFiumPage::PDFiumPage(PDFiumEngine* engine,	110 PDFiumPage::PDFiumPage(PDFiumEngine* engine,

45 int i,	111 int i,

46 const pp::Rect& r,	112 const pp::Rect& r,

47 bool available)	113 bool available)

48 : engine_(engine),	114 : engine_(engine),

(...skipping 74 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
123 }	189 }

124 return text_page_;	190 return text_page_;

125 }	191 }

126	192

127 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) {	193 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) {

128 base::DictionaryValue* node = new base::DictionaryValue();	194 base::DictionaryValue* node = new base::DictionaryValue();

129	195

130 if (!available_)	196 if (!available_)

131 return node;	197 return node;

132	198

133 double width = FPDF_GetPageWidth(GetPage());	199 FPDF_PAGE page = GetPage();

134 double height = FPDF_GetPageHeight(GetPage());	200 FPDF_TEXTPAGE text_page = GetTextPage();

135	201

136 base::ListValue* text = new base::ListValue();	202 double width = FPDF_GetPageWidth(page);

137 int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount());	203 double height = FPDF_GetPageHeight(page);

138 for (int i = 0; i < box_count; i++) {

139 double left, top, right, bottom;

140 FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom);

141 text->Append(

142 GetTextBoxAsValue(height, left, top, right, bottom, rotation));

143 }

144	204

145 node->SetDouble(kPageWidth, width);	205 node->SetDouble(kPageWidth, width);

146 node->SetDouble(kPageHeight, height);	206 node->SetDouble(kPageHeight, height);

147 node->Set(kPageTextBox, text); // Takes ownership of \|text\|	207 scoped_ptr<base::ListValue> text(new base::ListValue());

	208

	209 int chars_count = FPDFText_CountChars(text_page);

	210 pp::Rect line_rect;

	211 pp::Rect word_rect;

	212 bool seen_literal_text_in_word = false;

	213

	214 // Iterate over all of the chars on the page. Explicitly run the loop

	215 // with \|i == chars_count\|, which is one past the last character, and

	216 // pretend it's a newline character in order to ensure we always flush

	217 // the last line.

	218 base::string16 line;

	219 for (int i = 0; i <= chars_count; i++) {

	220 unsigned int character;

	221 pp::Rect char_rect;

	222

	223 if (i < chars_count) {

	224 character = FPDFText_GetUnicode(text_page, i);

	225 char_rect = GetCharRectInGViewCoords(page, text_page, i);

	226 } else {

	227 // Make the last character a newline so the last line isn't lost.

	228 character = '\n';

	229 }

	230

	231 // There are spurious STX chars appearing in place

	232 // of ligatures. Apply a heuristic to check that some vertical displacement

	233 // is involved before assuming they are line-breaks.

	234 bool is_intraword_linebreak = false;

	235 if (i < chars_count - 1 && IsSoftHyphen(character)) {

	236 // check if the next char and this char are in different lines.

	237 pp::Rect next_char_rect = GetCharRectInGViewCoords(

	238 page, text_page, i + 1);

	239

	240 // TODO(dmazzoni): this assumes horizontal text.

	241 // https://crbug.com/580311

	242 is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect);

	243 }

	244 if (is_intraword_linebreak \|\|

	245 base::IsUnicodeWhitespace(character) \|\|

	246 IsEol(character)) {

	247 if (!word_rect.IsEmpty() && seen_literal_text_in_word) {

	248 word_rect = pp::Rect();

	249 seen_literal_text_in_word = false;

	250 }

	251 }

	252

	253 if (is_intraword_linebreak \|\| IsEol(character)) {

	254 if (!line_rect.IsEmpty()) {

	255 if (is_intraword_linebreak) {

	256 // Add a 0-width hyphen.

	257 line.push_back('-');

	258 }

	259

	260 base::DictionaryValue* text_node = new base::DictionaryValue();

	261 text_node->SetString(kTextNodeType, kTextNodeTypeText);

	262 text_node->SetString(kTextNodeText, line);

	263

	264 base::ListValue* text_nodes = new base::ListValue();

	265 text_nodes->Append(text_node);

	266

	267 base::DictionaryValue* line_node = new base::DictionaryValue();

	268 line_node->SetDouble(kTextBoxLeft, line_rect.x());

	269 line_node->SetDouble(kTextBoxTop, line_rect.y());

	270 line_node->SetDouble(kTextBoxWidth, line_rect.width());

	271 line_node->SetDouble(kTextBoxHeight, line_rect.height());

	272 line_node->SetDouble(kTextBoxFontSize,

	273 FPDFText_GetFontSize(text_page, i));

	274 line_node->Set(kTextBoxNodes, text_nodes);

	275 text->Append(line_node);

	276

	277 line.clear();

	278 line_rect = pp::Rect();

	279 word_rect = pp::Rect();

	280 seen_literal_text_in_word = false;

	281 }

	282 continue;

	283 }

	284 seen_literal_text_in_word = seen_literal_text_in_word \|\|

	285 !base::IsUnicodeWhitespace(character);

	286 line.push_back(character);

	287

	288 if (!char_rect.IsEmpty()) {

	289 line_rect = line_rect.Union(char_rect);

	290

	291 if (!base::IsUnicodeWhitespace(character))

	292 word_rect = word_rect.Union(char_rect);

	293 }

	294 }

	295

	296 node->Set(kPageTextBox, text.release()); // Takes ownership of \|text\|

148	297

149 return node;	298 return node;

150 }	299 }

151	300

152 base::Value* PDFiumPage::GetTextBoxAsValue(double page_height,

153 double left, double top,

154 double right, double bottom,

155 int rotation) {

156 base::string16 text_utf16;

157 int char_count =

158 FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0);

159 if (char_count > 0) {

160 unsigned short* data = reinterpret_cast<unsigned short*>(

161 base::WriteInto(&text_utf16, char_count + 1));

162 FPDFText_GetBoundedText(GetTextPage(),

163 left, top, right, bottom,

164 data, char_count);

165 }

166 std::string text_utf8 = base::UTF16ToUTF8(text_utf16);

167

168 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top);

169 Area area;

170 std::vector<LinkTarget> targets;

171 if (link) {

172 targets.push_back(LinkTarget());

173 area = GetLinkTarget(link, &targets[0]);

174 } else {

175 pp::Rect rect(

176 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation));

177 GetLinks(rect, &targets);

178 area = targets.empty() ? TEXT_AREA : WEBLINK_AREA;

179 }

180

181 int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top,

182 kTolerance, kTolerance);

183 double font_size = FPDFText_GetFontSize(GetTextPage(), char_index);

184

185 base::DictionaryValue* node = new base::DictionaryValue();

186 node->SetDouble(kTextBoxLeft, left);

187 node->SetDouble(kTextBoxTop, page_height - top);

188 node->SetDouble(kTextBoxWidth, right - left);

189 node->SetDouble(kTextBoxHeight, top - bottom);

190 node->SetDouble(kTextBoxFontSize, font_size);

191

192 base::ListValue* text_nodes = new base::ListValue();

193

194 if (area == DOCLINK_AREA) {

195 std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page);

196 text_nodes->Append(CreateURLNode(text_utf8, url));

197 } else if (area == WEBLINK_AREA && link) {

198 text_nodes->Append(CreateURLNode(text_utf8, targets[0].url));

199 } else if (area == WEBLINK_AREA && !link) {

200 size_t start = 0;

201 for (const auto& target : targets) {

202 // If there is an extra NULL character at end, find() will not return any

203 // matches. There should not be any though.

204 if (!target.url.empty())

205 DCHECK_NE(target.url.back(), '\0');

206

207 // PDFium may change the case of generated links.

208 std::string lowerCaseURL = base::ToLowerASCII(target.url);

209 std::string lowerCaseText = base::ToLowerASCII(text_utf8);

210 size_t pos = lowerCaseText.find(lowerCaseURL, start);

211 size_t length = target.url.size();

212 if (pos == std::string::npos) {

213 // Check if the link is a "mailto:" URL

214 if (lowerCaseURL.compare(0, 7, "mailto:") == 0) {

215 pos = lowerCaseText.find(lowerCaseURL.substr(7), start);

216 length -= 7;

217 }

218

219 if (pos == std::string::npos) {

220 // No match has been found. This should never happen.

221 continue;

222 }

223 }

224

225 std::string before_text = text_utf8.substr(start, pos - start);

226 if (!before_text.empty())

227 text_nodes->Append(CreateTextNode(before_text));

228 std::string link_text = text_utf8.substr(pos, length);

229 text_nodes->Append(CreateURLNode(link_text, target.url));

230

231 start = pos + length;

232 }

233 std::string before_text = text_utf8.substr(start);

234 if (!before_text.empty())

235 text_nodes->Append(CreateTextNode(before_text));

236 } else {

237 text_nodes->Append(CreateTextNode(text_utf8));

238 }

239

240 node->Set(kTextBoxNodes, text_nodes); // Takes ownership of \|text_nodes\|.

241 return node;

242 }

243

244 base::Value* PDFiumPage::CreateTextNode(const std::string& text) {

245 base::DictionaryValue* node = new base::DictionaryValue();

246 node->SetString(kTextNodeType, kTextNodeTypeText);

247 node->SetString(kTextNodeText, text);

248 return node;

249 }

250

251 base::Value* PDFiumPage::CreateURLNode(const std::string& text,

252 const std::string& url) {

253 base::DictionaryValue* node = new base::DictionaryValue();

254 node->SetString(kTextNodeType, kTextNodeTypeURL);

255 node->SetString(kTextNodeText, text);

256 node->SetString(kTextNodeURL, url);

257 return node;

258 }

259

260 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point,	301 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point,

261 int rotation,	302 int rotation,

262 int* char_index,	303 int* char_index,

263 int* form_type,	304 int* form_type,

264 LinkTarget* target) {	305 LinkTarget* target) {

265 if (!available_)	306 if (!available_)

266 return NONSELECTABLE_AREA;	307 return NONSELECTABLE_AREA;

267 pp::Point point2 = point - rect_.point();	308 pp::Point point2 = point - rect_.point();

268 double new_x;	309 double new_x;

269 double new_y;	310 double new_y;

(...skipping 255 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
525 page_->loading_count_--;	566 page_->loading_count_--;

526 }	567 }

527	568

528 PDFiumPage::Link::Link() {	569 PDFiumPage::Link::Link() {

529 }	570 }

530	571

531 PDFiumPage::Link::~Link() {	572 PDFiumPage::Link::~Link() {

532 }	573 }

533	574

534 } // namespace chrome_pdf	575 } // namespace chrome_pdf

OLD	NEW

« no previous file with comments | « pdf/pdfium/pdfium_page.h ('k') | no next file » | no next file with comments »