Index: pdf/pdfium/pdfium_page.cc |
diff --git a/pdf/pdfium/pdfium_page.cc b/pdf/pdfium/pdfium_page.cc |
index c7e53393331ec021073af5f2afc69a08e4354f59..5c1eb319787f5ae19bc4db62d40e3ffd53cccf64 100644 |
--- a/pdf/pdfium/pdfium_page.cc |
+++ b/pdf/pdfium/pdfium_page.cc |
@@ -372,8 +372,8 @@ void PDFiumPage::GetTextRunInfo(int start_char_index, |
if (!base::IsUnicodeWhitespace(character)) { |
// TODO(dmazzoni): this assumes horizontal text. |
// https://crbug.com/580311 |
- pp::FloatRect char_rect = |
- GetFloatCharRectInPixels(page, text_page, char_index); |
+ pp::FloatRect char_rect = GetFloatCharRectInPixels( |
+ page, text_page, char_index); |
if (!char_rect.IsEmpty() && !OverlapsOnYAxis(text_run_bounds, char_rect)) |
break; |
@@ -381,12 +381,27 @@ void PDFiumPage::GetTextRunInfo(int start_char_index, |
if (font_size != text_run_font_size) |
break; |
+ // Heuristic: split a text run after a space longer than 3 average |
+ // characters. |
+ double avg_char_width = |
+ text_run_bounds.width() / (char_index - start_char_index); |
+ if (char_rect.x() - text_run_bounds.right() > avg_char_width * 3) |
+ break; |
+ |
text_run_bounds = text_run_bounds.Union(char_rect); |
} |
char_index++; |
} |
+ // Some PDFs have missing or obviously bogus font sizes; substitute the |
+ // height of the bounding box in those cases. |
+ if (text_run_font_size <= 1 || |
+ text_run_font_size < text_run_bounds.height() / 2 || |
+ text_run_font_size > text_run_bounds.height() * 2) { |
+ text_run_font_size = text_run_bounds.height(); |
+ } |
+ |
*out_len = char_index - start_char_index; |
*out_font_size = text_run_font_size; |
*out_bounds = text_run_bounds; |
@@ -397,10 +412,10 @@ uint32_t PDFiumPage::GetCharUnicode(int char_index) { |
return FPDFText_GetUnicode(text_page, char_index); |
} |
-double PDFiumPage::GetCharWidth(int char_index) { |
+pp::FloatRect PDFiumPage::GetCharBounds(int char_index) { |
FPDF_PAGE page = GetPage(); |
FPDF_TEXTPAGE text_page = GetTextPage(); |
- return GetFloatCharRectInPixels(page, text_page, char_index).width(); |
+ return GetFloatCharRectInPixels(page, text_page, char_index); |
} |
PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point, |