Index: pdf/pdfium/pdfium_page.cc |
diff --git a/pdf/pdfium/pdfium_page.cc b/pdf/pdfium/pdfium_page.cc |
index 5e0b192b5ab4182ba079f27534706609dfbc5046..c68873093ce78b1edd53948753cafc211b985e34 100644 |
--- a/pdf/pdfium/pdfium_page.cc |
+++ b/pdf/pdfium/pdfium_page.cc |
@@ -32,10 +32,74 @@ const char kTextBoxFontSize[] = "fontSize"; |
const char kTextBoxNodes[] = "textNodes"; |
const char kTextNodeType[] = "type"; |
const char kTextNodeText[] = "text"; |
-const char kTextNodeURL[] = "url"; |
const char kTextNodeTypeText[] = "text"; |
-const char kTextNodeTypeURL[] = "url"; |
-const char kDocLinkURLPrefix[] = "#page"; |
+ |
+pp::Rect PageRectToGViewRect(const pp::Rect &input, FPDF_PAGE page) { |
Lei Zhang
2016/01/08 04:01:36
"pp::Rect& input", put |page| as the first param.
dmazzoni
2016/01/11 19:58:01
Done.
|
+ int output_width = FPDF_GetPageWidth(page); |
+ int output_height = FPDF_GetPageHeight(page); |
+ |
+ int min_x, min_y; |
+ int max_x, max_y; |
+ FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, |
+ input.x(), input.y(), &min_x, &min_y); |
+ FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, |
+ input.right(), input.bottom(), &max_x, &max_y); |
+ |
+ if (max_x < min_x) |
+ std::swap(min_x, max_x); |
+ if (max_y < min_y) |
+ std::swap(min_y, max_y); |
+ |
+ pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y); |
+ output_rect.Intersect(pp::Rect(0, 0, output_width, output_height)); |
+ if (output_rect.IsEmpty()) { |
+ VLOG(9) << "xml-invalid-rectangle"; |
+ } |
+ return output_rect; |
+} |
+ |
+pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page, |
+ int index) { |
+ double left, right, bottom, top; |
+ FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); |
+ if (right < left) |
+ std::swap(left, right); |
+ if (bottom < top) |
+ std::swap(top, bottom); |
+ pp::Rect page_coords(left, top, right - left, bottom - top); |
+ return PageRectToGViewRect(page_coords, page); |
+} |
+ |
+// This is the character foxit inserts where a word is broken across lines. |
+const unsigned int kSoftHyphen = 0x02; |
+// The following characters should all be recognized as Unicode newlines: |
+// LF: Line Feed, U+000A |
+// VT: Vertical Tab, U+000B |
+// FF: Form Feed, U+000C |
+// CR: Carriage Return, U+000D |
+// CR+LF: CR (U+000D) followed by LF (U+000A) |
+// NEL: Next Line, U+0085 |
+// LS: Line Separator, U+2028 |
+// PS: Paragraph Separator, U+2029. |
+// Source: http://en.wikipedia.org/wiki/Newline#Unicode . |
+const unsigned int kUnicodeNewlines[] = { |
+ 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029, 0 |
Lei Zhang
2016/01/08 04:01:36
Don't need the 0 sentinel value at the end?
dmazzoni
2016/01/11 19:58:02
Done.
|
+}; |
+ |
+bool IsSoftHyphen(unsigned int character) { |
+ return kSoftHyphen == character; |
+} |
+ |
+bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) { |
+ return !(a.IsEmpty() || b.IsEmpty() || |
+ a.bottom() < b.y() || b.bottom() < a.y()); |
+} |
+ |
+bool IsEol(unsigned int character) { |
+ const unsigned int* first = kUnicodeNewlines; |
+ const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines); |
+ return std::find(first, last, character) != last; |
+} |
} // namespace |
@@ -130,130 +194,105 @@ base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { |
if (!available_) |
return node; |
- double width = FPDF_GetPageWidth(GetPage()); |
- double height = FPDF_GetPageHeight(GetPage()); |
+ FPDF_PAGE page = GetPage(); |
+ FPDF_TEXTPAGE text_page = GetTextPage(); |
- base::ListValue* text = new base::ListValue(); |
- int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount()); |
- for (int i = 0; i < box_count; i++) { |
- double left, top, right, bottom; |
- FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom); |
- text->Append( |
- GetTextBoxAsValue(height, left, top, right, bottom, rotation)); |
- } |
+ double width = FPDF_GetPageWidth(page); |
+ double height = FPDF_GetPageHeight(page); |
node->SetDouble(kPageWidth, width); |
node->SetDouble(kPageHeight, height); |
- node->Set(kPageTextBox, text); // Takes ownership of |text| |
- |
- return node; |
-} |
- |
-base::Value* PDFiumPage::GetTextBoxAsValue(double page_height, |
- double left, double top, |
- double right, double bottom, |
- int rotation) { |
- base::string16 text_utf16; |
- int char_count = |
- FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0); |
- if (char_count > 0) { |
- unsigned short* data = reinterpret_cast<unsigned short*>( |
- base::WriteInto(&text_utf16, char_count + 1)); |
- FPDFText_GetBoundedText(GetTextPage(), |
- left, top, right, bottom, |
- data, char_count); |
- } |
- std::string text_utf8 = base::UTF16ToUTF8(text_utf16); |
- |
- FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top); |
- Area area; |
- std::vector<LinkTarget> targets; |
- if (link) { |
- targets.push_back(LinkTarget()); |
- area = GetLinkTarget(link, &targets[0]); |
- } else { |
- pp::Rect rect( |
- PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation)); |
- GetLinks(rect, &targets); |
- area = targets.size() == 0 ? TEXT_AREA : WEBLINK_AREA; |
- } |
- |
- int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top, |
- kTolerance, kTolerance); |
- double font_size = FPDFText_GetFontSize(GetTextPage(), char_index); |
+ base::ListValue* text = new base::ListValue(); |
- base::DictionaryValue* node = new base::DictionaryValue(); |
- node->SetDouble(kTextBoxLeft, left); |
- node->SetDouble(kTextBoxTop, page_height - top); |
- node->SetDouble(kTextBoxWidth, right - left); |
- node->SetDouble(kTextBoxHeight, top - bottom); |
- node->SetDouble(kTextBoxFontSize, font_size); |
- |
- base::ListValue* text_nodes = new base::ListValue(); |
- |
- if (area == DOCLINK_AREA) { |
- std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page); |
- text_nodes->Append(CreateURLNode(text_utf8, url)); |
- } else if (area == WEBLINK_AREA && link) { |
- text_nodes->Append(CreateURLNode(text_utf8, targets[0].url)); |
- } else if (area == WEBLINK_AREA && !link) { |
- size_t start = 0; |
- for (size_t i = 0; i < targets.size(); ++i) { |
- // If there is an extra NULL character at end, find() will not return any |
- // matches. There should not be any though. |
- if (!targets[i].url.empty()) |
- DCHECK(targets[i].url[targets[i].url.size() - 1] != '\0'); |
- |
- // PDFium may change the case of generated links. |
- std::string lowerCaseURL = base::ToLowerASCII(targets[i].url); |
- std::string lowerCaseText = base::ToLowerASCII(text_utf8); |
- size_t pos = lowerCaseText.find(lowerCaseURL, start); |
- size_t length = targets[i].url.size(); |
- if (pos == std::string::npos) { |
- // Check if the link is a "mailto:" URL |
- if (lowerCaseURL.compare(0, 7, "mailto:") == 0) { |
- pos = lowerCaseText.find(lowerCaseURL.substr(7), start); |
- length -= 7; |
- } |
+ int chars_count = FPDFText_CountChars(text_page); |
+ pp::Rect block_rect; |
Lei Zhang
2016/01/08 04:01:36
Is this needed? It's being written to, but not rea
dmazzoni
2016/01/11 19:58:02
Done.
|
+ pp::Rect line_rect; |
+ pp::Rect word_rect; |
+ bool seen_literal_text_in_word = false; |
+ |
+ base::string16 line; |
+ for (int i = 0; i <= chars_count; i++) { |
Lei Zhang
2016/01/08 04:01:36
Isn't the last iteration going out of bounds?
dmazzoni
2016/01/11 19:58:02
Done.
|
+ unsigned int character = FPDFText_GetUnicode(text_page, i); |
+ pp::Rect char_rect = GetCharRectInGViewCoords(page, text_page, i); |
+ |
+ // Due to b/9598615 there are spurious STX chars appearing in place |
Lei Zhang
2016/01/08 04:01:36
Has that bug been fixed?
dmazzoni
2016/01/11 19:58:02
@jbreiden:
Not sure.
Honestly it looks reasonabl
|
+ // of ligatures. Apply a heuristic to check that some vertical displacement |
+ // is involved before assuming they are line-breaks. |
+ bool is_intraword_linebreak = false; |
+ if (IsSoftHyphen(character)) { |
+ if (i < chars_count) { |
+ // check if the next char and this char are in different lines. |
+ pp::Rect next_char_rect = GetCharRectInGViewCoords( |
+ page, text_page, i + 1); |
+ is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect); |
+ } |
+ } |
+ if (base::IsUnicodeWhitespace(character) || |
+ IsEol(character) || is_intraword_linebreak) { |
Lei Zhang
2016/01/08 04:01:36
Check |is_intraword_linebreak| first since that's
dmazzoni
2016/01/11 19:58:02
Done.
|
+ if (!word_rect.IsEmpty() && seen_literal_text_in_word) { |
+ word_rect = pp::Rect(); //.SetEmpty(); |
+ seen_literal_text_in_word = false; |
+ } |
+ } |
- if (pos == std::string::npos) { |
- // No match has been found. This should never happen. |
- continue; |
+ if (IsEol(character) || is_intraword_linebreak) { |
+ if (!line_rect.IsEmpty()) { |
+ if (is_intraword_linebreak) { |
+ // Add a 0-width hyphen. Projector will treat the first word of the |
Lei Zhang
2016/01/08 04:01:36
Reference to internal project name?
dmazzoni
2016/01/11 19:58:01
Done.
|
+ // next line as a continuation. |
+ line.push_back('-'); |
} |
+ base::DictionaryValue* line_node = new base::DictionaryValue(); |
+ line_node->SetDouble(kTextBoxLeft, line_rect.x()); |
+ line_node->SetDouble(kTextBoxTop, line_rect.y()); |
+ line_node->SetDouble(kTextBoxWidth, line_rect.width()); |
+ line_node->SetDouble(kTextBoxHeight, line_rect.height()); |
+ line_node->SetDouble(kTextBoxFontSize, |
+ FPDFText_GetFontSize(text_page, i)); |
+ |
+ base::ListValue* text_nodes = new base::ListValue(); |
+ base::DictionaryValue* text_node = new base::DictionaryValue(); |
+ text_node->SetString(kTextNodeType, kTextNodeTypeText); |
+ text_node->SetString(kTextNodeText, line); |
+ text_nodes->Append(text_node); |
+ |
+ line_node->Set(kTextBoxNodes, text_nodes); |
+ text->Append(line_node); |
+ |
+ if (!IsSoftHyphen(character)) |
+ block_rect = pp::Rect(); |
+ line.clear(); |
+ line_rect = pp::Rect(); |
+ word_rect = pp::Rect(); |
+ seen_literal_text_in_word = false; |
+ } |
+ continue; |
+ } |
+ seen_literal_text_in_word = seen_literal_text_in_word || |
+ !base::IsUnicodeWhitespace(character); |
+ line.push_back(character); |
+ |
+ if (!char_rect.IsEmpty()) { |
+ if (line_rect.IsEmpty()) |
+ line_rect = char_rect; |
+ else |
+ line_rect.Union(char_rect); |
Lei Zhang
2016/01/08 04:01:36
Union() calls like this have no effect.
dmazzoni
2016/01/11 19:58:01
Thanks! I meant to assign it.
|
+ if (block_rect.IsEmpty()) |
+ block_rect = char_rect; |
+ else |
+ block_rect.Union(char_rect); |
+ |
+ if (!base::IsUnicodeWhitespace(character)) { |
+ if (word_rect.IsEmpty()) |
+ word_rect = char_rect; |
+ else |
+ word_rect.Union(char_rect); |
} |
- |
- std::string before_text = text_utf8.substr(start, pos - start); |
- if (before_text.size() > 0) |
- text_nodes->Append(CreateTextNode(before_text)); |
- std::string link_text = text_utf8.substr(pos, length); |
- text_nodes->Append(CreateURLNode(link_text, targets[i].url)); |
- |
- start = pos + length; |
} |
- std::string before_text = text_utf8.substr(start); |
- if (before_text.size() > 0) |
- text_nodes->Append(CreateTextNode(before_text)); |
- } else { |
- text_nodes->Append(CreateTextNode(text_utf8)); |
} |
- node->Set(kTextBoxNodes, text_nodes); // Takes ownership of |text_nodes|. |
- return node; |
-} |
- |
-base::Value* PDFiumPage::CreateTextNode(const std::string& text) { |
- base::DictionaryValue* node = new base::DictionaryValue(); |
- node->SetString(kTextNodeType, kTextNodeTypeText); |
- node->SetString(kTextNodeText, text); |
- return node; |
-} |
+ node->Set(kPageTextBox, text); // Takes ownership of |text| |
-base::Value* PDFiumPage::CreateURLNode(const std::string& text, |
- const std::string& url) { |
- base::DictionaryValue* node = new base::DictionaryValue(); |
- node->SetString(kTextNodeType, kTextNodeTypeURL); |
- node->SetString(kTextNodeText, text); |
- node->SetString(kTextNodeURL, url); |
return node; |
} |