Index: pdf/pdfium/pdfium_page.cc |
diff --git a/pdf/pdfium/pdfium_page.cc b/pdf/pdfium/pdfium_page.cc |
index 3b7787f08f226622945255dcb5b8b093ff5adf9f..e95296986d823d84491fdd0fd8cdeea7ea80daf2 100644 |
--- a/pdf/pdfium/pdfium_page.cc |
+++ b/pdf/pdfium/pdfium_page.cc |
@@ -15,7 +15,6 @@ |
#include "base/strings/string_number_conversions.h" |
#include "base/strings/string_util.h" |
#include "base/strings/utf_string_conversions.h" |
-#include "base/values.h" |
#include "pdf/pdfium/pdfium_api_string_buffer_adapter.h" |
#include "pdf/pdfium/pdfium_engine.h" |
#include "printing/units.h" |
@@ -29,43 +28,6 @@ using printing::kPixelsPerInch; |
namespace { |
-// Dictionary Value key names for returning the accessible page content as JSON. |
-const char kPageWidth[] = "width"; |
-const char kPageHeight[] = "height"; |
-const char kPageTextBox[] = "textBox"; |
-const char kTextBoxLeft[] = "left"; |
-const char kTextBoxTop[] = "top"; |
-const char kTextBoxWidth[] = "width"; |
-const char kTextBoxHeight[] = "height"; |
-const char kTextBoxFontSize[] = "fontSize"; |
-const char kTextBoxNodes[] = "textNodes"; |
-const char kTextNodeType[] = "type"; |
-const char kTextNodeText[] = "text"; |
-const char kTextNodeTypeText[] = "text"; |
- |
-pp::Rect PageRectToGViewRect(FPDF_PAGE page, const pp::Rect& input) { |
- int output_width = FPDF_GetPageWidth(page); |
- int output_height = FPDF_GetPageHeight(page); |
- |
- int min_x; |
- int min_y; |
- int max_x; |
- int max_y; |
- FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, |
- input.x(), input.y(), &min_x, &min_y); |
- FPDF_PageToDevice(page, 0, 0, output_width, output_height, 0, |
- input.right(), input.bottom(), &max_x, &max_y); |
- |
- if (max_x < min_x) |
- std::swap(min_x, max_x); |
- if (max_y < min_y) |
- std::swap(min_y, max_y); |
- |
- pp::Rect output_rect(min_x, min_y, max_x - min_x, max_y - min_y); |
- output_rect.Intersect(pp::Rect(0, 0, output_width, output_height)); |
- return output_rect; |
-} |
- |
pp::FloatRect FloatPageRectToPixelRect(FPDF_PAGE page, |
const pp::FloatRect& input) { |
int output_width = FPDF_GetPageWidth(page); |
@@ -93,18 +55,6 @@ pp::FloatRect FloatPageRectToPixelRect(FPDF_PAGE page, |
return output_rect; |
} |
-pp::Rect GetCharRectInGViewCoords(FPDF_PAGE page, FPDF_TEXTPAGE text_page, |
- int index) { |
- double left, right, bottom, top; |
- FPDFText_GetCharBox(text_page, index, &left, &right, &bottom, &top); |
- if (right < left) |
- std::swap(left, right); |
- if (bottom < top) |
- std::swap(top, bottom); |
- pp::Rect page_coords(left, top, right - left, bottom - top); |
- return PageRectToGViewRect(page, page_coords); |
-} |
- |
pp::FloatRect GetFloatCharRectInPixels(FPDF_PAGE page, |
FPDF_TEXTPAGE text_page, |
int index) { |
@@ -118,43 +68,11 @@ pp::FloatRect GetFloatCharRectInPixels(FPDF_PAGE page, |
return FloatPageRectToPixelRect(page, page_coords); |
} |
-// This is the character PDFium inserts where a word is broken across lines. |
-const unsigned int kSoftHyphen = 0x02; |
- |
-// The following characters should all be recognized as Unicode newlines: |
-// LF: Line Feed, U+000A |
-// VT: Vertical Tab, U+000B |
-// FF: Form Feed, U+000C |
-// CR: Carriage Return, U+000D |
-// CR+LF: CR (U+000D) followed by LF (U+000A) |
-// NEL: Next Line, U+0085 |
-// LS: Line Separator, U+2028 |
-// PS: Paragraph Separator, U+2029. |
-// Source: http://en.wikipedia.org/wiki/Newline#Unicode . |
-const unsigned int kUnicodeNewlines[] = { |
- 0xA, 0xB, 0xC, 0xD, 0X85, 0x2028, 0x2029 |
-}; |
- |
-bool IsSoftHyphen(unsigned int character) { |
- return kSoftHyphen == character; |
-} |
- |
-bool OverlapsOnYAxis(const pp::Rect &a, const pp::Rect& b) { |
- return !(a.IsEmpty() || b.IsEmpty() || |
- a.bottom() < b.y() || b.bottom() < a.y()); |
-} |
- |
bool OverlapsOnYAxis(const pp::FloatRect &a, const pp::FloatRect& b) { |
return !(a.IsEmpty() || b.IsEmpty() || |
a.bottom() < b.y() || b.bottom() < a.y()); |
} |
-bool IsEol(unsigned int character) { |
- const unsigned int* first = kUnicodeNewlines; |
- const unsigned int* last = kUnicodeNewlines + arraysize(kUnicodeNewlines); |
- return std::find(first, last, character) != last; |
-} |
- |
} // namespace |
namespace chrome_pdf { |
@@ -242,116 +160,6 @@ FPDF_TEXTPAGE PDFiumPage::GetTextPage() { |
return text_page_; |
} |
-base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) { |
- base::DictionaryValue* node = new base::DictionaryValue(); |
- |
- if (!available_) |
- return node; |
- |
- FPDF_PAGE page = GetPage(); |
- FPDF_TEXTPAGE text_page = GetTextPage(); |
- |
- double width = FPDF_GetPageWidth(page); |
- double height = FPDF_GetPageHeight(page); |
- |
- node->SetDouble(kPageWidth, width); |
- node->SetDouble(kPageHeight, height); |
- std::unique_ptr<base::ListValue> text(new base::ListValue()); |
- |
- int chars_count = FPDFText_CountChars(text_page); |
- pp::Rect line_rect; |
- pp::Rect word_rect; |
- bool seen_literal_text_in_word = false; |
- |
- // Iterate over all of the chars on the page. Explicitly run the loop |
- // with |i == chars_count|, which is one past the last character, and |
- // pretend it's a newline character in order to ensure we always flush |
- // the last line. |
- base::string16 line; |
- for (int i = 0; i <= chars_count; i++) { |
- unsigned int character; |
- pp::Rect char_rect; |
- |
- if (i < chars_count) { |
- character = FPDFText_GetUnicode(text_page, i); |
- char_rect = GetCharRectInGViewCoords(page, text_page, i); |
- } else { |
- // Make the last character a newline so the last line isn't lost. |
- character = '\n'; |
- } |
- |
- // There are spurious STX chars appearing in place |
- // of ligatures. Apply a heuristic to check that some vertical displacement |
- // is involved before assuming they are line-breaks. |
- bool is_intraword_linebreak = false; |
- if (i < chars_count - 1 && IsSoftHyphen(character)) { |
- // check if the next char and this char are in different lines. |
- pp::Rect next_char_rect = GetCharRectInGViewCoords( |
- page, text_page, i + 1); |
- |
- // TODO(dmazzoni): this assumes horizontal text. |
- // https://crbug.com/580311 |
- is_intraword_linebreak = !OverlapsOnYAxis(char_rect, next_char_rect); |
- } |
- if (is_intraword_linebreak || |
- base::IsUnicodeWhitespace(character) || |
- IsEol(character)) { |
- if (!word_rect.IsEmpty() && seen_literal_text_in_word) { |
- word_rect = pp::Rect(); |
- seen_literal_text_in_word = false; |
- } |
- } |
- |
- if (is_intraword_linebreak || IsEol(character)) { |
- if (!line_rect.IsEmpty()) { |
- if (is_intraword_linebreak) { |
- // Add a 0-width hyphen. |
- line.push_back('-'); |
- } |
- |
- std::unique_ptr<base::DictionaryValue> text_node( |
- new base::DictionaryValue()); |
- text_node->SetString(kTextNodeType, kTextNodeTypeText); |
- text_node->SetString(kTextNodeText, line); |
- |
- base::ListValue* text_nodes = new base::ListValue(); |
- text_nodes->Append(std::move(text_node)); |
- |
- std::unique_ptr<base::DictionaryValue> line_node( |
- new base::DictionaryValue()); |
- line_node->SetDouble(kTextBoxLeft, line_rect.x()); |
- line_node->SetDouble(kTextBoxTop, line_rect.y()); |
- line_node->SetDouble(kTextBoxWidth, line_rect.width()); |
- line_node->SetDouble(kTextBoxHeight, line_rect.height()); |
- line_node->SetDouble(kTextBoxFontSize, |
- FPDFText_GetFontSize(text_page, i)); |
- line_node->Set(kTextBoxNodes, text_nodes); |
- text->Append(std::move(line_node)); |
- |
- line.clear(); |
- line_rect = pp::Rect(); |
- word_rect = pp::Rect(); |
- seen_literal_text_in_word = false; |
- } |
- continue; |
- } |
- seen_literal_text_in_word = seen_literal_text_in_word || |
- !base::IsUnicodeWhitespace(character); |
- line.push_back(character); |
- |
- if (!char_rect.IsEmpty()) { |
- line_rect = line_rect.Union(char_rect); |
- |
- if (!base::IsUnicodeWhitespace(character)) |
- word_rect = word_rect.Union(char_rect); |
- } |
- } |
- |
- node->Set(kPageTextBox, text.release()); // Takes ownership of |text| |
- |
- return node; |
-} |
- |
void PDFiumPage::GetTextRunInfo(int start_char_index, |
uint32_t* out_len, |
double* out_font_size, |