| OLD | NEW |
| 1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| 6 | 6 |
| 7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ | 7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ |
| 8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ | 8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ |
| 9 | 9 |
| 10 #include "../fpdfapi/fpdf_parser.h" | 10 #include "../fpdfapi/fpdf_parser.h" |
| 11 | 11 |
| 12 class CPDF_Page; | 12 class CPDF_Page; |
| 13 class CPDF_PageObjects; | 13 class CPDF_PageObjects; |
| 14 class CPDF_TextObject; | 14 class CPDF_TextObject; |
| 15 class IPDF_LinkExtract; | 15 class IPDF_LinkExtract; |
| 16 class IPDF_ReflowedPage; | 16 class IPDF_ReflowedPage; |
| 17 class IPDF_TextPage; | 17 class IPDF_TextPage; |
| 18 class IPDF_TextPageFind; | 18 class IPDF_TextPageFind; |
| 19 | 19 |
| 20 #define PDF2TXT_AUTO_ROTATE» » 1 | 20 #define PDF2TXT_AUTO_ROTATE 1 |
| 21 #define PDF2TXT_AUTO_WIDTH» » 2 | 21 #define PDF2TXT_AUTO_WIDTH 2 |
| 22 #define PDF2TXT_KEEP_COLUMN» » 4 | 22 #define PDF2TXT_KEEP_COLUMN 4 |
| 23 #define PDF2TXT_USE_OCR»» » 8 | 23 #define PDF2TXT_USE_OCR 8 |
| 24 #define PDF2TXT_INCLUDE_INVISIBLE» 16 | 24 #define PDF2TXT_INCLUDE_INVISIBLE 16 |
| 25 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dicti
onary* pPage, | 25 void PDF_GetPageText(CFX_ByteStringArray& lines, |
| 26 int iMinWidth, FX_DWORD flags); | 26 CPDF_Document* pDoc, |
| 27 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CP
DF_Dictionary* pPage, | 27 CPDF_Dictionary* pPage, |
| 28 int iMinWidth, FX_DWORD flags); | 28 int iMinWidth, |
| 29 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPD
F_Dictionary* pPage, | 29 FX_DWORD flags); |
| 30 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, |
| 31 CPDF_Document* pDoc, |
| 32 CPDF_Dictionary* pPage, |
| 33 int iMinWidth, |
| 34 FX_DWORD flags); |
| 35 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, |
| 36 CPDF_Document* pDoc, |
| 37 CPDF_Dictionary* pPage, |
| 30 FX_DWORD flags); | 38 FX_DWORD flags); |
| 31 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary
* pPage); | 39 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, |
| 32 #define CHAR_ERROR» » » -1 | 40 CPDF_Dictionary* pPage); |
| 33 #define CHAR_NORMAL» » » 0 | 41 #define CHAR_ERROR -1 |
| 34 #define CHAR_GENERATED» » 1 | 42 #define CHAR_NORMAL 0 |
| 35 #define CHAR_UNUNICODE» » 2 | 43 #define CHAR_GENERATED 1 |
| 44 #define CHAR_UNUNICODE 2 |
| 36 typedef struct { | 45 typedef struct { |
| 37 FX_WCHAR» » » m_Unicode; | 46 FX_WCHAR m_Unicode; |
| 38 FX_WCHAR» » » m_Charcode; | 47 FX_WCHAR m_Charcode; |
| 39 int32_t» » » m_Flag; | 48 int32_t m_Flag; |
| 40 FX_FLOAT» » » m_FontSize; | 49 FX_FLOAT m_FontSize; |
| 41 FX_FLOAT» » » m_OriginX; | 50 FX_FLOAT m_OriginX; |
| 42 FX_FLOAT» » » m_OriginY; | 51 FX_FLOAT m_OriginY; |
| 43 CFX_FloatRect» » m_CharBox; | 52 CFX_FloatRect m_CharBox; |
| 44 CPDF_TextObject*» m_pTextObj; | 53 CPDF_TextObject* m_pTextObj; |
| 45 CFX_AffineMatrix» m_Matrix; | 54 CFX_AffineMatrix m_Matrix; |
| 46 } FPDF_CHAR_INFO; | 55 } FPDF_CHAR_INFO; |
| 47 typedef»CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; | 56 typedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; |
| 48 #define FPDFTEXT_LRTB» 0 | 57 #define FPDFTEXT_LRTB 0 |
| 49 #define FPDFTEXT_RLTB» 1 | 58 #define FPDFTEXT_RLTB 1 |
| 50 #define FPDFTEXT_TBRL» 2 | 59 #define FPDFTEXT_TBRL 2 |
| 51 #define FPDFTEXT_LEFT» » » -1 | 60 #define FPDFTEXT_LEFT -1 |
| 52 #define FPDFTEXT_RIGHT» » » 1 | 61 #define FPDFTEXT_RIGHT 1 |
| 53 #define FPDFTEXT_UP» » » » -2 | 62 #define FPDFTEXT_UP -2 |
| 54 #define FPDFTEXT_DOWN» » » 2 | 63 #define FPDFTEXT_DOWN 2 |
| 55 #define FPDFTEXT_WRITINGMODE_UNKNOW» 0 | 64 #define FPDFTEXT_WRITINGMODE_UNKNOW 0 |
| 56 #define FPDFTEXT_WRITINGMODE_LRTB» 1 | 65 #define FPDFTEXT_WRITINGMODE_LRTB 1 |
| 57 #define FPDFTEXT_WRITINGMODE_RLTB» 2 | 66 #define FPDFTEXT_WRITINGMODE_RLTB 2 |
| 58 #define FPDFTEXT_WRITINGMODE_TBRL» 3 | 67 #define FPDFTEXT_WRITINGMODE_TBRL 3 |
| 59 class CPDFText_ParseOptions | 68 class CPDFText_ParseOptions { |
| 60 { | 69 public: |
| 61 public: | 70 CPDFText_ParseOptions(); |
| 71 FX_BOOL m_bGetCharCodeOnly; |
| 72 FX_BOOL m_bNormalizeObjs; |
| 73 FX_BOOL m_bOutputHyphen; |
| 74 }; |
| 75 class IPDF_TextPage { |
| 76 public: |
| 77 virtual ~IPDF_TextPage() {} |
| 78 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, |
| 79 CPDFText_ParseOptions ParserOptions); |
| 80 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0); |
| 81 static IPDF_TextPage* CreateTextPage(const CPDF_PageObjects* pObjs, |
| 82 int flags = 0); |
| 83 static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage); |
| 62 | 84 |
| 63 CPDFText_ParseOptions(); | 85 virtual void NormalizeObjects(FX_BOOL bNormalize) = 0; |
| 64 FX_BOOL» » » m_bGetCharCodeOnly; | 86 |
| 65 FX_BOOL» » » m_bNormalizeObjs; | 87 virtual FX_BOOL ParseTextPage() = 0; |
| 66 FX_BOOL» » » m_bOutputHyphen; | 88 |
| 89 virtual FX_BOOL IsParsered() const = 0; |
| 90 |
| 91 public: |
| 92 virtual int CharIndexFromTextIndex(int TextIndex) const = 0; |
| 93 |
| 94 virtual int TextIndexFromCharIndex(int CharIndex) const = 0; |
| 95 |
| 96 virtual int CountChars() const = 0; |
| 97 |
| 98 virtual void GetCharInfo(int index, FPDF_CHAR_INFO& info) const = 0; |
| 99 |
| 100 virtual void GetRectArray(int start, |
| 101 int nCount, |
| 102 CFX_RectArray& rectArray) const = 0; |
| 103 |
| 104 virtual int GetIndexAtPos(CPDF_Point point, |
| 105 FX_FLOAT xTorelance, |
| 106 FX_FLOAT yTorelance) const = 0; |
| 107 |
| 108 virtual int GetIndexAtPos(FX_FLOAT x, |
| 109 FX_FLOAT y, |
| 110 FX_FLOAT xTorelance, |
| 111 FX_FLOAT yTorelance) const = 0; |
| 112 |
| 113 virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0; |
| 114 |
| 115 virtual void GetRectsArrayByRect(const CFX_FloatRect& rect, |
| 116 CFX_RectArray& resRectArray) const = 0; |
| 117 |
| 118 virtual int CountRects(int start, int nCount) = 0; |
| 119 |
| 120 virtual void GetRect(int rectIndex, |
| 121 FX_FLOAT& left, |
| 122 FX_FLOAT& top, |
| 123 FX_FLOAT& right, |
| 124 FX_FLOAT& bottom) const = 0; |
| 125 |
| 126 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0; |
| 127 |
| 128 virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0; |
| 129 |
| 130 virtual int CountBoundedSegments(FX_FLOAT left, |
| 131 FX_FLOAT top, |
| 132 FX_FLOAT right, |
| 133 FX_FLOAT bottom, |
| 134 FX_BOOL bContains = FALSE) = 0; |
| 135 |
| 136 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; |
| 137 |
| 138 virtual int GetWordBreak(int index, int direction) const = 0; |
| 139 |
| 140 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0; |
| 67 }; | 141 }; |
| 68 class IPDF_TextPage | 142 #define FPDFTEXT_MATCHCASE 0x00000001 |
| 69 { | 143 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 |
| 70 public: | 144 #define FPDFTEXT_CONSECUTIVE 0x00000004 |
| 145 class IPDF_TextPageFind { |
| 146 public: |
| 147 virtual ~IPDF_TextPageFind() {} |
| 71 | 148 |
| 72 virtual ~IPDF_TextPage() {} | 149 static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage); |
| 73 static IPDF_TextPage*» CreateTextPage(const CPDF_Page* pPage, CPDFText_
ParseOptions ParserOptions); | |
| 74 static IPDF_TextPage*» CreateTextPage(const CPDF_Page* pPage, int flags
= 0); | |
| 75 static IPDF_TextPage*» CreateTextPage(const CPDF_PageObjects* pObjs, in
t flags = 0); | |
| 76 static IPDF_TextPage*» CreateReflowTextPage(IPDF_ReflowedPage* pRefPage
); | |
| 77 | 150 |
| 78 virtual void» » » NormalizeObjects(FX_BOOL bNormalize) = 0
; | 151 public: |
| 152 virtual FX_BOOL FindFirst(const CFX_WideString& findwhat, |
| 153 int flags, |
| 154 int startPos = 0) = 0; |
| 79 | 155 |
| 80 virtual FX_BOOL» » » ParseTextPage() = 0; | 156 virtual FX_BOOL FindNext() = 0; |
| 81 | 157 |
| 158 virtual FX_BOOL FindPrev() = 0; |
| 82 | 159 |
| 83 virtual FX_BOOL» » » IsParsered() const = 0; | 160 virtual void GetRectArray(CFX_RectArray& rects) const = 0; |
| 84 public: | |
| 85 | 161 |
| 86 virtual int CharIndexFromTextIndex(int TextIndex) const = 0; | 162 virtual int GetCurOrder() const = 0; |
| 87 | 163 |
| 88 virtual int TextIndexFromCharIndex(int CharIndex) const = 0; | 164 virtual int GetMatchedCount() const = 0; |
| 165 }; |
| 166 class IPDF_LinkExtract { |
| 167 public: |
| 168 virtual ~IPDF_LinkExtract() {} |
| 89 | 169 |
| 170 static IPDF_LinkExtract* CreateLinkExtract(); |
| 90 | 171 |
| 91 virtual int»» » » CountChars() const = 0; | 172 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0; |
| 92 | 173 |
| 93 virtual» void» » » GetCharInfo(int index, FPDF_CHAR_INFO &
info) const = 0; | 174 public: |
| 175 virtual int CountLinks() const = 0; |
| 94 | 176 |
| 95 virtual void» » » GetRectArray(int start, int nCount, CFX_
RectArray& rectArray) const = 0; | 177 virtual CFX_WideString GetURL(int index) const = 0; |
| 96 | 178 |
| 179 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; |
| 97 | 180 |
| 98 | 181 virtual void GetRects(int index, CFX_RectArray& rects) const = 0; |
| 99 virtual int»» » » GetIndexAtPos(CPDF_Point point, FX_FLOAT
xTorelance, FX_FLOAT yTorelance) const = 0; | |
| 100 | |
| 101 virtual int»» » » GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX
_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; | |
| 102 | |
| 103 virtual CFX_WideString» GetTextByRect(const CFX_FloatRect& rect) const =
0; | |
| 104 | |
| 105 virtual void» » » GetRectsArrayByRect(const CFX_FloatRect&
rect, CFX_RectArray& resRectArray) const = 0; | |
| 106 | |
| 107 | |
| 108 virtual int»» » » CountRects(int start, int nCount) = 0; | |
| 109 | |
| 110 virtual» void» » » GetRect(int rectIndex, FX_FLOAT& left, F
X_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const = 0; | |
| 111 | |
| 112 virtual FX_BOOL» » » GetBaselineRotate(int rectIndex, int& Ro
tate) = 0; | |
| 113 | |
| 114 virtual FX_BOOL» » » GetBaselineRotate(const CFX_FloatRect& r
ect, int& Rotate) = 0; | |
| 115 | |
| 116 virtual» int» » » » CountBoundedSegments(FX_FLOAT le
ft, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) =
0; | |
| 117 | |
| 118 virtual» void» » » GetBoundedSegment(int index, int& start,
int& count) const = 0; | |
| 119 | |
| 120 | |
| 121 virtual int»» » » GetWordBreak(int index, int direction) c
onst = 0; | |
| 122 | |
| 123 virtual CFX_WideString» GetPageText(int start = 0, int nCount = -1 ) con
st = 0; | |
| 124 }; | |
| 125 #define FPDFTEXT_MATCHCASE 0x00000001 | |
| 126 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 | |
| 127 #define FPDFTEXT_CONSECUTIVE» 0x00000004 | |
| 128 class IPDF_TextPageFind | |
| 129 { | |
| 130 public: | |
| 131 | |
| 132 virtual» ~IPDF_TextPageFind() {} | |
| 133 | |
| 134 static» IPDF_TextPageFind*» CreatePageFind(const IPDF_TextPage* pTex
tPage); | |
| 135 public: | |
| 136 | |
| 137 virtual» FX_BOOL»» » » FindFirst(const CFX_WideString&
findwhat, int flags, int startPos = 0) = 0; | |
| 138 | |
| 139 virtual» FX_BOOL»» » » FindNext() = 0; | |
| 140 | |
| 141 virtual» FX_BOOL»» » » FindPrev() = 0; | |
| 142 | |
| 143 virtual void» » » » GetRectArray(CFX_RectArray& rect
s) const = 0; | |
| 144 | |
| 145 virtual int»» » » » GetCurOrder() const = 0; | |
| 146 | |
| 147 virtual int»» » » » GetMatchedCount() const = 0; | |
| 148 }; | |
| 149 class IPDF_LinkExtract | |
| 150 { | |
| 151 public: | |
| 152 | |
| 153 virtual» ~IPDF_LinkExtract() {} | |
| 154 | |
| 155 static» IPDF_LinkExtract*» CreateLinkExtract(); | |
| 156 | |
| 157 virtual FX_BOOL» » » » ExtractLinks(const IPDF_TextPage
* pTextPage) = 0; | |
| 158 public: | |
| 159 | |
| 160 virtual int»» » » » CountLinks() const = 0; | |
| 161 | |
| 162 virtual CFX_WideString» » GetURL(int index) const = 0; | |
| 163 | |
| 164 virtual» void» » » » GetBoundedSegment(int index, int
& start, int& count) const = 0; | |
| 165 | |
| 166 virtual void» » » » GetRects(int index, CFX_RectArra
y& rects) const = 0; | |
| 167 }; | 182 }; |
| 168 | 183 |
| 169 #endif // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ | 184 #endif // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ |
| OLD | NEW |