OLD | NEW |
1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
6 | 6 |
7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ | 7 #ifndef CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ |
8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ | 8 #define CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ |
9 | 9 |
10 #include "../fpdfapi/fpdf_parser.h" | 10 #include "../fpdfapi/fpdf_parser.h" |
11 | 11 |
12 class CPDF_Page; | 12 class CPDF_Page; |
13 class CPDF_PageObjects; | 13 class CPDF_PageObjects; |
14 class CPDF_TextObject; | 14 class CPDF_TextObject; |
15 class IPDF_LinkExtract; | 15 class IPDF_LinkExtract; |
16 class IPDF_ReflowedPage; | 16 class IPDF_ReflowedPage; |
17 class IPDF_TextPage; | 17 class IPDF_TextPage; |
18 class IPDF_TextPageFind; | 18 class IPDF_TextPageFind; |
19 | 19 |
20 #define PDF2TXT_AUTO_ROTATE» » 1 | 20 #define PDF2TXT_AUTO_ROTATE 1 |
21 #define PDF2TXT_AUTO_WIDTH» » 2 | 21 #define PDF2TXT_AUTO_WIDTH 2 |
22 #define PDF2TXT_KEEP_COLUMN» » 4 | 22 #define PDF2TXT_KEEP_COLUMN 4 |
23 #define PDF2TXT_USE_OCR»» » 8 | 23 #define PDF2TXT_USE_OCR 8 |
24 #define PDF2TXT_INCLUDE_INVISIBLE» 16 | 24 #define PDF2TXT_INCLUDE_INVISIBLE 16 |
25 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dicti
onary* pPage, | 25 void PDF_GetPageText(CFX_ByteStringArray& lines, |
26 int iMinWidth, FX_DWORD flags); | 26 CPDF_Document* pDoc, |
27 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CP
DF_Dictionary* pPage, | 27 CPDF_Dictionary* pPage, |
28 int iMinWidth, FX_DWORD flags); | 28 int iMinWidth, |
29 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPD
F_Dictionary* pPage, | 29 FX_DWORD flags); |
| 30 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, |
| 31 CPDF_Document* pDoc, |
| 32 CPDF_Dictionary* pPage, |
| 33 int iMinWidth, |
| 34 FX_DWORD flags); |
| 35 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, |
| 36 CPDF_Document* pDoc, |
| 37 CPDF_Dictionary* pPage, |
30 FX_DWORD flags); | 38 FX_DWORD flags); |
31 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary
* pPage); | 39 CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, |
32 #define CHAR_ERROR» » » -1 | 40 CPDF_Dictionary* pPage); |
33 #define CHAR_NORMAL» » » 0 | 41 #define CHAR_ERROR -1 |
34 #define CHAR_GENERATED» » 1 | 42 #define CHAR_NORMAL 0 |
35 #define CHAR_UNUNICODE» » 2 | 43 #define CHAR_GENERATED 1 |
| 44 #define CHAR_UNUNICODE 2 |
36 typedef struct { | 45 typedef struct { |
37 FX_WCHAR» » » m_Unicode; | 46 FX_WCHAR m_Unicode; |
38 FX_WCHAR» » » m_Charcode; | 47 FX_WCHAR m_Charcode; |
39 int32_t» » » m_Flag; | 48 int32_t m_Flag; |
40 FX_FLOAT» » » m_FontSize; | 49 FX_FLOAT m_FontSize; |
41 FX_FLOAT» » » m_OriginX; | 50 FX_FLOAT m_OriginX; |
42 FX_FLOAT» » » m_OriginY; | 51 FX_FLOAT m_OriginY; |
43 CFX_FloatRect» » m_CharBox; | 52 CFX_FloatRect m_CharBox; |
44 CPDF_TextObject*» m_pTextObj; | 53 CPDF_TextObject* m_pTextObj; |
45 CFX_AffineMatrix» m_Matrix; | 54 CFX_AffineMatrix m_Matrix; |
46 } FPDF_CHAR_INFO; | 55 } FPDF_CHAR_INFO; |
47 typedef»CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; | 56 typedef CFX_ArrayTemplate<CFX_FloatRect> CFX_RectArray; |
48 #define FPDFTEXT_LRTB» 0 | 57 #define FPDFTEXT_LRTB 0 |
49 #define FPDFTEXT_RLTB» 1 | 58 #define FPDFTEXT_RLTB 1 |
50 #define FPDFTEXT_TBRL» 2 | 59 #define FPDFTEXT_TBRL 2 |
51 #define FPDFTEXT_LEFT» » » -1 | 60 #define FPDFTEXT_LEFT -1 |
52 #define FPDFTEXT_RIGHT» » » 1 | 61 #define FPDFTEXT_RIGHT 1 |
53 #define FPDFTEXT_UP» » » » -2 | 62 #define FPDFTEXT_UP -2 |
54 #define FPDFTEXT_DOWN» » » 2 | 63 #define FPDFTEXT_DOWN 2 |
55 #define FPDFTEXT_WRITINGMODE_UNKNOW» 0 | 64 #define FPDFTEXT_WRITINGMODE_UNKNOW 0 |
56 #define FPDFTEXT_WRITINGMODE_LRTB» 1 | 65 #define FPDFTEXT_WRITINGMODE_LRTB 1 |
57 #define FPDFTEXT_WRITINGMODE_RLTB» 2 | 66 #define FPDFTEXT_WRITINGMODE_RLTB 2 |
58 #define FPDFTEXT_WRITINGMODE_TBRL» 3 | 67 #define FPDFTEXT_WRITINGMODE_TBRL 3 |
59 class CPDFText_ParseOptions | 68 class CPDFText_ParseOptions { |
60 { | 69 public: |
61 public: | 70 CPDFText_ParseOptions(); |
| 71 FX_BOOL m_bGetCharCodeOnly; |
| 72 FX_BOOL m_bNormalizeObjs; |
| 73 FX_BOOL m_bOutputHyphen; |
| 74 }; |
| 75 class IPDF_TextPage { |
| 76 public: |
| 77 virtual ~IPDF_TextPage() {} |
| 78 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, |
| 79 CPDFText_ParseOptions ParserOptions); |
| 80 static IPDF_TextPage* CreateTextPage(const CPDF_Page* pPage, int flags = 0); |
| 81 static IPDF_TextPage* CreateTextPage(const CPDF_PageObjects* pObjs, |
| 82 int flags = 0); |
| 83 static IPDF_TextPage* CreateReflowTextPage(IPDF_ReflowedPage* pRefPage); |
62 | 84 |
63 CPDFText_ParseOptions(); | 85 virtual void NormalizeObjects(FX_BOOL bNormalize) = 0; |
64 FX_BOOL» » » m_bGetCharCodeOnly; | 86 |
65 FX_BOOL» » » m_bNormalizeObjs; | 87 virtual FX_BOOL ParseTextPage() = 0; |
66 FX_BOOL» » » m_bOutputHyphen; | 88 |
| 89 virtual FX_BOOL IsParsered() const = 0; |
| 90 |
| 91 public: |
| 92 virtual int CharIndexFromTextIndex(int TextIndex) const = 0; |
| 93 |
| 94 virtual int TextIndexFromCharIndex(int CharIndex) const = 0; |
| 95 |
| 96 virtual int CountChars() const = 0; |
| 97 |
| 98 virtual void GetCharInfo(int index, FPDF_CHAR_INFO& info) const = 0; |
| 99 |
| 100 virtual void GetRectArray(int start, |
| 101 int nCount, |
| 102 CFX_RectArray& rectArray) const = 0; |
| 103 |
| 104 virtual int GetIndexAtPos(CPDF_Point point, |
| 105 FX_FLOAT xTorelance, |
| 106 FX_FLOAT yTorelance) const = 0; |
| 107 |
| 108 virtual int GetIndexAtPos(FX_FLOAT x, |
| 109 FX_FLOAT y, |
| 110 FX_FLOAT xTorelance, |
| 111 FX_FLOAT yTorelance) const = 0; |
| 112 |
| 113 virtual CFX_WideString GetTextByRect(const CFX_FloatRect& rect) const = 0; |
| 114 |
| 115 virtual void GetRectsArrayByRect(const CFX_FloatRect& rect, |
| 116 CFX_RectArray& resRectArray) const = 0; |
| 117 |
| 118 virtual int CountRects(int start, int nCount) = 0; |
| 119 |
| 120 virtual void GetRect(int rectIndex, |
| 121 FX_FLOAT& left, |
| 122 FX_FLOAT& top, |
| 123 FX_FLOAT& right, |
| 124 FX_FLOAT& bottom) const = 0; |
| 125 |
| 126 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate) = 0; |
| 127 |
| 128 virtual FX_BOOL GetBaselineRotate(const CFX_FloatRect& rect, int& Rotate) = 0; |
| 129 |
| 130 virtual int CountBoundedSegments(FX_FLOAT left, |
| 131 FX_FLOAT top, |
| 132 FX_FLOAT right, |
| 133 FX_FLOAT bottom, |
| 134 FX_BOOL bContains = FALSE) = 0; |
| 135 |
| 136 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; |
| 137 |
| 138 virtual int GetWordBreak(int index, int direction) const = 0; |
| 139 |
| 140 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const = 0; |
67 }; | 141 }; |
68 class IPDF_TextPage | 142 #define FPDFTEXT_MATCHCASE 0x00000001 |
69 { | 143 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 |
70 public: | 144 #define FPDFTEXT_CONSECUTIVE 0x00000004 |
| 145 class IPDF_TextPageFind { |
| 146 public: |
| 147 virtual ~IPDF_TextPageFind() {} |
71 | 148 |
72 virtual ~IPDF_TextPage() {} | 149 static IPDF_TextPageFind* CreatePageFind(const IPDF_TextPage* pTextPage); |
73 static IPDF_TextPage*» CreateTextPage(const CPDF_Page* pPage, CPDFText_
ParseOptions ParserOptions); | |
74 static IPDF_TextPage*» CreateTextPage(const CPDF_Page* pPage, int flags
= 0); | |
75 static IPDF_TextPage*» CreateTextPage(const CPDF_PageObjects* pObjs, in
t flags = 0); | |
76 static IPDF_TextPage*» CreateReflowTextPage(IPDF_ReflowedPage* pRefPage
); | |
77 | 150 |
78 virtual void» » » NormalizeObjects(FX_BOOL bNormalize) = 0
; | 151 public: |
| 152 virtual FX_BOOL FindFirst(const CFX_WideString& findwhat, |
| 153 int flags, |
| 154 int startPos = 0) = 0; |
79 | 155 |
80 virtual FX_BOOL» » » ParseTextPage() = 0; | 156 virtual FX_BOOL FindNext() = 0; |
81 | 157 |
| 158 virtual FX_BOOL FindPrev() = 0; |
82 | 159 |
83 virtual FX_BOOL» » » IsParsered() const = 0; | 160 virtual void GetRectArray(CFX_RectArray& rects) const = 0; |
84 public: | |
85 | 161 |
86 virtual int CharIndexFromTextIndex(int TextIndex) const = 0; | 162 virtual int GetCurOrder() const = 0; |
87 | 163 |
88 virtual int TextIndexFromCharIndex(int CharIndex) const = 0; | 164 virtual int GetMatchedCount() const = 0; |
| 165 }; |
| 166 class IPDF_LinkExtract { |
| 167 public: |
| 168 virtual ~IPDF_LinkExtract() {} |
89 | 169 |
| 170 static IPDF_LinkExtract* CreateLinkExtract(); |
90 | 171 |
91 virtual int»» » » CountChars() const = 0; | 172 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage) = 0; |
92 | 173 |
93 virtual» void» » » GetCharInfo(int index, FPDF_CHAR_INFO &
info) const = 0; | 174 public: |
| 175 virtual int CountLinks() const = 0; |
94 | 176 |
95 virtual void» » » GetRectArray(int start, int nCount, CFX_
RectArray& rectArray) const = 0; | 177 virtual CFX_WideString GetURL(int index) const = 0; |
96 | 178 |
| 179 virtual void GetBoundedSegment(int index, int& start, int& count) const = 0; |
97 | 180 |
98 | 181 virtual void GetRects(int index, CFX_RectArray& rects) const = 0; |
99 virtual int»» » » GetIndexAtPos(CPDF_Point point, FX_FLOAT
xTorelance, FX_FLOAT yTorelance) const = 0; | |
100 | |
101 virtual int»» » » GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX
_FLOAT xTorelance, FX_FLOAT yTorelance) const = 0; | |
102 | |
103 virtual CFX_WideString» GetTextByRect(const CFX_FloatRect& rect) const =
0; | |
104 | |
105 virtual void» » » GetRectsArrayByRect(const CFX_FloatRect&
rect, CFX_RectArray& resRectArray) const = 0; | |
106 | |
107 | |
108 virtual int»» » » CountRects(int start, int nCount) = 0; | |
109 | |
110 virtual» void» » » GetRect(int rectIndex, FX_FLOAT& left, F
X_FLOAT& top, FX_FLOAT& right, FX_FLOAT &bottom) const = 0; | |
111 | |
112 virtual FX_BOOL» » » GetBaselineRotate(int rectIndex, int& Ro
tate) = 0; | |
113 | |
114 virtual FX_BOOL» » » GetBaselineRotate(const CFX_FloatRect& r
ect, int& Rotate) = 0; | |
115 | |
116 virtual» int» » » » CountBoundedSegments(FX_FLOAT le
ft, FX_FLOAT top, FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE) =
0; | |
117 | |
118 virtual» void» » » GetBoundedSegment(int index, int& start,
int& count) const = 0; | |
119 | |
120 | |
121 virtual int»» » » GetWordBreak(int index, int direction) c
onst = 0; | |
122 | |
123 virtual CFX_WideString» GetPageText(int start = 0, int nCount = -1 ) con
st = 0; | |
124 }; | |
125 #define FPDFTEXT_MATCHCASE 0x00000001 | |
126 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 | |
127 #define FPDFTEXT_CONSECUTIVE» 0x00000004 | |
128 class IPDF_TextPageFind | |
129 { | |
130 public: | |
131 | |
132 virtual» ~IPDF_TextPageFind() {} | |
133 | |
134 static» IPDF_TextPageFind*» CreatePageFind(const IPDF_TextPage* pTex
tPage); | |
135 public: | |
136 | |
137 virtual» FX_BOOL»» » » FindFirst(const CFX_WideString&
findwhat, int flags, int startPos = 0) = 0; | |
138 | |
139 virtual» FX_BOOL»» » » FindNext() = 0; | |
140 | |
141 virtual» FX_BOOL»» » » FindPrev() = 0; | |
142 | |
143 virtual void» » » » GetRectArray(CFX_RectArray& rect
s) const = 0; | |
144 | |
145 virtual int»» » » » GetCurOrder() const = 0; | |
146 | |
147 virtual int»» » » » GetMatchedCount() const = 0; | |
148 }; | |
149 class IPDF_LinkExtract | |
150 { | |
151 public: | |
152 | |
153 virtual» ~IPDF_LinkExtract() {} | |
154 | |
155 static» IPDF_LinkExtract*» CreateLinkExtract(); | |
156 | |
157 virtual FX_BOOL» » » » ExtractLinks(const IPDF_TextPage
* pTextPage) = 0; | |
158 public: | |
159 | |
160 virtual int»» » » » CountLinks() const = 0; | |
161 | |
162 virtual CFX_WideString» » GetURL(int index) const = 0; | |
163 | |
164 virtual» void» » » » GetBoundedSegment(int index, int
& start, int& count) const = 0; | |
165 | |
166 virtual void» » » » GetRects(int index, CFX_RectArra
y& rects) const = 0; | |
167 }; | 182 }; |
168 | 183 |
169 #endif // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ | 184 #endif // CORE_INCLUDE_FPDFTEXT_FPDF_TEXT_H_ |
OLD | NEW |