OLD | NEW |
1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
6 | 6 |
7 #ifndef _PDF_TEXT_INT_H_ | 7 #ifndef _PDF_TEXT_INT_H_ |
8 #define _PDF_TEXT_INT_H_ | 8 #define _PDF_TEXT_INT_H_ |
9 class CPDF_TextParseOptions : public CFX_Object | 9 class CPDF_TextParseOptions : public CFX_Object { |
10 { | 10 public: |
11 public: | 11 CPDF_TextParseOptions(); |
12 CPDF_TextParseOptions(); | 12 FX_BOOL m_bCheckObjectOrder; |
13 FX_BOOL» » » m_bCheckObjectOrder; | 13 FX_BOOL m_bCheckDirection; |
14 FX_BOOL» » » m_bCheckDirection; | 14 int m_nCheckSameObject; |
15 int»» » » m_nCheckSameObject; | |
16 }; | 15 }; |
17 class CPDF_TextPage; | 16 class CPDF_TextPage; |
18 class CPDF_LinkExtract; | 17 class CPDF_LinkExtract; |
19 class CPDF_TextPageFind; | 18 class CPDF_TextPageFind; |
20 class CPDF_DocProgressiveSearch; | 19 class CPDF_DocProgressiveSearch; |
21 #define FPDFTEXT_CHAR_ERROR» » » -1 | 20 #define FPDFTEXT_CHAR_ERROR -1 |
22 #define FPDFTEXT_CHAR_NORMAL» » 0 | 21 #define FPDFTEXT_CHAR_NORMAL 0 |
23 #define FPDFTEXT_CHAR_GENERATED»» 1 | 22 #define FPDFTEXT_CHAR_GENERATED 1 |
24 #define FPDFTEXT_CHAR_UNUNICODE»» 2 | 23 #define FPDFTEXT_CHAR_UNUNICODE 2 |
25 #define FPDFTEXT_CHAR_HYPHEN» » 3 | 24 #define FPDFTEXT_CHAR_HYPHEN 3 |
26 #define FPDFTEXT_CHAR_PIECE» » » 4 | 25 #define FPDFTEXT_CHAR_PIECE 4 |
27 #define FPDFTEXT_MC_PASS» » » 0 | 26 #define FPDFTEXT_MC_PASS 0 |
28 #define FPDFTEXT_MC_DONE» » » 1 | 27 #define FPDFTEXT_MC_DONE 1 |
29 #define FPDFTEXT_MC_DELAY» » » 2 | 28 #define FPDFTEXT_MC_DELAY 2 |
30 typedef struct _PAGECHAR_INFO: public CFX_Object { | 29 typedef struct _PAGECHAR_INFO : public CFX_Object { |
31 int»» » » » m_CharCode; | 30 int m_CharCode; |
32 FX_WCHAR» » » m_Unicode; | 31 FX_WCHAR m_Unicode; |
33 FX_FLOAT» » » m_OriginX; | 32 FX_FLOAT m_OriginX; |
34 FX_FLOAT» » » m_OriginY; | 33 FX_FLOAT m_OriginY; |
35 FX_INT32» » » m_Flag; | 34 FX_INT32 m_Flag; |
36 CFX_FloatRect» » m_CharBox; | 35 CFX_FloatRect m_CharBox; |
37 CPDF_TextObject*» m_pTextObj; | 36 CPDF_TextObject* m_pTextObj; |
38 CFX_AffineMatrix» m_Matrix; | 37 CFX_AffineMatrix m_Matrix; |
39 int»» » » » m_Index; | 38 int m_Index; |
40 } PAGECHAR_INFO; | 39 } PAGECHAR_INFO; |
41 typedef»CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray; | 40 typedef CFX_SegmentedArray<PAGECHAR_INFO> PAGECHAR_InfoArray; |
42 typedef struct { | 41 typedef struct { |
43 int»m_Start; | 42 int m_Start; |
44 int m_nCount; | 43 int m_nCount; |
45 } FPDF_SEGMENT; | 44 } FPDF_SEGMENT; |
46 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array; | 45 typedef CFX_ArrayTemplate<FPDF_SEGMENT> SEGMENT_Array; |
47 typedef struct { | 46 typedef struct { |
48 CPDF_TextObject*» m_pTextObj; | 47 CPDF_TextObject* m_pTextObj; |
49 CFX_AffineMatrix» m_formMatrix; | 48 CFX_AffineMatrix m_formMatrix; |
50 } PDFTEXT_Obj; | 49 } PDFTEXT_Obj; |
51 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ; | 50 typedef CFX_ArrayTemplate<PDFTEXT_Obj> LINEOBJ; |
52 class CPDF_TextPage: public IPDF_TextPage | 51 class CPDF_TextPage : public IPDF_TextPage { |
53 { | 52 public: |
54 public: | 53 CPDF_TextPage(const CPDF_Page* pPage, int flags = 0); |
55 CPDF_TextPage(const CPDF_Page* pPage, int flags = 0); | 54 CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0); |
56 CPDF_TextPage(const CPDF_PageObjects* pPage, int flags = 0); | 55 CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); |
57 CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions ParserOptions); | 56 virtual FX_BOOL ParseTextPage(); |
58 virtual FX_BOOL ParseTextPage(); | 57 virtual void NormalizeObjects(FX_BOOL bNormalize); |
59 virtual void NormalizeObjects(FX_BOOL
bNormalize); | 58 virtual FX_BOOL IsParsered() const { return m_IsParsered; } |
60 virtual FX_BOOL IsParsered() const | 59 virtual ~CPDF_TextPage(){}; |
61 { | 60 |
62 return m_IsParsered; | 61 public: |
63 } | 62 virtual int CharIndexFromTextIndex(int TextIndex) const; |
64 virtual ~CPDF_TextPage() {}; | 63 virtual int TextIndexFromCharIndex(int CharIndex) const; |
65 public: | 64 virtual int CountChars() const; |
66 virtual int CharIndexFromTextIndex(int TextIndex)const ; | 65 virtual void GetCharInfo(int index, FPDF_CHAR_INFO& info) const; |
67 virtual int TextIndexFromCharIndex(int CharIndex)const; | 66 virtual void GetRectArray(int start, |
68 virtual int CountChars() const; | 67 int nCount, |
69 virtual void GetCharInfo(int index, F
PDF_CHAR_INFO & info) const; | 68 CFX_RectArray& rectArray) const; |
70 virtual void GetRectArray(int start,
int nCount, CFX_RectArray& rectArray) const; | 69 virtual int GetIndexAtPos(CPDF_Point point, |
71 virtual int GetIndexAtPos(CPDF_Point
point, FX_FLOAT xTorelance, FX_FLOAT yTorelance) const; | 70 FX_FLOAT xTorelance, |
72 virtual int GetIndexAtPos(FX_FLOAT x
, FX_FLOAT y, FX_FLOAT xTorelance, | 71 FX_FLOAT yTorelance) const; |
73 FX_FLOAT yTorelance) const; | 72 virtual int GetIndexAtPos(FX_FLOAT x, |
74 virtual CFX_WideString GetTextByRect(CFX_FloatRect rect
) const; | 73 FX_FLOAT y, |
75 virtual void GetRectsArrayByRect(CFX_
FloatRect rect, CFX_RectArray& resRectArray) const; | 74 FX_FLOAT xTorelance, |
76 virtual int GetOrderByDirect
ion(int order, int direction) const; | 75 FX_FLOAT yTorelance) const; |
77 virtual CFX_WideString GetPageText(int start = 0, int n
Count = -1) const; | 76 virtual CFX_WideString GetTextByRect(CFX_FloatRect rect) const; |
78 | 77 virtual void GetRectsArrayByRect(CFX_FloatRect rect, |
79 virtual int CountRects(int start, in
t nCount); | 78 CFX_RectArray& resRectArray) const; |
80 virtual void GetRect(int rectIndex, F
X_FLOAT& left, FX_FLOAT& top | 79 virtual int GetOrderByDirection(int order, int direction) const; |
81 , FX_FLOAT& right, FX_FLOAT &bottom)
const; | 80 virtual CFX_WideString GetPageText(int start = 0, int nCount = -1) const; |
82 virtual FX_BOOL GetBaselineRotate(int re
ctIndex, int& Rotate); | 81 |
83 virtual FX_BOOL GetBaselineRotate(CFX_Fl
oatRect rect, int& Rotate); | 82 virtual int CountRects(int start, int nCount); |
84 virtual int CountBoundedSegm
ents(FX_FLOAT left, FX_FLOAT top, | 83 virtual void GetRect(int rectIndex, |
85 FX_FLOAT right, FX_FLOAT bottom, FX_BOOL bContains = FALSE); | 84 FX_FLOAT& left, |
86 virtual void GetBoundedSegment(int in
dex, int& start, int& count) const; | 85 FX_FLOAT& top, |
87 virtual int GetWordBreak(int index,
int direction) const; | 86 FX_FLOAT& right, |
88 public: | 87 FX_FLOAT& bottom) const; |
89 const PAGECHAR_InfoArray* GetCharList() const | 88 virtual FX_BOOL GetBaselineRotate(int rectIndex, int& Rotate); |
90 { | 89 virtual FX_BOOL GetBaselineRotate(CFX_FloatRect rect, int& Rotate); |
91 return &m_charList; | 90 virtual int CountBoundedSegments(FX_FLOAT left, |
92 } | 91 FX_FLOAT top, |
93 static FX_BOOL IsRectIntersect(CFX_Floa
tRect rect1, CFX_FloatRect rect2); | 92 FX_FLOAT right, |
94 static FX_BOOL IsLetter(FX_WCHAR unicod
e); | 93 FX_FLOAT bottom, |
95 private: | 94 FX_BOOL bContains = FALSE); |
96 FX_BOOL IsHyphen(FX_WCHA
R curChar); | 95 virtual void GetBoundedSegment(int index, int& start, int& count) const; |
97 FX_BOOL IsControlChar(PA
GECHAR_INFO* pCharInfo); | 96 virtual int GetWordBreak(int index, int direction) const; |
98 FX_BOOL GetBaselineRotat
e(int start, int end, int& Rotate); | 97 |
99 void ProcessObject(); | 98 public: |
100 void ProcessFormObjec
t(CPDF_FormObject* pFormObj, CFX_AffineMatrix formMatrix); | 99 const PAGECHAR_InfoArray* GetCharList() const { return &m_charList; } |
101 void ProcessTextObjec
t(PDFTEXT_Obj pObj); | 100 static FX_BOOL IsRectIntersect(CFX_FloatRect rect1, CFX_FloatRect rect2); |
102 void ProcessTextObjec
t(CPDF_TextObject* pTextObj, CFX_AffineMatrix formMatrix, FX_POSITION ObjPo
s); | 101 static FX_BOOL IsLetter(FX_WCHAR unicode); |
103 int ProcessInsertObj
ect(const CPDF_TextObject* pObj, CFX_AffineMatrix formMatrix); | 102 |
104 FX_BOOL GenerateCharInfo
(FX_WCHAR unicode, PAGECHAR_INFO& info); | 103 private: |
105 FX_BOOL IsSameAsPreTextO
bject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); | 104 FX_BOOL IsHyphen(FX_WCHAR curChar); |
106 FX_BOOL IsSameTextObject
(CPDF_TextObject* pTextObj1, CPDF_TextObject* pTextObj2); | 105 FX_BOOL IsControlChar(PAGECHAR_INFO* pCharInfo); |
107 int GetCharWidth(FX_
DWORD charCode, CPDF_Font* pFont) const; | 106 FX_BOOL GetBaselineRotate(int start, int end, int& Rotate); |
108 void CloseTempLine(); | 107 void ProcessObject(); |
109 void OnPiece(IFX_Bidi
Char* pBidi, CFX_WideString& str); | 108 void ProcessFormObject(CPDF_FormObject* pFormObj, |
110 FX_INT32 PreMarkedContent(PDFTEXT_Obj pObj); | 109 CFX_AffineMatrix formMatrix); |
111 void ProcessMarkedContent(PDFTEXT_Obj pObj); | 110 void ProcessTextObject(PDFTEXT_Obj pObj); |
112 void CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCou
nt) const; | 111 void ProcessTextObject(CPDF_TextObject* pTextObj, |
113 void FindPreviousTextObject(void); | 112 CFX_AffineMatrix formMatrix, |
114 void AddCharInfoByLRDirection(CFX_WideString& str, int i); | 113 FX_POSITION ObjPos); |
115 void AddCharInfoByRLDirection(CFX_WideString& str, int i); | 114 int ProcessInsertObject(const CPDF_TextObject* pObj, |
116 FX_INT32 GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); | 115 CFX_AffineMatrix formMatrix); |
117 FX_INT32 FindTextlineFlowDirection(); | 116 FX_BOOL GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info); |
118 protected: | 117 FX_BOOL IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSITION ObjPos); |
119 CPDFText_ParseOptions m_ParseOptions; | 118 FX_BOOL IsSameTextObject(CPDF_TextObject* pTextObj1, |
120 CFX_WordArray m_CharIndex; | 119 CPDF_TextObject* pTextObj2); |
121 const CPDF_PageObjects* m_pPage; | 120 int GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const; |
122 PAGECHAR_InfoArray m_charList; | 121 void CloseTempLine(); |
123 CFX_WideTextBuf m_TextBuf; | 122 void OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str); |
124 PAGECHAR_InfoArray m_TempCharList; | 123 FX_INT32 PreMarkedContent(PDFTEXT_Obj pObj); |
125 CFX_WideTextBuf m_TempTextBuf; | 124 void ProcessMarkedContent(PDFTEXT_Obj pObj); |
126 int m_parserflag; | 125 void CheckMarkedContentObject(FX_INT32& start, FX_INT32& nCount) const; |
127 CPDF_TextObject* m_pPreTextObj; | 126 void FindPreviousTextObject(void); |
128 CFX_AffineMatrix m_perMatrix; | 127 void AddCharInfoByLRDirection(CFX_WideString& str, int i); |
129 FX_BOOL m_IsParsered; | 128 void AddCharInfoByRLDirection(CFX_WideString& str, int i); |
130 CFX_AffineMatrix m_DisplayMatrix; | 129 FX_INT32 GetTextObjectWritingMode(const CPDF_TextObject* pTextObj); |
131 | 130 FX_INT32 FindTextlineFlowDirection(); |
132 SEGMENT_Array m_Segment; | 131 |
133 CFX_RectArray m_SelRects; | 132 protected: |
134 LINEOBJ m_LineObj; | 133 CPDFText_ParseOptions m_ParseOptions; |
135 FX_BOOL m_TextlineDir; | 134 CFX_WordArray m_CharIndex; |
136 CFX_FloatRect m_CurlineRect; | 135 const CPDF_PageObjects* m_pPage; |
137 }; | 136 PAGECHAR_InfoArray m_charList; |
138 class CPDF_TextPageFind: public IPDF_TextPageFind | 137 CFX_WideTextBuf m_TextBuf; |
139 { | 138 PAGECHAR_InfoArray m_TempCharList; |
140 public: | 139 CFX_WideTextBuf m_TempTextBuf; |
141 CPDF_TextPageFind(const IPDF_TextPage* pTextPage); | 140 int m_parserflag; |
142 virtual ~CPDF_TextPageFi
nd() {}; | 141 CPDF_TextObject* m_pPreTextObj; |
143 public: | 142 CFX_AffineMatrix m_perMatrix; |
144 virtual FX_BOOL FindFirst(CFX_WideString
findwhat, int flags, int startPos = 0); | 143 FX_BOOL m_IsParsered; |
145 virtual FX_BOOL FindNext(); | 144 CFX_AffineMatrix m_DisplayMatrix; |
146 virtual FX_BOOL FindPrev(); | 145 |
147 | 146 SEGMENT_Array m_Segment; |
148 virtual void GetRectArray(CFX_RectArr
ay& rects) const; | 147 CFX_RectArray m_SelRects; |
149 virtual int GetCurOrder() const; | 148 LINEOBJ m_LineObj; |
150 virtual int GetMatchedCount()const; | 149 FX_BOOL m_TextlineDir; |
151 protected: | 150 CFX_FloatRect m_CurlineRect; |
152 void ExtractFindWhat(
CFX_WideString findwhat); | 151 }; |
153 FX_BOOL IsMatchWholeWord
(CFX_WideString csPageText, int startPos, int endPos); | 152 class CPDF_TextPageFind : public IPDF_TextPageFind { |
154 FX_BOOL ExtractSubString
(CFX_WideString& rString, FX_LPCWSTR lpszFullString, | 153 public: |
155 int iSubString, FX_WCHAR chSep); | 154 CPDF_TextPageFind(const IPDF_TextPage* pTextPage); |
156 CFX_WideString MakeReverse(const CFX_Wi
deString str); | 155 virtual ~CPDF_TextPageFind(){}; |
157 int ReverseFind(CFX_
WideString csPageText, CFX_WideString csWord, int nStartPos, int& WordLength); | 156 |
158 int GetCharIndex(int
index) const; | 157 public: |
159 private: | 158 virtual FX_BOOL FindFirst(CFX_WideString findwhat, |
160 CFX_WordArray m_CharIndex; | 159 int flags, |
161 const IPDF_TextPage* m_pTextPage; | 160 int startPos = 0); |
162 CFX_WideString m_strText; | 161 virtual FX_BOOL FindNext(); |
163 CFX_WideString m_findWhat; | 162 virtual FX_BOOL FindPrev(); |
164 int m_flags; | 163 |
165 CFX_WideStringArray m_csFindWhatArray; | 164 virtual void GetRectArray(CFX_RectArray& rects) const; |
166 int m_findNextStart; | 165 virtual int GetCurOrder() const; |
167 int m_findPreStart; | 166 virtual int GetMatchedCount() const; |
168 FX_BOOL m_bMatchCase; | 167 |
169 FX_BOOL m_bMatchWholeWor
d; | 168 protected: |
170 int m_resStart; | 169 void ExtractFindWhat(CFX_WideString findwhat); |
171 int m_resEnd; | 170 FX_BOOL IsMatchWholeWord(CFX_WideString csPageText, int startPos, int endPos); |
172 CFX_RectArray m_resArray; | 171 FX_BOOL ExtractSubString(CFX_WideString& rString, |
173 FX_BOOL m_IsFind; | 172 FX_LPCWSTR lpszFullString, |
174 }; | 173 int iSubString, |
175 class CPDF_LinkExt: public CFX_Object | 174 FX_WCHAR chSep); |
176 { | 175 CFX_WideString MakeReverse(const CFX_WideString str); |
177 public: | 176 int ReverseFind(CFX_WideString csPageText, |
178 CPDF_LinkExt() {}; | 177 CFX_WideString csWord, |
179 int m_Start; | 178 int nStartPos, |
180 int m_Count; | 179 int& WordLength); |
181 CFX_WideString m_strUrl; | 180 int GetCharIndex(int index) const; |
182 virtual ~CPDF_LinkExt()
{}; | 181 |
| 182 private: |
| 183 CFX_WordArray m_CharIndex; |
| 184 const IPDF_TextPage* m_pTextPage; |
| 185 CFX_WideString m_strText; |
| 186 CFX_WideString m_findWhat; |
| 187 int m_flags; |
| 188 CFX_WideStringArray m_csFindWhatArray; |
| 189 int m_findNextStart; |
| 190 int m_findPreStart; |
| 191 FX_BOOL m_bMatchCase; |
| 192 FX_BOOL m_bMatchWholeWord; |
| 193 int m_resStart; |
| 194 int m_resEnd; |
| 195 CFX_RectArray m_resArray; |
| 196 FX_BOOL m_IsFind; |
| 197 }; |
| 198 class CPDF_LinkExt : public CFX_Object { |
| 199 public: |
| 200 CPDF_LinkExt(){}; |
| 201 int m_Start; |
| 202 int m_Count; |
| 203 CFX_WideString m_strUrl; |
| 204 virtual ~CPDF_LinkExt(){}; |
183 }; | 205 }; |
184 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; | 206 typedef CFX_ArrayTemplate<CPDF_LinkExt*> LINK_InfoArray; |
185 class CPDF_LinkExtract: public IPDF_LinkExtract | 207 class CPDF_LinkExtract : public IPDF_LinkExtract { |
186 { | 208 public: |
187 public: | 209 CPDF_LinkExtract(); |
188 CPDF_LinkExtract(); | 210 virtual ~CPDF_LinkExtract(); |
189 virtual» » » » » » » ~CPDF_LinkExtrac
t(); | 211 virtual FX_BOOL ExtractLinks(const IPDF_TextPage* pTextPage); |
190 virtual FX_BOOL» » » » » ExtractLinks(const IPDF_
TextPage* pTextPage); | 212 virtual FX_BOOL IsExtract() const { return m_IsParserd; } |
191 virtual» FX_BOOL»» » » » IsExtract() const | 213 |
192 { | 214 public: |
193 return m_IsParserd; | 215 virtual int CountLinks() const; |
194 } | 216 virtual CFX_WideString GetURL(int index) const; |
195 public: | 217 virtual void GetBoundedSegment(int index, int& start, int& count) const; |
196 virtual int»» » » » » CountLinks() const; | 218 virtual void GetRects(int index, CFX_RectArray& rects) const; |
197 virtual» CFX_WideString» » » GetURL(int index) const; | 219 |
198 virtual» void» » » » » GetBoundedSegment(int in
dex, int& start, int& count) const; | 220 protected: |
199 virtual» void» » » » » GetRects(int index, CFX_
RectArray& rects)const; | 221 void parserLink(); |
200 protected: | 222 void DeleteLinkList(); |
201 void» » » » » » » parserLink(); | 223 FX_BOOL CheckWebLink(CFX_WideString& strBeCheck); |
202 void» » » » » » » DeleteLinkList()
; | 224 FX_BOOL CheckMailLink(CFX_WideString& str); |
203 FX_BOOL» » » » » » » CheckWebLink(CFX
_WideString& strBeCheck); | 225 FX_BOOL AppendToLinkList(int start, int count, CFX_WideString strUrl); |
204 FX_BOOL» » » » » » » CheckMailLink(CF
X_WideString& str); | 226 |
205 FX_BOOL» » » » » » » AppendToLinkList
(int start, int count, CFX_WideString strUrl); | 227 private: |
206 private: | 228 LINK_InfoArray m_LinkList; |
207 LINK_InfoArray» » » » » m_LinkList; | 229 const CPDF_TextPage* m_pTextPage; |
208 const CPDF_TextPage*» » » m_pTextPage; | 230 CFX_WideString m_strPageText; |
209 CFX_WideString» » » » » m_strPageText; | 231 FX_BOOL m_IsParserd; |
210 FX_BOOL» » » » » » » m_IsParserd; | |
211 }; | 232 }; |
212 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst); | 233 FX_STRSIZE FX_Unicode_GetNormalization(FX_WCHAR wch, FX_LPWSTR pDst); |
213 void NormalizeString(CFX_WideString& str); | 234 void NormalizeString(CFX_WideString& str); |
214 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); | 235 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest); |
215 #endif | 236 #endif |
OLD | NEW |