Index: core/fpdftext/fpdf_text_int.cpp |
diff --git a/core/fpdftext/fpdf_text_int.cpp b/core/fpdftext/fpdf_text_int.cpp |
index 8e8686c4a1ac6bcba681f41e5446c829543dfc78..741331fb7711984b851d25053cefc5def0d50828 100644 |
--- a/core/fpdftext/fpdf_text_int.cpp |
+++ b/core/fpdftext/fpdf_text_int.cpp |
@@ -4,8 +4,6 @@ |
// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
-#include "core/fpdftext/fpdf_text_int.h" |
- |
#include <algorithm> |
#include <cctype> |
#include <cwctype> |
@@ -14,15 +12,17 @@ |
#include <vector> |
#include "core/fpdfapi/fpdf_font/include/cpdf_font.h" |
+#include "core/fpdfapi/fpdf_page/include/cpdf_form.h" |
#include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" |
+#include "core/fpdfapi/fpdf_page/include/cpdf_page.h" |
#include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" |
#include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" |
#include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" |
#include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" |
-#include "core/fpdftext/include/ipdf_linkextract.h" |
-#include "core/fpdftext/include/ipdf_textpage.h" |
-#include "core/fpdftext/include/ipdf_textpagefind.h" |
-#include "core/fpdftext/unicodenormalization.h" |
+#include "core/fpdftext/include/cpdf_linkextract.h" |
+#include "core/fpdftext/include/cpdf_textpage.h" |
+#include "core/fpdftext/include/cpdf_textpagefind.h" |
+#include "core/fpdftext/unicodenormalizationdata.h" |
#include "core/fxcrt/fx_bidi.h" |
#include "core/fxcrt/include/fx_ext.h" |
#include "core/fxcrt/include/fx_ucd.h" |
@@ -36,9 +36,24 @@ |
#define FPDFTEXT_MATCHWHOLEWORD 0x00000002 |
#define FPDFTEXT_CONSECUTIVE 0x00000004 |
+#define FPDFTEXT_CHAR_ERROR -1 |
+#define FPDFTEXT_CHAR_NORMAL 0 |
+#define FPDFTEXT_CHAR_GENERATED 1 |
+#define FPDFTEXT_CHAR_UNUNICODE 2 |
+#define FPDFTEXT_CHAR_HYPHEN 3 |
+#define FPDFTEXT_CHAR_PIECE 4 |
+#define FPDFTEXT_MC_PASS 0 |
+#define FPDFTEXT_MC_DONE 1 |
+#define FPDFTEXT_MC_DELAY 2 |
+ |
namespace { |
-FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { |
+const FX_FLOAT kDefaultFontSize = 1.0f; |
+const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { |
+ nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, |
+ g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; |
+ |
+FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) { |
if (curChar < 255) { |
return FALSE; |
} |
@@ -55,7 +70,7 @@ FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { |
return TRUE; |
} |
-FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) { |
+FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { |
if (threshold < 300) { |
return threshold / 2.0f; |
} |
@@ -68,8 +83,8 @@ FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) { |
return threshold / 6.0f; |
} |
-FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, |
- const CFX_Matrix& matrix) { |
+FX_FLOAT CalculateBaseSpace(const CPDF_TextObject* pTextObj, |
+ const CFX_Matrix& matrix) { |
FX_FLOAT baseSpace = 0.0; |
const int nItems = pTextObj->CountItems(); |
if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { |
@@ -94,23 +109,39 @@ FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, |
return baseSpace; |
} |
-const FX_FLOAT kDefaultFontSize = 1.0f; |
- |
-} // namespace |
- |
-IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, |
- int flags) { |
- return new CPDF_TextPage(pPage, flags); |
-} |
- |
-IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind( |
- const IPDF_TextPage* pTextPage) { |
- return pTextPage ? new CPDF_TextPageFind(pTextPage) : nullptr; |
+FX_STRSIZE Unicode_GetNormalization(FX_WCHAR wch, FX_WCHAR* pDst) { |
+ wch = wch & 0xFFFF; |
+ FX_WCHAR wFind = g_UnicodeData_Normalization[wch]; |
+ if (!wFind) { |
+ if (pDst) { |
+ *pDst = wch; |
+ } |
+ return 1; |
+ } |
+ if (wFind >= 0x8000) { |
+ wch = wFind - 0x8000; |
+ wFind = 1; |
+ } else { |
+ wch = wFind & 0x0FFF; |
+ wFind >>= 12; |
+ } |
+ const uint16_t* pMap = g_UnicodeData_Normalization_Maps[wFind]; |
+ if (pMap == g_UnicodeData_Normalization_Map4) { |
+ pMap = g_UnicodeData_Normalization_Map4 + wch; |
+ wFind = (FX_WCHAR)(*pMap++); |
+ } else { |
+ pMap += wch; |
+ } |
+ if (pDst) { |
+ FX_WCHAR n = wFind; |
+ while (n--) { |
+ *pDst++ = *pMap++; |
+ } |
+ } |
+ return (FX_STRSIZE)wFind; |
} |
-IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() { |
- return new CPDF_LinkExtract(); |
-} |
+} // namespace |
#define TEXT_BLANK_CHAR L' ' |
#define TEXT_LINEFEED_CHAR L'\n' |
@@ -932,10 +963,10 @@ void CPDF_TextPage::AddCharInfoByLRDirection(FX_WCHAR wChar, |
info.m_Index = m_TextBuf.GetLength(); |
if (wChar >= 0xFB00 && wChar <= 0xFB06) { |
FX_WCHAR* pDst = NULL; |
- FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); |
+ FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst); |
if (nCount >= 1) { |
pDst = FX_Alloc(FX_WCHAR, nCount); |
- FX_Unicode_GetNormalization(wChar, pDst); |
+ Unicode_GetNormalization(wChar, pDst); |
for (int nIndex = 0; nIndex < nCount; nIndex++) { |
PAGECHAR_INFO info2 = info; |
info2.m_Unicode = pDst[nIndex]; |
@@ -960,10 +991,10 @@ void CPDF_TextPage::AddCharInfoByRLDirection(FX_WCHAR wChar, |
info.m_Index = m_TextBuf.GetLength(); |
wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); |
FX_WCHAR* pDst = NULL; |
- FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); |
+ FX_STRSIZE nCount = Unicode_GetNormalization(wChar, pDst); |
if (nCount >= 1) { |
pDst = FX_Alloc(FX_WCHAR, nCount); |
- FX_Unicode_GetNormalization(wChar, pDst); |
+ Unicode_GetNormalization(wChar, pDst); |
for (int nIndex = 0; nIndex < nCount; nIndex++) { |
PAGECHAR_INFO info2 = info; |
info2.m_Unicode = pDst[nIndex]; |
@@ -1377,7 +1408,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { |
m_pPreTextObj = pTextObj; |
m_perMatrix.Copy(formMatrix); |
int nItems = pTextObj->CountItems(); |
- FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix); |
+ FX_FLOAT baseSpace = CalculateBaseSpace(pTextObj, matrix); |
const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems); |
const FX_BOOL bIsBidiAndMirrorInverse = |
@@ -1430,7 +1461,7 @@ void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { |
int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); |
threshold = this_width > last_width ? (FX_FLOAT)this_width |
: (FX_FLOAT)last_width; |
- threshold = _NormalizeThreshold(threshold); |
+ threshold = NormalizeThreshold(threshold); |
threshold = fontsize_h * threshold / 1000; |
} |
if (threshold && (spacing && spacing >= threshold)) { |
@@ -1898,7 +1929,7 @@ FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) { |
return TRUE; |
} |
-CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage) |
+CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) |
: m_pTextPage(pTextPage), |
m_flags(0), |
m_findNextStart(-1), |
@@ -2054,8 +2085,8 @@ FX_BOOL CPDF_TextPageFind::FindNext() { |
CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; |
int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); |
if (nStartPos == nResultPos && |
- !(_IsIgnoreSpaceCharacter(lastChar) || |
- _IsIgnoreSpaceCharacter(curChar))) { |
+ !(IsIgnoreSpaceCharacter(lastChar) || |
+ IsIgnoreSpaceCharacter(curChar))) { |
bMatch = FALSE; |
} |
for (int d = PreResEndPos; d < nResultPos; d++) { |
@@ -2174,7 +2205,7 @@ void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { |
while (pos < csWord.GetLength()) { |
CFX_WideString curStr = csWord.Mid(pos, 1); |
FX_WCHAR curChar = csWord.GetAt(pos); |
- if (_IsIgnoreSpaceCharacter(curChar)) { |
+ if (IsIgnoreSpaceCharacter(curChar)) { |
if (pos > 0 && curChar == 0x2019) { |
pos++; |
continue; |
@@ -2306,7 +2337,7 @@ CPDF_LinkExtract::~CPDF_LinkExtract() { |
DeleteLinkList(); |
} |
-FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) { |
+FX_BOOL CPDF_LinkExtract::ExtractLinks(const CPDF_TextPage* pTextPage) { |
if (!pTextPage || !pTextPage->IsParsed()) |
return FALSE; |