core/fpdftext/cpdf_textpage.cpp - Issue 2286723003: Split fpdf_text_int into classes

Side by Side Diff: core/fpdftext/cpdf_textpage.cpp

Issue 2286723003: Split fpdf_text_int into classes (Closed)

Patch Set: Fix bots Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 PDFium Authors. All rights reserved.	1 // Copyright 2014 PDFium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com	5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

6	6

	7 #include "core/fpdftext/include/cpdf_textpage.h"

	8

7 #include <algorithm>	9 #include <algorithm>

8 #include <cctype>

9 #include <cwctype>

10 #include <memory>

11 #include <utility>	10 #include <utility>

12 #include <vector>	11 #include <vector>

13	12

14 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h"	13 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h"

15 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h"	14 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h"

16 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h"	15 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h"

17 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h"	16 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h"

18 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h"	17 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h"

19 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h"	18 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h"

20 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h"	19 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h"

21 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h"	20 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h"

22 #include "core/fpdftext/include/cpdf_linkextract.h"

23 #include "core/fpdftext/include/cpdf_textpage.h"

24 #include "core/fpdftext/include/cpdf_textpagefind.h"

25 #include "core/fpdftext/unicodenormalizationdata.h"	21 #include "core/fpdftext/unicodenormalizationdata.h"

26 #include "core/fxcrt/fx_bidi.h"	22 #include "core/fxcrt/fx_bidi.h"

27 #include "core/fxcrt/include/fx_ext.h"	23 #include "core/fxcrt/include/fx_ext.h"

28 #include "core/fxcrt/include/fx_ucd.h"	24 #include "core/fxcrt/include/fx_ucd.h"

29 #include "third_party/base/stl_util.h"	25 #include "third_party/base/stl_util.h"

30	26

31 #define FPDFTEXT_MATCHCASE 0x00000001

32 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002

33 #define FPDFTEXT_CONSECUTIVE 0x00000004

34

35 #define FPDFTEXT_CHAR_ERROR -1

36 #define FPDFTEXT_CHAR_NORMAL 0

37 #define FPDFTEXT_CHAR_GENERATED 1

38 #define FPDFTEXT_CHAR_UNUNICODE 2

39 #define FPDFTEXT_CHAR_HYPHEN 3

40 #define FPDFTEXT_CHAR_PIECE 4

41

42 #define TEXT_SPACE_CHAR L' '

43 #define TEXT_LINEFEED_CHAR L'\n'

44 #define TEXT_RETURN_CHAR L'\r'

45 #define TEXT_EMPTY L""

46 #define TEXT_SPACE L" "

47 #define TEXT_RETURN_LINEFEED L"\r\n"

48 #define TEXT_LINEFEED L"\n"

49 #define TEXT_CHARRATIO_GAPDELTA 0.070

50

51 namespace {	27 namespace {

52	28

53 const FX_FLOAT kDefaultFontSize = 1.0f;	29 const FX_FLOAT kDefaultFontSize = 1.0f;

54 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {	30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {

55 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,	31 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,

56 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};	32 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};

57	33

58 FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {

59 if (curChar < 255)

60 return FALSE;

61 if ((curChar >= 0x0600 && curChar <= 0x06FF) \|\|

62 (curChar >= 0xFE70 && curChar <= 0xFEFF) \|\|

63 (curChar >= 0xFB50 && curChar <= 0xFDFF) \|\|

64 (curChar >= 0x0400 && curChar <= 0x04FF) \|\|

65 (curChar >= 0x0500 && curChar <= 0x052F) \|\|

66 (curChar >= 0xA640 && curChar <= 0xA69F) \|\|

67 (curChar >= 0x2DE0 && curChar <= 0x2DFF) \|\| curChar == 8467 \|\|

68 (curChar >= 0x2000 && curChar <= 0x206F)) {

69 return FALSE;

70 }

71 return TRUE;

72 }

73

74 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {	34 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {

75 if (threshold < 300)	35 if (threshold < 300)

76 return threshold / 2.0f;	36 return threshold / 2.0f;

77 if (threshold < 500)	37 if (threshold < 500)

78 return threshold / 4.0f;	38 return threshold / 4.0f;

79 if (threshold < 700)	39 if (threshold < 700)

80 return threshold / 5.0f;	40 return threshold / 5.0f;

81 return threshold / 6.0f;	41 return threshold / 6.0f;

82 }	42 }

83	43

(...skipping 1496 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1580 info.m_OriginY);	1540 info.m_OriginY);

1581 return TRUE;	1541 return TRUE;

1582 }	1542 }

1583	1543

1584 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,	1544 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,

1585 const CFX_FloatRect& rect2) {	1545 const CFX_FloatRect& rect2) {

1586 CFX_FloatRect rect = rect1;	1546 CFX_FloatRect rect = rect1;

1587 rect.Intersect(rect2);	1547 rect.Intersect(rect2);

1588 return !rect.IsEmpty();	1548 return !rect.IsEmpty();

1589 }	1549 }

1590

1591 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)

1592 : m_pTextPage(pTextPage),

1593 m_flags(0),

1594 m_findNextStart(-1),

1595 m_findPreStart(-1),

1596 m_bMatchCase(FALSE),

1597 m_bMatchWholeWord(FALSE),

1598 m_resStart(0),

1599 m_resEnd(-1),

1600 m_IsFind(FALSE) {

1601 m_strText = m_pTextPage->GetPageText();

1602 int nCount = pTextPage->CountChars();

1603 if (nCount) {

1604 m_CharIndex.push_back(0);

1605 }

1606 for (int i = 0; i < nCount; i++) {

1607 FPDF_CHAR_INFO info;

1608 pTextPage->GetCharInfo(i, &info);

1609 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);

1610 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL \|\|

1611 info.m_Flag == FPDFTEXT_CHAR_GENERATED) {

1612 if (indexSize % 2) {

1613 m_CharIndex.push_back(1);

1614 } else {

1615 if (indexSize <= 0) {

1616 continue;

1617 }

1618 m_CharIndex[indexSize - 1] += 1;

1619 }

1620 } else {

1621 if (indexSize % 2) {

1622 if (indexSize <= 0) {

1623 continue;

1624 }

1625 m_CharIndex[indexSize - 1] = i + 1;

1626 } else {

1627 m_CharIndex.push_back(i + 1);

1628 }

1629 }

1630 }

1631 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);

1632 if (indexSize % 2) {

1633 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);

1634 }

1635 }

1636

1637 CPDF_TextPageFind::~CPDF_TextPageFind() {}

1638

1639 int CPDF_TextPageFind::GetCharIndex(int index) const {

1640 return m_pTextPage->CharIndexFromTextIndex(index);

1641 }

1642

1643 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,

1644 int flags,

1645 int startPos) {

1646 if (!m_pTextPage) {

1647 return FALSE;

1648 }

1649 if (m_strText.IsEmpty() \|\| m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {

1650 m_strText = m_pTextPage->GetPageText();

1651 }

1652 CFX_WideString findwhatStr = findwhat;

1653 m_findWhat = findwhatStr;

1654 m_flags = flags;

1655 m_bMatchCase = flags & FPDFTEXT_MATCHCASE;

1656 if (m_strText.IsEmpty()) {

1657 m_IsFind = FALSE;

1658 return TRUE;

1659 }

1660 FX_STRSIZE len = findwhatStr.GetLength();

1661 if (!m_bMatchCase) {

1662 findwhatStr.MakeLower();

1663 m_strText.MakeLower();

1664 }

1665 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;

1666 m_findNextStart = startPos;

1667 if (startPos == -1) {

1668 m_findPreStart = m_strText.GetLength() - 1;

1669 } else {

1670 m_findPreStart = startPos;

1671 }

1672 m_csFindWhatArray.clear();

1673 int i = 0;

1674 while (i < len) {

1675 if (findwhatStr.GetAt(i) != ' ') {

1676 break;

1677 }

1678 i++;

1679 }

1680 if (i < len) {

1681 ExtractFindWhat(findwhatStr);

1682 } else {

1683 m_csFindWhatArray.push_back(findwhatStr);

1684 }

1685 if (m_csFindWhatArray.empty()) {

1686 return FALSE;

1687 }

1688 m_IsFind = TRUE;

1689 m_resStart = 0;

1690 m_resEnd = -1;

1691 return TRUE;

1692 }

1693

1694 FX_BOOL CPDF_TextPageFind::FindNext() {

1695 if (!m_pTextPage) {

1696 return FALSE;

1697 }

1698 m_resArray.clear();

1699 if (m_findNextStart == -1) {

1700 return FALSE;

1701 }

1702 if (m_strText.IsEmpty()) {

1703 m_IsFind = FALSE;

1704 return m_IsFind;

1705 }

1706 int strLen = m_strText.GetLength();

1707 if (m_findNextStart > strLen - 1) {

1708 m_IsFind = FALSE;

1709 return m_IsFind;

1710 }

1711 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);

1712 int nResultPos = 0;

1713 int nStartPos = 0;

1714 nStartPos = m_findNextStart;

1715 FX_BOOL bSpaceStart = FALSE;

1716 for (int iWord = 0; iWord < nCount; iWord++) {

1717 CFX_WideString csWord = m_csFindWhatArray[iWord];

1718 if (csWord.IsEmpty()) {

1719 if (iWord == nCount - 1) {

1720 FX_WCHAR strInsert = m_strText.GetAt(nStartPos);

1721 if (strInsert == TEXT_LINEFEED_CHAR \|\| strInsert == TEXT_SPACE_CHAR \|\|

1722 strInsert == TEXT_RETURN_CHAR \|\| strInsert == 160) {

1723 nResultPos = nStartPos + 1;

1724 break;

1725 }

1726 iWord = -1;

1727 } else if (iWord == 0) {

1728 bSpaceStart = TRUE;

1729 }

1730 continue;

1731 }

1732 int endIndex;

1733 nResultPos = m_strText.Find(csWord.c_str(), nStartPos);

1734 if (nResultPos == -1) {

1735 m_IsFind = FALSE;

1736 return m_IsFind;

1737 }

1738 endIndex = nResultPos + csWord.GetLength() - 1;

1739 if (iWord == 0) {

1740 m_resStart = nResultPos;

1741 }

1742 FX_BOOL bMatch = TRUE;

1743 if (iWord != 0 && !bSpaceStart) {

1744 int PreResEndPos = nStartPos;

1745 int curChar = csWord.GetAt(0);

1746 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];

1747 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);

1748 if (nStartPos == nResultPos &&

1749 !(IsIgnoreSpaceCharacter(lastChar) \|\|

1750 IsIgnoreSpaceCharacter(curChar))) {

1751 bMatch = FALSE;

1752 }

1753 for (int d = PreResEndPos; d < nResultPos; d++) {

1754 FX_WCHAR strInsert = m_strText.GetAt(d);

1755 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&

1756 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {

1757 bMatch = FALSE;

1758 break;

1759 }

1760 }

1761 } else if (bSpaceStart) {

1762 if (nResultPos > 0) {

1763 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);

1764 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&

1765 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {

1766 bMatch = FALSE;

1767 m_resStart = nResultPos;

1768 } else {

1769 m_resStart = nResultPos - 1;

1770 }

1771 }

1772 }

1773 if (m_bMatchWholeWord && bMatch) {

1774 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);

1775 }

1776 nStartPos = endIndex + 1;

1777 if (!bMatch) {

1778 iWord = -1;

1779 if (bSpaceStart) {

1780 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();

1781 } else {

1782 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();

1783 }

1784 }

1785 }

1786 m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;

1787 m_IsFind = TRUE;

1788 int resStart = GetCharIndex(m_resStart);

1789 int resEnd = GetCharIndex(m_resEnd);

1790 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);

1791 if (m_flags & FPDFTEXT_CONSECUTIVE) {

1792 m_findNextStart = m_resStart + 1;

1793 m_findPreStart = m_resEnd - 1;

1794 } else {

1795 m_findNextStart = m_resEnd + 1;

1796 m_findPreStart = m_resStart - 1;

1797 }

1798 return m_IsFind;

1799 }

1800

1801 FX_BOOL CPDF_TextPageFind::FindPrev() {

1802 if (!m_pTextPage) {

1803 return FALSE;

1804 }

1805 m_resArray.clear();

1806 if (m_strText.IsEmpty() \|\| m_findPreStart < 0) {

1807 m_IsFind = FALSE;

1808 return m_IsFind;

1809 }

1810 CPDF_TextPageFind findEngine(m_pTextPage);

1811 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);

1812 if (!ret) {

1813 m_IsFind = FALSE;

1814 return m_IsFind;

1815 }

1816 int order = -1, MatchedCount = 0;

1817 while (ret) {

1818 ret = findEngine.FindNext();

1819 if (ret) {

1820 int order1 = findEngine.GetCurOrder();

1821 int MatchedCount1 = findEngine.GetMatchedCount();

1822 if (((order1 + MatchedCount1) - 1) > m_findPreStart) {

1823 break;

1824 }

1825 order = order1;

1826 MatchedCount = MatchedCount1;

1827 }

1828 }

1829 if (order == -1) {

1830 m_IsFind = FALSE;

1831 return m_IsFind;

1832 }

1833 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);

1834 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);

1835 m_IsFind = TRUE;

1836 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);

1837 if (m_flags & FPDFTEXT_CONSECUTIVE) {

1838 m_findNextStart = m_resStart + 1;

1839 m_findPreStart = m_resEnd - 1;

1840 } else {

1841 m_findNextStart = m_resEnd + 1;

1842 m_findPreStart = m_resStart - 1;

1843 }

1844 return m_IsFind;

1845 }

1846

1847 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {

1848 if (findwhat.IsEmpty()) {

1849 return;

1850 }

1851 int index = 0;

1852 while (1) {

1853 CFX_WideString csWord = TEXT_EMPTY;

1854 int ret =

1855 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);

1856 if (csWord.IsEmpty()) {

1857 if (ret) {

1858 m_csFindWhatArray.push_back(L"");

1859 index++;

1860 continue;

1861 } else {

1862 break;

1863 }

1864 }

1865 int pos = 0;

1866 while (pos < csWord.GetLength()) {

1867 CFX_WideString curStr = csWord.Mid(pos, 1);

1868 FX_WCHAR curChar = csWord.GetAt(pos);

1869 if (IsIgnoreSpaceCharacter(curChar)) {

1870 if (pos > 0 && curChar == 0x2019) {

1871 pos++;

1872 continue;

1873 }

1874 if (pos > 0) {

1875 m_csFindWhatArray.push_back(csWord.Mid(0, pos));

1876 }

1877 m_csFindWhatArray.push_back(curStr);

1878 if (pos == csWord.GetLength() - 1) {

1879 csWord.clear();

1880 break;

1881 }

1882 csWord = csWord.Right(csWord.GetLength() - pos - 1);

1883 pos = 0;

1884 continue;

1885 }

1886 pos++;

1887 }

1888 if (!csWord.IsEmpty()) {

1889 m_csFindWhatArray.push_back(csWord);

1890 }

1891 index++;

1892 }

1893 }

1894

1895 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,

1896 int startPos,

1897 int endPos) {

1898 FX_WCHAR char_left = 0;

1899 FX_WCHAR char_right = 0;

1900 int char_count = endPos - startPos + 1;

1901 if (char_count < 1) {

1902 return FALSE;

1903 }

1904 if (char_count == 1 && csPageText.GetAt(startPos) > 255) {

1905 return TRUE;

1906 }

1907 if (startPos - 1 >= 0) {

1908 char_left = csPageText.GetAt(startPos - 1);

1909 }

1910 if (startPos + char_count < csPageText.GetLength()) {

1911 char_right = csPageText.GetAt(startPos + char_count);

1912 }

1913 if ((char_left > 'A' && char_left < 'a') \|\|

1914 (char_left > 'a' && char_left < 'z') \|\|

1915 (char_left > 0xfb00 && char_left < 0xfb06) \|\| std::iswdigit(char_left) \|\|

1916 (char_right > 'A' && char_right < 'a') \|\|

1917 (char_right > 'a' && char_right < 'z') \|\|

1918 (char_right > 0xfb00 && char_right < 0xfb06) \|\|

1919 std::iswdigit(char_right)) {

1920 return FALSE;

1921 }

1922 if (!(('A' > char_left \|\| char_left > 'Z') &&

1923 ('a' > char_left \|\| char_left > 'z') &&

1924 ('A' > char_right \|\| char_right > 'Z') &&

1925 ('a' > char_right \|\| char_right > 'z'))) {

1926 return FALSE;

1927 }

1928 if (char_count > 0) {

1929 if (csPageText.GetAt(startPos) >= L'0' &&

1930 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&

1931 char_left <= L'9') {

1932 return FALSE;

1933 }

1934 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&

1935 char_right >= L'0' && char_right <= L'9') {

1936 return FALSE;

1937 }

1938 }

1939 return TRUE;

1940 }

1941

1942 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,

1943 const FX_WCHAR* lpszFullString,

1944 int iSubString,

1945 FX_WCHAR chSep) {

1946 if (!lpszFullString) {

1947 return FALSE;

1948 }

1949 while (iSubString--) {

1950 lpszFullString = wcschr(lpszFullString, chSep);

1951 if (!lpszFullString) {

1952 rString.clear();

1953 return FALSE;

1954 }

1955 lpszFullString++;

1956 while (*lpszFullString == chSep) {

1957 lpszFullString++;

1958 }

1959 }

1960 const FX_WCHAR* lpchEnd = wcschr(lpszFullString, chSep);

1961 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)

1962 : (int)FXSYS_wcslen(lpszFullString);

1963 ASSERT(nLen >= 0);

1964 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,

1965 nLen * sizeof(FX_WCHAR));

1966 rString.ReleaseBuffer();

1967 return TRUE;

1968 }

1969

1970 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {

1971 CFX_WideString str2;

1972 str2.clear();

1973 int nlen = str.GetLength();

1974 for (int i = nlen - 1; i >= 0; i--) {

1975 str2 += str.GetAt(i);

1976 }

1977 return str2;

1978 }

1979

1980 int CPDF_TextPageFind::GetCurOrder() const {

1981 return GetCharIndex(m_resStart);

1982 }

1983

1984 int CPDF_TextPageFind::GetMatchedCount() const {

1985 int resStart = GetCharIndex(m_resStart);

1986 int resEnd = GetCharIndex(m_resEnd);

1987 return resEnd - resStart + 1;

1988 }

1989

1990 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)

1991 : m_pTextPage(pTextPage) {}

1992

1993 CPDF_LinkExtract::~CPDF_LinkExtract() {

1994 }

1995

1996 void CPDF_LinkExtract::ExtractLinks() {

1997 m_LinkArray.clear();

1998 if (!m_pTextPage->IsParsed())

1999 return;

2000

2001 m_strPageText = m_pTextPage->GetPageText(0, -1);

2002 if (m_strPageText.IsEmpty())

2003 return;

2004

2005 ParseLink();

2006 }

2007

2008 void CPDF_LinkExtract::ParseLink() {

2009 int start = 0, pos = 0;

2010 int TotalChar = m_pTextPage->CountChars();

2011 while (pos < TotalChar) {

2012 FPDF_CHAR_INFO pageChar;

2013 m_pTextPage->GetCharInfo(pos, &pageChar);

2014 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED \|\|

2015 pageChar.m_Unicode == 0x20 \|\| pos == TotalChar - 1) {

2016 int nCount = pos - start;

2017 if (pos == TotalChar - 1) {

2018 nCount++;

2019 }

2020 CFX_WideString strBeCheck;

2021 strBeCheck = m_pTextPage->GetPageText(start, nCount);

2022 if (strBeCheck.GetLength() > 5) {

2023 while (strBeCheck.GetLength() > 0) {

2024 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);

2025 if (ch == L')' \|\| ch == L',' \|\| ch == L'>' \|\| ch == L'.') {

2026 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);

2027 nCount--;

2028 } else {

2029 break;

2030 }

2031 }

2032 if (nCount > 5 &&

2033 (CheckWebLink(strBeCheck) \|\| CheckMailLink(strBeCheck))) {

2034 m_LinkArray.push_back({start, nCount, strBeCheck});

2035 }

2036 }

2037 start = ++pos;

2038 } else {

2039 pos++;

2040 }

2041 }

2042 }

2043

2044 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {

2045 CFX_WideString str = strBeCheck;

2046 str.MakeLower();

2047 if (str.Find(L"http://www.") != -1) {

2048 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));

2049 return true;

2050 }

2051 if (str.Find(L"http://") != -1) {

2052 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));

2053 return true;

2054 }

2055 if (str.Find(L"https://www.") != -1) {

2056 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));

2057 return true;

2058 }

2059 if (str.Find(L"https://") != -1) {

2060 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));

2061 return true;

2062 }

2063 if (str.Find(L"www.") != -1) {

2064 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));

2065 strBeCheck = L"http://" + strBeCheck;

2066 return true;

2067 }

2068 return false;

2069 }

2070

2071 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {

2072 int aPos = str.Find(L'@');

2073 // Invalid when no '@'.

2074 if (aPos < 1)

2075 return false;

2076

2077 // Check the local part.

2078 int pPos = aPos; // Used to track the position of '@' or '.'.

2079 for (int i = aPos - 1; i >= 0; i--) {

2080 FX_WCHAR ch = str.GetAt(i);

2081 if (ch == L'_' \|\| ch == L'-' \|\| FXSYS_iswalnum(ch))

2082 continue;

2083

2084 if (ch != L'.' \|\| i == pPos - 1 \|\| i == 0) {

2085 if (i == aPos - 1) {

2086 // There is '.' or invalid char before '@'.

2087 return FALSE;

2088 }

2089 // End extracting for other invalid chars, '.' at the beginning, or

2090 // consecutive '.'.

2091 int removed_len = i == pPos - 1 ? i + 2 : i + 1;

2092 str = str.Right(str.GetLength() - removed_len);

2093 break;

2094 }

2095 // Found a valid '.'.

2096 pPos = i;

2097 }

2098

2099 // Check the domain name part.

2100 aPos = str.Find(L'@');

2101 if (aPos < 1)

2102 return false;

2103

2104 str.TrimRight(L'.');

2105 // At least one '.' in domain name, but not at the beginning.

2106 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.

2107 // Check whether we should remove this check.

2108 int ePos = str.Find(L'.', aPos + 1);

2109 if (ePos == -1 \|\| ePos == aPos + 1)

2110 return false;

2111

2112 // Validate all other chars in domain name.

2113 int nLen = str.GetLength();

2114 pPos = 0; // Used to track the position of '.'.

2115 for (int i = aPos + 1; i < nLen; i++) {

2116 FX_WCHAR wch = str.GetAt(i);

2117 if (wch == L'-' \|\| FXSYS_iswalnum(wch))

2118 continue;

2119

2120 if (wch != L'.' \|\| i == pPos + 1) {

2121 // Domain name should end before invalid char.

2122 int host_end = i == pPos + 1 ? i - 2 : i - 1;

2123 if (pPos > 0 && host_end - aPos >= 3) {

2124 // Trim the ending invalid chars if there is at least one '.' and name.

2125 str = str.Left(host_end + 1);

2126 break;

2127 }

2128 return false;

2129 }

2130 pPos = i;

2131 }

2132

2133 if (str.Find(L"mailto:") == -1)

2134 str = L"mailto:" + str;

2135

2136 return true;

2137 }

2138

2139 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {

2140 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";

2141 }

2142

2143 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {

2144 if (index >= m_LinkArray.size())

2145 return std::vector<CFX_FloatRect>();

2146

2147 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,

2148 m_LinkArray[index].m_Count);

2149 }

OLD	NEW

« no previous file with comments | « core/fpdftext/cpdf_linkextract.cpp ('k') | core/fpdftext/cpdf_textpagefind.cpp » ('j') | no next file with comments »