OLD | NEW |
1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
6 | 6 |
| 7 #include "core/fpdftext/include/cpdf_textpage.h" |
| 8 |
7 #include <algorithm> | 9 #include <algorithm> |
8 #include <cctype> | |
9 #include <cwctype> | |
10 #include <memory> | |
11 #include <utility> | 10 #include <utility> |
12 #include <vector> | 11 #include <vector> |
13 | 12 |
14 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h" | 13 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h" |
15 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h" | 14 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h" |
16 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" | 15 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" |
17 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h" | 16 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h" |
18 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" | 17 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" |
19 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" | 18 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" |
20 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" | 19 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" |
21 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" | 20 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" |
22 #include "core/fpdftext/include/cpdf_linkextract.h" | |
23 #include "core/fpdftext/include/cpdf_textpage.h" | |
24 #include "core/fpdftext/include/cpdf_textpagefind.h" | |
25 #include "core/fpdftext/unicodenormalizationdata.h" | 21 #include "core/fpdftext/unicodenormalizationdata.h" |
26 #include "core/fxcrt/fx_bidi.h" | 22 #include "core/fxcrt/fx_bidi.h" |
27 #include "core/fxcrt/include/fx_ext.h" | 23 #include "core/fxcrt/include/fx_ext.h" |
28 #include "core/fxcrt/include/fx_ucd.h" | 24 #include "core/fxcrt/include/fx_ucd.h" |
29 #include "third_party/base/stl_util.h" | 25 #include "third_party/base/stl_util.h" |
30 | 26 |
31 #define FPDFTEXT_MATCHCASE 0x00000001 | |
32 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 | |
33 #define FPDFTEXT_CONSECUTIVE 0x00000004 | |
34 | |
35 #define FPDFTEXT_CHAR_ERROR -1 | |
36 #define FPDFTEXT_CHAR_NORMAL 0 | |
37 #define FPDFTEXT_CHAR_GENERATED 1 | |
38 #define FPDFTEXT_CHAR_UNUNICODE 2 | |
39 #define FPDFTEXT_CHAR_HYPHEN 3 | |
40 #define FPDFTEXT_CHAR_PIECE 4 | |
41 | |
42 #define TEXT_SPACE_CHAR L' ' | |
43 #define TEXT_LINEFEED_CHAR L'\n' | |
44 #define TEXT_RETURN_CHAR L'\r' | |
45 #define TEXT_EMPTY L"" | |
46 #define TEXT_SPACE L" " | |
47 #define TEXT_RETURN_LINEFEED L"\r\n" | |
48 #define TEXT_LINEFEED L"\n" | |
49 #define TEXT_CHARRATIO_GAPDELTA 0.070 | |
50 | |
51 namespace { | 27 namespace { |
52 | 28 |
53 const FX_FLOAT kDefaultFontSize = 1.0f; | 29 const FX_FLOAT kDefaultFontSize = 1.0f; |
54 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { | 30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { |
55 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, | 31 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, |
56 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; | 32 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; |
57 | 33 |
58 FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) { | |
59 if (curChar < 255) | |
60 return FALSE; | |
61 if ((curChar >= 0x0600 && curChar <= 0x06FF) || | |
62 (curChar >= 0xFE70 && curChar <= 0xFEFF) || | |
63 (curChar >= 0xFB50 && curChar <= 0xFDFF) || | |
64 (curChar >= 0x0400 && curChar <= 0x04FF) || | |
65 (curChar >= 0x0500 && curChar <= 0x052F) || | |
66 (curChar >= 0xA640 && curChar <= 0xA69F) || | |
67 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || | |
68 (curChar >= 0x2000 && curChar <= 0x206F)) { | |
69 return FALSE; | |
70 } | |
71 return TRUE; | |
72 } | |
73 | |
74 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { | 34 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { |
75 if (threshold < 300) | 35 if (threshold < 300) |
76 return threshold / 2.0f; | 36 return threshold / 2.0f; |
77 if (threshold < 500) | 37 if (threshold < 500) |
78 return threshold / 4.0f; | 38 return threshold / 4.0f; |
79 if (threshold < 700) | 39 if (threshold < 700) |
80 return threshold / 5.0f; | 40 return threshold / 5.0f; |
81 return threshold / 6.0f; | 41 return threshold / 6.0f; |
82 } | 42 } |
83 | 43 |
(...skipping 1496 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1580 info.m_OriginY); | 1540 info.m_OriginY); |
1581 return TRUE; | 1541 return TRUE; |
1582 } | 1542 } |
1583 | 1543 |
1584 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, | 1544 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, |
1585 const CFX_FloatRect& rect2) { | 1545 const CFX_FloatRect& rect2) { |
1586 CFX_FloatRect rect = rect1; | 1546 CFX_FloatRect rect = rect1; |
1587 rect.Intersect(rect2); | 1547 rect.Intersect(rect2); |
1588 return !rect.IsEmpty(); | 1548 return !rect.IsEmpty(); |
1589 } | 1549 } |
1590 | |
1591 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) | |
1592 : m_pTextPage(pTextPage), | |
1593 m_flags(0), | |
1594 m_findNextStart(-1), | |
1595 m_findPreStart(-1), | |
1596 m_bMatchCase(FALSE), | |
1597 m_bMatchWholeWord(FALSE), | |
1598 m_resStart(0), | |
1599 m_resEnd(-1), | |
1600 m_IsFind(FALSE) { | |
1601 m_strText = m_pTextPage->GetPageText(); | |
1602 int nCount = pTextPage->CountChars(); | |
1603 if (nCount) { | |
1604 m_CharIndex.push_back(0); | |
1605 } | |
1606 for (int i = 0; i < nCount; i++) { | |
1607 FPDF_CHAR_INFO info; | |
1608 pTextPage->GetCharInfo(i, &info); | |
1609 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); | |
1610 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL || | |
1611 info.m_Flag == FPDFTEXT_CHAR_GENERATED) { | |
1612 if (indexSize % 2) { | |
1613 m_CharIndex.push_back(1); | |
1614 } else { | |
1615 if (indexSize <= 0) { | |
1616 continue; | |
1617 } | |
1618 m_CharIndex[indexSize - 1] += 1; | |
1619 } | |
1620 } else { | |
1621 if (indexSize % 2) { | |
1622 if (indexSize <= 0) { | |
1623 continue; | |
1624 } | |
1625 m_CharIndex[indexSize - 1] = i + 1; | |
1626 } else { | |
1627 m_CharIndex.push_back(i + 1); | |
1628 } | |
1629 } | |
1630 } | |
1631 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); | |
1632 if (indexSize % 2) { | |
1633 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); | |
1634 } | |
1635 } | |
1636 | |
1637 CPDF_TextPageFind::~CPDF_TextPageFind() {} | |
1638 | |
1639 int CPDF_TextPageFind::GetCharIndex(int index) const { | |
1640 return m_pTextPage->CharIndexFromTextIndex(index); | |
1641 } | |
1642 | |
1643 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, | |
1644 int flags, | |
1645 int startPos) { | |
1646 if (!m_pTextPage) { | |
1647 return FALSE; | |
1648 } | |
1649 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) { | |
1650 m_strText = m_pTextPage->GetPageText(); | |
1651 } | |
1652 CFX_WideString findwhatStr = findwhat; | |
1653 m_findWhat = findwhatStr; | |
1654 m_flags = flags; | |
1655 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; | |
1656 if (m_strText.IsEmpty()) { | |
1657 m_IsFind = FALSE; | |
1658 return TRUE; | |
1659 } | |
1660 FX_STRSIZE len = findwhatStr.GetLength(); | |
1661 if (!m_bMatchCase) { | |
1662 findwhatStr.MakeLower(); | |
1663 m_strText.MakeLower(); | |
1664 } | |
1665 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD; | |
1666 m_findNextStart = startPos; | |
1667 if (startPos == -1) { | |
1668 m_findPreStart = m_strText.GetLength() - 1; | |
1669 } else { | |
1670 m_findPreStart = startPos; | |
1671 } | |
1672 m_csFindWhatArray.clear(); | |
1673 int i = 0; | |
1674 while (i < len) { | |
1675 if (findwhatStr.GetAt(i) != ' ') { | |
1676 break; | |
1677 } | |
1678 i++; | |
1679 } | |
1680 if (i < len) { | |
1681 ExtractFindWhat(findwhatStr); | |
1682 } else { | |
1683 m_csFindWhatArray.push_back(findwhatStr); | |
1684 } | |
1685 if (m_csFindWhatArray.empty()) { | |
1686 return FALSE; | |
1687 } | |
1688 m_IsFind = TRUE; | |
1689 m_resStart = 0; | |
1690 m_resEnd = -1; | |
1691 return TRUE; | |
1692 } | |
1693 | |
1694 FX_BOOL CPDF_TextPageFind::FindNext() { | |
1695 if (!m_pTextPage) { | |
1696 return FALSE; | |
1697 } | |
1698 m_resArray.clear(); | |
1699 if (m_findNextStart == -1) { | |
1700 return FALSE; | |
1701 } | |
1702 if (m_strText.IsEmpty()) { | |
1703 m_IsFind = FALSE; | |
1704 return m_IsFind; | |
1705 } | |
1706 int strLen = m_strText.GetLength(); | |
1707 if (m_findNextStart > strLen - 1) { | |
1708 m_IsFind = FALSE; | |
1709 return m_IsFind; | |
1710 } | |
1711 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); | |
1712 int nResultPos = 0; | |
1713 int nStartPos = 0; | |
1714 nStartPos = m_findNextStart; | |
1715 FX_BOOL bSpaceStart = FALSE; | |
1716 for (int iWord = 0; iWord < nCount; iWord++) { | |
1717 CFX_WideString csWord = m_csFindWhatArray[iWord]; | |
1718 if (csWord.IsEmpty()) { | |
1719 if (iWord == nCount - 1) { | |
1720 FX_WCHAR strInsert = m_strText.GetAt(nStartPos); | |
1721 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR || | |
1722 strInsert == TEXT_RETURN_CHAR || strInsert == 160) { | |
1723 nResultPos = nStartPos + 1; | |
1724 break; | |
1725 } | |
1726 iWord = -1; | |
1727 } else if (iWord == 0) { | |
1728 bSpaceStart = TRUE; | |
1729 } | |
1730 continue; | |
1731 } | |
1732 int endIndex; | |
1733 nResultPos = m_strText.Find(csWord.c_str(), nStartPos); | |
1734 if (nResultPos == -1) { | |
1735 m_IsFind = FALSE; | |
1736 return m_IsFind; | |
1737 } | |
1738 endIndex = nResultPos + csWord.GetLength() - 1; | |
1739 if (iWord == 0) { | |
1740 m_resStart = nResultPos; | |
1741 } | |
1742 FX_BOOL bMatch = TRUE; | |
1743 if (iWord != 0 && !bSpaceStart) { | |
1744 int PreResEndPos = nStartPos; | |
1745 int curChar = csWord.GetAt(0); | |
1746 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; | |
1747 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); | |
1748 if (nStartPos == nResultPos && | |
1749 !(IsIgnoreSpaceCharacter(lastChar) || | |
1750 IsIgnoreSpaceCharacter(curChar))) { | |
1751 bMatch = FALSE; | |
1752 } | |
1753 for (int d = PreResEndPos; d < nResultPos; d++) { | |
1754 FX_WCHAR strInsert = m_strText.GetAt(d); | |
1755 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && | |
1756 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { | |
1757 bMatch = FALSE; | |
1758 break; | |
1759 } | |
1760 } | |
1761 } else if (bSpaceStart) { | |
1762 if (nResultPos > 0) { | |
1763 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); | |
1764 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && | |
1765 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { | |
1766 bMatch = FALSE; | |
1767 m_resStart = nResultPos; | |
1768 } else { | |
1769 m_resStart = nResultPos - 1; | |
1770 } | |
1771 } | |
1772 } | |
1773 if (m_bMatchWholeWord && bMatch) { | |
1774 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); | |
1775 } | |
1776 nStartPos = endIndex + 1; | |
1777 if (!bMatch) { | |
1778 iWord = -1; | |
1779 if (bSpaceStart) { | |
1780 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); | |
1781 } else { | |
1782 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); | |
1783 } | |
1784 } | |
1785 } | |
1786 m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1; | |
1787 m_IsFind = TRUE; | |
1788 int resStart = GetCharIndex(m_resStart); | |
1789 int resEnd = GetCharIndex(m_resEnd); | |
1790 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1); | |
1791 if (m_flags & FPDFTEXT_CONSECUTIVE) { | |
1792 m_findNextStart = m_resStart + 1; | |
1793 m_findPreStart = m_resEnd - 1; | |
1794 } else { | |
1795 m_findNextStart = m_resEnd + 1; | |
1796 m_findPreStart = m_resStart - 1; | |
1797 } | |
1798 return m_IsFind; | |
1799 } | |
1800 | |
1801 FX_BOOL CPDF_TextPageFind::FindPrev() { | |
1802 if (!m_pTextPage) { | |
1803 return FALSE; | |
1804 } | |
1805 m_resArray.clear(); | |
1806 if (m_strText.IsEmpty() || m_findPreStart < 0) { | |
1807 m_IsFind = FALSE; | |
1808 return m_IsFind; | |
1809 } | |
1810 CPDF_TextPageFind findEngine(m_pTextPage); | |
1811 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); | |
1812 if (!ret) { | |
1813 m_IsFind = FALSE; | |
1814 return m_IsFind; | |
1815 } | |
1816 int order = -1, MatchedCount = 0; | |
1817 while (ret) { | |
1818 ret = findEngine.FindNext(); | |
1819 if (ret) { | |
1820 int order1 = findEngine.GetCurOrder(); | |
1821 int MatchedCount1 = findEngine.GetMatchedCount(); | |
1822 if (((order1 + MatchedCount1) - 1) > m_findPreStart) { | |
1823 break; | |
1824 } | |
1825 order = order1; | |
1826 MatchedCount = MatchedCount1; | |
1827 } | |
1828 } | |
1829 if (order == -1) { | |
1830 m_IsFind = FALSE; | |
1831 return m_IsFind; | |
1832 } | |
1833 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); | |
1834 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); | |
1835 m_IsFind = TRUE; | |
1836 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount); | |
1837 if (m_flags & FPDFTEXT_CONSECUTIVE) { | |
1838 m_findNextStart = m_resStart + 1; | |
1839 m_findPreStart = m_resEnd - 1; | |
1840 } else { | |
1841 m_findNextStart = m_resEnd + 1; | |
1842 m_findPreStart = m_resStart - 1; | |
1843 } | |
1844 return m_IsFind; | |
1845 } | |
1846 | |
1847 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { | |
1848 if (findwhat.IsEmpty()) { | |
1849 return; | |
1850 } | |
1851 int index = 0; | |
1852 while (1) { | |
1853 CFX_WideString csWord = TEXT_EMPTY; | |
1854 int ret = | |
1855 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR); | |
1856 if (csWord.IsEmpty()) { | |
1857 if (ret) { | |
1858 m_csFindWhatArray.push_back(L""); | |
1859 index++; | |
1860 continue; | |
1861 } else { | |
1862 break; | |
1863 } | |
1864 } | |
1865 int pos = 0; | |
1866 while (pos < csWord.GetLength()) { | |
1867 CFX_WideString curStr = csWord.Mid(pos, 1); | |
1868 FX_WCHAR curChar = csWord.GetAt(pos); | |
1869 if (IsIgnoreSpaceCharacter(curChar)) { | |
1870 if (pos > 0 && curChar == 0x2019) { | |
1871 pos++; | |
1872 continue; | |
1873 } | |
1874 if (pos > 0) { | |
1875 m_csFindWhatArray.push_back(csWord.Mid(0, pos)); | |
1876 } | |
1877 m_csFindWhatArray.push_back(curStr); | |
1878 if (pos == csWord.GetLength() - 1) { | |
1879 csWord.clear(); | |
1880 break; | |
1881 } | |
1882 csWord = csWord.Right(csWord.GetLength() - pos - 1); | |
1883 pos = 0; | |
1884 continue; | |
1885 } | |
1886 pos++; | |
1887 } | |
1888 if (!csWord.IsEmpty()) { | |
1889 m_csFindWhatArray.push_back(csWord); | |
1890 } | |
1891 index++; | |
1892 } | |
1893 } | |
1894 | |
1895 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, | |
1896 int startPos, | |
1897 int endPos) { | |
1898 FX_WCHAR char_left = 0; | |
1899 FX_WCHAR char_right = 0; | |
1900 int char_count = endPos - startPos + 1; | |
1901 if (char_count < 1) { | |
1902 return FALSE; | |
1903 } | |
1904 if (char_count == 1 && csPageText.GetAt(startPos) > 255) { | |
1905 return TRUE; | |
1906 } | |
1907 if (startPos - 1 >= 0) { | |
1908 char_left = csPageText.GetAt(startPos - 1); | |
1909 } | |
1910 if (startPos + char_count < csPageText.GetLength()) { | |
1911 char_right = csPageText.GetAt(startPos + char_count); | |
1912 } | |
1913 if ((char_left > 'A' && char_left < 'a') || | |
1914 (char_left > 'a' && char_left < 'z') || | |
1915 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || | |
1916 (char_right > 'A' && char_right < 'a') || | |
1917 (char_right > 'a' && char_right < 'z') || | |
1918 (char_right > 0xfb00 && char_right < 0xfb06) || | |
1919 std::iswdigit(char_right)) { | |
1920 return FALSE; | |
1921 } | |
1922 if (!(('A' > char_left || char_left > 'Z') && | |
1923 ('a' > char_left || char_left > 'z') && | |
1924 ('A' > char_right || char_right > 'Z') && | |
1925 ('a' > char_right || char_right > 'z'))) { | |
1926 return FALSE; | |
1927 } | |
1928 if (char_count > 0) { | |
1929 if (csPageText.GetAt(startPos) >= L'0' && | |
1930 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && | |
1931 char_left <= L'9') { | |
1932 return FALSE; | |
1933 } | |
1934 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && | |
1935 char_right >= L'0' && char_right <= L'9') { | |
1936 return FALSE; | |
1937 } | |
1938 } | |
1939 return TRUE; | |
1940 } | |
1941 | |
1942 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, | |
1943 const FX_WCHAR* lpszFullString, | |
1944 int iSubString, | |
1945 FX_WCHAR chSep) { | |
1946 if (!lpszFullString) { | |
1947 return FALSE; | |
1948 } | |
1949 while (iSubString--) { | |
1950 lpszFullString = wcschr(lpszFullString, chSep); | |
1951 if (!lpszFullString) { | |
1952 rString.clear(); | |
1953 return FALSE; | |
1954 } | |
1955 lpszFullString++; | |
1956 while (*lpszFullString == chSep) { | |
1957 lpszFullString++; | |
1958 } | |
1959 } | |
1960 const FX_WCHAR* lpchEnd = wcschr(lpszFullString, chSep); | |
1961 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString) | |
1962 : (int)FXSYS_wcslen(lpszFullString); | |
1963 ASSERT(nLen >= 0); | |
1964 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, | |
1965 nLen * sizeof(FX_WCHAR)); | |
1966 rString.ReleaseBuffer(); | |
1967 return TRUE; | |
1968 } | |
1969 | |
1970 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) { | |
1971 CFX_WideString str2; | |
1972 str2.clear(); | |
1973 int nlen = str.GetLength(); | |
1974 for (int i = nlen - 1; i >= 0; i--) { | |
1975 str2 += str.GetAt(i); | |
1976 } | |
1977 return str2; | |
1978 } | |
1979 | |
1980 int CPDF_TextPageFind::GetCurOrder() const { | |
1981 return GetCharIndex(m_resStart); | |
1982 } | |
1983 | |
1984 int CPDF_TextPageFind::GetMatchedCount() const { | |
1985 int resStart = GetCharIndex(m_resStart); | |
1986 int resEnd = GetCharIndex(m_resEnd); | |
1987 return resEnd - resStart + 1; | |
1988 } | |
1989 | |
1990 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) | |
1991 : m_pTextPage(pTextPage) {} | |
1992 | |
1993 CPDF_LinkExtract::~CPDF_LinkExtract() { | |
1994 } | |
1995 | |
1996 void CPDF_LinkExtract::ExtractLinks() { | |
1997 m_LinkArray.clear(); | |
1998 if (!m_pTextPage->IsParsed()) | |
1999 return; | |
2000 | |
2001 m_strPageText = m_pTextPage->GetPageText(0, -1); | |
2002 if (m_strPageText.IsEmpty()) | |
2003 return; | |
2004 | |
2005 ParseLink(); | |
2006 } | |
2007 | |
2008 void CPDF_LinkExtract::ParseLink() { | |
2009 int start = 0, pos = 0; | |
2010 int TotalChar = m_pTextPage->CountChars(); | |
2011 while (pos < TotalChar) { | |
2012 FPDF_CHAR_INFO pageChar; | |
2013 m_pTextPage->GetCharInfo(pos, &pageChar); | |
2014 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || | |
2015 pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { | |
2016 int nCount = pos - start; | |
2017 if (pos == TotalChar - 1) { | |
2018 nCount++; | |
2019 } | |
2020 CFX_WideString strBeCheck; | |
2021 strBeCheck = m_pTextPage->GetPageText(start, nCount); | |
2022 if (strBeCheck.GetLength() > 5) { | |
2023 while (strBeCheck.GetLength() > 0) { | |
2024 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); | |
2025 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { | |
2026 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); | |
2027 nCount--; | |
2028 } else { | |
2029 break; | |
2030 } | |
2031 } | |
2032 if (nCount > 5 && | |
2033 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { | |
2034 m_LinkArray.push_back({start, nCount, strBeCheck}); | |
2035 } | |
2036 } | |
2037 start = ++pos; | |
2038 } else { | |
2039 pos++; | |
2040 } | |
2041 } | |
2042 } | |
2043 | |
2044 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { | |
2045 CFX_WideString str = strBeCheck; | |
2046 str.MakeLower(); | |
2047 if (str.Find(L"http://www.") != -1) { | |
2048 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); | |
2049 return true; | |
2050 } | |
2051 if (str.Find(L"http://") != -1) { | |
2052 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); | |
2053 return true; | |
2054 } | |
2055 if (str.Find(L"https://www.") != -1) { | |
2056 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); | |
2057 return true; | |
2058 } | |
2059 if (str.Find(L"https://") != -1) { | |
2060 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); | |
2061 return true; | |
2062 } | |
2063 if (str.Find(L"www.") != -1) { | |
2064 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); | |
2065 strBeCheck = L"http://" + strBeCheck; | |
2066 return true; | |
2067 } | |
2068 return false; | |
2069 } | |
2070 | |
2071 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { | |
2072 int aPos = str.Find(L'@'); | |
2073 // Invalid when no '@'. | |
2074 if (aPos < 1) | |
2075 return false; | |
2076 | |
2077 // Check the local part. | |
2078 int pPos = aPos; // Used to track the position of '@' or '.'. | |
2079 for (int i = aPos - 1; i >= 0; i--) { | |
2080 FX_WCHAR ch = str.GetAt(i); | |
2081 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) | |
2082 continue; | |
2083 | |
2084 if (ch != L'.' || i == pPos - 1 || i == 0) { | |
2085 if (i == aPos - 1) { | |
2086 // There is '.' or invalid char before '@'. | |
2087 return FALSE; | |
2088 } | |
2089 // End extracting for other invalid chars, '.' at the beginning, or | |
2090 // consecutive '.'. | |
2091 int removed_len = i == pPos - 1 ? i + 2 : i + 1; | |
2092 str = str.Right(str.GetLength() - removed_len); | |
2093 break; | |
2094 } | |
2095 // Found a valid '.'. | |
2096 pPos = i; | |
2097 } | |
2098 | |
2099 // Check the domain name part. | |
2100 aPos = str.Find(L'@'); | |
2101 if (aPos < 1) | |
2102 return false; | |
2103 | |
2104 str.TrimRight(L'.'); | |
2105 // At least one '.' in domain name, but not at the beginning. | |
2106 // TODO(weili): RFC5322 allows domain names to be a local name without '.'. | |
2107 // Check whether we should remove this check. | |
2108 int ePos = str.Find(L'.', aPos + 1); | |
2109 if (ePos == -1 || ePos == aPos + 1) | |
2110 return false; | |
2111 | |
2112 // Validate all other chars in domain name. | |
2113 int nLen = str.GetLength(); | |
2114 pPos = 0; // Used to track the position of '.'. | |
2115 for (int i = aPos + 1; i < nLen; i++) { | |
2116 FX_WCHAR wch = str.GetAt(i); | |
2117 if (wch == L'-' || FXSYS_iswalnum(wch)) | |
2118 continue; | |
2119 | |
2120 if (wch != L'.' || i == pPos + 1) { | |
2121 // Domain name should end before invalid char. | |
2122 int host_end = i == pPos + 1 ? i - 2 : i - 1; | |
2123 if (pPos > 0 && host_end - aPos >= 3) { | |
2124 // Trim the ending invalid chars if there is at least one '.' and name. | |
2125 str = str.Left(host_end + 1); | |
2126 break; | |
2127 } | |
2128 return false; | |
2129 } | |
2130 pPos = i; | |
2131 } | |
2132 | |
2133 if (str.Find(L"mailto:") == -1) | |
2134 str = L"mailto:" + str; | |
2135 | |
2136 return true; | |
2137 } | |
2138 | |
2139 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const { | |
2140 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; | |
2141 } | |
2142 | |
2143 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { | |
2144 if (index >= m_LinkArray.size()) | |
2145 return std::vector<CFX_FloatRect>(); | |
2146 | |
2147 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, | |
2148 m_LinkArray[index].m_Count); | |
2149 } | |
OLD | NEW |