| OLD | NEW |
| 1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| 6 | 6 |
| 7 #include "core/fpdftext/include/cpdf_textpage.h" |
| 8 |
| 7 #include <algorithm> | 9 #include <algorithm> |
| 8 #include <cctype> | |
| 9 #include <cwctype> | |
| 10 #include <memory> | |
| 11 #include <utility> | 10 #include <utility> |
| 12 #include <vector> | 11 #include <vector> |
| 13 | 12 |
| 14 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h" | 13 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h" |
| 15 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h" | 14 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h" |
| 16 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" | 15 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" |
| 17 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h" | 16 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h" |
| 18 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" | 17 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" |
| 19 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" | 18 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" |
| 20 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" | 19 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" |
| 21 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" | 20 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" |
| 22 #include "core/fpdftext/include/cpdf_linkextract.h" | |
| 23 #include "core/fpdftext/include/cpdf_textpage.h" | |
| 24 #include "core/fpdftext/include/cpdf_textpagefind.h" | |
| 25 #include "core/fpdftext/unicodenormalizationdata.h" | 21 #include "core/fpdftext/unicodenormalizationdata.h" |
| 26 #include "core/fxcrt/fx_bidi.h" | 22 #include "core/fxcrt/fx_bidi.h" |
| 27 #include "core/fxcrt/include/fx_ext.h" | 23 #include "core/fxcrt/include/fx_ext.h" |
| 28 #include "core/fxcrt/include/fx_ucd.h" | 24 #include "core/fxcrt/include/fx_ucd.h" |
| 29 #include "third_party/base/stl_util.h" | 25 #include "third_party/base/stl_util.h" |
| 30 | 26 |
| 31 #define FPDFTEXT_MATCHCASE 0x00000001 | |
| 32 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002 | |
| 33 #define FPDFTEXT_CONSECUTIVE 0x00000004 | |
| 34 | |
| 35 #define FPDFTEXT_CHAR_ERROR -1 | |
| 36 #define FPDFTEXT_CHAR_NORMAL 0 | |
| 37 #define FPDFTEXT_CHAR_GENERATED 1 | |
| 38 #define FPDFTEXT_CHAR_UNUNICODE 2 | |
| 39 #define FPDFTEXT_CHAR_HYPHEN 3 | |
| 40 #define FPDFTEXT_CHAR_PIECE 4 | |
| 41 | |
| 42 #define TEXT_SPACE_CHAR L' ' | |
| 43 #define TEXT_LINEFEED_CHAR L'\n' | |
| 44 #define TEXT_RETURN_CHAR L'\r' | |
| 45 #define TEXT_EMPTY L"" | |
| 46 #define TEXT_SPACE L" " | |
| 47 #define TEXT_RETURN_LINEFEED L"\r\n" | |
| 48 #define TEXT_LINEFEED L"\n" | |
| 49 #define TEXT_CHARRATIO_GAPDELTA 0.070 | |
| 50 | |
| 51 namespace { | 27 namespace { |
| 52 | 28 |
| 53 const FX_FLOAT kDefaultFontSize = 1.0f; | 29 const FX_FLOAT kDefaultFontSize = 1.0f; |
| 54 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { | 30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { |
| 55 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, | 31 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, |
| 56 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; | 32 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; |
| 57 | 33 |
| 58 FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) { | |
| 59 if (curChar < 255) | |
| 60 return FALSE; | |
| 61 if ((curChar >= 0x0600 && curChar <= 0x06FF) || | |
| 62 (curChar >= 0xFE70 && curChar <= 0xFEFF) || | |
| 63 (curChar >= 0xFB50 && curChar <= 0xFDFF) || | |
| 64 (curChar >= 0x0400 && curChar <= 0x04FF) || | |
| 65 (curChar >= 0x0500 && curChar <= 0x052F) || | |
| 66 (curChar >= 0xA640 && curChar <= 0xA69F) || | |
| 67 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || | |
| 68 (curChar >= 0x2000 && curChar <= 0x206F)) { | |
| 69 return FALSE; | |
| 70 } | |
| 71 return TRUE; | |
| 72 } | |
| 73 | |
| 74 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { | 34 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { |
| 75 if (threshold < 300) | 35 if (threshold < 300) |
| 76 return threshold / 2.0f; | 36 return threshold / 2.0f; |
| 77 if (threshold < 500) | 37 if (threshold < 500) |
| 78 return threshold / 4.0f; | 38 return threshold / 4.0f; |
| 79 if (threshold < 700) | 39 if (threshold < 700) |
| 80 return threshold / 5.0f; | 40 return threshold / 5.0f; |
| 81 return threshold / 6.0f; | 41 return threshold / 6.0f; |
| 82 } | 42 } |
| 83 | 43 |
| (...skipping 1496 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1580 info.m_OriginY); | 1540 info.m_OriginY); |
| 1581 return TRUE; | 1541 return TRUE; |
| 1582 } | 1542 } |
| 1583 | 1543 |
| 1584 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, | 1544 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, |
| 1585 const CFX_FloatRect& rect2) { | 1545 const CFX_FloatRect& rect2) { |
| 1586 CFX_FloatRect rect = rect1; | 1546 CFX_FloatRect rect = rect1; |
| 1587 rect.Intersect(rect2); | 1547 rect.Intersect(rect2); |
| 1588 return !rect.IsEmpty(); | 1548 return !rect.IsEmpty(); |
| 1589 } | 1549 } |
| 1590 | |
| 1591 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) | |
| 1592 : m_pTextPage(pTextPage), | |
| 1593 m_flags(0), | |
| 1594 m_findNextStart(-1), | |
| 1595 m_findPreStart(-1), | |
| 1596 m_bMatchCase(FALSE), | |
| 1597 m_bMatchWholeWord(FALSE), | |
| 1598 m_resStart(0), | |
| 1599 m_resEnd(-1), | |
| 1600 m_IsFind(FALSE) { | |
| 1601 m_strText = m_pTextPage->GetPageText(); | |
| 1602 int nCount = pTextPage->CountChars(); | |
| 1603 if (nCount) { | |
| 1604 m_CharIndex.push_back(0); | |
| 1605 } | |
| 1606 for (int i = 0; i < nCount; i++) { | |
| 1607 FPDF_CHAR_INFO info; | |
| 1608 pTextPage->GetCharInfo(i, &info); | |
| 1609 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); | |
| 1610 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL || | |
| 1611 info.m_Flag == FPDFTEXT_CHAR_GENERATED) { | |
| 1612 if (indexSize % 2) { | |
| 1613 m_CharIndex.push_back(1); | |
| 1614 } else { | |
| 1615 if (indexSize <= 0) { | |
| 1616 continue; | |
| 1617 } | |
| 1618 m_CharIndex[indexSize - 1] += 1; | |
| 1619 } | |
| 1620 } else { | |
| 1621 if (indexSize % 2) { | |
| 1622 if (indexSize <= 0) { | |
| 1623 continue; | |
| 1624 } | |
| 1625 m_CharIndex[indexSize - 1] = i + 1; | |
| 1626 } else { | |
| 1627 m_CharIndex.push_back(i + 1); | |
| 1628 } | |
| 1629 } | |
| 1630 } | |
| 1631 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); | |
| 1632 if (indexSize % 2) { | |
| 1633 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); | |
| 1634 } | |
| 1635 } | |
| 1636 | |
| 1637 CPDF_TextPageFind::~CPDF_TextPageFind() {} | |
| 1638 | |
| 1639 int CPDF_TextPageFind::GetCharIndex(int index) const { | |
| 1640 return m_pTextPage->CharIndexFromTextIndex(index); | |
| 1641 } | |
| 1642 | |
| 1643 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, | |
| 1644 int flags, | |
| 1645 int startPos) { | |
| 1646 if (!m_pTextPage) { | |
| 1647 return FALSE; | |
| 1648 } | |
| 1649 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) { | |
| 1650 m_strText = m_pTextPage->GetPageText(); | |
| 1651 } | |
| 1652 CFX_WideString findwhatStr = findwhat; | |
| 1653 m_findWhat = findwhatStr; | |
| 1654 m_flags = flags; | |
| 1655 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; | |
| 1656 if (m_strText.IsEmpty()) { | |
| 1657 m_IsFind = FALSE; | |
| 1658 return TRUE; | |
| 1659 } | |
| 1660 FX_STRSIZE len = findwhatStr.GetLength(); | |
| 1661 if (!m_bMatchCase) { | |
| 1662 findwhatStr.MakeLower(); | |
| 1663 m_strText.MakeLower(); | |
| 1664 } | |
| 1665 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD; | |
| 1666 m_findNextStart = startPos; | |
| 1667 if (startPos == -1) { | |
| 1668 m_findPreStart = m_strText.GetLength() - 1; | |
| 1669 } else { | |
| 1670 m_findPreStart = startPos; | |
| 1671 } | |
| 1672 m_csFindWhatArray.clear(); | |
| 1673 int i = 0; | |
| 1674 while (i < len) { | |
| 1675 if (findwhatStr.GetAt(i) != ' ') { | |
| 1676 break; | |
| 1677 } | |
| 1678 i++; | |
| 1679 } | |
| 1680 if (i < len) { | |
| 1681 ExtractFindWhat(findwhatStr); | |
| 1682 } else { | |
| 1683 m_csFindWhatArray.push_back(findwhatStr); | |
| 1684 } | |
| 1685 if (m_csFindWhatArray.empty()) { | |
| 1686 return FALSE; | |
| 1687 } | |
| 1688 m_IsFind = TRUE; | |
| 1689 m_resStart = 0; | |
| 1690 m_resEnd = -1; | |
| 1691 return TRUE; | |
| 1692 } | |
| 1693 | |
| 1694 FX_BOOL CPDF_TextPageFind::FindNext() { | |
| 1695 if (!m_pTextPage) { | |
| 1696 return FALSE; | |
| 1697 } | |
| 1698 m_resArray.clear(); | |
| 1699 if (m_findNextStart == -1) { | |
| 1700 return FALSE; | |
| 1701 } | |
| 1702 if (m_strText.IsEmpty()) { | |
| 1703 m_IsFind = FALSE; | |
| 1704 return m_IsFind; | |
| 1705 } | |
| 1706 int strLen = m_strText.GetLength(); | |
| 1707 if (m_findNextStart > strLen - 1) { | |
| 1708 m_IsFind = FALSE; | |
| 1709 return m_IsFind; | |
| 1710 } | |
| 1711 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); | |
| 1712 int nResultPos = 0; | |
| 1713 int nStartPos = 0; | |
| 1714 nStartPos = m_findNextStart; | |
| 1715 FX_BOOL bSpaceStart = FALSE; | |
| 1716 for (int iWord = 0; iWord < nCount; iWord++) { | |
| 1717 CFX_WideString csWord = m_csFindWhatArray[iWord]; | |
| 1718 if (csWord.IsEmpty()) { | |
| 1719 if (iWord == nCount - 1) { | |
| 1720 FX_WCHAR strInsert = m_strText.GetAt(nStartPos); | |
| 1721 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR || | |
| 1722 strInsert == TEXT_RETURN_CHAR || strInsert == 160) { | |
| 1723 nResultPos = nStartPos + 1; | |
| 1724 break; | |
| 1725 } | |
| 1726 iWord = -1; | |
| 1727 } else if (iWord == 0) { | |
| 1728 bSpaceStart = TRUE; | |
| 1729 } | |
| 1730 continue; | |
| 1731 } | |
| 1732 int endIndex; | |
| 1733 nResultPos = m_strText.Find(csWord.c_str(), nStartPos); | |
| 1734 if (nResultPos == -1) { | |
| 1735 m_IsFind = FALSE; | |
| 1736 return m_IsFind; | |
| 1737 } | |
| 1738 endIndex = nResultPos + csWord.GetLength() - 1; | |
| 1739 if (iWord == 0) { | |
| 1740 m_resStart = nResultPos; | |
| 1741 } | |
| 1742 FX_BOOL bMatch = TRUE; | |
| 1743 if (iWord != 0 && !bSpaceStart) { | |
| 1744 int PreResEndPos = nStartPos; | |
| 1745 int curChar = csWord.GetAt(0); | |
| 1746 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; | |
| 1747 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); | |
| 1748 if (nStartPos == nResultPos && | |
| 1749 !(IsIgnoreSpaceCharacter(lastChar) || | |
| 1750 IsIgnoreSpaceCharacter(curChar))) { | |
| 1751 bMatch = FALSE; | |
| 1752 } | |
| 1753 for (int d = PreResEndPos; d < nResultPos; d++) { | |
| 1754 FX_WCHAR strInsert = m_strText.GetAt(d); | |
| 1755 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && | |
| 1756 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { | |
| 1757 bMatch = FALSE; | |
| 1758 break; | |
| 1759 } | |
| 1760 } | |
| 1761 } else if (bSpaceStart) { | |
| 1762 if (nResultPos > 0) { | |
| 1763 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); | |
| 1764 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && | |
| 1765 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { | |
| 1766 bMatch = FALSE; | |
| 1767 m_resStart = nResultPos; | |
| 1768 } else { | |
| 1769 m_resStart = nResultPos - 1; | |
| 1770 } | |
| 1771 } | |
| 1772 } | |
| 1773 if (m_bMatchWholeWord && bMatch) { | |
| 1774 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); | |
| 1775 } | |
| 1776 nStartPos = endIndex + 1; | |
| 1777 if (!bMatch) { | |
| 1778 iWord = -1; | |
| 1779 if (bSpaceStart) { | |
| 1780 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); | |
| 1781 } else { | |
| 1782 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); | |
| 1783 } | |
| 1784 } | |
| 1785 } | |
| 1786 m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1; | |
| 1787 m_IsFind = TRUE; | |
| 1788 int resStart = GetCharIndex(m_resStart); | |
| 1789 int resEnd = GetCharIndex(m_resEnd); | |
| 1790 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1); | |
| 1791 if (m_flags & FPDFTEXT_CONSECUTIVE) { | |
| 1792 m_findNextStart = m_resStart + 1; | |
| 1793 m_findPreStart = m_resEnd - 1; | |
| 1794 } else { | |
| 1795 m_findNextStart = m_resEnd + 1; | |
| 1796 m_findPreStart = m_resStart - 1; | |
| 1797 } | |
| 1798 return m_IsFind; | |
| 1799 } | |
| 1800 | |
| 1801 FX_BOOL CPDF_TextPageFind::FindPrev() { | |
| 1802 if (!m_pTextPage) { | |
| 1803 return FALSE; | |
| 1804 } | |
| 1805 m_resArray.clear(); | |
| 1806 if (m_strText.IsEmpty() || m_findPreStart < 0) { | |
| 1807 m_IsFind = FALSE; | |
| 1808 return m_IsFind; | |
| 1809 } | |
| 1810 CPDF_TextPageFind findEngine(m_pTextPage); | |
| 1811 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); | |
| 1812 if (!ret) { | |
| 1813 m_IsFind = FALSE; | |
| 1814 return m_IsFind; | |
| 1815 } | |
| 1816 int order = -1, MatchedCount = 0; | |
| 1817 while (ret) { | |
| 1818 ret = findEngine.FindNext(); | |
| 1819 if (ret) { | |
| 1820 int order1 = findEngine.GetCurOrder(); | |
| 1821 int MatchedCount1 = findEngine.GetMatchedCount(); | |
| 1822 if (((order1 + MatchedCount1) - 1) > m_findPreStart) { | |
| 1823 break; | |
| 1824 } | |
| 1825 order = order1; | |
| 1826 MatchedCount = MatchedCount1; | |
| 1827 } | |
| 1828 } | |
| 1829 if (order == -1) { | |
| 1830 m_IsFind = FALSE; | |
| 1831 return m_IsFind; | |
| 1832 } | |
| 1833 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); | |
| 1834 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); | |
| 1835 m_IsFind = TRUE; | |
| 1836 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount); | |
| 1837 if (m_flags & FPDFTEXT_CONSECUTIVE) { | |
| 1838 m_findNextStart = m_resStart + 1; | |
| 1839 m_findPreStart = m_resEnd - 1; | |
| 1840 } else { | |
| 1841 m_findNextStart = m_resEnd + 1; | |
| 1842 m_findPreStart = m_resStart - 1; | |
| 1843 } | |
| 1844 return m_IsFind; | |
| 1845 } | |
| 1846 | |
| 1847 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { | |
| 1848 if (findwhat.IsEmpty()) { | |
| 1849 return; | |
| 1850 } | |
| 1851 int index = 0; | |
| 1852 while (1) { | |
| 1853 CFX_WideString csWord = TEXT_EMPTY; | |
| 1854 int ret = | |
| 1855 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR); | |
| 1856 if (csWord.IsEmpty()) { | |
| 1857 if (ret) { | |
| 1858 m_csFindWhatArray.push_back(L""); | |
| 1859 index++; | |
| 1860 continue; | |
| 1861 } else { | |
| 1862 break; | |
| 1863 } | |
| 1864 } | |
| 1865 int pos = 0; | |
| 1866 while (pos < csWord.GetLength()) { | |
| 1867 CFX_WideString curStr = csWord.Mid(pos, 1); | |
| 1868 FX_WCHAR curChar = csWord.GetAt(pos); | |
| 1869 if (IsIgnoreSpaceCharacter(curChar)) { | |
| 1870 if (pos > 0 && curChar == 0x2019) { | |
| 1871 pos++; | |
| 1872 continue; | |
| 1873 } | |
| 1874 if (pos > 0) { | |
| 1875 m_csFindWhatArray.push_back(csWord.Mid(0, pos)); | |
| 1876 } | |
| 1877 m_csFindWhatArray.push_back(curStr); | |
| 1878 if (pos == csWord.GetLength() - 1) { | |
| 1879 csWord.clear(); | |
| 1880 break; | |
| 1881 } | |
| 1882 csWord = csWord.Right(csWord.GetLength() - pos - 1); | |
| 1883 pos = 0; | |
| 1884 continue; | |
| 1885 } | |
| 1886 pos++; | |
| 1887 } | |
| 1888 if (!csWord.IsEmpty()) { | |
| 1889 m_csFindWhatArray.push_back(csWord); | |
| 1890 } | |
| 1891 index++; | |
| 1892 } | |
| 1893 } | |
| 1894 | |
| 1895 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, | |
| 1896 int startPos, | |
| 1897 int endPos) { | |
| 1898 FX_WCHAR char_left = 0; | |
| 1899 FX_WCHAR char_right = 0; | |
| 1900 int char_count = endPos - startPos + 1; | |
| 1901 if (char_count < 1) { | |
| 1902 return FALSE; | |
| 1903 } | |
| 1904 if (char_count == 1 && csPageText.GetAt(startPos) > 255) { | |
| 1905 return TRUE; | |
| 1906 } | |
| 1907 if (startPos - 1 >= 0) { | |
| 1908 char_left = csPageText.GetAt(startPos - 1); | |
| 1909 } | |
| 1910 if (startPos + char_count < csPageText.GetLength()) { | |
| 1911 char_right = csPageText.GetAt(startPos + char_count); | |
| 1912 } | |
| 1913 if ((char_left > 'A' && char_left < 'a') || | |
| 1914 (char_left > 'a' && char_left < 'z') || | |
| 1915 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || | |
| 1916 (char_right > 'A' && char_right < 'a') || | |
| 1917 (char_right > 'a' && char_right < 'z') || | |
| 1918 (char_right > 0xfb00 && char_right < 0xfb06) || | |
| 1919 std::iswdigit(char_right)) { | |
| 1920 return FALSE; | |
| 1921 } | |
| 1922 if (!(('A' > char_left || char_left > 'Z') && | |
| 1923 ('a' > char_left || char_left > 'z') && | |
| 1924 ('A' > char_right || char_right > 'Z') && | |
| 1925 ('a' > char_right || char_right > 'z'))) { | |
| 1926 return FALSE; | |
| 1927 } | |
| 1928 if (char_count > 0) { | |
| 1929 if (csPageText.GetAt(startPos) >= L'0' && | |
| 1930 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && | |
| 1931 char_left <= L'9') { | |
| 1932 return FALSE; | |
| 1933 } | |
| 1934 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && | |
| 1935 char_right >= L'0' && char_right <= L'9') { | |
| 1936 return FALSE; | |
| 1937 } | |
| 1938 } | |
| 1939 return TRUE; | |
| 1940 } | |
| 1941 | |
| 1942 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, | |
| 1943 const FX_WCHAR* lpszFullString, | |
| 1944 int iSubString, | |
| 1945 FX_WCHAR chSep) { | |
| 1946 if (!lpszFullString) { | |
| 1947 return FALSE; | |
| 1948 } | |
| 1949 while (iSubString--) { | |
| 1950 lpszFullString = wcschr(lpszFullString, chSep); | |
| 1951 if (!lpszFullString) { | |
| 1952 rString.clear(); | |
| 1953 return FALSE; | |
| 1954 } | |
| 1955 lpszFullString++; | |
| 1956 while (*lpszFullString == chSep) { | |
| 1957 lpszFullString++; | |
| 1958 } | |
| 1959 } | |
| 1960 const FX_WCHAR* lpchEnd = wcschr(lpszFullString, chSep); | |
| 1961 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString) | |
| 1962 : (int)FXSYS_wcslen(lpszFullString); | |
| 1963 ASSERT(nLen >= 0); | |
| 1964 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, | |
| 1965 nLen * sizeof(FX_WCHAR)); | |
| 1966 rString.ReleaseBuffer(); | |
| 1967 return TRUE; | |
| 1968 } | |
| 1969 | |
| 1970 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) { | |
| 1971 CFX_WideString str2; | |
| 1972 str2.clear(); | |
| 1973 int nlen = str.GetLength(); | |
| 1974 for (int i = nlen - 1; i >= 0; i--) { | |
| 1975 str2 += str.GetAt(i); | |
| 1976 } | |
| 1977 return str2; | |
| 1978 } | |
| 1979 | |
| 1980 int CPDF_TextPageFind::GetCurOrder() const { | |
| 1981 return GetCharIndex(m_resStart); | |
| 1982 } | |
| 1983 | |
| 1984 int CPDF_TextPageFind::GetMatchedCount() const { | |
| 1985 int resStart = GetCharIndex(m_resStart); | |
| 1986 int resEnd = GetCharIndex(m_resEnd); | |
| 1987 return resEnd - resStart + 1; | |
| 1988 } | |
| 1989 | |
| 1990 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) | |
| 1991 : m_pTextPage(pTextPage) {} | |
| 1992 | |
| 1993 CPDF_LinkExtract::~CPDF_LinkExtract() { | |
| 1994 } | |
| 1995 | |
| 1996 void CPDF_LinkExtract::ExtractLinks() { | |
| 1997 m_LinkArray.clear(); | |
| 1998 if (!m_pTextPage->IsParsed()) | |
| 1999 return; | |
| 2000 | |
| 2001 m_strPageText = m_pTextPage->GetPageText(0, -1); | |
| 2002 if (m_strPageText.IsEmpty()) | |
| 2003 return; | |
| 2004 | |
| 2005 ParseLink(); | |
| 2006 } | |
| 2007 | |
| 2008 void CPDF_LinkExtract::ParseLink() { | |
| 2009 int start = 0, pos = 0; | |
| 2010 int TotalChar = m_pTextPage->CountChars(); | |
| 2011 while (pos < TotalChar) { | |
| 2012 FPDF_CHAR_INFO pageChar; | |
| 2013 m_pTextPage->GetCharInfo(pos, &pageChar); | |
| 2014 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || | |
| 2015 pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { | |
| 2016 int nCount = pos - start; | |
| 2017 if (pos == TotalChar - 1) { | |
| 2018 nCount++; | |
| 2019 } | |
| 2020 CFX_WideString strBeCheck; | |
| 2021 strBeCheck = m_pTextPage->GetPageText(start, nCount); | |
| 2022 if (strBeCheck.GetLength() > 5) { | |
| 2023 while (strBeCheck.GetLength() > 0) { | |
| 2024 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); | |
| 2025 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { | |
| 2026 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); | |
| 2027 nCount--; | |
| 2028 } else { | |
| 2029 break; | |
| 2030 } | |
| 2031 } | |
| 2032 if (nCount > 5 && | |
| 2033 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { | |
| 2034 m_LinkArray.push_back({start, nCount, strBeCheck}); | |
| 2035 } | |
| 2036 } | |
| 2037 start = ++pos; | |
| 2038 } else { | |
| 2039 pos++; | |
| 2040 } | |
| 2041 } | |
| 2042 } | |
| 2043 | |
| 2044 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { | |
| 2045 CFX_WideString str = strBeCheck; | |
| 2046 str.MakeLower(); | |
| 2047 if (str.Find(L"http://www.") != -1) { | |
| 2048 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); | |
| 2049 return true; | |
| 2050 } | |
| 2051 if (str.Find(L"http://") != -1) { | |
| 2052 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); | |
| 2053 return true; | |
| 2054 } | |
| 2055 if (str.Find(L"https://www.") != -1) { | |
| 2056 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); | |
| 2057 return true; | |
| 2058 } | |
| 2059 if (str.Find(L"https://") != -1) { | |
| 2060 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); | |
| 2061 return true; | |
| 2062 } | |
| 2063 if (str.Find(L"www.") != -1) { | |
| 2064 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); | |
| 2065 strBeCheck = L"http://" + strBeCheck; | |
| 2066 return true; | |
| 2067 } | |
| 2068 return false; | |
| 2069 } | |
| 2070 | |
| 2071 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { | |
| 2072 int aPos = str.Find(L'@'); | |
| 2073 // Invalid when no '@'. | |
| 2074 if (aPos < 1) | |
| 2075 return false; | |
| 2076 | |
| 2077 // Check the local part. | |
| 2078 int pPos = aPos; // Used to track the position of '@' or '.'. | |
| 2079 for (int i = aPos - 1; i >= 0; i--) { | |
| 2080 FX_WCHAR ch = str.GetAt(i); | |
| 2081 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) | |
| 2082 continue; | |
| 2083 | |
| 2084 if (ch != L'.' || i == pPos - 1 || i == 0) { | |
| 2085 if (i == aPos - 1) { | |
| 2086 // There is '.' or invalid char before '@'. | |
| 2087 return FALSE; | |
| 2088 } | |
| 2089 // End extracting for other invalid chars, '.' at the beginning, or | |
| 2090 // consecutive '.'. | |
| 2091 int removed_len = i == pPos - 1 ? i + 2 : i + 1; | |
| 2092 str = str.Right(str.GetLength() - removed_len); | |
| 2093 break; | |
| 2094 } | |
| 2095 // Found a valid '.'. | |
| 2096 pPos = i; | |
| 2097 } | |
| 2098 | |
| 2099 // Check the domain name part. | |
| 2100 aPos = str.Find(L'@'); | |
| 2101 if (aPos < 1) | |
| 2102 return false; | |
| 2103 | |
| 2104 str.TrimRight(L'.'); | |
| 2105 // At least one '.' in domain name, but not at the beginning. | |
| 2106 // TODO(weili): RFC5322 allows domain names to be a local name without '.'. | |
| 2107 // Check whether we should remove this check. | |
| 2108 int ePos = str.Find(L'.', aPos + 1); | |
| 2109 if (ePos == -1 || ePos == aPos + 1) | |
| 2110 return false; | |
| 2111 | |
| 2112 // Validate all other chars in domain name. | |
| 2113 int nLen = str.GetLength(); | |
| 2114 pPos = 0; // Used to track the position of '.'. | |
| 2115 for (int i = aPos + 1; i < nLen; i++) { | |
| 2116 FX_WCHAR wch = str.GetAt(i); | |
| 2117 if (wch == L'-' || FXSYS_iswalnum(wch)) | |
| 2118 continue; | |
| 2119 | |
| 2120 if (wch != L'.' || i == pPos + 1) { | |
| 2121 // Domain name should end before invalid char. | |
| 2122 int host_end = i == pPos + 1 ? i - 2 : i - 1; | |
| 2123 if (pPos > 0 && host_end - aPos >= 3) { | |
| 2124 // Trim the ending invalid chars if there is at least one '.' and name. | |
| 2125 str = str.Left(host_end + 1); | |
| 2126 break; | |
| 2127 } | |
| 2128 return false; | |
| 2129 } | |
| 2130 pPos = i; | |
| 2131 } | |
| 2132 | |
| 2133 if (str.Find(L"mailto:") == -1) | |
| 2134 str = L"mailto:" + str; | |
| 2135 | |
| 2136 return true; | |
| 2137 } | |
| 2138 | |
| 2139 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const { | |
| 2140 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; | |
| 2141 } | |
| 2142 | |
| 2143 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { | |
| 2144 if (index >= m_LinkArray.size()) | |
| 2145 return std::vector<CFX_FloatRect>(); | |
| 2146 | |
| 2147 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, | |
| 2148 m_LinkArray[index].m_Count); | |
| 2149 } | |
| OLD | NEW |