Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Side by Side Diff: core/fpdftext/cpdf_textpage.cpp

Issue 2286723003: Split fpdf_text_int into classes (Closed)
Patch Set: Fix bots Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « core/fpdftext/cpdf_linkextract.cpp ('k') | core/fpdftext/cpdf_textpagefind.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include "core/fpdftext/include/cpdf_textpage.h"
8
7 #include <algorithm> 9 #include <algorithm>
8 #include <cctype>
9 #include <cwctype>
10 #include <memory>
11 #include <utility> 10 #include <utility>
12 #include <vector> 11 #include <vector>
13 12
14 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h" 13 #include "core/fpdfapi/fpdf_font/include/cpdf_font.h"
15 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h" 14 #include "core/fpdfapi/fpdf_page/include/cpdf_form.h"
16 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h" 15 #include "core/fpdfapi/fpdf_page/include/cpdf_formobject.h"
17 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h" 16 #include "core/fpdfapi/fpdf_page/include/cpdf_page.h"
18 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h" 17 #include "core/fpdfapi/fpdf_page/include/cpdf_pageobject.h"
19 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h" 18 #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h"
20 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h" 19 #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h"
21 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h" 20 #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h"
22 #include "core/fpdftext/include/cpdf_linkextract.h"
23 #include "core/fpdftext/include/cpdf_textpage.h"
24 #include "core/fpdftext/include/cpdf_textpagefind.h"
25 #include "core/fpdftext/unicodenormalizationdata.h" 21 #include "core/fpdftext/unicodenormalizationdata.h"
26 #include "core/fxcrt/fx_bidi.h" 22 #include "core/fxcrt/fx_bidi.h"
27 #include "core/fxcrt/include/fx_ext.h" 23 #include "core/fxcrt/include/fx_ext.h"
28 #include "core/fxcrt/include/fx_ucd.h" 24 #include "core/fxcrt/include/fx_ucd.h"
29 #include "third_party/base/stl_util.h" 25 #include "third_party/base/stl_util.h"
30 26
31 #define FPDFTEXT_MATCHCASE 0x00000001
32 #define FPDFTEXT_MATCHWHOLEWORD 0x00000002
33 #define FPDFTEXT_CONSECUTIVE 0x00000004
34
35 #define FPDFTEXT_CHAR_ERROR -1
36 #define FPDFTEXT_CHAR_NORMAL 0
37 #define FPDFTEXT_CHAR_GENERATED 1
38 #define FPDFTEXT_CHAR_UNUNICODE 2
39 #define FPDFTEXT_CHAR_HYPHEN 3
40 #define FPDFTEXT_CHAR_PIECE 4
41
42 #define TEXT_SPACE_CHAR L' '
43 #define TEXT_LINEFEED_CHAR L'\n'
44 #define TEXT_RETURN_CHAR L'\r'
45 #define TEXT_EMPTY L""
46 #define TEXT_SPACE L" "
47 #define TEXT_RETURN_LINEFEED L"\r\n"
48 #define TEXT_LINEFEED L"\n"
49 #define TEXT_CHARRATIO_GAPDELTA 0.070
50
51 namespace { 27 namespace {
52 28
53 const FX_FLOAT kDefaultFontSize = 1.0f; 29 const FX_FLOAT kDefaultFontSize = 1.0f;
54 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = { 30 const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
55 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2, 31 nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
56 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4}; 32 g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
57 33
58 FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
59 if (curChar < 255)
60 return FALSE;
61 if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
62 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
63 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
64 (curChar >= 0x0400 && curChar <= 0x04FF) ||
65 (curChar >= 0x0500 && curChar <= 0x052F) ||
66 (curChar >= 0xA640 && curChar <= 0xA69F) ||
67 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
68 (curChar >= 0x2000 && curChar <= 0x206F)) {
69 return FALSE;
70 }
71 return TRUE;
72 }
73
74 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) { 34 FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {
75 if (threshold < 300) 35 if (threshold < 300)
76 return threshold / 2.0f; 36 return threshold / 2.0f;
77 if (threshold < 500) 37 if (threshold < 500)
78 return threshold / 4.0f; 38 return threshold / 4.0f;
79 if (threshold < 700) 39 if (threshold < 700)
80 return threshold / 5.0f; 40 return threshold / 5.0f;
81 return threshold / 6.0f; 41 return threshold / 6.0f;
82 } 42 }
83 43
(...skipping 1496 matching lines...) Expand 10 before | Expand all | Expand 10 after
1580 info.m_OriginY); 1540 info.m_OriginY);
1581 return TRUE; 1541 return TRUE;
1582 } 1542 }
1583 1543
1584 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, 1544 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
1585 const CFX_FloatRect& rect2) { 1545 const CFX_FloatRect& rect2) {
1586 CFX_FloatRect rect = rect1; 1546 CFX_FloatRect rect = rect1;
1587 rect.Intersect(rect2); 1547 rect.Intersect(rect2);
1588 return !rect.IsEmpty(); 1548 return !rect.IsEmpty();
1589 } 1549 }
1590
1591 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
1592 : m_pTextPage(pTextPage),
1593 m_flags(0),
1594 m_findNextStart(-1),
1595 m_findPreStart(-1),
1596 m_bMatchCase(FALSE),
1597 m_bMatchWholeWord(FALSE),
1598 m_resStart(0),
1599 m_resEnd(-1),
1600 m_IsFind(FALSE) {
1601 m_strText = m_pTextPage->GetPageText();
1602 int nCount = pTextPage->CountChars();
1603 if (nCount) {
1604 m_CharIndex.push_back(0);
1605 }
1606 for (int i = 0; i < nCount; i++) {
1607 FPDF_CHAR_INFO info;
1608 pTextPage->GetCharInfo(i, &info);
1609 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
1610 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
1611 info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
1612 if (indexSize % 2) {
1613 m_CharIndex.push_back(1);
1614 } else {
1615 if (indexSize <= 0) {
1616 continue;
1617 }
1618 m_CharIndex[indexSize - 1] += 1;
1619 }
1620 } else {
1621 if (indexSize % 2) {
1622 if (indexSize <= 0) {
1623 continue;
1624 }
1625 m_CharIndex[indexSize - 1] = i + 1;
1626 } else {
1627 m_CharIndex.push_back(i + 1);
1628 }
1629 }
1630 }
1631 int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
1632 if (indexSize % 2) {
1633 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
1634 }
1635 }
1636
1637 CPDF_TextPageFind::~CPDF_TextPageFind() {}
1638
1639 int CPDF_TextPageFind::GetCharIndex(int index) const {
1640 return m_pTextPage->CharIndexFromTextIndex(index);
1641 }
1642
1643 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
1644 int flags,
1645 int startPos) {
1646 if (!m_pTextPage) {
1647 return FALSE;
1648 }
1649 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
1650 m_strText = m_pTextPage->GetPageText();
1651 }
1652 CFX_WideString findwhatStr = findwhat;
1653 m_findWhat = findwhatStr;
1654 m_flags = flags;
1655 m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
1656 if (m_strText.IsEmpty()) {
1657 m_IsFind = FALSE;
1658 return TRUE;
1659 }
1660 FX_STRSIZE len = findwhatStr.GetLength();
1661 if (!m_bMatchCase) {
1662 findwhatStr.MakeLower();
1663 m_strText.MakeLower();
1664 }
1665 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
1666 m_findNextStart = startPos;
1667 if (startPos == -1) {
1668 m_findPreStart = m_strText.GetLength() - 1;
1669 } else {
1670 m_findPreStart = startPos;
1671 }
1672 m_csFindWhatArray.clear();
1673 int i = 0;
1674 while (i < len) {
1675 if (findwhatStr.GetAt(i) != ' ') {
1676 break;
1677 }
1678 i++;
1679 }
1680 if (i < len) {
1681 ExtractFindWhat(findwhatStr);
1682 } else {
1683 m_csFindWhatArray.push_back(findwhatStr);
1684 }
1685 if (m_csFindWhatArray.empty()) {
1686 return FALSE;
1687 }
1688 m_IsFind = TRUE;
1689 m_resStart = 0;
1690 m_resEnd = -1;
1691 return TRUE;
1692 }
1693
1694 FX_BOOL CPDF_TextPageFind::FindNext() {
1695 if (!m_pTextPage) {
1696 return FALSE;
1697 }
1698 m_resArray.clear();
1699 if (m_findNextStart == -1) {
1700 return FALSE;
1701 }
1702 if (m_strText.IsEmpty()) {
1703 m_IsFind = FALSE;
1704 return m_IsFind;
1705 }
1706 int strLen = m_strText.GetLength();
1707 if (m_findNextStart > strLen - 1) {
1708 m_IsFind = FALSE;
1709 return m_IsFind;
1710 }
1711 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
1712 int nResultPos = 0;
1713 int nStartPos = 0;
1714 nStartPos = m_findNextStart;
1715 FX_BOOL bSpaceStart = FALSE;
1716 for (int iWord = 0; iWord < nCount; iWord++) {
1717 CFX_WideString csWord = m_csFindWhatArray[iWord];
1718 if (csWord.IsEmpty()) {
1719 if (iWord == nCount - 1) {
1720 FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
1721 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
1722 strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
1723 nResultPos = nStartPos + 1;
1724 break;
1725 }
1726 iWord = -1;
1727 } else if (iWord == 0) {
1728 bSpaceStart = TRUE;
1729 }
1730 continue;
1731 }
1732 int endIndex;
1733 nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
1734 if (nResultPos == -1) {
1735 m_IsFind = FALSE;
1736 return m_IsFind;
1737 }
1738 endIndex = nResultPos + csWord.GetLength() - 1;
1739 if (iWord == 0) {
1740 m_resStart = nResultPos;
1741 }
1742 FX_BOOL bMatch = TRUE;
1743 if (iWord != 0 && !bSpaceStart) {
1744 int PreResEndPos = nStartPos;
1745 int curChar = csWord.GetAt(0);
1746 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
1747 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
1748 if (nStartPos == nResultPos &&
1749 !(IsIgnoreSpaceCharacter(lastChar) ||
1750 IsIgnoreSpaceCharacter(curChar))) {
1751 bMatch = FALSE;
1752 }
1753 for (int d = PreResEndPos; d < nResultPos; d++) {
1754 FX_WCHAR strInsert = m_strText.GetAt(d);
1755 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
1756 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
1757 bMatch = FALSE;
1758 break;
1759 }
1760 }
1761 } else if (bSpaceStart) {
1762 if (nResultPos > 0) {
1763 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
1764 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
1765 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
1766 bMatch = FALSE;
1767 m_resStart = nResultPos;
1768 } else {
1769 m_resStart = nResultPos - 1;
1770 }
1771 }
1772 }
1773 if (m_bMatchWholeWord && bMatch) {
1774 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
1775 }
1776 nStartPos = endIndex + 1;
1777 if (!bMatch) {
1778 iWord = -1;
1779 if (bSpaceStart) {
1780 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
1781 } else {
1782 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
1783 }
1784 }
1785 }
1786 m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
1787 m_IsFind = TRUE;
1788 int resStart = GetCharIndex(m_resStart);
1789 int resEnd = GetCharIndex(m_resEnd);
1790 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
1791 if (m_flags & FPDFTEXT_CONSECUTIVE) {
1792 m_findNextStart = m_resStart + 1;
1793 m_findPreStart = m_resEnd - 1;
1794 } else {
1795 m_findNextStart = m_resEnd + 1;
1796 m_findPreStart = m_resStart - 1;
1797 }
1798 return m_IsFind;
1799 }
1800
1801 FX_BOOL CPDF_TextPageFind::FindPrev() {
1802 if (!m_pTextPage) {
1803 return FALSE;
1804 }
1805 m_resArray.clear();
1806 if (m_strText.IsEmpty() || m_findPreStart < 0) {
1807 m_IsFind = FALSE;
1808 return m_IsFind;
1809 }
1810 CPDF_TextPageFind findEngine(m_pTextPage);
1811 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
1812 if (!ret) {
1813 m_IsFind = FALSE;
1814 return m_IsFind;
1815 }
1816 int order = -1, MatchedCount = 0;
1817 while (ret) {
1818 ret = findEngine.FindNext();
1819 if (ret) {
1820 int order1 = findEngine.GetCurOrder();
1821 int MatchedCount1 = findEngine.GetMatchedCount();
1822 if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
1823 break;
1824 }
1825 order = order1;
1826 MatchedCount = MatchedCount1;
1827 }
1828 }
1829 if (order == -1) {
1830 m_IsFind = FALSE;
1831 return m_IsFind;
1832 }
1833 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
1834 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
1835 m_IsFind = TRUE;
1836 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
1837 if (m_flags & FPDFTEXT_CONSECUTIVE) {
1838 m_findNextStart = m_resStart + 1;
1839 m_findPreStart = m_resEnd - 1;
1840 } else {
1841 m_findNextStart = m_resEnd + 1;
1842 m_findPreStart = m_resStart - 1;
1843 }
1844 return m_IsFind;
1845 }
1846
1847 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
1848 if (findwhat.IsEmpty()) {
1849 return;
1850 }
1851 int index = 0;
1852 while (1) {
1853 CFX_WideString csWord = TEXT_EMPTY;
1854 int ret =
1855 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
1856 if (csWord.IsEmpty()) {
1857 if (ret) {
1858 m_csFindWhatArray.push_back(L"");
1859 index++;
1860 continue;
1861 } else {
1862 break;
1863 }
1864 }
1865 int pos = 0;
1866 while (pos < csWord.GetLength()) {
1867 CFX_WideString curStr = csWord.Mid(pos, 1);
1868 FX_WCHAR curChar = csWord.GetAt(pos);
1869 if (IsIgnoreSpaceCharacter(curChar)) {
1870 if (pos > 0 && curChar == 0x2019) {
1871 pos++;
1872 continue;
1873 }
1874 if (pos > 0) {
1875 m_csFindWhatArray.push_back(csWord.Mid(0, pos));
1876 }
1877 m_csFindWhatArray.push_back(curStr);
1878 if (pos == csWord.GetLength() - 1) {
1879 csWord.clear();
1880 break;
1881 }
1882 csWord = csWord.Right(csWord.GetLength() - pos - 1);
1883 pos = 0;
1884 continue;
1885 }
1886 pos++;
1887 }
1888 if (!csWord.IsEmpty()) {
1889 m_csFindWhatArray.push_back(csWord);
1890 }
1891 index++;
1892 }
1893 }
1894
1895 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
1896 int startPos,
1897 int endPos) {
1898 FX_WCHAR char_left = 0;
1899 FX_WCHAR char_right = 0;
1900 int char_count = endPos - startPos + 1;
1901 if (char_count < 1) {
1902 return FALSE;
1903 }
1904 if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
1905 return TRUE;
1906 }
1907 if (startPos - 1 >= 0) {
1908 char_left = csPageText.GetAt(startPos - 1);
1909 }
1910 if (startPos + char_count < csPageText.GetLength()) {
1911 char_right = csPageText.GetAt(startPos + char_count);
1912 }
1913 if ((char_left > 'A' && char_left < 'a') ||
1914 (char_left > 'a' && char_left < 'z') ||
1915 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
1916 (char_right > 'A' && char_right < 'a') ||
1917 (char_right > 'a' && char_right < 'z') ||
1918 (char_right > 0xfb00 && char_right < 0xfb06) ||
1919 std::iswdigit(char_right)) {
1920 return FALSE;
1921 }
1922 if (!(('A' > char_left || char_left > 'Z') &&
1923 ('a' > char_left || char_left > 'z') &&
1924 ('A' > char_right || char_right > 'Z') &&
1925 ('a' > char_right || char_right > 'z'))) {
1926 return FALSE;
1927 }
1928 if (char_count > 0) {
1929 if (csPageText.GetAt(startPos) >= L'0' &&
1930 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
1931 char_left <= L'9') {
1932 return FALSE;
1933 }
1934 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
1935 char_right >= L'0' && char_right <= L'9') {
1936 return FALSE;
1937 }
1938 }
1939 return TRUE;
1940 }
1941
1942 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
1943 const FX_WCHAR* lpszFullString,
1944 int iSubString,
1945 FX_WCHAR chSep) {
1946 if (!lpszFullString) {
1947 return FALSE;
1948 }
1949 while (iSubString--) {
1950 lpszFullString = wcschr(lpszFullString, chSep);
1951 if (!lpszFullString) {
1952 rString.clear();
1953 return FALSE;
1954 }
1955 lpszFullString++;
1956 while (*lpszFullString == chSep) {
1957 lpszFullString++;
1958 }
1959 }
1960 const FX_WCHAR* lpchEnd = wcschr(lpszFullString, chSep);
1961 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
1962 : (int)FXSYS_wcslen(lpszFullString);
1963 ASSERT(nLen >= 0);
1964 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
1965 nLen * sizeof(FX_WCHAR));
1966 rString.ReleaseBuffer();
1967 return TRUE;
1968 }
1969
1970 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
1971 CFX_WideString str2;
1972 str2.clear();
1973 int nlen = str.GetLength();
1974 for (int i = nlen - 1; i >= 0; i--) {
1975 str2 += str.GetAt(i);
1976 }
1977 return str2;
1978 }
1979
1980 int CPDF_TextPageFind::GetCurOrder() const {
1981 return GetCharIndex(m_resStart);
1982 }
1983
1984 int CPDF_TextPageFind::GetMatchedCount() const {
1985 int resStart = GetCharIndex(m_resStart);
1986 int resEnd = GetCharIndex(m_resEnd);
1987 return resEnd - resStart + 1;
1988 }
1989
1990 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
1991 : m_pTextPage(pTextPage) {}
1992
1993 CPDF_LinkExtract::~CPDF_LinkExtract() {
1994 }
1995
1996 void CPDF_LinkExtract::ExtractLinks() {
1997 m_LinkArray.clear();
1998 if (!m_pTextPage->IsParsed())
1999 return;
2000
2001 m_strPageText = m_pTextPage->GetPageText(0, -1);
2002 if (m_strPageText.IsEmpty())
2003 return;
2004
2005 ParseLink();
2006 }
2007
2008 void CPDF_LinkExtract::ParseLink() {
2009 int start = 0, pos = 0;
2010 int TotalChar = m_pTextPage->CountChars();
2011 while (pos < TotalChar) {
2012 FPDF_CHAR_INFO pageChar;
2013 m_pTextPage->GetCharInfo(pos, &pageChar);
2014 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
2015 pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
2016 int nCount = pos - start;
2017 if (pos == TotalChar - 1) {
2018 nCount++;
2019 }
2020 CFX_WideString strBeCheck;
2021 strBeCheck = m_pTextPage->GetPageText(start, nCount);
2022 if (strBeCheck.GetLength() > 5) {
2023 while (strBeCheck.GetLength() > 0) {
2024 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
2025 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
2026 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
2027 nCount--;
2028 } else {
2029 break;
2030 }
2031 }
2032 if (nCount > 5 &&
2033 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
2034 m_LinkArray.push_back({start, nCount, strBeCheck});
2035 }
2036 }
2037 start = ++pos;
2038 } else {
2039 pos++;
2040 }
2041 }
2042 }
2043
2044 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
2045 CFX_WideString str = strBeCheck;
2046 str.MakeLower();
2047 if (str.Find(L"http://www.") != -1) {
2048 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
2049 return true;
2050 }
2051 if (str.Find(L"http://") != -1) {
2052 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
2053 return true;
2054 }
2055 if (str.Find(L"https://www.") != -1) {
2056 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
2057 return true;
2058 }
2059 if (str.Find(L"https://") != -1) {
2060 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2061 return true;
2062 }
2063 if (str.Find(L"www.") != -1) {
2064 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2065 strBeCheck = L"http://" + strBeCheck;
2066 return true;
2067 }
2068 return false;
2069 }
2070
2071 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2072 int aPos = str.Find(L'@');
2073 // Invalid when no '@'.
2074 if (aPos < 1)
2075 return false;
2076
2077 // Check the local part.
2078 int pPos = aPos; // Used to track the position of '@' or '.'.
2079 for (int i = aPos - 1; i >= 0; i--) {
2080 FX_WCHAR ch = str.GetAt(i);
2081 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
2082 continue;
2083
2084 if (ch != L'.' || i == pPos - 1 || i == 0) {
2085 if (i == aPos - 1) {
2086 // There is '.' or invalid char before '@'.
2087 return FALSE;
2088 }
2089 // End extracting for other invalid chars, '.' at the beginning, or
2090 // consecutive '.'.
2091 int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2092 str = str.Right(str.GetLength() - removed_len);
2093 break;
2094 }
2095 // Found a valid '.'.
2096 pPos = i;
2097 }
2098
2099 // Check the domain name part.
2100 aPos = str.Find(L'@');
2101 if (aPos < 1)
2102 return false;
2103
2104 str.TrimRight(L'.');
2105 // At least one '.' in domain name, but not at the beginning.
2106 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
2107 // Check whether we should remove this check.
2108 int ePos = str.Find(L'.', aPos + 1);
2109 if (ePos == -1 || ePos == aPos + 1)
2110 return false;
2111
2112 // Validate all other chars in domain name.
2113 int nLen = str.GetLength();
2114 pPos = 0; // Used to track the position of '.'.
2115 for (int i = aPos + 1; i < nLen; i++) {
2116 FX_WCHAR wch = str.GetAt(i);
2117 if (wch == L'-' || FXSYS_iswalnum(wch))
2118 continue;
2119
2120 if (wch != L'.' || i == pPos + 1) {
2121 // Domain name should end before invalid char.
2122 int host_end = i == pPos + 1 ? i - 2 : i - 1;
2123 if (pPos > 0 && host_end - aPos >= 3) {
2124 // Trim the ending invalid chars if there is at least one '.' and name.
2125 str = str.Left(host_end + 1);
2126 break;
2127 }
2128 return false;
2129 }
2130 pPos = i;
2131 }
2132
2133 if (str.Find(L"mailto:") == -1)
2134 str = L"mailto:" + str;
2135
2136 return true;
2137 }
2138
2139 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
2140 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
2141 }
2142
2143 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
2144 if (index >= m_LinkArray.size())
2145 return std::vector<CFX_FloatRect>();
2146
2147 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
2148 m_LinkArray[index].m_Count);
2149 }
OLDNEW
« no previous file with comments | « core/fpdftext/cpdf_linkextract.cpp ('k') | core/fpdftext/cpdf_textpagefind.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698