OLD | NEW |
1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
6 | 6 |
7 #include "../../include/fpdfapi/fpdf_resource.h" | 7 #include "../../include/fpdfapi/fpdf_resource.h" |
8 #include "../../include/fpdfapi/fpdf_pageobj.h" | 8 #include "../../include/fpdfapi/fpdf_pageobj.h" |
9 #include "../../include/fpdftext/fpdf_text.h" | 9 #include "../../include/fpdftext/fpdf_text.h" |
10 #include "../../include/fpdfapi/fpdf_page.h" | 10 #include "../../include/fpdfapi/fpdf_page.h" |
11 #include "../../include/fpdfapi/fpdf_module.h" | 11 #include "../../include/fpdfapi/fpdf_module.h" |
12 #include <ctype.h> | 12 #include <ctype.h> |
| 13 #include <algorithm> |
13 #include "text_int.h" | 14 #include "text_int.h" |
| 15 |
| 16 namespace { |
| 17 |
14 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) | 18 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) |
15 { | 19 { |
16 if(curChar < 255 ) { | 20 if(curChar < 255 ) { |
17 return FALSE; | 21 return FALSE; |
18 } | 22 } |
19 if ( (curChar >= 0x0600 && curChar <= 0x06FF) | 23 if ( (curChar >= 0x0600 && curChar <= 0x06FF) |
20 || (curChar >= 0xFE70 && curChar <= 0xFEFF) | 24 || (curChar >= 0xFE70 && curChar <= 0xFEFF) |
21 || (curChar >= 0xFB50 && curChar <= 0xFDFF) | 25 || (curChar >= 0xFB50 && curChar <= 0xFDFF) |
22 || (curChar >= 0x0400 && curChar <= 0x04FF) | 26 || (curChar >= 0x0400 && curChar <= 0x04FF) |
23 || (curChar >= 0x0500 && curChar <= 0x052F) | 27 || (curChar >= 0x0500 && curChar <= 0x052F) |
24 || (curChar >= 0xA640 && curChar <= 0xA69F) | 28 || (curChar >= 0xA640 && curChar <= 0xA69F) |
25 || (curChar >= 0x2DE0 && curChar <= 0x2DFF) | 29 || (curChar >= 0x2DE0 && curChar <= 0x2DFF) |
26 || curChar == 8467 | 30 || curChar == 8467 |
27 || (curChar >= 0x2000 && curChar <= 0x206F)) { | 31 || (curChar >= 0x2000 && curChar <= 0x206F)) { |
28 return FALSE; | 32 return FALSE; |
29 } | 33 } |
30 return TRUE; | 34 return TRUE; |
31 } | 35 } |
| 36 |
| 37 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) |
| 38 { |
| 39 if (threshold < 300) { |
| 40 return threshold / 2.0; |
| 41 } else if (threshold < 500) { |
| 42 return threshold / 4.0; |
| 43 } else if (threshold < 700) { |
| 44 return threshold / 5.0; |
| 45 } |
| 46 return threshold / 6.0; |
| 47 } |
| 48 |
| 49 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, |
| 50 const CFX_AffineMatrix& matrix) |
| 51 { |
| 52 FX_FLOAT baseSpace = 0.0; |
| 53 const int nItems = pTextObj->CountItems(); |
| 54 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { |
| 55 FX_BOOL bAllChar = TRUE; |
| 56 FX_FLOAT spacing = matrix.TransformDistance( |
| 57 pTextObj->m_TextState.GetObject()->m_CharSpace); |
| 58 baseSpace = spacing; |
| 59 for (int i = 0; i < nItems; i++) { |
| 60 CPDF_TextObjectItem item; |
| 61 pTextObj->GetItemInfo(i, &item); |
| 62 if (item.m_CharCode == (FX_DWORD) - 1) { |
| 63 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); |
| 64 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000; |
| 65 baseSpace = std::min(baseSpace, kerning + spacing); |
| 66 bAllChar = FALSE; |
| 67 } |
| 68 } |
| 69 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) { |
| 70 baseSpace = 0.0; |
| 71 } |
| 72 } |
| 73 return baseSpace; |
| 74 } |
| 75 |
| 76 } // namespace |
| 77 |
32 CPDFText_ParseOptions::CPDFText_ParseOptions() | 78 CPDFText_ParseOptions::CPDFText_ParseOptions() |
33 : m_bGetCharCodeOnly(FALSE), m_bNormalizeObjs(TRUE), m_bOutputHyphen(FALSE) | 79 : m_bGetCharCodeOnly(FALSE), m_bNormalizeObjs(TRUE), m_bOutputHyphen(FALSE) |
34 { | 80 { |
35 } | 81 } |
36 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, CPDFText_Pa
rseOptions ParserOptions) | 82 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, CPDFText_Pa
rseOptions ParserOptions) |
37 { | 83 { |
38 CPDF_TextPage* pTextPageEx = FX_NEW CPDF_TextPage(pPage, ParserOptions); | 84 CPDF_TextPage* pTextPageEx = FX_NEW CPDF_TextPage(pPage, ParserOptions); |
39 return pTextPageEx; | 85 return pTextPageEx; |
40 } | 86 } |
41 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, int flags) | 87 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, int flags) |
(...skipping 1593 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1635 } | 1681 } |
1636 if (FPDFTEXT_MC_DELAY == bPreMKC) { | 1682 if (FPDFTEXT_MC_DELAY == bPreMKC) { |
1637 ProcessMarkedContent(Obj); | 1683 ProcessMarkedContent(Obj); |
1638 m_pPreTextObj = pTextObj; | 1684 m_pPreTextObj = pTextObj; |
1639 m_perMatrix.Copy(formMatrix); | 1685 m_perMatrix.Copy(formMatrix); |
1640 return; | 1686 return; |
1641 } | 1687 } |
1642 m_pPreTextObj = pTextObj; | 1688 m_pPreTextObj = pTextObj; |
1643 m_perMatrix.Copy(formMatrix); | 1689 m_perMatrix.Copy(formMatrix); |
1644 int nItems = pTextObj->CountItems(); | 1690 int nItems = pTextObj->CountItems(); |
1645 FX_FLOAT spacing = 0; | 1691 FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix); |
1646 FX_FLOAT baseSpace = 0.0; | |
1647 FX_BOOL bAllChar = TRUE; | |
1648 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { | |
1649 spacing = matrix.TransformDistance(pTextObj->m_TextState.GetObject()->m_
CharSpace); | |
1650 baseSpace = spacing; | |
1651 for (int i = 0; i < nItems; i++) { | |
1652 CPDF_TextObjectItem item; | |
1653 pTextObj->GetItemInfo(i, &item); | |
1654 if (item.m_CharCode == (FX_DWORD) - 1) { | |
1655 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); | |
1656 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000; | |
1657 if(kerning + spacing < baseSpace) { | |
1658 baseSpace = kerning + spacing; | |
1659 } | |
1660 bAllChar = FALSE; | |
1661 } | |
1662 } | |
1663 spacing = 0; | |
1664 if(baseSpace < 0.0 || (nItems == 3 && !bAllChar)) { | |
1665 baseSpace = 0.0; | |
1666 } | |
1667 } | |
1668 | 1692 |
1669 FX_BOOL bIsBidiAndMirrosInverse = FALSE; | 1693 FX_BOOL bIsBidiAndMirrosInverse = FALSE; |
1670 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); | 1694 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); |
1671 FX_INT32 nR2L = 0; | 1695 FX_INT32 nR2L = 0; |
1672 FX_INT32 nL2R = 0; | 1696 FX_INT32 nL2R = 0; |
1673 FX_INT32 start = 0, count = 0; | 1697 FX_INT32 start = 0, count = 0; |
1674 CPDF_TextObjectItem item; | 1698 CPDF_TextObjectItem item; |
1675 for (FX_INT32 i = 0; i < nItems; i++) { | 1699 for (FX_INT32 i = 0; i < nItems; i++) { |
1676 pTextObj->GetItemInfo(i, &item); | 1700 pTextObj->GetItemInfo(i, &item); |
1677 if (item.m_CharCode == (FX_DWORD)-1) { | 1701 if (item.m_CharCode == (FX_DWORD)-1) { |
(...skipping 27 matching lines...) Expand all Loading... |
1705 } | 1729 } |
1706 } | 1730 } |
1707 FX_BOOL bR2L = FALSE; | 1731 FX_BOOL bR2L = FALSE; |
1708 if (nR2L > 0 && nR2L >= nL2R) { | 1732 if (nR2L > 0 && nR2L >= nL2R) { |
1709 bR2L = TRUE; | 1733 bR2L = TRUE; |
1710 } | 1734 } |
1711 bIsBidiAndMirrosInverse = bR2L && (matrix.a * matrix.d - matrix.b * matrix.c
) < 0; | 1735 bIsBidiAndMirrosInverse = bR2L && (matrix.a * matrix.d - matrix.b * matrix.c
) < 0; |
1712 FX_INT32 iBufStartAppend = m_TempTextBuf.GetLength(); | 1736 FX_INT32 iBufStartAppend = m_TempTextBuf.GetLength(); |
1713 FX_INT32 iCharListStartAppend = m_TempCharList.GetSize(); | 1737 FX_INT32 iCharListStartAppend = m_TempCharList.GetSize(); |
1714 | 1738 |
| 1739 FX_FLOAT spacing = 0; |
1715 for (int i = 0; i < nItems; i++) { | 1740 for (int i = 0; i < nItems; i++) { |
1716 CPDF_TextObjectItem item; | 1741 CPDF_TextObjectItem item; |
1717 PAGECHAR_INFO charinfo; | 1742 PAGECHAR_INFO charinfo; |
1718 charinfo.m_OriginX = 0; | 1743 charinfo.m_OriginX = 0; |
1719 charinfo.m_OriginY = 0; | 1744 charinfo.m_OriginY = 0; |
1720 pTextObj->GetItemInfo(i, &item); | 1745 pTextObj->GetItemInfo(i, &item); |
1721 if (item.m_CharCode == (FX_DWORD) - 1) { | 1746 if (item.m_CharCode == (FX_DWORD) - 1) { |
1722 CFX_WideString str = m_TempTextBuf.GetWideString(); | 1747 CFX_WideString str = m_TempTextBuf.GetWideString(); |
1723 if(str.IsEmpty()) { | 1748 if(str.IsEmpty()) { |
1724 str = m_TextBuf.GetWideString(); | 1749 str = m_TextBuf.GetWideString(); |
(...skipping 22 matching lines...) Expand all Loading... |
1747 } | 1772 } |
1748 if (threshold > fontsize_h / 3) { | 1773 if (threshold > fontsize_h / 3) { |
1749 threshold = 0; | 1774 threshold = 0; |
1750 } else { | 1775 } else { |
1751 threshold /= 2; | 1776 threshold /= 2; |
1752 } | 1777 } |
1753 if (threshold == 0) { | 1778 if (threshold == 0) { |
1754 threshold = fontsize_h; | 1779 threshold = fontsize_h; |
1755 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont))
; | 1780 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont))
; |
1756 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX
_FLOAT)last_width; | 1781 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX
_FLOAT)last_width; |
1757 int nDivide = 6; | 1782 threshold = _NormalizeThreshold(threshold); |
1758 if (threshold < 300) { | |
1759 nDivide = 2; | |
1760 } else if (threshold < 500) { | |
1761 nDivide = 4; | |
1762 } else if (threshold < 700) { | |
1763 nDivide = 5; | |
1764 } | |
1765 threshold = threshold / nDivide; | |
1766 threshold = fontsize_h * threshold / 1000; | 1783 threshold = fontsize_h * threshold / 1000; |
1767 } | 1784 } |
1768 if (threshold && (spacing && spacing >= threshold) ) { | 1785 if (threshold && (spacing && spacing >= threshold) ) { |
1769 charinfo.m_Unicode = TEXT_BLANK_CHAR; | 1786 charinfo.m_Unicode = TEXT_BLANK_CHAR; |
1770 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; | 1787 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; |
1771 charinfo.m_pTextObj = pTextObj; | 1788 charinfo.m_pTextObj = pTextObj; |
1772 charinfo.m_Index = m_TextBuf.GetLength(); | 1789 charinfo.m_Index = m_TextBuf.GetLength(); |
1773 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); | 1790 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR); |
1774 charinfo.m_CharCode = -1; | 1791 charinfo.m_CharCode = -1; |
1775 charinfo.m_Matrix.Copy(formMatrix); | 1792 charinfo.m_Matrix.Copy(formMatrix); |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1818 } | 1835 } |
1819 matrix.TransformRect(charinfo.m_CharBox); | 1836 matrix.TransformRect(charinfo.m_CharBox); |
1820 charinfo.m_Matrix.Copy(matrix); | 1837 charinfo.m_Matrix.Copy(matrix); |
1821 if (wstrItem.IsEmpty()) { | 1838 if (wstrItem.IsEmpty()) { |
1822 charinfo.m_Unicode = 0; | 1839 charinfo.m_Unicode = 0; |
1823 m_TempCharList.Add(charinfo); | 1840 m_TempCharList.Add(charinfo); |
1824 m_TempTextBuf.AppendChar(0xfffe); | 1841 m_TempTextBuf.AppendChar(0xfffe); |
1825 continue; | 1842 continue; |
1826 } else { | 1843 } else { |
1827 int nTotal = wstrItem.GetLength(); | 1844 int nTotal = wstrItem.GetLength(); |
1828 int n = 0; | |
1829 FX_BOOL bDel = FALSE; | 1845 FX_BOOL bDel = FALSE; |
1830 while (n < m_TempCharList.GetSize() && n < 7) { | 1846 const int count = std::min(m_TempCharList.GetSize(), 7); |
1831 n++; | 1847 for (int n = m_TempCharList.GetSize(); |
1832 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(
m_TempCharList.GetSize() - n); | 1848 n > m_TempCharList.GetSize() - count; |
| 1849 n--) { |
| 1850 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(
n - 1); |
1833 if(charinfo1->m_CharCode == charinfo.m_CharCode && | 1851 if(charinfo1->m_CharCode == charinfo.m_CharCode && |
1834 charinfo1->m_pTextObj->GetFont() == charinfo.m_pTextObj-
>GetFont() && | 1852 charinfo1->m_pTextObj->GetFont() == charinfo.m_pTextObj-
>GetFont() && |
1835 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) <
TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize() && | 1853 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) <
TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize() && |
1836 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) <
TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize() ) { | 1854 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) <
TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize() ) { |
1837 bDel = TRUE; | 1855 bDel = TRUE; |
1838 break; | 1856 break; |
1839 } | 1857 } |
1840 } | 1858 } |
1841 if(!bDel) { | 1859 if(!bDel) { |
1842 for (int nIndex = 0; nIndex < nTotal; nIndex++) { | 1860 for (int nIndex = 0; nIndex < nTotal; nIndex++) { |
(...skipping 12 matching lines...) Expand all Loading... |
1855 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); | 1873 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); |
1856 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); | 1874 m_TempCharList.Delete(m_TempCharList.GetSize() - 1); |
1857 } | 1875 } |
1858 } | 1876 } |
1859 } | 1877 } |
1860 } | 1878 } |
1861 if (bIsBidiAndMirrosInverse) { | 1879 if (bIsBidiAndMirrosInverse) { |
1862 FX_INT32 i, j; | 1880 FX_INT32 i, j; |
1863 i = iCharListStartAppend; | 1881 i = iCharListStartAppend; |
1864 j = m_TempCharList.GetSize() - 1; | 1882 j = m_TempCharList.GetSize() - 1; |
1865 PAGECHAR_INFO tempCharInfo; | |
1866 FX_INT32 tempIndex = 0; | |
1867 for (; i < j; i++, j--) { | 1883 for (; i < j; i++, j--) { |
1868 tempCharInfo = m_TempCharList[i]; | 1884 std::swap(m_TempCharList[i], m_TempCharList[j]); |
1869 m_TempCharList[i] = m_TempCharList[j]; | 1885 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index); |
1870 m_TempCharList[j] = tempCharInfo; | |
1871 tempIndex = m_TempCharList[i].m_Index; | |
1872 m_TempCharList[i].m_Index = m_TempCharList[j].m_Index; | |
1873 m_TempCharList[j].m_Index = tempIndex; | |
1874 } | 1886 } |
1875 FX_WCHAR * pTempBuffer = m_TempTextBuf.GetBuffer(); | 1887 FX_WCHAR * pTempBuffer = m_TempTextBuf.GetBuffer(); |
1876 i = iBufStartAppend; | 1888 i = iBufStartAppend; |
1877 j = m_TempTextBuf.GetLength() - 1; | 1889 j = m_TempTextBuf.GetLength() - 1; |
1878 FX_WCHAR wTemp; | 1890 FX_WCHAR wTemp; |
1879 for (; i < j; i++, j--) { | 1891 for (; i < j; i++, j--) { |
1880 wTemp = pTempBuffer[i]; | 1892 std::swap(pTempBuffer[i], pTempBuffer[j]); |
1881 pTempBuffer[i] = pTempBuffer[j]; | |
1882 pTempBuffer[j] = wTemp; | |
1883 } | 1893 } |
1884 } | 1894 } |
1885 } | 1895 } |
1886 FX_INT32 CPDF_TextPage::GetTextObjectWritingMode(const CPDF_TextObject* pTextObj
) | 1896 FX_INT32 CPDF_TextPage::GetTextObjectWritingMode(const CPDF_TextObject* pTextObj
) |
1887 { | 1897 { |
1888 FX_INT32 nChars = pTextObj->CountChars(); | 1898 FX_INT32 nChars = pTextObj->CountChars(); |
1889 if (nChars == 1) { | 1899 if (nChars == 1) { |
1890 return m_TextlineDir; | 1900 return m_TextlineDir; |
1891 } | 1901 } |
1892 CPDF_TextObjectItem first, last; | 1902 CPDF_TextObjectItem first, last; |
(...skipping 937 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2830 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { | 2840 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) { |
2831 return; | 2841 return; |
2832 } | 2842 } |
2833 CPDF_LinkExt* link = NULL; | 2843 CPDF_LinkExt* link = NULL; |
2834 link = m_LinkList.GetAt(index); | 2844 link = m_LinkList.GetAt(index); |
2835 if (!link) { | 2845 if (!link) { |
2836 return ; | 2846 return ; |
2837 } | 2847 } |
2838 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); | 2848 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); |
2839 } | 2849 } |
OLD | NEW |