| OLD | NEW |
| 1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| 6 | 6 |
| 7 #include "core/src/fpdftext/text_int.h" | 7 #include "core/src/fpdftext/text_int.h" |
| 8 | 8 |
| 9 #include <algorithm> | 9 #include <algorithm> |
| 10 #include <cctype> | 10 #include <cctype> |
| (...skipping 743 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 754 | 754 |
| 755 int32_t CPDF_TextPage::FindTextlineFlowDirection() { | 755 int32_t CPDF_TextPage::FindTextlineFlowDirection() { |
| 756 const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth()); | 756 const int32_t nPageWidth = static_cast<int32_t>(m_pPage->GetPageWidth()); |
| 757 const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight()); | 757 const int32_t nPageHeight = static_cast<int32_t>(m_pPage->GetPageHeight()); |
| 758 std::vector<uint8_t> nHorizontalMask(nPageWidth); | 758 std::vector<uint8_t> nHorizontalMask(nPageWidth); |
| 759 std::vector<uint8_t> nVerticalMask(nPageHeight); | 759 std::vector<uint8_t> nVerticalMask(nPageHeight); |
| 760 uint8_t* pDataH = nHorizontalMask.data(); | 760 uint8_t* pDataH = nHorizontalMask.data(); |
| 761 uint8_t* pDataV = nVerticalMask.data(); | 761 uint8_t* pDataV = nVerticalMask.data(); |
| 762 int32_t index = 0; | 762 int32_t index = 0; |
| 763 FX_FLOAT fLineHeight = 0.0f; | 763 FX_FLOAT fLineHeight = 0.0f; |
| 764 CPDF_PageObject* pPageObj = NULL; | 764 if (m_pPage->GetPageObjectList()->empty()) |
| 765 FX_POSITION pos = NULL; | |
| 766 pos = m_pPage->GetPageObjectList()->GetHeadPosition(); | |
| 767 if (!pos) { | |
| 768 return -1; | 765 return -1; |
| 769 } | 766 |
| 770 while (pos) { | 767 for (auto& pPageObj : *m_pPage->GetPageObjectList()) { |
| 771 pPageObj = m_pPage->GetPageObjectList()->GetNextObject(pos); | 768 if (!pPageObj || pPageObj->m_Type != CPDF_PageObject::TEXT) |
| 772 if (!pPageObj) { | |
| 773 continue; | 769 continue; |
| 774 } | 770 |
| 775 if (CPDF_PageObject::TEXT != pPageObj->m_Type) { | |
| 776 continue; | |
| 777 } | |
| 778 int32_t minH = | 771 int32_t minH = |
| 779 (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left; | 772 (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left; |
| 780 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth | 773 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth |
| 781 ? nPageWidth | 774 ? nPageWidth |
| 782 : (int32_t)pPageObj->m_Right; | 775 : (int32_t)pPageObj->m_Right; |
| 783 int32_t minV = | 776 int32_t minV = |
| 784 (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom; | 777 (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom; |
| 785 int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight | 778 int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight |
| 786 ? nPageHeight | 779 ? nPageHeight |
| 787 : (int32_t)pPageObj->m_Top; | 780 : (int32_t)pPageObj->m_Top; |
| 788 if (minH >= maxH || minV >= maxV) { | 781 if (minH >= maxH || minV >= maxV) |
| 789 continue; | 782 continue; |
| 790 } | 783 |
| 791 FXSYS_memset(pDataH + minH, 1, maxH - minH); | 784 FXSYS_memset(pDataH + minH, 1, maxH - minH); |
| 792 FXSYS_memset(pDataV + minV, 1, maxV - minV); | 785 FXSYS_memset(pDataV + minV, 1, maxV - minV); |
| 793 if (fLineHeight <= 0.0f) { | 786 if (fLineHeight <= 0.0f) |
| 794 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom; | 787 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom; |
| 795 } | |
| 796 pPageObj = NULL; | |
| 797 } | 788 } |
| 798 int32_t nStartH = 0; | 789 int32_t nStartH = 0; |
| 799 int32_t nEndH = 0; | 790 int32_t nEndH = 0; |
| 800 FX_FLOAT nSumH = 0.0f; | 791 FX_FLOAT nSumH = 0.0f; |
| 801 for (index = 0; index < nPageWidth; index++) | 792 for (index = 0; index < nPageWidth; index++) { |
| 802 if (1 == nHorizontalMask[index]) { | 793 if (1 == nHorizontalMask[index]) |
| 803 break; | 794 break; |
| 804 } | 795 } |
| 805 nStartH = index; | 796 nStartH = index; |
| 806 for (index = nPageWidth; index > 0; index--) | 797 for (index = nPageWidth; index > 0; index--) { |
| 807 if (1 == nHorizontalMask[index - 1]) { | 798 if (1 == nHorizontalMask[index - 1]) |
| 808 break; | 799 break; |
| 809 } | 800 } |
| 810 nEndH = index; | 801 nEndH = index; |
| 811 for (index = nStartH; index < nEndH; index++) { | 802 for (index = nStartH; index < nEndH; index++) { |
| 812 nSumH += nHorizontalMask[index]; | 803 nSumH += nHorizontalMask[index]; |
| 813 } | 804 } |
| 814 nSumH /= nEndH - nStartH; | 805 nSumH /= nEndH - nStartH; |
| 815 int32_t nStartV = 0; | 806 int32_t nStartV = 0; |
| 816 int32_t nEndV = 0; | 807 int32_t nEndV = 0; |
| 817 FX_FLOAT nSumV = 0.0f; | 808 FX_FLOAT nSumV = 0.0f; |
| 818 for (index = 0; index < nPageHeight; index++) | 809 for (index = 0; index < nPageHeight; index++) { |
| 819 if (1 == nVerticalMask[index]) { | 810 if (1 == nVerticalMask[index]) |
| 820 break; | 811 break; |
| 821 } | 812 } |
| 822 nStartV = index; | 813 nStartV = index; |
| 823 for (index = nPageHeight; index > 0; index--) | 814 for (index = nPageHeight; index > 0; index--) { |
| 824 if (1 == nVerticalMask[index - 1]) { | 815 if (1 == nVerticalMask[index - 1]) |
| 825 break; | 816 break; |
| 826 } | 817 } |
| 827 nEndV = index; | 818 nEndV = index; |
| 828 for (index = nStartV; index < nEndV; index++) { | 819 for (index = nStartV; index < nEndV; index++) { |
| 829 nSumV += nVerticalMask[index]; | 820 nSumV += nVerticalMask[index]; |
| 830 } | 821 } |
| 831 nSumV /= nEndV - nStartV; | 822 nSumV /= nEndV - nStartV; |
| 832 if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) { | 823 if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) { |
| 833 return 0; | 824 return 0; |
| 834 } | 825 } |
| 835 if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) { | 826 if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) { |
| 836 return 1; | 827 return 1; |
| 837 } | 828 } |
| 838 if (nSumH > 0.8f) { | 829 if (nSumH > 0.8f) { |
| 839 return 0; | 830 return 0; |
| 840 } | 831 } |
| 841 if (nSumH - nSumV > 0.0f) { | 832 if (nSumH - nSumV > 0.0f) { |
| 842 return 0; | 833 return 0; |
| 843 } | 834 } |
| 844 if (nSumV - nSumH > 0.0f) { | 835 if (nSumV - nSumH > 0.0f) { |
| 845 return 1; | 836 return 1; |
| 846 } | 837 } |
| 847 return -1; | 838 return -1; |
| 848 } | 839 } |
| 849 | 840 |
| 850 void CPDF_TextPage::ProcessObject() { | 841 void CPDF_TextPage::ProcessObject() { |
| 851 FX_POSITION pos = m_pPage->GetPageObjectList()->GetHeadPosition(); | 842 if (m_pPage->GetPageObjectList()->empty()) |
| 852 if (!pos) { | |
| 853 return; | 843 return; |
| 854 } | 844 |
| 855 m_TextlineDir = FindTextlineFlowDirection(); | 845 m_TextlineDir = FindTextlineFlowDirection(); |
| 856 int nCount = 0; | 846 const CPDF_PageObjectList* pObjList = m_pPage->GetPageObjectList(); |
| 857 while (pos) { | 847 for (auto it = pObjList->begin(); it != pObjList->end(); ++it) { |
| 858 CPDF_PageObject* pPageObj = | 848 if (CPDF_PageObject* pObj = it->get()) { |
| 859 m_pPage->GetPageObjectList()->GetNextObject(pos); | 849 if (pObj->m_Type == CPDF_PageObject::TEXT) { |
| 860 if (pPageObj) { | |
| 861 if (pPageObj->m_Type == CPDF_PageObject::TEXT) { | |
| 862 CFX_Matrix matrix; | 850 CFX_Matrix matrix; |
| 863 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos); | 851 ProcessTextObject(static_cast<CPDF_TextObject*>(pObj), matrix, pObjList, |
| 864 nCount++; | 852 it); |
| 865 } else if (pPageObj->m_Type == CPDF_PageObject::FORM) { | 853 } else if (pObj->m_Type == CPDF_PageObject::FORM) { |
| 866 CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0); | 854 CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0); |
| 867 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix); | 855 ProcessFormObject(static_cast<CPDF_FormObject*>(pObj), formMatrix); |
| 868 } | 856 } |
| 869 } | 857 } |
| 870 } | 858 } |
| 871 int count = m_LineObj.GetSize(); | 859 for (int i = 0; i < m_LineObj.GetSize(); i++) |
| 872 for (int i = 0; i < count; i++) { | |
| 873 ProcessTextObject(m_LineObj.GetAt(i)); | 860 ProcessTextObject(m_LineObj.GetAt(i)); |
| 874 } | 861 |
| 875 m_LineObj.RemoveAll(); | 862 m_LineObj.RemoveAll(); |
| 876 CloseTempLine(); | 863 CloseTempLine(); |
| 877 } | 864 } |
| 878 | 865 |
| 879 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, | 866 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, |
| 880 const CFX_Matrix& formMatrix) { | 867 const CFX_Matrix& formMatrix) { |
| 881 CPDF_PageObject* pPageObj = NULL; | 868 CPDF_PageObjectList* pObjectList = pFormObj->m_pForm->GetPageObjectList(); |
| 882 FX_POSITION pos; | 869 if (pObjectList->empty()) |
| 883 if (!pFormObj) { | |
| 884 return; | 870 return; |
| 885 } | 871 |
| 886 pos = pFormObj->m_pForm->GetPageObjectList()->GetHeadPosition(); | |
| 887 if (!pos) { | |
| 888 return; | |
| 889 } | |
| 890 CFX_Matrix curFormMatrix; | 872 CFX_Matrix curFormMatrix; |
| 891 curFormMatrix.Copy(pFormObj->m_FormMatrix); | 873 curFormMatrix.Copy(pFormObj->m_FormMatrix); |
| 892 curFormMatrix.Concat(formMatrix); | 874 curFormMatrix.Concat(formMatrix); |
| 893 while (pos) { | 875 |
| 894 pPageObj = pFormObj->m_pForm->GetPageObjectList()->GetNextObject(pos); | 876 for (auto it = pObjectList->begin(); it != pObjectList->end(); ++it) { |
| 895 if (pPageObj) { | 877 if (CPDF_PageObject* pPageObj = it->get()) { |
| 896 if (pPageObj->m_Type == CPDF_PageObject::TEXT) { | 878 if (pPageObj->m_Type == CPDF_PageObject::TEXT) { |
| 897 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos); | 879 ProcessTextObject(static_cast<CPDF_TextObject*>(pPageObj), |
| 880 curFormMatrix, pObjectList, it); |
| 898 } else if (pPageObj->m_Type == CPDF_PageObject::FORM) { | 881 } else if (pPageObj->m_Type == CPDF_PageObject::FORM) { |
| 899 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix); | 882 ProcessFormObject(static_cast<CPDF_FormObject*>(pPageObj), |
| 883 curFormMatrix); |
| 900 } | 884 } |
| 901 } | 885 } |
| 902 pPageObj = NULL; | |
| 903 } | 886 } |
| 904 } | 887 } |
| 905 | 888 |
| 906 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const { | 889 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const { |
| 907 if (charCode == -1) { | 890 if (charCode == -1) { |
| 908 return 0; | 891 return 0; |
| 909 } | 892 } |
| 910 int w = pFont->GetCharWidthF(charCode); | 893 int w = pFont->GetCharWidthF(charCode); |
| 911 if (w == 0) { | 894 if (w == 0) { |
| 912 CFX_ByteString str; | 895 CFX_ByteString str; |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1025 } else { | 1008 } else { |
| 1026 eCurrentDirection = CFX_BidiChar::LEFT; | 1009 eCurrentDirection = CFX_BidiChar::LEFT; |
| 1027 for (int m = segment.start; m < segment.start + segment.count; m++) | 1010 for (int m = segment.start; m < segment.start + segment.count; m++) |
| 1028 AddCharInfoByLRDirection(bidi.CharAt(m), m_TempCharList[m]); | 1011 AddCharInfoByLRDirection(bidi.CharAt(m), m_TempCharList[m]); |
| 1029 } | 1012 } |
| 1030 } | 1013 } |
| 1031 m_TempCharList.clear(); | 1014 m_TempCharList.clear(); |
| 1032 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength()); | 1015 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength()); |
| 1033 } | 1016 } |
| 1034 | 1017 |
| 1035 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj, | 1018 void CPDF_TextPage::ProcessTextObject( |
| 1036 const CFX_Matrix& formMatrix, | 1019 CPDF_TextObject* pTextObj, |
| 1037 FX_POSITION ObjPos) { | 1020 const CFX_Matrix& formMatrix, |
| 1021 const CPDF_PageObjectList* pObjList, |
| 1022 CPDF_PageObjectList::const_iterator ObjPos) { |
| 1038 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right, | 1023 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right, |
| 1039 pTextObj->m_Top); | 1024 pTextObj->m_Top); |
| 1040 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) { | 1025 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) { |
| 1041 return; | 1026 return; |
| 1042 } | 1027 } |
| 1043 int count = m_LineObj.GetSize(); | 1028 int count = m_LineObj.GetSize(); |
| 1044 PDFTEXT_Obj Obj; | 1029 PDFTEXT_Obj Obj; |
| 1045 Obj.m_pTextObj = pTextObj; | 1030 Obj.m_pTextObj = pTextObj; |
| 1046 Obj.m_formMatrix = formMatrix; | 1031 Obj.m_formMatrix = formMatrix; |
| 1047 if (count == 0) { | 1032 if (count == 0) { |
| 1048 m_LineObj.Add(Obj); | 1033 m_LineObj.Add(Obj); |
| 1049 return; | 1034 return; |
| 1050 } | 1035 } |
| 1051 if (IsSameAsPreTextObject(pTextObj, ObjPos)) { | 1036 if (IsSameAsPreTextObject(pTextObj, pObjList, ObjPos)) { |
| 1052 return; | 1037 return; |
| 1053 } | 1038 } |
| 1054 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1); | 1039 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1); |
| 1055 CPDF_TextObjectItem item; | 1040 CPDF_TextObjectItem item; |
| 1056 int nItem = prev_Obj.m_pTextObj->CountItems(); | 1041 int nItem = prev_Obj.m_pTextObj->CountItems(); |
| 1057 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item); | 1042 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item); |
| 1058 FX_FLOAT prev_width = | 1043 FX_FLOAT prev_width = |
| 1059 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) * | 1044 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) * |
| 1060 prev_Obj.m_pTextObj->GetFontSize() / 1000; | 1045 prev_Obj.m_pTextObj->GetFontSize() / 1000; |
| 1061 CFX_Matrix prev_matrix; | 1046 CFX_Matrix prev_matrix; |
| (...skipping 763 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1825 GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) * | 1810 GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) * |
| 1826 pTextObj2->GetFontSize() / 1000 * 0.9 || | 1811 pTextObj2->GetFontSize() / 1000 * 0.9 || |
| 1827 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) > | 1812 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) > |
| 1828 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), | 1813 std::max(std::max(rcPreObj.Height(), rcPreObj.Width()), |
| 1829 pTextObj2->GetFontSize()) / | 1814 pTextObj2->GetFontSize()) / |
| 1830 8) { | 1815 8) { |
| 1831 return FALSE; | 1816 return FALSE; |
| 1832 } | 1817 } |
| 1833 return TRUE; | 1818 return TRUE; |
| 1834 } | 1819 } |
| 1835 | 1820 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject( |
| 1836 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, | 1821 CPDF_TextObject* pTextObj, |
| 1837 FX_POSITION ObjPos) { | 1822 const CPDF_PageObjectList* pObjList, |
| 1838 if (!pTextObj) { | 1823 CPDF_PageObjectList::const_iterator iter) { |
| 1839 return FALSE; | |
| 1840 } | |
| 1841 int i = 0; | 1824 int i = 0; |
| 1842 if (!ObjPos) { | 1825 while (i < 5 && iter != pObjList->begin()) { |
| 1843 ObjPos = m_pPage->GetPageObjectList()->GetTailPosition(); | 1826 --iter; |
| 1844 } | 1827 CPDF_PageObject* pOtherObj = iter->get(); |
| 1845 CPDF_PageObject* pObj = m_pPage->GetPageObjectList()->GetPrevObject(ObjPos); | 1828 if (pOtherObj == pTextObj || pOtherObj->m_Type != CPDF_PageObject::TEXT) |
| 1846 while (i < 5 && ObjPos) { | |
| 1847 pObj = m_pPage->GetPageObjectList()->GetPrevObject(ObjPos); | |
| 1848 if (pObj == pTextObj) { | |
| 1849 continue; | 1829 continue; |
| 1850 } | 1830 if (IsSameTextObject(static_cast<CPDF_TextObject*>(pOtherObj), pTextObj)) |
| 1851 if (pObj->m_Type != CPDF_PageObject::TEXT) { | |
| 1852 continue; | |
| 1853 } | |
| 1854 if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) { | |
| 1855 return TRUE; | 1831 return TRUE; |
| 1856 } | 1832 ++i; |
| 1857 i++; | |
| 1858 } | 1833 } |
| 1859 return FALSE; | 1834 return FALSE; |
| 1860 } | 1835 } |
| 1861 | 1836 |
| 1862 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) { | 1837 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) { |
| 1863 const PAGECHAR_INFO* preChar; | 1838 const PAGECHAR_INFO* preChar; |
| 1864 if (!m_TempCharList.empty()) { | 1839 if (!m_TempCharList.empty()) { |
| 1865 preChar = &m_TempCharList.back(); | 1840 preChar = &m_TempCharList.back(); |
| 1866 } else if (!m_CharList.empty()) { | 1841 } else if (!m_CharList.empty()) { |
| 1867 preChar = &m_CharList.back(); | 1842 preChar = &m_CharList.back(); |
| (...skipping 654 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2522 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { | 2497 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { |
| 2523 return; | 2498 return; |
| 2524 } | 2499 } |
| 2525 CPDF_LinkExt* link = NULL; | 2500 CPDF_LinkExt* link = NULL; |
| 2526 link = m_LinkList.GetAt(index); | 2501 link = m_LinkList.GetAt(index); |
| 2527 if (!link) { | 2502 if (!link) { |
| 2528 return; | 2503 return; |
| 2529 } | 2504 } |
| 2530 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); | 2505 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); |
| 2531 } | 2506 } |
| OLD | NEW |