| OLD | NEW |
| 1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| 6 | 6 |
| 7 #include "core/src/fpdftext/text_int.h" | 7 #include "core/src/fpdftext/text_int.h" |
| 8 | 8 |
| 9 #include <algorithm> | 9 #include <algorithm> |
| 10 #include <cctype> | 10 #include <cctype> |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 101 #define TEXT_BLANK_CHAR L' ' | 101 #define TEXT_BLANK_CHAR L' ' |
| 102 #define TEXT_LINEFEED_CHAR L'\n' | 102 #define TEXT_LINEFEED_CHAR L'\n' |
| 103 #define TEXT_RETURN_CHAR L'\r' | 103 #define TEXT_RETURN_CHAR L'\r' |
| 104 #define TEXT_EMPTY L"" | 104 #define TEXT_EMPTY L"" |
| 105 #define TEXT_BLANK L" " | 105 #define TEXT_BLANK L" " |
| 106 #define TEXT_RETURN_LINEFEED L"\r\n" | 106 #define TEXT_RETURN_LINEFEED L"\r\n" |
| 107 #define TEXT_LINEFEED L"\n" | 107 #define TEXT_LINEFEED L"\n" |
| 108 #define TEXT_CHARRATIO_GAPDELTA 0.070 | 108 #define TEXT_CHARRATIO_GAPDELTA 0.070 |
| 109 | 109 |
| 110 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags) | 110 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags) |
| 111 : m_pPage(pPage), | 111 : m_pPageObjectHolder(pPage), |
| 112 m_parserflag(flags), | 112 m_parserflag(flags), |
| 113 m_pPreTextObj(nullptr), | 113 m_pPreTextObj(nullptr), |
| 114 m_bIsParsed(false), | 114 m_bIsParsed(false), |
| 115 m_TextlineDir(-1), | 115 m_TextlineDir(-1), |
| 116 m_CurlineRect(0, 0, 0, 0) { | 116 m_CurlineRect(0, 0, 0, 0) { |
| 117 m_TextBuf.EstimateSize(0, 10240); | 117 m_TextBuf.EstimateSize(0, 10240); |
| 118 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(), | 118 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(), |
| 119 (int)pPage->GetPageHeight(), 0); | 119 (int)pPage->GetPageHeight(), 0); |
| 120 } | 120 } |
| 121 | 121 |
| 122 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) { | 122 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) { |
| 123 switch (charInfo.m_Unicode) { | 123 switch (charInfo.m_Unicode) { |
| 124 case 0x2: | 124 case 0x2: |
| 125 case 0x3: | 125 case 0x3: |
| 126 case 0x93: | 126 case 0x93: |
| 127 case 0x94: | 127 case 0x94: |
| 128 case 0x96: | 128 case 0x96: |
| 129 case 0x97: | 129 case 0x97: |
| 130 case 0x98: | 130 case 0x98: |
| 131 case 0xfffe: | 131 case 0xfffe: |
| 132 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN; | 132 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN; |
| 133 default: | 133 default: |
| 134 return false; | 134 return false; |
| 135 } | 135 } |
| 136 } | 136 } |
| 137 | 137 |
| 138 FX_BOOL CPDF_TextPage::ParseTextPage() { | 138 FX_BOOL CPDF_TextPage::ParseTextPage() { |
| 139 m_bIsParsed = false; | 139 m_bIsParsed = false; |
| 140 if (!m_pPage) | 140 if (!m_pPageObjectHolder) |
| 141 return FALSE; | 141 return FALSE; |
| 142 | 142 |
| 143 m_TextBuf.Clear(); | 143 m_TextBuf.Clear(); |
| 144 m_CharList.clear(); | 144 m_CharList.clear(); |
| 145 m_pPreTextObj = NULL; | 145 m_pPreTextObj = NULL; |
| 146 ProcessObject(); | 146 ProcessObject(); |
| 147 m_bIsParsed = true; | 147 m_bIsParsed = true; |
| 148 m_CharIndex.clear(); | 148 m_CharIndex.clear(); |
| 149 int nCount = pdfium::CollectionSize<int>(m_CharList); | 149 int nCount = pdfium::CollectionSize<int>(m_CharList); |
| 150 if (nCount) { | 150 if (nCount) { |
| (...skipping 598 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 749 } else if (direction == FPDFTEXT_RIGHT) { | 749 } else if (direction == FPDFTEXT_RIGHT) { |
| 750 while (++breakPos < pdfium::CollectionSize<int>(m_CharList)) { | 750 while (++breakPos < pdfium::CollectionSize<int>(m_CharList)) { |
| 751 if (!IsLetter(m_CharList[breakPos].m_Unicode)) | 751 if (!IsLetter(m_CharList[breakPos].m_Unicode)) |
| 752 break; | 752 break; |
| 753 } | 753 } |
| 754 } | 754 } |
| 755 return breakPos; | 755 return breakPos; |
| 756 } | 756 } |
| 757 | 757 |
| 758 int32_t CPDF_TextPage::FindTextlineFlowDirection() { | 758 int32_t CPDF_TextPage::FindTextlineFlowDirection() { |
| 759 if (!m_pPage) { | 759 if (!m_pPageObjectHolder) { |
| 760 return -1; | 760 return -1; |
| 761 } | 761 } |
| 762 const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth(); | 762 const int32_t nPageWidth = static_cast<int32_t>( |
| 763 const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight(); | 763 static_cast<const CPDF_Page*>(m_pPageObjectHolder)->GetPageWidth()); |
| 764 const int32_t nPageHeight = static_cast<int32_t>( |
| 765 static_cast<const CPDF_Page*>(m_pPageObjectHolder)->GetPageHeight()); |
| 764 std::vector<uint8_t> nHorizontalMask(nPageWidth); | 766 std::vector<uint8_t> nHorizontalMask(nPageWidth); |
| 765 std::vector<uint8_t> nVerticalMask(nPageHeight); | 767 std::vector<uint8_t> nVerticalMask(nPageHeight); |
| 766 uint8_t* pDataH = nHorizontalMask.data(); | 768 uint8_t* pDataH = nHorizontalMask.data(); |
| 767 uint8_t* pDataV = nVerticalMask.data(); | 769 uint8_t* pDataV = nVerticalMask.data(); |
| 768 int32_t index = 0; | 770 int32_t index = 0; |
| 769 FX_FLOAT fLineHeight = 0.0f; | 771 FX_FLOAT fLineHeight = 0.0f; |
| 770 CPDF_PageObject* pPageObj = NULL; | 772 CPDF_PageObject* pPageObj = NULL; |
| 771 FX_POSITION pos = NULL; | 773 FX_POSITION pos = NULL; |
| 772 pos = m_pPage->GetFirstObjectPosition(); | 774 pos = m_pPageObjectHolder->GetPageObjectList()->GetHeadPosition(); |
| 773 if (!pos) { | 775 if (!pos) { |
| 774 return -1; | 776 return -1; |
| 775 } | 777 } |
| 776 while (pos) { | 778 while (pos) { |
| 777 pPageObj = m_pPage->GetNextObject(pos); | 779 pPageObj = m_pPageObjectHolder->GetPageObjectList()->GetNextObject(pos); |
| 778 if (NULL == pPageObj) { | 780 if (NULL == pPageObj) { |
| 779 continue; | 781 continue; |
| 780 } | 782 } |
| 781 if (CPDF_PageObject::TEXT != pPageObj->m_Type) { | 783 if (CPDF_PageObject::TEXT != pPageObj->m_Type) { |
| 782 continue; | 784 continue; |
| 783 } | 785 } |
| 784 int32_t minH = | 786 int32_t minH = |
| 785 (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left; | 787 (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left; |
| 786 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth | 788 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth |
| 787 ? nPageWidth | 789 ? nPageWidth |
| (...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 848 return 0; | 850 return 0; |
| 849 } | 851 } |
| 850 if (nSumV - nSumH > 0.0f) { | 852 if (nSumV - nSumH > 0.0f) { |
| 851 return 1; | 853 return 1; |
| 852 } | 854 } |
| 853 return -1; | 855 return -1; |
| 854 } | 856 } |
| 855 | 857 |
| 856 void CPDF_TextPage::ProcessObject() { | 858 void CPDF_TextPage::ProcessObject() { |
| 857 CPDF_PageObject* pPageObj = NULL; | 859 CPDF_PageObject* pPageObj = NULL; |
| 858 if (!m_pPage) { | 860 if (!m_pPageObjectHolder) { |
| 859 return; | 861 return; |
| 860 } | 862 } |
| 861 FX_POSITION pos; | 863 FX_POSITION pos; |
| 862 pos = m_pPage->GetFirstObjectPosition(); | 864 pos = m_pPageObjectHolder->GetPageObjectList()->GetHeadPosition(); |
| 863 if (!pos) { | 865 if (!pos) { |
| 864 return; | 866 return; |
| 865 } | 867 } |
| 866 m_TextlineDir = FindTextlineFlowDirection(); | 868 m_TextlineDir = FindTextlineFlowDirection(); |
| 867 int nCount = 0; | 869 int nCount = 0; |
| 868 while (pos) { | 870 while (pos) { |
| 869 pPageObj = m_pPage->GetNextObject(pos); | 871 pPageObj = m_pPageObjectHolder->GetPageObjectList()->GetNextObject(pos); |
| 870 if (pPageObj) { | 872 if (pPageObj) { |
| 871 if (pPageObj->m_Type == CPDF_PageObject::TEXT) { | 873 if (pPageObj->m_Type == CPDF_PageObject::TEXT) { |
| 872 CFX_Matrix matrix; | 874 CFX_Matrix matrix; |
| 873 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos); | 875 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos); |
| 874 nCount++; | 876 nCount++; |
| 875 } else if (pPageObj->m_Type == CPDF_PageObject::FORM) { | 877 } else if (pPageObj->m_Type == CPDF_PageObject::FORM) { |
| 876 CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0); | 878 CFX_Matrix formMatrix(1, 0, 0, 1, 0, 0); |
| 877 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix); | 879 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix); |
| 878 } | 880 } |
| 879 } | 881 } |
| 880 pPageObj = NULL; | 882 pPageObj = NULL; |
| 881 } | 883 } |
| 882 int count = m_LineObj.GetSize(); | 884 int count = m_LineObj.GetSize(); |
| 883 for (int i = 0; i < count; i++) { | 885 for (int i = 0; i < count; i++) { |
| 884 ProcessTextObject(m_LineObj.GetAt(i)); | 886 ProcessTextObject(m_LineObj.GetAt(i)); |
| 885 } | 887 } |
| 886 m_LineObj.RemoveAll(); | 888 m_LineObj.RemoveAll(); |
| 887 CloseTempLine(); | 889 CloseTempLine(); |
| 888 } | 890 } |
| 889 | 891 |
| 890 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, | 892 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, |
| 891 const CFX_Matrix& formMatrix) { | 893 const CFX_Matrix& formMatrix) { |
| 892 CPDF_PageObject* pPageObj = NULL; | 894 CPDF_PageObject* pPageObj = NULL; |
| 893 FX_POSITION pos; | 895 FX_POSITION pos; |
| 894 if (!pFormObj) { | 896 if (!pFormObj) { |
| 895 return; | 897 return; |
| 896 } | 898 } |
| 897 pos = pFormObj->m_pForm->GetFirstObjectPosition(); | 899 pos = pFormObj->m_pForm->GetPageObjectList()->GetHeadPosition(); |
| 898 if (!pos) { | 900 if (!pos) { |
| 899 return; | 901 return; |
| 900 } | 902 } |
| 901 CFX_Matrix curFormMatrix; | 903 CFX_Matrix curFormMatrix; |
| 902 curFormMatrix.Copy(pFormObj->m_FormMatrix); | 904 curFormMatrix.Copy(pFormObj->m_FormMatrix); |
| 903 curFormMatrix.Concat(formMatrix); | 905 curFormMatrix.Concat(formMatrix); |
| 904 while (pos) { | 906 while (pos) { |
| 905 pPageObj = pFormObj->m_pForm->GetNextObject(pos); | 907 pPageObj = pFormObj->m_pForm->GetPageObjectList()->GetNextObject(pos); |
| 906 if (pPageObj) { | 908 if (pPageObj) { |
| 907 if (pPageObj->m_Type == CPDF_PageObject::TEXT) { | 909 if (pPageObj->m_Type == CPDF_PageObject::TEXT) { |
| 908 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos); | 910 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos); |
| 909 } else if (pPageObj->m_Type == CPDF_PageObject::FORM) { | 911 } else if (pPageObj->m_Type == CPDF_PageObject::FORM) { |
| 910 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix); | 912 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix); |
| 911 } | 913 } |
| 912 } | 914 } |
| 913 pPageObj = NULL; | 915 pPageObj = NULL; |
| 914 } | 916 } |
| 915 } | 917 } |
| (...skipping 928 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1844 return TRUE; | 1846 return TRUE; |
| 1845 } | 1847 } |
| 1846 | 1848 |
| 1847 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, | 1849 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, |
| 1848 FX_POSITION ObjPos) { | 1850 FX_POSITION ObjPos) { |
| 1849 if (!pTextObj) { | 1851 if (!pTextObj) { |
| 1850 return FALSE; | 1852 return FALSE; |
| 1851 } | 1853 } |
| 1852 int i = 0; | 1854 int i = 0; |
| 1853 if (!ObjPos) { | 1855 if (!ObjPos) { |
| 1854 ObjPos = m_pPage->GetLastObjectPosition(); | 1856 ObjPos = m_pPageObjectHolder->GetPageObjectList()->GetTailPosition(); |
| 1855 } | 1857 } |
| 1856 CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos); | 1858 CPDF_PageObject* pObj = |
| 1859 m_pPageObjectHolder->GetPageObjectList()->GetPrevObject(ObjPos); |
| 1857 while (i < 5 && ObjPos) { | 1860 while (i < 5 && ObjPos) { |
| 1858 pObj = m_pPage->GetPrevObject(ObjPos); | 1861 pObj = m_pPageObjectHolder->GetPageObjectList()->GetPrevObject(ObjPos); |
| 1859 if (pObj == pTextObj) { | 1862 if (pObj == pTextObj) { |
| 1860 continue; | 1863 continue; |
| 1861 } | 1864 } |
| 1862 if (pObj->m_Type != CPDF_PageObject::TEXT) { | 1865 if (pObj->m_Type != CPDF_PageObject::TEXT) { |
| 1863 continue; | 1866 continue; |
| 1864 } | 1867 } |
| 1865 if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) { | 1868 if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) { |
| 1866 return TRUE; | 1869 return TRUE; |
| 1867 } | 1870 } |
| 1868 i++; | 1871 i++; |
| (...skipping 666 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2535 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { | 2538 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { |
| 2536 return; | 2539 return; |
| 2537 } | 2540 } |
| 2538 CPDF_LinkExt* link = NULL; | 2541 CPDF_LinkExt* link = NULL; |
| 2539 link = m_LinkList.GetAt(index); | 2542 link = m_LinkList.GetAt(index); |
| 2540 if (!link) { | 2543 if (!link) { |
| 2541 return; | 2544 return; |
| 2542 } | 2545 } |
| 2543 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); | 2546 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); |
| 2544 } | 2547 } |
| OLD | NEW |