OLD | NEW |
---|---|
1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
6 | 6 |
7 #include "core/include/fpdfapi/fpdf_parser.h" | 7 #include "core/include/fpdfapi/fpdf_parser.h" |
8 | 8 |
9 #include <algorithm> | 9 #include <algorithm> |
10 #include <memory> | 10 #include <memory> |
(...skipping 18 matching lines...) Expand all Loading... | |
29 // A limit on the maximum object number in the xref table. Theoretical limits | 29 // A limit on the maximum object number in the xref table. Theoretical limits |
30 // are higher, but this may be large enough in practice. | 30 // are higher, but this may be large enough in practice. |
31 const FX_DWORD kMaxObjectNumber = 1048576; | 31 const FX_DWORD kMaxObjectNumber = 1048576; |
32 | 32 |
33 struct SearchTagRecord { | 33 struct SearchTagRecord { |
34 const char* m_pTag; | 34 const char* m_pTag; |
35 FX_DWORD m_Len; | 35 FX_DWORD m_Len; |
36 FX_DWORD m_Offset; | 36 FX_DWORD m_Offset; |
37 }; | 37 }; |
38 | 38 |
39 enum class ParserState { | |
Tom Sepez
2016/02/18 19:27:55
Can we make ParserState an enum nested privately i
dsinclair
2016/02/18 19:47:04
Done.
Is there a benefit over having it in the an
Tom Sepez
2016/02/18 19:49:16
Yes, it tells the reader that this state is associ
dsinclair
2016/02/18 19:51:03
Ah, right. Too spoiled by one class per file. Wasn
| |
40 kDefault, | |
41 kComment, | |
Tom Sepez
2016/02/18 19:27:55
Can we keep these in the same order as before so t
dsinclair
2016/02/18 19:47:04
I checked through the code, all uses of status are
Tom Sepez
2016/02/18 19:49:16
Acknowledged.
| |
42 kWhitespace, | |
43 | |
Tom Sepez
2016/02/18 19:27:55
nit: not sure we need blank lines here.
dsinclair
2016/02/18 19:47:04
Done. Was an attempt at grouping and readability.
| |
44 kString, | |
45 kHexString, | |
46 kEscapedString, | |
47 | |
48 kXref, | |
49 kObjNum, | |
50 kPostObjNum, | |
51 kGenNum, | |
52 kPostGenNum, | |
53 | |
54 kTrailer, | |
55 | |
56 kBeginObj, | |
57 kEndObj | |
58 }; | |
59 | |
39 int32_t GetHeaderOffset(IFX_FileRead* pFile) { | 60 int32_t GetHeaderOffset(IFX_FileRead* pFile) { |
40 const FX_DWORD tag = FXDWORD_FROM_LSBFIRST(0x46445025); | 61 const FX_DWORD tag = FXDWORD_FROM_LSBFIRST(0x46445025); |
41 const size_t kBufSize = 4; | 62 const size_t kBufSize = 4; |
42 uint8_t buf[kBufSize]; | 63 uint8_t buf[kBufSize]; |
43 int32_t offset = 0; | 64 int32_t offset = 0; |
44 while (offset <= 1024) { | 65 while (offset <= 1024) { |
45 if (!pFile->ReadBlock(buf, offset, kBufSize)) | 66 if (!pFile->ReadBlock(buf, offset, kBufSize)) |
46 return -1; | 67 return -1; |
47 | 68 |
48 if (*(FX_DWORD*)buf == tag) | 69 if (*(FX_DWORD*)buf == tag) |
(...skipping 526 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
575 return TRUE; | 596 return TRUE; |
576 } | 597 } |
577 | 598 |
578 FX_BOOL CPDF_Parser::RebuildCrossRef() { | 599 FX_BOOL CPDF_Parser::RebuildCrossRef() { |
579 m_ObjectInfo.clear(); | 600 m_ObjectInfo.clear(); |
580 m_SortedOffset.clear(); | 601 m_SortedOffset.clear(); |
581 if (m_pTrailer) { | 602 if (m_pTrailer) { |
582 m_pTrailer->Release(); | 603 m_pTrailer->Release(); |
583 m_pTrailer = NULL; | 604 m_pTrailer = NULL; |
584 } | 605 } |
585 int32_t status = 0; | 606 |
607 enum ParserState status = ParserState::kDefault; | |
Tom Sepez
2016/02/18 19:27:55
nit: |enum| needed ? Also, can we rename this |sta
dsinclair
2016/02/18 19:47:04
Done.
| |
608 | |
586 int32_t inside_index = 0; | 609 int32_t inside_index = 0; |
587 FX_DWORD objnum = 0; | 610 FX_DWORD objnum = 0; |
588 FX_DWORD gennum = 0; | 611 FX_DWORD gennum = 0; |
589 int32_t depth = 0; | 612 int32_t depth = 0; |
590 const FX_DWORD kBufferSize = 4096; | 613 const FX_DWORD kBufferSize = 4096; |
591 std::vector<uint8_t> buffer(kBufferSize); | 614 std::vector<uint8_t> buffer(kBufferSize); |
592 FX_FILESIZE pos = m_Syntax.m_HeaderOffset; | 615 FX_FILESIZE pos = m_Syntax.m_HeaderOffset; |
593 FX_FILESIZE start_pos = 0; | 616 FX_FILESIZE start_pos = 0; |
594 FX_FILESIZE start_pos1 = 0; | 617 FX_FILESIZE start_pos1 = 0; |
595 FX_FILESIZE last_obj = -1; | 618 FX_FILESIZE last_obj = -1; |
596 FX_FILESIZE last_xref = -1; | 619 FX_FILESIZE last_xref = -1; |
597 FX_FILESIZE last_trailer = -1; | 620 FX_FILESIZE last_trailer = -1; |
598 while (pos < m_Syntax.m_FileLen) { | 621 while (pos < m_Syntax.m_FileLen) { |
599 const FX_FILESIZE saved_pos = pos; | 622 const FX_FILESIZE saved_pos = pos; |
600 bool bOverFlow = false; | 623 bool bOverFlow = false; |
601 FX_DWORD size = std::min((FX_DWORD)(m_Syntax.m_FileLen - pos), kBufferSize); | 624 FX_DWORD size = std::min((FX_DWORD)(m_Syntax.m_FileLen - pos), kBufferSize); |
602 if (!m_Syntax.m_pFileAccess->ReadBlock(buffer.data(), pos, size)) | 625 if (!m_Syntax.m_pFileAccess->ReadBlock(buffer.data(), pos, size)) |
603 break; | 626 break; |
604 | 627 |
605 for (FX_DWORD i = 0; i < size; i++) { | 628 for (FX_DWORD i = 0; i < size; i++) { |
606 uint8_t byte = buffer[i]; | 629 uint8_t byte = buffer[i]; |
607 switch (status) { | 630 switch (status) { |
608 case 0: | 631 case ParserState::kDefault: |
609 if (PDFCharIsWhitespace(byte)) | 632 if (PDFCharIsWhitespace(byte)) |
610 status = 1; | 633 status = ParserState::kWhitespace; |
611 | 634 |
612 if (std::isdigit(byte)) { | 635 if (std::isdigit(byte)) { |
613 --i; | 636 --i; |
614 status = 1; | 637 status = ParserState::kWhitespace; |
615 } | 638 } |
616 | 639 |
617 if (byte == '%') { | 640 if (byte == '%') { |
618 inside_index = 0; | 641 inside_index = 0; |
619 status = 9; | 642 status = ParserState::kComment; |
620 } | 643 } |
621 | 644 |
622 if (byte == '(') { | 645 if (byte == '(') { |
623 status = 10; | 646 status = ParserState::kString; |
624 depth = 1; | 647 depth = 1; |
625 } | 648 } |
626 | 649 |
627 if (byte == '<') { | 650 if (byte == '<') { |
628 inside_index = 1; | 651 inside_index = 1; |
629 status = 11; | 652 status = ParserState::kHexString; |
630 } | 653 } |
631 | 654 |
632 if (byte == '\\') | 655 if (byte == '\\') |
633 status = 13; | 656 status = ParserState::kEscapedString; |
634 | 657 |
635 if (byte == 't') { | 658 if (byte == 't') { |
636 status = 7; | 659 status = ParserState::kTrailer; |
637 inside_index = 1; | 660 inside_index = 1; |
638 } | 661 } |
639 break; | 662 break; |
640 case 1: | 663 |
664 case ParserState::kWhitespace: | |
641 if (PDFCharIsWhitespace(byte)) { | 665 if (PDFCharIsWhitespace(byte)) { |
642 break; | 666 break; |
643 } else if (std::isdigit(byte)) { | 667 } else if (std::isdigit(byte)) { |
644 start_pos = pos + i; | 668 start_pos = pos + i; |
645 status = 2; | 669 status = ParserState::kObjNum; |
646 objnum = FXSYS_toDecimalDigit(byte); | 670 objnum = FXSYS_toDecimalDigit(byte); |
671 | |
647 } else if (byte == 't') { | 672 } else if (byte == 't') { |
648 status = 7; | 673 status = ParserState::kTrailer; |
649 inside_index = 1; | 674 inside_index = 1; |
675 | |
650 } else if (byte == 'x') { | 676 } else if (byte == 'x') { |
651 status = 8; | 677 status = ParserState::kXref; |
652 inside_index = 1; | 678 inside_index = 1; |
679 | |
653 } else { | 680 } else { |
654 --i; | 681 --i; |
655 status = 0; | 682 status = ParserState::kDefault; |
656 } | 683 } |
657 break; | 684 break; |
658 case 2: | 685 |
686 case ParserState::kObjNum: | |
659 if (std::isdigit(byte)) { | 687 if (std::isdigit(byte)) { |
660 objnum = objnum * 10 + FXSYS_toDecimalDigit(byte); | 688 objnum = objnum * 10 + FXSYS_toDecimalDigit(byte); |
661 break; | 689 break; |
662 } else if (PDFCharIsWhitespace(byte)) { | 690 } else if (PDFCharIsWhitespace(byte)) { |
663 status = 3; | 691 status = ParserState::kPostObjNum; |
664 } else { | 692 } else { |
665 --i; | 693 --i; |
666 status = 14; | 694 status = ParserState::kEndObj; |
667 inside_index = 0; | 695 inside_index = 0; |
668 } | 696 } |
669 break; | 697 break; |
670 case 3: | 698 |
699 case ParserState::kPostObjNum: | |
671 if (std::isdigit(byte)) { | 700 if (std::isdigit(byte)) { |
672 start_pos1 = pos + i; | 701 start_pos1 = pos + i; |
673 status = 4; | 702 status = ParserState::kGenNum; |
674 gennum = FXSYS_toDecimalDigit(byte); | 703 gennum = FXSYS_toDecimalDigit(byte); |
675 } else if (PDFCharIsWhitespace(byte)) { | 704 } else if (PDFCharIsWhitespace(byte)) { |
676 break; | 705 break; |
677 } else if (byte == 't') { | 706 } else if (byte == 't') { |
678 status = 7; | 707 status = ParserState::kTrailer; |
679 inside_index = 1; | 708 inside_index = 1; |
680 } else { | 709 } else { |
681 --i; | 710 --i; |
682 status = 0; | 711 status = ParserState::kDefault; |
683 } | 712 } |
684 break; | 713 break; |
685 case 4: | 714 |
715 case ParserState::kGenNum: | |
686 if (std::isdigit(byte)) { | 716 if (std::isdigit(byte)) { |
687 gennum = gennum * 10 + FXSYS_toDecimalDigit(byte); | 717 gennum = gennum * 10 + FXSYS_toDecimalDigit(byte); |
688 break; | 718 break; |
689 } else if (PDFCharIsWhitespace(byte)) { | 719 } else if (PDFCharIsWhitespace(byte)) { |
690 status = 5; | 720 status = ParserState::kPostGenNum; |
691 } else { | 721 } else { |
692 --i; | 722 --i; |
693 status = 0; | 723 status = ParserState::kDefault; |
694 } | 724 } |
695 break; | 725 break; |
696 case 5: | 726 |
727 case ParserState::kPostGenNum: | |
697 if (byte == 'o') { | 728 if (byte == 'o') { |
698 status = 6; | 729 status = ParserState::kBeginObj; |
699 inside_index = 1; | 730 inside_index = 1; |
700 } else if (PDFCharIsWhitespace(byte)) { | 731 } else if (PDFCharIsWhitespace(byte)) { |
701 break; | 732 break; |
702 } else if (std::isdigit(byte)) { | 733 } else if (std::isdigit(byte)) { |
703 objnum = gennum; | 734 objnum = gennum; |
704 gennum = FXSYS_toDecimalDigit(byte); | 735 gennum = FXSYS_toDecimalDigit(byte); |
705 start_pos = start_pos1; | 736 start_pos = start_pos1; |
706 start_pos1 = pos + i; | 737 start_pos1 = pos + i; |
707 status = 4; | 738 status = ParserState::kGenNum; |
708 } else if (byte == 't') { | 739 } else if (byte == 't') { |
709 status = 7; | 740 status = ParserState::kTrailer; |
710 inside_index = 1; | 741 inside_index = 1; |
711 } else { | 742 } else { |
712 --i; | 743 --i; |
713 status = 0; | 744 status = ParserState::kDefault; |
714 } | 745 } |
715 break; | 746 break; |
716 case 6: | 747 |
748 case ParserState::kBeginObj: | |
717 switch (inside_index) { | 749 switch (inside_index) { |
718 case 1: | 750 case 1: |
719 if (byte != 'b') { | 751 if (byte != 'b') { |
720 --i; | 752 --i; |
721 status = 0; | 753 status = ParserState::kDefault; |
722 } else { | 754 } else { |
723 inside_index++; | 755 inside_index++; |
724 } | 756 } |
725 break; | 757 break; |
726 case 2: | 758 case 2: |
727 if (byte != 'j') { | 759 if (byte != 'j') { |
728 --i; | 760 --i; |
729 status = 0; | 761 status = ParserState::kDefault; |
730 } else { | 762 } else { |
731 inside_index++; | 763 inside_index++; |
732 } | 764 } |
733 break; | 765 break; |
734 case 3: | 766 case 3: |
735 if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { | 767 if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { |
736 if (objnum > 0x1000000) { | 768 if (objnum > 0x1000000) { |
737 status = 0; | 769 status = ParserState::kDefault; |
738 break; | 770 break; |
739 } | 771 } |
740 FX_FILESIZE obj_pos = start_pos - m_Syntax.m_HeaderOffset; | 772 FX_FILESIZE obj_pos = start_pos - m_Syntax.m_HeaderOffset; |
741 m_SortedOffset.insert(obj_pos); | 773 m_SortedOffset.insert(obj_pos); |
742 last_obj = start_pos; | 774 last_obj = start_pos; |
743 FX_FILESIZE obj_end = 0; | 775 FX_FILESIZE obj_end = 0; |
744 CPDF_Object* pObject = ParseIndirectObjectAtByStrict( | 776 CPDF_Object* pObject = ParseIndirectObjectAtByStrict( |
745 m_pDocument, obj_pos, objnum, &obj_end); | 777 m_pDocument, obj_pos, objnum, &obj_end); |
746 if (CPDF_Stream* pStream = ToStream(pObject)) { | 778 if (CPDF_Stream* pStream = ToStream(pObject)) { |
747 if (CPDF_Dictionary* pDict = pStream->GetDict()) { | 779 if (CPDF_Dictionary* pDict = pStream->GetDict()) { |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
786 } else { | 818 } else { |
787 m_ObjectInfo[objnum].pos = obj_pos; | 819 m_ObjectInfo[objnum].pos = obj_pos; |
788 m_ObjectInfo[objnum].type = 1; | 820 m_ObjectInfo[objnum].type = 1; |
789 m_ObjectInfo[objnum].gennum = gennum; | 821 m_ObjectInfo[objnum].gennum = gennum; |
790 } | 822 } |
791 if (pObject) { | 823 if (pObject) { |
792 pObject->Release(); | 824 pObject->Release(); |
793 } | 825 } |
794 } | 826 } |
795 --i; | 827 --i; |
796 status = 0; | 828 status = ParserState::kDefault; |
797 break; | 829 break; |
798 } | 830 } |
799 break; | 831 break; |
800 case 7: | 832 |
833 case ParserState::kTrailer: | |
801 if (inside_index == 7) { | 834 if (inside_index == 7) { |
802 if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { | 835 if (PDFCharIsWhitespace(byte) || PDFCharIsDelimiter(byte)) { |
803 last_trailer = pos + i - 7; | 836 last_trailer = pos + i - 7; |
804 m_Syntax.RestorePos(pos + i - m_Syntax.m_HeaderOffset); | 837 m_Syntax.RestorePos(pos + i - m_Syntax.m_HeaderOffset); |
805 CPDF_Object* pObj = m_Syntax.GetObject(m_pDocument, 0, 0, true); | 838 CPDF_Object* pObj = m_Syntax.GetObject(m_pDocument, 0, 0, true); |
806 if (pObj) { | 839 if (pObj) { |
807 if (!pObj->IsDictionary() && !pObj->AsStream()) { | 840 if (!pObj->IsDictionary() && !pObj->AsStream()) { |
808 pObj->Release(); | 841 pObj->Release(); |
809 } else { | 842 } else { |
810 CPDF_Stream* pStream = pObj->AsStream(); | 843 CPDF_Stream* pStream = pObj->AsStream(); |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
853 } | 886 } |
854 m_Syntax.RestorePos(dwSavePos); | 887 m_Syntax.RestorePos(dwSavePos); |
855 } | 888 } |
856 } else { | 889 } else { |
857 pObj->Release(); | 890 pObj->Release(); |
858 } | 891 } |
859 } | 892 } |
860 } | 893 } |
861 } | 894 } |
862 --i; | 895 --i; |
863 status = 0; | 896 status = ParserState::kDefault; |
864 } else if (byte == "trailer"[inside_index]) { | 897 } else if (byte == "trailer"[inside_index]) { |
865 inside_index++; | 898 inside_index++; |
866 } else { | 899 } else { |
867 --i; | 900 --i; |
868 status = 0; | 901 status = ParserState::kDefault; |
869 } | 902 } |
870 break; | 903 break; |
871 case 8: | 904 |
905 case ParserState::kXref: | |
872 if (inside_index == 4) { | 906 if (inside_index == 4) { |
873 last_xref = pos + i - 4; | 907 last_xref = pos + i - 4; |
874 status = 1; | 908 status = ParserState::kWhitespace; |
875 } else if (byte == "xref"[inside_index]) { | 909 } else if (byte == "xref"[inside_index]) { |
876 inside_index++; | 910 inside_index++; |
877 } else { | 911 } else { |
878 --i; | 912 --i; |
879 status = 0; | 913 status = ParserState::kDefault; |
880 } | 914 } |
881 break; | 915 break; |
882 case 9: | 916 |
917 case ParserState::kComment: | |
883 if (byte == '\r' || byte == '\n') { | 918 if (byte == '\r' || byte == '\n') { |
884 status = 0; | 919 status = ParserState::kDefault; |
885 } | 920 } |
886 break; | 921 break; |
887 case 10: | 922 |
923 case ParserState::kString: | |
888 if (byte == ')') { | 924 if (byte == ')') { |
889 if (depth > 0) { | 925 if (depth > 0) { |
890 depth--; | 926 depth--; |
891 } | 927 } |
892 } else if (byte == '(') { | 928 } else if (byte == '(') { |
893 depth++; | 929 depth++; |
894 } | 930 } |
895 if (!depth) { | 931 if (!depth) { |
896 status = 0; | 932 status = ParserState::kDefault; |
897 } | 933 } |
898 break; | 934 break; |
899 case 11: | 935 |
936 case ParserState::kHexString: | |
900 if (byte == '>' || (byte == '<' && inside_index == 1)) | 937 if (byte == '>' || (byte == '<' && inside_index == 1)) |
901 status = 0; | 938 status = ParserState::kDefault; |
902 inside_index = 0; | 939 inside_index = 0; |
903 break; | 940 break; |
904 case 13: | 941 |
Tom Sepez
2016/02/18 19:27:55
Heh. No 12.
dsinclair
2016/02/18 19:47:04
Acknowledged.
| |
942 case ParserState::kEscapedString: | |
905 if (PDFCharIsDelimiter(byte) || PDFCharIsWhitespace(byte)) { | 943 if (PDFCharIsDelimiter(byte) || PDFCharIsWhitespace(byte)) { |
906 --i; | 944 --i; |
907 status = 0; | 945 status = ParserState::kDefault; |
908 } | 946 } |
909 break; | 947 break; |
910 case 14: | 948 |
949 case ParserState::kEndObj: | |
911 if (PDFCharIsWhitespace(byte)) { | 950 if (PDFCharIsWhitespace(byte)) { |
912 status = 0; | 951 status = ParserState::kDefault; |
913 } else if (byte == '%' || byte == '(' || byte == '<' || | 952 } else if (byte == '%' || byte == '(' || byte == '<' || |
914 byte == '\\') { | 953 byte == '\\') { |
915 status = 0; | 954 status = ParserState::kDefault; |
916 --i; | 955 --i; |
917 } else if (inside_index == 6) { | 956 } else if (inside_index == 6) { |
918 status = 0; | 957 status = ParserState::kDefault; |
919 --i; | 958 --i; |
920 } else if (byte == "endobj"[inside_index]) { | 959 } else if (byte == "endobj"[inside_index]) { |
921 inside_index++; | 960 inside_index++; |
922 } | 961 } |
923 break; | 962 break; |
924 } | 963 } |
925 if (bOverFlow) { | 964 if (bOverFlow) { |
926 size = 0; | 965 size = 0; |
927 break; | 966 break; |
928 } | 967 } |
(...skipping 3793 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4722 if (!m_pLinearizedDict) | 4761 if (!m_pLinearizedDict) |
4723 return -1; | 4762 return -1; |
4724 CPDF_Array* pRange = m_pLinearizedDict->GetArrayBy("H"); | 4763 CPDF_Array* pRange = m_pLinearizedDict->GetArrayBy("H"); |
4725 if (!pRange) | 4764 if (!pRange) |
4726 return -1; | 4765 return -1; |
4727 CPDF_Object* pStreamLen = pRange->GetElementValue(1); | 4766 CPDF_Object* pStreamLen = pRange->GetElementValue(1); |
4728 if (!pStreamLen) | 4767 if (!pStreamLen) |
4729 return -1; | 4768 return -1; |
4730 return pStreamLen->GetInteger(); | 4769 return pStreamLen->GetInteger(); |
4731 } | 4770 } |
OLD | NEW |