| OLD | NEW |
| 1 /* | 1 /* |
| 2 * HTMLparser.c : an HTML 4.0 non-verifying parser | 2 * HTMLparser.c : an HTML 4.0 non-verifying parser |
| 3 * | 3 * |
| 4 * See Copyright for the status of this software. | 4 * See Copyright for the status of this software. |
| 5 * | 5 * |
| 6 * daniel@veillard.com | 6 * daniel@veillard.com |
| 7 */ | 7 */ |
| 8 | 8 |
| 9 #define IN_LIBXML | 9 #define IN_LIBXML |
| 10 #include "libxml.h" | 10 #include "libxml.h" |
| (...skipping 2930 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2941 */ | 2941 */ |
| 2942 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); | 2942 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); |
| 2943 } else if (ctxt->sax->characters != NULL) { | 2943 } else if (ctxt->sax->characters != NULL) { |
| 2944 ctxt->sax->characters(ctxt->userData, buf, nbchar); | 2944 ctxt->sax->characters(ctxt->userData, buf, nbchar); |
| 2945 } | 2945 } |
| 2946 } | 2946 } |
| 2947 } | 2947 } |
| 2948 | 2948 |
| 2949 | 2949 |
| 2950 /** | 2950 /** |
| 2951 * htmlParseCharData: | 2951 * htmlParseCharDataInternal: |
| 2952 * @ctxt: an HTML parser context | 2952 * @ctxt: an HTML parser context |
| 2953 * @readahead: optional read ahead character in ascii range |
| 2953 * | 2954 * |
| 2954 * parse a CharData section. | 2955 * parse a CharData section. |
| 2955 * if we are within a CDATA section ']]>' marks an end of section. | 2956 * if we are within a CDATA section ']]>' marks an end of section. |
| 2956 * | 2957 * |
| 2957 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) | 2958 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) |
| 2958 */ | 2959 */ |
| 2959 | 2960 |
| 2960 static void | 2961 static void |
| 2961 htmlParseCharData(htmlParserCtxtPtr ctxt) { | 2962 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) { |
| 2962 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; | 2963 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; |
| 2963 int nbchar = 0; | 2964 int nbchar = 0; |
| 2964 int cur, l; | 2965 int cur, l; |
| 2965 int chunk = 0; | 2966 int chunk = 0; |
| 2966 | 2967 |
| 2968 if (readahead) |
| 2969 buf[nbchar++] = readahead; |
| 2970 |
| 2967 SHRINK; | 2971 SHRINK; |
| 2968 cur = CUR_CHAR(l); | 2972 cur = CUR_CHAR(l); |
| 2969 while (((cur != '<') || (ctxt->token == '<')) && | 2973 while (((cur != '<') || (ctxt->token == '<')) && |
| 2970 ((cur != '&') || (ctxt->token == '&')) && | 2974 ((cur != '&') || (ctxt->token == '&')) && |
| 2971 (cur != 0)) { | 2975 (cur != 0)) { |
| 2972 if (!(IS_CHAR(cur))) { | 2976 if (!(IS_CHAR(cur))) { |
| 2973 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, | 2977 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, |
| 2974 "Invalid char in CDATA 0x%X\n", cur); | 2978 "Invalid char in CDATA 0x%X\n", cur); |
| 2975 } else { | 2979 } else { |
| 2976 COPY_BUF(l,buf,nbchar,cur); | 2980 COPY_BUF(l,buf,nbchar,cur); |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3036 } else { | 3040 } else { |
| 3037 /* | 3041 /* |
| 3038 * Loop detection | 3042 * Loop detection |
| 3039 */ | 3043 */ |
| 3040 if (cur == 0) | 3044 if (cur == 0) |
| 3041 ctxt->instate = XML_PARSER_EOF; | 3045 ctxt->instate = XML_PARSER_EOF; |
| 3042 } | 3046 } |
| 3043 } | 3047 } |
| 3044 | 3048 |
| 3045 /** | 3049 /** |
| 3050 * htmlParseCharData: |
| 3051 * @ctxt: an HTML parser context |
| 3052 * |
| 3053 * parse a CharData section. |
| 3054 * if we are within a CDATA section ']]>' marks an end of section. |
| 3055 * |
| 3056 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) |
| 3057 */ |
| 3058 |
| 3059 static void |
| 3060 htmlParseCharData(htmlParserCtxtPtr ctxt) { |
| 3061 htmlParseCharDataInternal(ctxt, 0); |
| 3062 } |
| 3063 |
| 3064 /** |
| 3046 * htmlParseExternalID: | 3065 * htmlParseExternalID: |
| 3047 * @ctxt: an HTML parser context | 3066 * @ctxt: an HTML parser context |
| 3048 * @publicID: a xmlChar** receiving PubidLiteral | 3067 * @publicID: a xmlChar** receiving PubidLiteral |
| 3049 * | 3068 * |
| 3050 * Parse an External ID or a Public ID | 3069 * Parse an External ID or a Public ID |
| 3051 * | 3070 * |
| 3052 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 3071 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral |
| 3053 * | 'PUBLIC' S PubidLiteral S SystemLiteral | 3072 * | 'PUBLIC' S PubidLiteral S SystemLiteral |
| 3054 * | 3073 * |
| 3055 * [83] PublicID ::= 'PUBLIC' S PubidLiteral | 3074 * [83] PublicID ::= 'PUBLIC' S PubidLiteral |
| (...skipping 182 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3238 state = ctxt->instate; | 3257 state = ctxt->instate; |
| 3239 ctxt->instate = XML_PARSER_COMMENT; | 3258 ctxt->instate = XML_PARSER_COMMENT; |
| 3240 SHRINK; | 3259 SHRINK; |
| 3241 SKIP(4); | 3260 SKIP(4); |
| 3242 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); | 3261 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); |
| 3243 if (buf == NULL) { | 3262 if (buf == NULL) { |
| 3244 htmlErrMemory(ctxt, "buffer allocation failed\n"); | 3263 htmlErrMemory(ctxt, "buffer allocation failed\n"); |
| 3245 ctxt->instate = state; | 3264 ctxt->instate = state; |
| 3246 return; | 3265 return; |
| 3247 } | 3266 } |
| 3267 len = 0; |
| 3268 buf[len] = 0; |
| 3248 q = CUR_CHAR(ql); | 3269 q = CUR_CHAR(ql); |
| 3270 if (!IS_CHAR(q)) |
| 3271 goto unfinished; |
| 3249 NEXTL(ql); | 3272 NEXTL(ql); |
| 3250 r = CUR_CHAR(rl); | 3273 r = CUR_CHAR(rl); |
| 3274 if (!IS_CHAR(r)) |
| 3275 goto unfinished; |
| 3251 NEXTL(rl); | 3276 NEXTL(rl); |
| 3252 cur = CUR_CHAR(l); | 3277 cur = CUR_CHAR(l); |
| 3253 len = 0; | |
| 3254 while (IS_CHAR(cur) && | 3278 while (IS_CHAR(cur) && |
| 3255 ((cur != '>') || | 3279 ((cur != '>') || |
| 3256 (r != '-') || (q != '-'))) { | 3280 (r != '-') || (q != '-'))) { |
| 3257 if (len + 5 >= size) { | 3281 if (len + 5 >= size) { |
| 3258 xmlChar *tmp; | 3282 xmlChar *tmp; |
| 3259 | 3283 |
| 3260 size *= 2; | 3284 size *= 2; |
| 3261 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); | 3285 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); |
| 3262 if (tmp == NULL) { | 3286 if (tmp == NULL) { |
| 3263 xmlFree(buf); | 3287 xmlFree(buf); |
| (...skipping 10 matching lines...) Expand all Loading... |
| 3274 rl = l; | 3298 rl = l; |
| 3275 NEXTL(l); | 3299 NEXTL(l); |
| 3276 cur = CUR_CHAR(l); | 3300 cur = CUR_CHAR(l); |
| 3277 if (cur == 0) { | 3301 if (cur == 0) { |
| 3278 SHRINK; | 3302 SHRINK; |
| 3279 GROW; | 3303 GROW; |
| 3280 cur = CUR_CHAR(l); | 3304 cur = CUR_CHAR(l); |
| 3281 } | 3305 } |
| 3282 } | 3306 } |
| 3283 buf[len] = 0; | 3307 buf[len] = 0; |
| 3284 if (!IS_CHAR(cur)) { | 3308 if (IS_CHAR(cur)) { |
| 3285 » htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, | |
| 3286 » "Comment not terminated \n<!--%.50s\n", buf, NULL); | |
| 3287 » xmlFree(buf); | |
| 3288 } else { | |
| 3289 NEXT; | 3309 NEXT; |
| 3290 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && | 3310 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && |
| 3291 (!ctxt->disableSAX)) | 3311 (!ctxt->disableSAX)) |
| 3292 ctxt->sax->comment(ctxt->userData, buf); | 3312 ctxt->sax->comment(ctxt->userData, buf); |
| 3293 xmlFree(buf); | 3313 xmlFree(buf); |
| 3314 ctxt->instate = state; |
| 3315 return; |
| 3294 } | 3316 } |
| 3295 ctxt->instate = state; | 3317 |
| 3318 unfinished: |
| 3319 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, |
| 3320 » » "Comment not terminated \n<!--%.50s\n", buf, NULL); |
| 3321 xmlFree(buf); |
| 3296 } | 3322 } |
| 3297 | 3323 |
| 3298 /** | 3324 /** |
| 3299 * htmlParseCharRef: | 3325 * htmlParseCharRef: |
| 3300 * @ctxt: an HTML parser context | 3326 * @ctxt: an HTML parser context |
| 3301 * | 3327 * |
| 3302 * parse Reference declarations | 3328 * parse Reference declarations |
| 3303 * | 3329 * |
| 3304 * [66] CharRef ::= '&#' [0-9]+ ';' | | 3330 * [66] CharRef ::= '&#' [0-9]+ ';' | |
| 3305 * '&#x' [0-9a-fA-F]+ ';' | 3331 * '&#x' [0-9a-fA-F]+ ';' |
| (...skipping 377 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3683 | 3709 |
| 3684 atts = ctxt->atts; | 3710 atts = ctxt->atts; |
| 3685 maxatts = ctxt->maxatts; | 3711 maxatts = ctxt->maxatts; |
| 3686 | 3712 |
| 3687 GROW; | 3713 GROW; |
| 3688 name = htmlParseHTMLName(ctxt); | 3714 name = htmlParseHTMLName(ctxt); |
| 3689 if (name == NULL) { | 3715 if (name == NULL) { |
| 3690 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, | 3716 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, |
| 3691 "htmlParseStartTag: invalid element name\n", | 3717 "htmlParseStartTag: invalid element name\n", |
| 3692 NULL, NULL); | 3718 NULL, NULL); |
| 3719 /* if recover preserve text on classic misconstructs */ |
| 3720 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || |
| 3721 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { |
| 3722 htmlParseCharDataInternal(ctxt, '<'); |
| 3723 return(-1); |
| 3724 } |
| 3725 |
| 3726 |
| 3693 /* Dump the bogus tag like browsers do */ | 3727 /* Dump the bogus tag like browsers do */ |
| 3694 while ((IS_CHAR_CH(CUR)) && (CUR != '>') && | 3728 while ((IS_CHAR_CH(CUR)) && (CUR != '>') && |
| 3695 (ctxt->instate != XML_PARSER_EOF)) | 3729 (ctxt->instate != XML_PARSER_EOF)) |
| 3696 NEXT; | 3730 NEXT; |
| 3697 return -1; | 3731 return -1; |
| 3698 } | 3732 } |
| 3699 if (xmlStrEqual(name, BAD_CAST"meta")) | 3733 if (xmlStrEqual(name, BAD_CAST"meta")) |
| 3700 meta = 1; | 3734 meta = 1; |
| 3701 | 3735 |
| 3702 /* | 3736 /* |
| (...skipping 1991 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5694 ctxt->checkIndex = 0; | 5728 ctxt->checkIndex = 0; |
| 5695 } | 5729 } |
| 5696 if ((avail == 1) && (terminate)) { | 5730 if ((avail == 1) && (terminate)) { |
| 5697 cur = in->cur[0]; | 5731 cur = in->cur[0]; |
| 5698 if ((cur != '<') && (cur != '&')) { | 5732 if ((cur != '<') && (cur != '&')) { |
| 5699 if (ctxt->sax != NULL) { | 5733 if (ctxt->sax != NULL) { |
| 5700 if (IS_BLANK_CH(cur)) { | 5734 if (IS_BLANK_CH(cur)) { |
| 5701 if (ctxt->keepBlanks) { | 5735 if (ctxt->keepBlanks) { |
| 5702 if (ctxt->sax->characters != NULL) | 5736 if (ctxt->sax->characters != NULL) |
| 5703 ctxt->sax->characters( | 5737 ctxt->sax->characters( |
| 5704 » » » » » » ctxt->userData, &cur, 1); | 5738 » » » » » » ctxt->userData, &in->cur[0], 1); |
| 5705 } else { | 5739 } else { |
| 5706 if (ctxt->sax->ignorableWhitespace != NULL) | 5740 if (ctxt->sax->ignorableWhitespace != NULL) |
| 5707 ctxt->sax->ignorableWhitespace( | 5741 ctxt->sax->ignorableWhitespace( |
| 5708 » » » » » » ctxt->userData, &cur, 1); | 5742 » » » » » » ctxt->userData, &in->cur[0], 1); |
| 5709 } | 5743 } |
| 5710 } else { | 5744 } else { |
| 5711 htmlCheckParagraph(ctxt); | 5745 htmlCheckParagraph(ctxt); |
| 5712 if (ctxt->sax->characters != NULL) | 5746 if (ctxt->sax->characters != NULL) |
| 5713 ctxt->sax->characters( | 5747 ctxt->sax->characters( |
| 5714 » » » » » ctxt->userData, &cur, 1); | 5748 » » » » » ctxt->userData, &in->cur[0], 1); |
| 5715 } | 5749 } |
| 5716 } | 5750 } |
| 5717 ctxt->token = 0; | 5751 ctxt->token = 0; |
| 5718 ctxt->checkIndex = 0; | 5752 ctxt->checkIndex = 0; |
| 5719 in->cur++; | 5753 in->cur++; |
| 5720 break; | 5754 break; |
| 5721 } | 5755 } |
| 5722 } | 5756 } |
| 5723 if (avail < 2) | 5757 if (avail < 2) |
| 5724 goto done; | 5758 goto done; |
| (...skipping 1353 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 7078 xmlFreeParserInputBuffer(input); | 7112 xmlFreeParserInputBuffer(input); |
| 7079 return (NULL); | 7113 return (NULL); |
| 7080 } | 7114 } |
| 7081 inputPush(ctxt, stream); | 7115 inputPush(ctxt, stream); |
| 7082 return (htmlDoRead(ctxt, URL, encoding, options, 1)); | 7116 return (htmlDoRead(ctxt, URL, encoding, options, 1)); |
| 7083 } | 7117 } |
| 7084 | 7118 |
| 7085 #define bottom_HTMLparser | 7119 #define bottom_HTMLparser |
| 7086 #include "elfgcchack.h" | 7120 #include "elfgcchack.h" |
| 7087 #endif /* LIBXML_HTML_ENABLED */ | 7121 #endif /* LIBXML_HTML_ENABLED */ |
| OLD | NEW |