OLD | NEW |
1 /* | 1 /* |
2 * HTMLparser.c : an HTML 4.0 non-verifying parser | 2 * HTMLparser.c : an HTML 4.0 non-verifying parser |
3 * | 3 * |
4 * See Copyright for the status of this software. | 4 * See Copyright for the status of this software. |
5 * | 5 * |
6 * daniel@veillard.com | 6 * daniel@veillard.com |
7 */ | 7 */ |
8 | 8 |
9 #define IN_LIBXML | 9 #define IN_LIBXML |
10 #include "libxml.h" | 10 #include "libxml.h" |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
98 /** | 98 /** |
99 * htmlParseErr: | 99 * htmlParseErr: |
100 * @ctxt: an HTML parser context | 100 * @ctxt: an HTML parser context |
101 * @error: the error number | 101 * @error: the error number |
102 * @msg: the error message | 102 * @msg: the error message |
103 * @str1: string infor | 103 * @str1: string infor |
104 * @str2: string infor | 104 * @str2: string infor |
105 * | 105 * |
106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints | 106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints |
107 */ | 107 */ |
108 static void | 108 static void LIBXML_ATTR_FORMAT(3,0) |
109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, | 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, |
110 const char *msg, const xmlChar *str1, const xmlChar *str2) | 110 const char *msg, const xmlChar *str1, const xmlChar *str2) |
111 { | 111 { |
112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && | 112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && |
113 (ctxt->instate == XML_PARSER_EOF)) | 113 (ctxt->instate == XML_PARSER_EOF)) |
114 return; | 114 return; |
115 if (ctxt != NULL) | 115 if (ctxt != NULL) |
116 ctxt->errNo = error; | 116 ctxt->errNo = error; |
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, | 117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, |
118 XML_ERR_ERROR, NULL, 0, | 118 XML_ERR_ERROR, NULL, 0, |
119 (const char *) str1, (const char *) str2, | 119 (const char *) str1, (const char *) str2, |
120 NULL, 0, 0, | 120 NULL, 0, 0, |
121 msg, str1, str2); | 121 msg, str1, str2); |
122 if (ctxt != NULL) | 122 if (ctxt != NULL) |
123 ctxt->wellFormed = 0; | 123 ctxt->wellFormed = 0; |
124 } | 124 } |
125 | 125 |
126 /** | 126 /** |
127 * htmlParseErrInt: | 127 * htmlParseErrInt: |
128 * @ctxt: an HTML parser context | 128 * @ctxt: an HTML parser context |
129 * @error: the error number | 129 * @error: the error number |
130 * @msg: the error message | 130 * @msg: the error message |
131 * @val: integer info | 131 * @val: integer info |
132 * | 132 * |
133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints | 133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints |
134 */ | 134 */ |
135 static void | 135 static void LIBXML_ATTR_FORMAT(3,0) |
136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, | 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, |
137 const char *msg, int val) | 137 const char *msg, int val) |
138 { | 138 { |
139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && | 139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && |
140 (ctxt->instate == XML_PARSER_EOF)) | 140 (ctxt->instate == XML_PARSER_EOF)) |
141 return; | 141 return; |
142 if (ctxt != NULL) | 142 if (ctxt != NULL) |
143 ctxt->errNo = error; | 143 ctxt->errNo = error; |
144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, | 144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, |
145 XML_ERR_ERROR, NULL, 0, NULL, NULL, | 145 XML_ERR_ERROR, NULL, 0, NULL, NULL, |
(...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
296 | 296 |
297 #define UPPER (toupper(*ctxt->input->cur)) | 297 #define UPPER (toupper(*ctxt->input->cur)) |
298 | 298 |
299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->
col+=(val) | 299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->
col+=(val) |
300 | 300 |
301 #define NXT(val) ctxt->input->cur[(val)] | 301 #define NXT(val) ctxt->input->cur[(val)] |
302 | 302 |
303 #define UPP(val) (toupper(ctxt->input->cur[(val)])) | 303 #define UPP(val) (toupper(ctxt->input->cur[(val)])) |
304 | 304 |
305 #define CUR_PTR ctxt->input->cur | 305 #define CUR_PTR ctxt->input->cur |
| 306 #define BASE_PTR ctxt->input->base |
306 | 307 |
307 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ | 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ |
308 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ | 309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ |
309 xmlParserInputShrink(ctxt->input) | 310 xmlParserInputShrink(ctxt->input) |
310 | 311 |
311 #define GROW if ((ctxt->progressive == 0) && \ | 312 #define GROW if ((ctxt->progressive == 0) && \ |
312 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ | 313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ |
313 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) | 314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) |
314 | 315 |
315 #define CURRENT ((int) (*ctxt->input->cur)) | 316 #define CURRENT ((int) (*ctxt->input->cur)) |
(...skipping 2148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2464 if (((*in >= 0x61) && (*in <= 0x7A)) || | 2465 if (((*in >= 0x61) && (*in <= 0x7A)) || |
2465 ((*in >= 0x41) && (*in <= 0x5A)) || | 2466 ((*in >= 0x41) && (*in <= 0x5A)) || |
2466 (*in == '_') || (*in == ':')) { | 2467 (*in == '_') || (*in == ':')) { |
2467 in++; | 2468 in++; |
2468 while (((*in >= 0x61) && (*in <= 0x7A)) || | 2469 while (((*in >= 0x61) && (*in <= 0x7A)) || |
2469 ((*in >= 0x41) && (*in <= 0x5A)) || | 2470 ((*in >= 0x41) && (*in <= 0x5A)) || |
2470 ((*in >= 0x30) && (*in <= 0x39)) || | 2471 ((*in >= 0x30) && (*in <= 0x39)) || |
2471 (*in == '_') || (*in == '-') || | 2472 (*in == '_') || (*in == '-') || |
2472 (*in == ':') || (*in == '.')) | 2473 (*in == ':') || (*in == '.')) |
2473 in++; | 2474 in++; |
| 2475 |
| 2476 if (in == ctxt->input->end) |
| 2477 return(NULL); |
| 2478 |
2474 if ((*in > 0) && (*in < 0x80)) { | 2479 if ((*in > 0) && (*in < 0x80)) { |
2475 count = in - ctxt->input->cur; | 2480 count = in - ctxt->input->cur; |
2476 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); | 2481 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); |
2477 ctxt->input->cur = in; | 2482 ctxt->input->cur = in; |
2478 ctxt->nbChars += count; | 2483 ctxt->nbChars += count; |
2479 ctxt->input->col += count; | 2484 ctxt->input->col += count; |
2480 return(ret); | 2485 return(ret); |
2481 } | 2486 } |
2482 } | 2487 } |
2483 return(htmlParseNameComplex(ctxt)); | 2488 return(htmlParseNameComplex(ctxt)); |
2484 } | 2489 } |
2485 | 2490 |
2486 static const xmlChar * | 2491 static const xmlChar * |
2487 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { | 2492 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { |
2488 int len = 0, l; | 2493 int len = 0, l; |
2489 int c; | 2494 int c; |
2490 int count = 0; | 2495 int count = 0; |
| 2496 const xmlChar *base = ctxt->input->base; |
2491 | 2497 |
2492 /* | 2498 /* |
2493 * Handler for more complex cases | 2499 * Handler for more complex cases |
2494 */ | 2500 */ |
2495 GROW; | 2501 GROW; |
2496 c = CUR_CHAR(l); | 2502 c = CUR_CHAR(l); |
2497 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ | 2503 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ |
2498 (!IS_LETTER(c) && (c != '_') && | 2504 (!IS_LETTER(c) && (c != '_') && |
2499 (c != ':'))) { | 2505 (c != ':'))) { |
2500 return(NULL); | 2506 return(NULL); |
2501 } | 2507 } |
2502 | 2508 |
2503 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ | 2509 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ |
2504 ((IS_LETTER(c)) || (IS_DIGIT(c)) || | 2510 ((IS_LETTER(c)) || (IS_DIGIT(c)) || |
2505 (c == '.') || (c == '-') || | 2511 (c == '.') || (c == '-') || |
2506 (c == '_') || (c == ':') || | 2512 (c == '_') || (c == ':') || |
2507 (IS_COMBINING(c)) || | 2513 (IS_COMBINING(c)) || |
2508 (IS_EXTENDER(c)))) { | 2514 (IS_EXTENDER(c)))) { |
2509 if (count++ > 100) { | 2515 if (count++ > 100) { |
2510 count = 0; | 2516 count = 0; |
2511 GROW; | 2517 GROW; |
2512 } | 2518 } |
2513 len += l; | 2519 len += l; |
2514 NEXTL(l); | 2520 NEXTL(l); |
2515 c = CUR_CHAR(l); | 2521 c = CUR_CHAR(l); |
| 2522 if (ctxt->input->base != base) { |
| 2523 /* |
| 2524 * We changed encoding from an unknown encoding |
| 2525 * Input buffer changed location, so we better start again |
| 2526 */ |
| 2527 return(htmlParseNameComplex(ctxt)); |
| 2528 } |
2516 } | 2529 } |
| 2530 |
| 2531 if (ctxt->input->base > ctxt->input->cur - len) |
| 2532 return(NULL); |
| 2533 |
2517 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); | 2534 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); |
2518 } | 2535 } |
2519 | 2536 |
2520 | 2537 |
2521 /** | 2538 /** |
2522 * htmlParseHTMLAttribute: | 2539 * htmlParseHTMLAttribute: |
2523 * @ctxt: an HTML parser context | 2540 * @ctxt: an HTML parser context |
2524 * @stop: a char stop value | 2541 * @stop: a char stop value |
2525 * | 2542 * |
2526 * parse an HTML attribute value till the stop (quote), if | 2543 * parse an HTML attribute value till the stop (quote), if |
(...skipping 231 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2758 * | 2775 * |
2759 * parse an HTML Literal | 2776 * parse an HTML Literal |
2760 * | 2777 * |
2761 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") | 2778 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") |
2762 * | 2779 * |
2763 * Returns the SystemLiteral parsed or NULL | 2780 * Returns the SystemLiteral parsed or NULL |
2764 */ | 2781 */ |
2765 | 2782 |
2766 static xmlChar * | 2783 static xmlChar * |
2767 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { | 2784 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { |
2768 const xmlChar *q; | 2785 size_t len = 0, startPosition = 0; |
2769 xmlChar *ret = NULL; | 2786 xmlChar *ret = NULL; |
2770 | 2787 |
2771 if (CUR == '"') { | 2788 if (CUR == '"') { |
2772 NEXT; | 2789 NEXT; |
2773 » q = CUR_PTR; | 2790 |
2774 » while ((IS_CHAR_CH(CUR)) && (CUR != '"')) | 2791 if (CUR_PTR < BASE_PTR) |
| 2792 return(ret); |
| 2793 startPosition = CUR_PTR - BASE_PTR; |
| 2794 |
| 2795 » while ((IS_CHAR_CH(CUR)) && (CUR != '"')) { |
2775 NEXT; | 2796 NEXT; |
| 2797 len++; |
| 2798 } |
2776 if (!IS_CHAR_CH(CUR)) { | 2799 if (!IS_CHAR_CH(CUR)) { |
2777 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2800 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
2778 "Unfinished SystemLiteral\n", NULL, NULL); | 2801 "Unfinished SystemLiteral\n", NULL, NULL); |
2779 } else { | 2802 } else { |
2780 » ret = xmlStrndup(q, CUR_PTR - q); | 2803 » ret = xmlStrndup((BASE_PTR+startPosition), len); |
2781 NEXT; | 2804 NEXT; |
2782 } | 2805 } |
2783 } else if (CUR == '\'') { | 2806 } else if (CUR == '\'') { |
2784 NEXT; | 2807 NEXT; |
2785 » q = CUR_PTR; | 2808 |
2786 » while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) | 2809 if (CUR_PTR < BASE_PTR) |
| 2810 return(ret); |
| 2811 startPosition = CUR_PTR - BASE_PTR; |
| 2812 |
| 2813 » while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) { |
2787 NEXT; | 2814 NEXT; |
| 2815 len++; |
| 2816 } |
2788 if (!IS_CHAR_CH(CUR)) { | 2817 if (!IS_CHAR_CH(CUR)) { |
2789 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2818 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
2790 "Unfinished SystemLiteral\n", NULL, NULL); | 2819 "Unfinished SystemLiteral\n", NULL, NULL); |
2791 } else { | 2820 } else { |
2792 » ret = xmlStrndup(q, CUR_PTR - q); | 2821 » ret = xmlStrndup((BASE_PTR+startPosition), len); |
2793 NEXT; | 2822 NEXT; |
2794 } | 2823 } |
2795 } else { | 2824 } else { |
2796 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, | 2825 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, |
2797 " or ' expected\n", NULL, NULL); | 2826 " or ' expected\n", NULL, NULL); |
2798 } | 2827 } |
2799 | 2828 |
2800 return(ret); | 2829 return(ret); |
2801 } | 2830 } |
2802 | 2831 |
2803 /** | 2832 /** |
2804 * htmlParsePubidLiteral: | 2833 * htmlParsePubidLiteral: |
2805 * @ctxt: an HTML parser context | 2834 * @ctxt: an HTML parser context |
2806 * | 2835 * |
2807 * parse an HTML public literal | 2836 * parse an HTML public literal |
2808 * | 2837 * |
2809 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" | 2838 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" |
2810 * | 2839 * |
2811 * Returns the PubidLiteral parsed or NULL. | 2840 * Returns the PubidLiteral parsed or NULL. |
2812 */ | 2841 */ |
2813 | 2842 |
2814 static xmlChar * | 2843 static xmlChar * |
2815 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { | 2844 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { |
2816 const xmlChar *q; | 2845 size_t len = 0, startPosition = 0; |
2817 xmlChar *ret = NULL; | 2846 xmlChar *ret = NULL; |
2818 /* | 2847 /* |
2819 * Name ::= (Letter | '_') (NameChar)* | 2848 * Name ::= (Letter | '_') (NameChar)* |
2820 */ | 2849 */ |
2821 if (CUR == '"') { | 2850 if (CUR == '"') { |
2822 NEXT; | 2851 NEXT; |
2823 » q = CUR_PTR; | 2852 |
2824 » while (IS_PUBIDCHAR_CH(CUR)) NEXT; | 2853 if (CUR_PTR < BASE_PTR) |
| 2854 return(ret); |
| 2855 startPosition = CUR_PTR - BASE_PTR; |
| 2856 |
| 2857 while (IS_PUBIDCHAR_CH(CUR)) { |
| 2858 len++; |
| 2859 NEXT; |
| 2860 } |
| 2861 |
2825 if (CUR != '"') { | 2862 if (CUR != '"') { |
2826 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2863 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
2827 "Unfinished PubidLiteral\n", NULL, NULL); | 2864 "Unfinished PubidLiteral\n", NULL, NULL); |
2828 } else { | 2865 } else { |
2829 » ret = xmlStrndup(q, CUR_PTR - q); | 2866 » ret = xmlStrndup((BASE_PTR + startPosition), len); |
2830 NEXT; | 2867 NEXT; |
2831 } | 2868 } |
2832 } else if (CUR == '\'') { | 2869 } else if (CUR == '\'') { |
2833 NEXT; | 2870 NEXT; |
2834 » q = CUR_PTR; | 2871 |
2835 » while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) | 2872 if (CUR_PTR < BASE_PTR) |
2836 » NEXT; | 2873 return(ret); |
| 2874 startPosition = CUR_PTR - BASE_PTR; |
| 2875 |
| 2876 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){ |
| 2877 len++; |
| 2878 NEXT; |
| 2879 } |
| 2880 |
2837 if (CUR != '\'') { | 2881 if (CUR != '\'') { |
2838 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2882 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
2839 "Unfinished PubidLiteral\n", NULL, NULL); | 2883 "Unfinished PubidLiteral\n", NULL, NULL); |
2840 } else { | 2884 } else { |
2841 » ret = xmlStrndup(q, CUR_PTR - q); | 2885 » ret = xmlStrndup((BASE_PTR + startPosition), len); |
2842 NEXT; | 2886 NEXT; |
2843 } | 2887 } |
2844 } else { | 2888 } else { |
2845 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, | 2889 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, |
2846 "PubidLiteral \" or ' expected\n", NULL, NULL); | 2890 "PubidLiteral \" or ' expected\n", NULL, NULL); |
2847 } | 2891 } |
2848 | 2892 |
2849 return(ret); | 2893 return(ret); |
2850 } | 2894 } |
2851 | 2895 |
(...skipping 4260 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
7112 xmlFreeParserInputBuffer(input); | 7156 xmlFreeParserInputBuffer(input); |
7113 return (NULL); | 7157 return (NULL); |
7114 } | 7158 } |
7115 inputPush(ctxt, stream); | 7159 inputPush(ctxt, stream); |
7116 return (htmlDoRead(ctxt, URL, encoding, options, 1)); | 7160 return (htmlDoRead(ctxt, URL, encoding, options, 1)); |
7117 } | 7161 } |
7118 | 7162 |
7119 #define bottom_HTMLparser | 7163 #define bottom_HTMLparser |
7120 #include "elfgcchack.h" | 7164 #include "elfgcchack.h" |
7121 #endif /* LIBXML_HTML_ENABLED */ | 7165 #endif /* LIBXML_HTML_ENABLED */ |
OLD | NEW |