| OLD | NEW |
| 1 /* | 1 /* |
| 2 * HTMLparser.c : an HTML 4.0 non-verifying parser | 2 * HTMLparser.c : an HTML 4.0 non-verifying parser |
| 3 * | 3 * |
| 4 * See Copyright for the status of this software. | 4 * See Copyright for the status of this software. |
| 5 * | 5 * |
| 6 * daniel@veillard.com | 6 * daniel@veillard.com |
| 7 */ | 7 */ |
| 8 | 8 |
| 9 #define IN_LIBXML | 9 #define IN_LIBXML |
| 10 #include "libxml.h" | 10 #include "libxml.h" |
| (...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 98 /** | 98 /** |
| 99 * htmlParseErr: | 99 * htmlParseErr: |
| 100 * @ctxt: an HTML parser context | 100 * @ctxt: an HTML parser context |
| 101 * @error: the error number | 101 * @error: the error number |
| 102 * @msg: the error message | 102 * @msg: the error message |
| 103 * @str1: string infor | 103 * @str1: string infor |
| 104 * @str2: string infor | 104 * @str2: string infor |
| 105 * | 105 * |
| 106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints | 106 * Handle a fatal parser error, i.e. violating Well-Formedness constraints |
| 107 */ | 107 */ |
| 108 static void | 108 static void LIBXML_ATTR_FORMAT(3,0) |
| 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, | 109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, |
| 110 const char *msg, const xmlChar *str1, const xmlChar *str2) | 110 const char *msg, const xmlChar *str1, const xmlChar *str2) |
| 111 { | 111 { |
| 112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && | 112 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && |
| 113 (ctxt->instate == XML_PARSER_EOF)) | 113 (ctxt->instate == XML_PARSER_EOF)) |
| 114 return; | 114 return; |
| 115 if (ctxt != NULL) | 115 if (ctxt != NULL) |
| 116 ctxt->errNo = error; | 116 ctxt->errNo = error; |
| 117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, | 117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, |
| 118 XML_ERR_ERROR, NULL, 0, | 118 XML_ERR_ERROR, NULL, 0, |
| 119 (const char *) str1, (const char *) str2, | 119 (const char *) str1, (const char *) str2, |
| 120 NULL, 0, 0, | 120 NULL, 0, 0, |
| 121 msg, str1, str2); | 121 msg, str1, str2); |
| 122 if (ctxt != NULL) | 122 if (ctxt != NULL) |
| 123 ctxt->wellFormed = 0; | 123 ctxt->wellFormed = 0; |
| 124 } | 124 } |
| 125 | 125 |
| 126 /** | 126 /** |
| 127 * htmlParseErrInt: | 127 * htmlParseErrInt: |
| 128 * @ctxt: an HTML parser context | 128 * @ctxt: an HTML parser context |
| 129 * @error: the error number | 129 * @error: the error number |
| 130 * @msg: the error message | 130 * @msg: the error message |
| 131 * @val: integer info | 131 * @val: integer info |
| 132 * | 132 * |
| 133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints | 133 * Handle a fatal parser error, i.e. violating Well-Formedness constraints |
| 134 */ | 134 */ |
| 135 static void | 135 static void LIBXML_ATTR_FORMAT(3,0) |
| 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, | 136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, |
| 137 const char *msg, int val) | 137 const char *msg, int val) |
| 138 { | 138 { |
| 139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && | 139 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && |
| 140 (ctxt->instate == XML_PARSER_EOF)) | 140 (ctxt->instate == XML_PARSER_EOF)) |
| 141 return; | 141 return; |
| 142 if (ctxt != NULL) | 142 if (ctxt != NULL) |
| 143 ctxt->errNo = error; | 143 ctxt->errNo = error; |
| 144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, | 144 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, |
| 145 XML_ERR_ERROR, NULL, 0, NULL, NULL, | 145 XML_ERR_ERROR, NULL, 0, NULL, NULL, |
| (...skipping 150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 296 | 296 |
| 297 #define UPPER (toupper(*ctxt->input->cur)) | 297 #define UPPER (toupper(*ctxt->input->cur)) |
| 298 | 298 |
| 299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->
col+=(val) | 299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->
col+=(val) |
| 300 | 300 |
| 301 #define NXT(val) ctxt->input->cur[(val)] | 301 #define NXT(val) ctxt->input->cur[(val)] |
| 302 | 302 |
| 303 #define UPP(val) (toupper(ctxt->input->cur[(val)])) | 303 #define UPP(val) (toupper(ctxt->input->cur[(val)])) |
| 304 | 304 |
| 305 #define CUR_PTR ctxt->input->cur | 305 #define CUR_PTR ctxt->input->cur |
| 306 #define BASE_PTR ctxt->input->base |
| 306 | 307 |
| 307 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ | 308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ |
| 308 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ | 309 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ |
| 309 xmlParserInputShrink(ctxt->input) | 310 xmlParserInputShrink(ctxt->input) |
| 310 | 311 |
| 311 #define GROW if ((ctxt->progressive == 0) && \ | 312 #define GROW if ((ctxt->progressive == 0) && \ |
| 312 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ | 313 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ |
| 313 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) | 314 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) |
| 314 | 315 |
| 315 #define CURRENT ((int) (*ctxt->input->cur)) | 316 #define CURRENT ((int) (*ctxt->input->cur)) |
| (...skipping 2148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2464 if (((*in >= 0x61) && (*in <= 0x7A)) || | 2465 if (((*in >= 0x61) && (*in <= 0x7A)) || |
| 2465 ((*in >= 0x41) && (*in <= 0x5A)) || | 2466 ((*in >= 0x41) && (*in <= 0x5A)) || |
| 2466 (*in == '_') || (*in == ':')) { | 2467 (*in == '_') || (*in == ':')) { |
| 2467 in++; | 2468 in++; |
| 2468 while (((*in >= 0x61) && (*in <= 0x7A)) || | 2469 while (((*in >= 0x61) && (*in <= 0x7A)) || |
| 2469 ((*in >= 0x41) && (*in <= 0x5A)) || | 2470 ((*in >= 0x41) && (*in <= 0x5A)) || |
| 2470 ((*in >= 0x30) && (*in <= 0x39)) || | 2471 ((*in >= 0x30) && (*in <= 0x39)) || |
| 2471 (*in == '_') || (*in == '-') || | 2472 (*in == '_') || (*in == '-') || |
| 2472 (*in == ':') || (*in == '.')) | 2473 (*in == ':') || (*in == '.')) |
| 2473 in++; | 2474 in++; |
| 2475 |
| 2476 if (in == ctxt->input->end) |
| 2477 return(NULL); |
| 2478 |
| 2474 if ((*in > 0) && (*in < 0x80)) { | 2479 if ((*in > 0) && (*in < 0x80)) { |
| 2475 count = in - ctxt->input->cur; | 2480 count = in - ctxt->input->cur; |
| 2476 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); | 2481 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); |
| 2477 ctxt->input->cur = in; | 2482 ctxt->input->cur = in; |
| 2478 ctxt->nbChars += count; | 2483 ctxt->nbChars += count; |
| 2479 ctxt->input->col += count; | 2484 ctxt->input->col += count; |
| 2480 return(ret); | 2485 return(ret); |
| 2481 } | 2486 } |
| 2482 } | 2487 } |
| 2483 return(htmlParseNameComplex(ctxt)); | 2488 return(htmlParseNameComplex(ctxt)); |
| 2484 } | 2489 } |
| 2485 | 2490 |
| 2486 static const xmlChar * | 2491 static const xmlChar * |
| 2487 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { | 2492 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { |
| 2488 int len = 0, l; | 2493 int len = 0, l; |
| 2489 int c; | 2494 int c; |
| 2490 int count = 0; | 2495 int count = 0; |
| 2496 const xmlChar *base = ctxt->input->base; |
| 2491 | 2497 |
| 2492 /* | 2498 /* |
| 2493 * Handler for more complex cases | 2499 * Handler for more complex cases |
| 2494 */ | 2500 */ |
| 2495 GROW; | 2501 GROW; |
| 2496 c = CUR_CHAR(l); | 2502 c = CUR_CHAR(l); |
| 2497 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ | 2503 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ |
| 2498 (!IS_LETTER(c) && (c != '_') && | 2504 (!IS_LETTER(c) && (c != '_') && |
| 2499 (c != ':'))) { | 2505 (c != ':'))) { |
| 2500 return(NULL); | 2506 return(NULL); |
| 2501 } | 2507 } |
| 2502 | 2508 |
| 2503 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ | 2509 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ |
| 2504 ((IS_LETTER(c)) || (IS_DIGIT(c)) || | 2510 ((IS_LETTER(c)) || (IS_DIGIT(c)) || |
| 2505 (c == '.') || (c == '-') || | 2511 (c == '.') || (c == '-') || |
| 2506 (c == '_') || (c == ':') || | 2512 (c == '_') || (c == ':') || |
| 2507 (IS_COMBINING(c)) || | 2513 (IS_COMBINING(c)) || |
| 2508 (IS_EXTENDER(c)))) { | 2514 (IS_EXTENDER(c)))) { |
| 2509 if (count++ > 100) { | 2515 if (count++ > 100) { |
| 2510 count = 0; | 2516 count = 0; |
| 2511 GROW; | 2517 GROW; |
| 2512 } | 2518 } |
| 2513 len += l; | 2519 len += l; |
| 2514 NEXTL(l); | 2520 NEXTL(l); |
| 2515 c = CUR_CHAR(l); | 2521 c = CUR_CHAR(l); |
| 2522 if (ctxt->input->base != base) { |
| 2523 /* |
| 2524 * We changed encoding from an unknown encoding |
| 2525 * Input buffer changed location, so we better start again |
| 2526 */ |
| 2527 return(htmlParseNameComplex(ctxt)); |
| 2528 } |
| 2516 } | 2529 } |
| 2530 |
| 2531 if (ctxt->input->base > ctxt->input->cur - len) |
| 2532 return(NULL); |
| 2533 |
| 2517 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); | 2534 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); |
| 2518 } | 2535 } |
| 2519 | 2536 |
| 2520 | 2537 |
| 2521 /** | 2538 /** |
| 2522 * htmlParseHTMLAttribute: | 2539 * htmlParseHTMLAttribute: |
| 2523 * @ctxt: an HTML parser context | 2540 * @ctxt: an HTML parser context |
| 2524 * @stop: a char stop value | 2541 * @stop: a char stop value |
| 2525 * | 2542 * |
| 2526 * parse an HTML attribute value till the stop (quote), if | 2543 * parse an HTML attribute value till the stop (quote), if |
| (...skipping 231 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2758 * | 2775 * |
| 2759 * parse an HTML Literal | 2776 * parse an HTML Literal |
| 2760 * | 2777 * |
| 2761 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") | 2778 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") |
| 2762 * | 2779 * |
| 2763 * Returns the SystemLiteral parsed or NULL | 2780 * Returns the SystemLiteral parsed or NULL |
| 2764 */ | 2781 */ |
| 2765 | 2782 |
| 2766 static xmlChar * | 2783 static xmlChar * |
| 2767 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { | 2784 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { |
| 2768 const xmlChar *q; | 2785 size_t len = 0, startPosition = 0; |
| 2769 xmlChar *ret = NULL; | 2786 xmlChar *ret = NULL; |
| 2770 | 2787 |
| 2771 if (CUR == '"') { | 2788 if (CUR == '"') { |
| 2772 NEXT; | 2789 NEXT; |
| 2773 » q = CUR_PTR; | 2790 |
| 2774 » while ((IS_CHAR_CH(CUR)) && (CUR != '"')) | 2791 if (CUR_PTR < BASE_PTR) |
| 2792 return(ret); |
| 2793 startPosition = CUR_PTR - BASE_PTR; |
| 2794 |
| 2795 » while ((IS_CHAR_CH(CUR)) && (CUR != '"')) { |
| 2775 NEXT; | 2796 NEXT; |
| 2797 len++; |
| 2798 } |
| 2776 if (!IS_CHAR_CH(CUR)) { | 2799 if (!IS_CHAR_CH(CUR)) { |
| 2777 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2800 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
| 2778 "Unfinished SystemLiteral\n", NULL, NULL); | 2801 "Unfinished SystemLiteral\n", NULL, NULL); |
| 2779 } else { | 2802 } else { |
| 2780 » ret = xmlStrndup(q, CUR_PTR - q); | 2803 » ret = xmlStrndup((BASE_PTR+startPosition), len); |
| 2781 NEXT; | 2804 NEXT; |
| 2782 } | 2805 } |
| 2783 } else if (CUR == '\'') { | 2806 } else if (CUR == '\'') { |
| 2784 NEXT; | 2807 NEXT; |
| 2785 » q = CUR_PTR; | 2808 |
| 2786 » while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) | 2809 if (CUR_PTR < BASE_PTR) |
| 2810 return(ret); |
| 2811 startPosition = CUR_PTR - BASE_PTR; |
| 2812 |
| 2813 » while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) { |
| 2787 NEXT; | 2814 NEXT; |
| 2815 len++; |
| 2816 } |
| 2788 if (!IS_CHAR_CH(CUR)) { | 2817 if (!IS_CHAR_CH(CUR)) { |
| 2789 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2818 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
| 2790 "Unfinished SystemLiteral\n", NULL, NULL); | 2819 "Unfinished SystemLiteral\n", NULL, NULL); |
| 2791 } else { | 2820 } else { |
| 2792 » ret = xmlStrndup(q, CUR_PTR - q); | 2821 » ret = xmlStrndup((BASE_PTR+startPosition), len); |
| 2793 NEXT; | 2822 NEXT; |
| 2794 } | 2823 } |
| 2795 } else { | 2824 } else { |
| 2796 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, | 2825 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, |
| 2797 " or ' expected\n", NULL, NULL); | 2826 " or ' expected\n", NULL, NULL); |
| 2798 } | 2827 } |
| 2799 | 2828 |
| 2800 return(ret); | 2829 return(ret); |
| 2801 } | 2830 } |
| 2802 | 2831 |
| 2803 /** | 2832 /** |
| 2804 * htmlParsePubidLiteral: | 2833 * htmlParsePubidLiteral: |
| 2805 * @ctxt: an HTML parser context | 2834 * @ctxt: an HTML parser context |
| 2806 * | 2835 * |
| 2807 * parse an HTML public literal | 2836 * parse an HTML public literal |
| 2808 * | 2837 * |
| 2809 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" | 2838 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" |
| 2810 * | 2839 * |
| 2811 * Returns the PubidLiteral parsed or NULL. | 2840 * Returns the PubidLiteral parsed or NULL. |
| 2812 */ | 2841 */ |
| 2813 | 2842 |
| 2814 static xmlChar * | 2843 static xmlChar * |
| 2815 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { | 2844 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { |
| 2816 const xmlChar *q; | 2845 size_t len = 0, startPosition = 0; |
| 2817 xmlChar *ret = NULL; | 2846 xmlChar *ret = NULL; |
| 2818 /* | 2847 /* |
| 2819 * Name ::= (Letter | '_') (NameChar)* | 2848 * Name ::= (Letter | '_') (NameChar)* |
| 2820 */ | 2849 */ |
| 2821 if (CUR == '"') { | 2850 if (CUR == '"') { |
| 2822 NEXT; | 2851 NEXT; |
| 2823 » q = CUR_PTR; | 2852 |
| 2824 » while (IS_PUBIDCHAR_CH(CUR)) NEXT; | 2853 if (CUR_PTR < BASE_PTR) |
| 2854 return(ret); |
| 2855 startPosition = CUR_PTR - BASE_PTR; |
| 2856 |
| 2857 while (IS_PUBIDCHAR_CH(CUR)) { |
| 2858 len++; |
| 2859 NEXT; |
| 2860 } |
| 2861 |
| 2825 if (CUR != '"') { | 2862 if (CUR != '"') { |
| 2826 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2863 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
| 2827 "Unfinished PubidLiteral\n", NULL, NULL); | 2864 "Unfinished PubidLiteral\n", NULL, NULL); |
| 2828 } else { | 2865 } else { |
| 2829 » ret = xmlStrndup(q, CUR_PTR - q); | 2866 » ret = xmlStrndup((BASE_PTR + startPosition), len); |
| 2830 NEXT; | 2867 NEXT; |
| 2831 } | 2868 } |
| 2832 } else if (CUR == '\'') { | 2869 } else if (CUR == '\'') { |
| 2833 NEXT; | 2870 NEXT; |
| 2834 » q = CUR_PTR; | 2871 |
| 2835 » while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) | 2872 if (CUR_PTR < BASE_PTR) |
| 2836 » NEXT; | 2873 return(ret); |
| 2874 startPosition = CUR_PTR - BASE_PTR; |
| 2875 |
| 2876 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){ |
| 2877 len++; |
| 2878 NEXT; |
| 2879 } |
| 2880 |
| 2837 if (CUR != '\'') { | 2881 if (CUR != '\'') { |
| 2838 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, | 2882 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
| 2839 "Unfinished PubidLiteral\n", NULL, NULL); | 2883 "Unfinished PubidLiteral\n", NULL, NULL); |
| 2840 } else { | 2884 } else { |
| 2841 » ret = xmlStrndup(q, CUR_PTR - q); | 2885 » ret = xmlStrndup((BASE_PTR + startPosition), len); |
| 2842 NEXT; | 2886 NEXT; |
| 2843 } | 2887 } |
| 2844 } else { | 2888 } else { |
| 2845 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, | 2889 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, |
| 2846 "PubidLiteral \" or ' expected\n", NULL, NULL); | 2890 "PubidLiteral \" or ' expected\n", NULL, NULL); |
| 2847 } | 2891 } |
| 2848 | 2892 |
| 2849 return(ret); | 2893 return(ret); |
| 2850 } | 2894 } |
| 2851 | 2895 |
| (...skipping 4260 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 7112 xmlFreeParserInputBuffer(input); | 7156 xmlFreeParserInputBuffer(input); |
| 7113 return (NULL); | 7157 return (NULL); |
| 7114 } | 7158 } |
| 7115 inputPush(ctxt, stream); | 7159 inputPush(ctxt, stream); |
| 7116 return (htmlDoRead(ctxt, URL, encoding, options, 1)); | 7160 return (htmlDoRead(ctxt, URL, encoding, options, 1)); |
| 7117 } | 7161 } |
| 7118 | 7162 |
| 7119 #define bottom_HTMLparser | 7163 #define bottom_HTMLparser |
| 7120 #include "elfgcchack.h" | 7164 #include "elfgcchack.h" |
| 7121 #endif /* LIBXML_HTML_ENABLED */ | 7165 #endif /* LIBXML_HTML_ENABLED */ |
| OLD | NEW |