| Index: third_party/libxml/HTMLparser.c
|
| diff --git a/third_party/libxml/HTMLparser.c b/third_party/libxml/HTMLparser.c
|
| index 92503a1a4d92a3568c3a1b01b53f7568cda1f031..42dc776ae587e03d38416dafe072e563d4040683 100644
|
| --- a/third_party/libxml/HTMLparser.c
|
| +++ b/third_party/libxml/HTMLparser.c
|
| @@ -59,7 +59,7 @@ static void htmlParseComment(htmlParserCtxtPtr ctxt);
|
|
|
| /************************************************************************
|
| * *
|
| - * Some factorized error routines *
|
| + * Some factorized error routines *
|
| * *
|
| ************************************************************************/
|
|
|
| @@ -147,7 +147,7 @@ htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
|
|
|
| /************************************************************************
|
| * *
|
| - * Parser stacks related functions and macros *
|
| + * Parser stacks related functions and macros *
|
| * *
|
| ************************************************************************/
|
|
|
| @@ -163,6 +163,10 @@ htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
|
| static int
|
| htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
|
| {
|
| + if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
|
| + ctxt->html = 3;
|
| + if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
|
| + ctxt->html = 10;
|
| if (ctxt->nameNr >= ctxt->nameMax) {
|
| ctxt->nameMax *= 2;
|
| ctxt->nameTab = (const xmlChar * *)
|
| @@ -205,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
|
| return (ret);
|
| }
|
|
|
| +/**
|
| + * htmlNodeInfoPush:
|
| + * @ctxt: an HTML parser context
|
| + * @value: the node info
|
| + *
|
| + * Pushes a new element name on top of the node info stack
|
| + *
|
| + * Returns 0 in case of error, the index in the stack otherwise
|
| + */
|
| +static int
|
| +htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
|
| +{
|
| + if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
|
| + if (ctxt->nodeInfoMax == 0)
|
| + ctxt->nodeInfoMax = 5;
|
| + ctxt->nodeInfoMax *= 2;
|
| + ctxt->nodeInfoTab = (htmlParserNodeInfo *)
|
| + xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
|
| + ctxt->nodeInfoMax *
|
| + sizeof(ctxt->nodeInfoTab[0]));
|
| + if (ctxt->nodeInfoTab == NULL) {
|
| + htmlErrMemory(ctxt, NULL);
|
| + return (0);
|
| + }
|
| + }
|
| + ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
|
| + ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
|
| + return (ctxt->nodeInfoNr++);
|
| +}
|
| +
|
| +/**
|
| + * htmlNodeInfoPop:
|
| + * @ctxt: an HTML parser context
|
| + *
|
| + * Pops the top element name from the node info stack
|
| + *
|
| + * Returns 0 in case of error, the pointer to NodeInfo otherwise
|
| + */
|
| +static htmlParserNodeInfo *
|
| +htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
|
| +{
|
| + if (ctxt->nodeInfoNr <= 0)
|
| + return (NULL);
|
| + ctxt->nodeInfoNr--;
|
| + if (ctxt->nodeInfoNr < 0)
|
| + return (NULL);
|
| + if (ctxt->nodeInfoNr > 0)
|
| + ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
|
| + else
|
| + ctxt->nodeInfo = NULL;
|
| + return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
|
| +}
|
| +
|
| /*
|
| * Macros for accessing the content. Those should be used only by the parser,
|
| * and not exported.
|
| @@ -263,8 +320,6 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
|
| #define NEXT xmlNextChar(ctxt)
|
|
|
| #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
|
| -#define NXT(val) ctxt->input->cur[(val)]
|
| -#define CUR_PTR ctxt->input->cur
|
|
|
|
|
| #define NEXTL(l) do { \
|
| @@ -273,7 +328,7 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
|
| } else ctxt->input->col++; \
|
| ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
|
| } while (0)
|
| -
|
| +
|
| /************
|
| \
|
| if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
|
| @@ -288,6 +343,58 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
|
| else i += xmlCopyChar(l,&b[i],v)
|
|
|
| /**
|
| + * htmlFindEncoding:
|
| + * @the HTML parser context
|
| + *
|
| + * Ty to find and encoding in the current data available in the input
|
| + * buffer this is needed to try to switch to the proper encoding when
|
| + * one face a character error.
|
| + * That's an heuristic, since it's operating outside of parsing it could
|
| + * try to use a meta which had been commented out, that's the reason it
|
| + * should only be used in case of error, not as a default.
|
| + *
|
| + * Returns an encoding string or NULL if not found, the string need to
|
| + * be freed
|
| + */
|
| +static xmlChar *
|
| +htmlFindEncoding(xmlParserCtxtPtr ctxt) {
|
| + const xmlChar *start, *cur, *end;
|
| +
|
| + if ((ctxt == NULL) || (ctxt->input == NULL) ||
|
| + (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
|
| + (ctxt->input->buf->encoder != NULL))
|
| + return(NULL);
|
| + if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
|
| + return(NULL);
|
| +
|
| + start = ctxt->input->cur;
|
| + end = ctxt->input->end;
|
| + /* we also expect the input buffer to be zero terminated */
|
| + if (*end != 0)
|
| + return(NULL);
|
| +
|
| + cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
|
| + if (cur == NULL)
|
| + return(NULL);
|
| + cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
|
| + if (cur == NULL)
|
| + return(NULL);
|
| + cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
|
| + if (cur == NULL)
|
| + return(NULL);
|
| + cur += 8;
|
| + start = cur;
|
| + while (((*cur >= 'A') && (*cur <= 'Z')) ||
|
| + ((*cur >= 'a') && (*cur <= 'z')) ||
|
| + ((*cur >= '0') && (*cur <= '9')) ||
|
| + (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
|
| + cur++;
|
| + if (cur == start)
|
| + return(NULL);
|
| + return(xmlStrndup(start, cur - start));
|
| +}
|
| +
|
| +/**
|
| * htmlCurrentChar:
|
| * @ctxt: the HTML parser context
|
| * @len: pointer to the length of the char read
|
| @@ -309,7 +416,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
| if (ctxt->token != 0) {
|
| *len = 0;
|
| return(ctxt->token);
|
| - }
|
| + }
|
| if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
|
| /*
|
| * We are supposed to handle UTF8, check it's valid
|
| @@ -318,7 +425,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
| * UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
| * 0000 0000-0000 007F 0xxxxxxx
|
| * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
| - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
| + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
| *
|
| * Check for the 0x110000 limit too
|
| */
|
| @@ -328,19 +435,25 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
|
|
| c = *cur;
|
| if (c & 0x80) {
|
| - if (cur[1] == 0)
|
| + if (cur[1] == 0) {
|
| xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
| + cur = ctxt->input->cur;
|
| + }
|
| if ((cur[1] & 0xc0) != 0x80)
|
| goto encoding_error;
|
| if ((c & 0xe0) == 0xe0) {
|
|
|
| - if (cur[2] == 0)
|
| + if (cur[2] == 0) {
|
| xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
| + cur = ctxt->input->cur;
|
| + }
|
| if ((cur[2] & 0xc0) != 0x80)
|
| goto encoding_error;
|
| if ((c & 0xf0) == 0xf0) {
|
| - if (cur[3] == 0)
|
| + if (cur[3] == 0) {
|
| xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
| + cur = ctxt->input->cur;
|
| + }
|
| if (((c & 0xf8) != 0xf0) ||
|
| ((cur[3] & 0xc0) != 0x80))
|
| goto encoding_error;
|
| @@ -366,9 +479,16 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
| if (!IS_CHAR(val)) {
|
| htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
| "Char 0x%X out of allowed range\n", val);
|
| - }
|
| + }
|
| return(val);
|
| } else {
|
| + if ((*ctxt->input->cur == 0) &&
|
| + (ctxt->input->cur < ctxt->input->end)) {
|
| + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
| + "Char 0x%X out of allowed range\n", 0);
|
| + *len = 1;
|
| + return(' ');
|
| + }
|
| /* 1-byte code */
|
| *len = 1;
|
| return((int) *ctxt->input->cur);
|
| @@ -386,8 +506,28 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
| /*
|
| * Humm this is bad, do an automatic flow conversion
|
| */
|
| - xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
| - ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
| + {
|
| + xmlChar * guess;
|
| + xmlCharEncodingHandlerPtr handler;
|
| +
|
| + guess = htmlFindEncoding(ctxt);
|
| + if (guess == NULL) {
|
| + xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
| + } else {
|
| + if (ctxt->input->encoding != NULL)
|
| + xmlFree((xmlChar *) ctxt->input->encoding);
|
| + ctxt->input->encoding = guess;
|
| + handler = xmlFindCharEncodingHandler((const char *) guess);
|
| + if (handler != NULL) {
|
| + xmlSwitchToEncoding(ctxt, handler);
|
| + } else {
|
| + htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
|
| + "Unsupported encoding %s", guess, NULL);
|
| + }
|
| + }
|
| + ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
| + }
|
| +
|
| return(xmlCurrentChar(ctxt, len));
|
|
|
| encoding_error:
|
| @@ -413,7 +553,7 @@ encoding_error:
|
| BAD_CAST buffer, NULL);
|
| }
|
|
|
| - ctxt->charset = XML_CHAR_ENCODING_8859_1;
|
| + ctxt->charset = XML_CHAR_ENCODING_8859_1;
|
| *len = 1;
|
| return((int) *ctxt->input->cur);
|
| }
|
| @@ -453,7 +593,7 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
|
|
|
| /************************************************************************
|
| * *
|
| - * The list of HTML elements and their properties *
|
| + * The list of HTML elements and their properties *
|
| * *
|
| ************************************************************************/
|
|
|
| @@ -478,9 +618,9 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
|
| #define NB_PHRASE 10
|
| #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
|
| #define NB_SPECIAL 16
|
| -#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
|
| +#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
|
| #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
|
| -#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
|
| +#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
|
| #define NB_BLOCK NB_HEADING + NB_LIST + 14
|
| #define FORMCTRL "input", "select", "textarea", "label", "button"
|
| #define NB_FORMCTRL 5
|
| @@ -606,7 +746,7 @@ static const char* const language_attr[] = { "language", NULL } ;
|
| static const char* const select_content[] = { "optgroup", "option", NULL } ;
|
| static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
|
| static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
|
| -static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
|
| +static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
|
| static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
|
| static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
|
| static const char* const tr_elt[] = { "tr", NULL } ;
|
| @@ -938,7 +1078,7 @@ static const char * const htmlStartClose[] = {
|
| "listing", "xmp", NULL,
|
| "ol", "p", "head", "ul", NULL,
|
| "menu", "p", "head", "ul", NULL,
|
| -"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
|
| +"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
|
| "div", "p", "head", NULL,
|
| "noscript", "p", "head", NULL,
|
| "center", "font", "b", "i", "p", "head", NULL,
|
| @@ -949,7 +1089,7 @@ static const char * const htmlStartClose[] = {
|
| "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
|
| "listing", "xmp", "a", NULL,
|
| "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
|
| -"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
|
| +"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
|
| "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
|
| "thead", "caption", "col", "colgroup", NULL,
|
| "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
|
| @@ -1008,7 +1148,7 @@ static const char *const htmlScriptAttributes[] = {
|
| * elements the parser can decide how to handle extra endtags.
|
| * Endtags are only allowed to close elements with lower or equal
|
| * priority.
|
| - */
|
| + */
|
|
|
| typedef struct {
|
| const char *name;
|
| @@ -1035,7 +1175,7 @@ static int htmlStartCloseIndexinitialized = 0;
|
|
|
| /************************************************************************
|
| * *
|
| - * functions to handle HTML specific data *
|
| + * functions to handle HTML specific data *
|
| * *
|
| ************************************************************************/
|
|
|
| @@ -1085,7 +1225,7 @@ htmlTagLookup(const xmlChar *tag) {
|
| /**
|
| * htmlGetEndPriority:
|
| * @name: The name of the element to look up the priority for.
|
| - *
|
| + *
|
| * Return value: The "endtag" priority.
|
| **/
|
| static int
|
| @@ -1164,7 +1304,7 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
|
| * A missplaced endtag can only close elements with lower
|
| * or equal priority, so if we find an element with higher
|
| * priority before we find an element with
|
| - * matching name, we just ignore this endtag
|
| + * matching name, we just ignore this endtag
|
| */
|
| if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
|
| return;
|
| @@ -1215,7 +1355,7 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
|
| * called when a new tag has been detected and generates the
|
| * appropriates closes if possible/needed.
|
| * If newtag is NULL this mean we are at the end of the resource
|
| - * and we should check
|
| + * and we should check
|
| */
|
| static void
|
| htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
|
| @@ -1303,6 +1443,10 @@ htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
|
| */
|
| static void
|
| htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
|
| + int i;
|
| +
|
| + if (ctxt->options & HTML_PARSE_NOIMPLIED)
|
| + return;
|
| if (!htmlOmittedDefaultValue)
|
| return;
|
| if (xmlStrEqual(newtag, BAD_CAST"html"))
|
| @@ -1314,24 +1458,31 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
|
| }
|
| if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
|
| return;
|
| - if ((ctxt->nameNr <= 1) &&
|
| + if ((ctxt->nameNr <= 1) &&
|
| ((xmlStrEqual(newtag, BAD_CAST"script")) ||
|
| (xmlStrEqual(newtag, BAD_CAST"style")) ||
|
| (xmlStrEqual(newtag, BAD_CAST"meta")) ||
|
| (xmlStrEqual(newtag, BAD_CAST"link")) ||
|
| (xmlStrEqual(newtag, BAD_CAST"title")) ||
|
| (xmlStrEqual(newtag, BAD_CAST"base")))) {
|
| - /*
|
| - * dropped OBJECT ... i you put it first BODY will be
|
| - * assumed !
|
| - */
|
| - htmlnamePush(ctxt, BAD_CAST"head");
|
| - if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
| - ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
|
| + if (ctxt->html >= 3) {
|
| + /* we already saw or generated an <head> before */
|
| + return;
|
| + }
|
| + /*
|
| + * dropped OBJECT ... i you put it first BODY will be
|
| + * assumed !
|
| + */
|
| + htmlnamePush(ctxt, BAD_CAST"head");
|
| + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
| + ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
|
| } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
|
| (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
|
| (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
|
| - int i;
|
| + if (ctxt->html >= 10) {
|
| + /* we already saw or generated a <body> before */
|
| + return;
|
| + }
|
| for (i = 0;i < ctxt->nameNr;i++) {
|
| if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
|
| return;
|
| @@ -1340,7 +1491,7 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
|
| return;
|
| }
|
| }
|
| -
|
| +
|
| htmlnamePush(ctxt, BAD_CAST"body");
|
| if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
| ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
|
| @@ -1402,12 +1553,12 @@ htmlIsScriptAttribute(const xmlChar *name) {
|
| unsigned int i;
|
|
|
| if (name == NULL)
|
| - return(0);
|
| + return(0);
|
| /*
|
| * all script attributes start with 'on'
|
| */
|
| if ((name[0] != 'o') || (name[1] != 'n'))
|
| - return(0);
|
| + return(0);
|
| for (i = 0;
|
| i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
|
| i++) {
|
| @@ -1419,7 +1570,7 @@ htmlIsScriptAttribute(const xmlChar *name) {
|
|
|
| /************************************************************************
|
| * *
|
| - * The list of HTML predefined entities *
|
| + * The list of HTML predefined entities *
|
| * *
|
| ************************************************************************/
|
|
|
| @@ -1833,7 +1984,7 @@ UTF8ToHtml(unsigned char* out, int *outlen,
|
|
|
| if (inend - in < trailing) {
|
| break;
|
| - }
|
| + }
|
|
|
| for ( ; trailing; trailing--) {
|
| if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
|
| @@ -2023,7 +2174,7 @@ htmlNewInputStream(htmlParserCtxtPtr ctxt) {
|
| * *
|
| ************************************************************************/
|
| /*
|
| - * all tags allowing pc data from the html 4.01 loose dtd
|
| + * all tags allowing pc data from the html 4.01 loose dtd
|
| * NOTE: it might be more apropriate to integrate this information
|
| * into the html40ElementTable array but I don't want to risk any
|
| * binary incomptibility
|
| @@ -2083,7 +2234,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
|
| if (lastChild == NULL) {
|
| if ((ctxt->node->type != XML_ELEMENT_NODE) &&
|
| (ctxt->node->content != NULL)) return(0);
|
| - /* keep ws in constructs like ...<b> </b>...
|
| + /* keep ws in constructs like ...<b> </b>...
|
| for all tags "b" allowing PCDATA */
|
| for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
|
| if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
|
| @@ -2093,7 +2244,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
|
| } else if (xmlNodeIsText(lastChild)) {
|
| return(0);
|
| } else {
|
| - /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
|
| + /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
|
| for all tags "p" allowing PCDATA */
|
| for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
|
| if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
|
| @@ -2133,7 +2284,7 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
|
| cur->intSubset = NULL;
|
| cur->doc = cur;
|
| cur->name = NULL;
|
| - cur->children = NULL;
|
| + cur->children = NULL;
|
| cur->extSubset = NULL;
|
| cur->oldNs = NULL;
|
| cur->encoding = NULL;
|
| @@ -2143,6 +2294,7 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
|
| cur->refs = NULL;
|
| cur->_private = NULL;
|
| cur->charset = XML_CHAR_ENCODING_UTF8;
|
| + cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
|
| if ((ExternalID != NULL) ||
|
| (URI != NULL))
|
| xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
|
| @@ -2200,18 +2352,19 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
|
| xmlChar loc[HTML_PARSER_BUFFER_SIZE];
|
|
|
| if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
|
| - (CUR != ':')) return(NULL);
|
| + (CUR != ':') && (CUR != '.')) return(NULL);
|
|
|
| while ((i < HTML_PARSER_BUFFER_SIZE) &&
|
| ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
|
| - (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
|
| + (CUR == ':') || (CUR == '-') || (CUR == '_') ||
|
| + (CUR == '.'))) {
|
| if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
|
| else loc[i] = CUR;
|
| i++;
|
| -
|
| +
|
| NEXT;
|
| }
|
| -
|
| +
|
| return(xmlDictLookup(ctxt->dict, loc, i));
|
| }
|
|
|
| @@ -2234,7 +2387,7 @@ htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
|
|
|
| if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
|
| (NXT(1) != ':')) return(NULL);
|
| -
|
| +
|
| while ((i < HTML_PARSER_BUFFER_SIZE) &&
|
| ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
|
| (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
|
| @@ -2242,7 +2395,7 @@ htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
|
| else loc[i] = NXT(1+i);
|
| i++;
|
| }
|
| -
|
| +
|
| return(xmlDictLookup(ctxt->dict, loc, i));
|
| }
|
|
|
| @@ -2310,7 +2463,7 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
|
| while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
|
| ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
|
| (c == '.') || (c == '-') ||
|
| - (c == '_') || (c == ':') ||
|
| + (c == '_') || (c == ':') ||
|
| (IS_COMBINING(c)) ||
|
| (IS_EXTENDER(c)))) {
|
| if (count++ > 100) {
|
| @@ -2329,7 +2482,7 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
|
| * htmlParseHTMLAttribute:
|
| * @ctxt: an HTML parser context
|
| * @stop: a char stop value
|
| - *
|
| + *
|
| * parse an HTML attribute value till the stop (quote), if
|
| * stop is 0 then it stops at the first space
|
| *
|
| @@ -2374,13 +2527,13 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
|
| { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
| else if (c < 0x10000)
|
| { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
|
| - else
|
| + else
|
| { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
|
| -
|
| +
|
| for ( ; bits >= 0; bits-= 6) {
|
| *out++ = ((c >> bits) & 0x3F) | 0x80;
|
| }
|
| -
|
| +
|
| if (out - buffer > buffer_size - 100) {
|
| int indx = out - buffer;
|
|
|
| @@ -2426,9 +2579,9 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
|
| { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
| else if (c < 0x10000)
|
| { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
|
| - else
|
| + else
|
| { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
|
| -
|
| +
|
| for ( ; bits >= 0; bits-= 6) {
|
| *out++ = ((c >> bits) & 0x3F) | 0x80;
|
| }
|
| @@ -2451,16 +2604,16 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
|
| { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
| else if (c < 0x10000)
|
| { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
|
| - else
|
| + else
|
| { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
|
| -
|
| +
|
| for ( ; bits >= 0; bits-= 6) {
|
| *out++ = ((c >> bits) & 0x3F) | 0x80;
|
| }
|
| NEXT;
|
| }
|
| }
|
| - *out++ = 0;
|
| + *out = 0;
|
| return(buffer);
|
| }
|
|
|
| @@ -2521,7 +2674,7 @@ htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
|
| * parse a value for an attribute
|
| * Note: the parser won't do substitution of entities here, this
|
| * will be handled later in xmlStringGetNodeList, unless it was
|
| - * asked for ctxt->replaceEntities != 0
|
| + * asked for ctxt->replaceEntities != 0
|
| *
|
| * Returns the AttValue parsed or NULL.
|
| */
|
| @@ -2562,7 +2715,7 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
|
| /**
|
| * htmlParseSystemLiteral:
|
| * @ctxt: an HTML parser context
|
| - *
|
| + *
|
| * parse an HTML Literal
|
| *
|
| * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
|
| @@ -2603,7 +2756,7 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
|
| htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
|
| " or ' expected\n", NULL, NULL);
|
| }
|
| -
|
| +
|
| return(ret);
|
| }
|
|
|
| @@ -2652,7 +2805,7 @@ htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
|
| htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
|
| "PubidLiteral \" or ' expected\n", NULL, NULL);
|
| }
|
| -
|
| +
|
| return(ret);
|
| }
|
|
|
| @@ -2699,8 +2852,8 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
|
| * CDATA.
|
| */
|
| if (ctxt->recovery) {
|
| - if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
|
| - xmlStrlen(ctxt->name)) == 0)
|
| + if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
|
| + xmlStrlen(ctxt->name)) == 0)
|
| {
|
| break; /* while */
|
| } else {
|
| @@ -2710,7 +2863,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
|
| }
|
| } else {
|
| if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
|
| - ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
|
| + ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
|
| {
|
| break; /* while */
|
| }
|
| @@ -2767,11 +2920,12 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
| xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
|
| int nbchar = 0;
|
| int cur, l;
|
| + int chunk = 0;
|
|
|
| SHRINK;
|
| cur = CUR_CHAR(l);
|
| while (((cur != '<') || (ctxt->token == '<')) &&
|
| - ((cur != '&') || (ctxt->token == '&')) &&
|
| + ((cur != '&') || (ctxt->token == '&')) &&
|
| (cur != 0)) {
|
| if (!(IS_CHAR(cur))) {
|
| htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
| @@ -2797,6 +2951,12 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
|
| nbchar = 0;
|
| }
|
| NEXTL(l);
|
| + chunk++;
|
| + if (chunk > HTML_PARSER_BUFFER_SIZE) {
|
| + chunk = 0;
|
| + SHRINK;
|
| + GROW;
|
| + }
|
| cur = CUR_CHAR(l);
|
| if (cur == 0) {
|
| SHRINK;
|
| @@ -2991,7 +3151,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {
|
| }
|
| xmlFree(buf);
|
| } else {
|
| - htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
|
| + htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
|
| "PI is not started correctly", NULL, NULL);
|
| }
|
| ctxt->instate = state;
|
| @@ -3107,7 +3267,7 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
|
| ((NXT(2) == 'x') || NXT(2) == 'X')) {
|
| SKIP(3);
|
| while (CUR != ';') {
|
| - if ((CUR >= '0') && (CUR <= '9'))
|
| + if ((CUR >= '0') && (CUR <= '9'))
|
| val = val * 16 + (CUR - '0');
|
| else if ((CUR >= 'a') && (CUR <= 'f'))
|
| val = val * 16 + (CUR - 'a') + 10;
|
| @@ -3126,7 +3286,7 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
|
| } else if ((CUR == '&') && (NXT(1) == '#')) {
|
| SKIP(2);
|
| while (CUR != ';') {
|
| - if ((CUR >= '0') && (CUR <= '9'))
|
| + if ((CUR >= '0') && (CUR <= '9'))
|
| val = val * 10 + (CUR - '0');
|
| else {
|
| htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
|
| @@ -3162,7 +3322,7 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {
|
| *
|
| * parse a DOCTYPE declaration
|
| *
|
| - * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
|
| + * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
|
| * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
|
| */
|
|
|
| @@ -3266,11 +3426,6 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
|
| NEXT;
|
| SKIP_BLANKS;
|
| val = htmlParseAttValue(ctxt);
|
| - } else if (htmlIsBooleanAttr(name)) {
|
| - /*
|
| - * assume a minimized attribute
|
| - */
|
| - val = xmlStrdup(name);
|
| }
|
|
|
| *value = val;
|
| @@ -3294,7 +3449,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
| if ((ctxt == NULL) || (attvalue == NULL))
|
| return;
|
|
|
| - /* do not change encoding */
|
| + /* do not change encoding */
|
| if (ctxt->input->encoding != NULL)
|
| return;
|
|
|
| @@ -3321,7 +3476,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
| * registered set of known encodings
|
| */
|
| if (enc != XML_CHAR_ENCODING_ERROR) {
|
| - if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
|
| + if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
|
| (enc == XML_CHAR_ENCODING_UTF16BE) ||
|
| (enc == XML_CHAR_ENCODING_UCS4LE) ||
|
| (enc == XML_CHAR_ENCODING_UCS4BE)) &&
|
| @@ -3369,6 +3524,8 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
| }
|
| ctxt->input->base =
|
| ctxt->input->cur = ctxt->input->buf->buffer->content;
|
| + ctxt->input->end =
|
| + &ctxt->input->base[ctxt->input->buf->buffer->use];
|
| }
|
| }
|
| }
|
| @@ -3409,7 +3566,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
|
| /**
|
| * htmlParseStartTag:
|
| * @ctxt: an HTML parser context
|
| - *
|
| + *
|
| * parse a start of tag either for rule element or
|
| * EmptyElement. In both case we don't parse the tag closing chars.
|
| *
|
| @@ -3438,6 +3595,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
| int i;
|
| int discardtag = 0;
|
|
|
| + if (ctxt->instate == XML_PARSER_EOF)
|
| + return(-1);
|
| if ((ctxt == NULL) || (ctxt->input == NULL)) {
|
| htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
|
| "htmlParseStartTag: context error\n", NULL, NULL);
|
| @@ -3456,7 +3615,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
| "htmlParseStartTag: invalid element name\n",
|
| NULL, NULL);
|
| /* Dump the bogus tag like browsers do */
|
| - while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
|
| + while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
|
| + (ctxt->instate != XML_PARSER_EOF))
|
| NEXT;
|
| return -1;
|
| }
|
| @@ -3482,13 +3642,15 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
| "htmlParseStartTag: misplaced <html> tag\n",
|
| name, NULL);
|
| discardtag = 1;
|
| + ctxt->depth++;
|
| }
|
| - if ((ctxt->nameNr != 1) &&
|
| + if ((ctxt->nameNr != 1) &&
|
| (xmlStrEqual(name, BAD_CAST"head"))) {
|
| htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
|
| "htmlParseStartTag: misplaced <head> tag\n",
|
| name, NULL);
|
| discardtag = 1;
|
| + ctxt->depth++;
|
| }
|
| if (xmlStrEqual(name, BAD_CAST"body")) {
|
| int indx;
|
| @@ -3498,6 +3660,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
| "htmlParseStartTag: misplaced <body> tag\n",
|
| name, NULL);
|
| discardtag = 1;
|
| + ctxt->depth++;
|
| }
|
| }
|
| }
|
| @@ -3509,7 +3672,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
| */
|
| SKIP_BLANKS;
|
| while ((IS_CHAR_CH(CUR)) &&
|
| - (CUR != '>') &&
|
| + (CUR != '>') &&
|
| ((CUR != '/') || (NXT(1) != '>'))) {
|
| long cons = ctxt->nbChars;
|
|
|
| @@ -3648,7 +3811,6 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
|
| name = htmlParseHTMLName(ctxt);
|
| if (name == NULL)
|
| return (0);
|
| -
|
| /*
|
| * We should definitely be at the ending "S? '>'" part
|
| */
|
| @@ -3669,6 +3831,18 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
|
| NEXT;
|
|
|
| /*
|
| + * if we ignored misplaced tags in htmlParseStartTag don't pop them
|
| + * out now.
|
| + */
|
| + if ((ctxt->depth > 0) &&
|
| + (xmlStrEqual(name, BAD_CAST "html") ||
|
| + xmlStrEqual(name, BAD_CAST "body") ||
|
| + xmlStrEqual(name, BAD_CAST "head"))) {
|
| + ctxt->depth--;
|
| + return (0);
|
| + }
|
| +
|
| + /*
|
| * If the name read is not one of the element in the parsing stack
|
| * then return, it's just an error.
|
| */
|
| @@ -3722,7 +3896,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
|
| /**
|
| * htmlParseReference:
|
| * @ctxt: an HTML parser context
|
| - *
|
| + *
|
| * parse and handle entity references in content,
|
| * this will end-up in a call to character() since this is either a
|
| * CharRef, or a predefined entity.
|
| @@ -3746,7 +3920,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
| else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
| else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
|
| else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
|
| -
|
| +
|
| for ( ; bits >= 0; bits-= 6) {
|
| out[i++]= ((c >> bits) & 0x3F) | 0x80;
|
| }
|
| @@ -3781,9 +3955,9 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
| { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
| else if (c < 0x10000)
|
| { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
|
| - else
|
| + else
|
| { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
|
| -
|
| +
|
| for ( ; bits >= 0; bits-= 6) {
|
| out[i++]= ((c >> bits) & 0x3F) | 0x80;
|
| }
|
| @@ -3801,6 +3975,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
| * @ctxt: an HTML parser context
|
| *
|
| * Parse a content: comment, sub-element, reference or text.
|
| + * Kept for compatibility with old code
|
| */
|
|
|
| static void
|
| @@ -3815,6 +3990,10 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
| long cons = ctxt->nbChars;
|
|
|
| GROW;
|
| +
|
| + if (ctxt->instate == XML_PARSER_EOF)
|
| + break;
|
| +
|
| /*
|
| * Our tag or one of it's parent or children is ending.
|
| */
|
| @@ -3837,7 +4016,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
| "htmlParseStartTag: invalid element name\n",
|
| NULL, NULL);
|
| /* Dump the bogus tag like browsers do */
|
| - while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
|
| + while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
|
| NEXT;
|
|
|
| if (currentNode != NULL)
|
| @@ -3850,7 +4029,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
| htmlAutoClose(ctxt, name);
|
| continue;
|
| }
|
| - }
|
| + }
|
| }
|
|
|
| /*
|
| @@ -3909,7 +4088,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
|
|
| /*
|
| * Fourth case : a reference. If if has not been resolved,
|
| - * parsing returns it's Name, create the node
|
| + * parsing returns it's Name, create the node
|
| */
|
| else if (CUR == '&') {
|
| htmlParseReference(ctxt);
|
| @@ -3945,23 +4124,11 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
| }
|
|
|
| /**
|
| - * htmlParseContent:
|
| - * @ctxt: an HTML parser context
|
| - *
|
| - * Parse a content: comment, sub-element, reference or text.
|
| - */
|
| -
|
| -void
|
| -__htmlParseContent(void *ctxt) {
|
| - if (ctxt != NULL)
|
| - htmlParseContent((htmlParserCtxtPtr) ctxt);
|
| -}
|
| -
|
| -/**
|
| * htmlParseElement:
|
| * @ctxt: an HTML parser context
|
| *
|
| * parse an HTML element, this is highly recursive
|
| + * this is kept for compatibility with previous code versions
|
| *
|
| * [39] element ::= EmptyElemTag | STag content ETag
|
| *
|
| @@ -3983,6 +4150,10 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
| "htmlParseElement: context error\n", NULL, NULL);
|
| return;
|
| }
|
| +
|
| + if (ctxt->instate == XML_PARSER_EOF)
|
| + return;
|
| +
|
| /* Capture start position */
|
| if (ctxt->record_info) {
|
| node_info.begin_pos = ctxt->input->consumed +
|
| @@ -4027,10 +4198,10 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
| /*
|
| * end of parsing of this node.
|
| */
|
| - if (xmlStrEqual(name, ctxt->name)) {
|
| + if (xmlStrEqual(name, ctxt->name)) {
|
| nodePop(ctxt);
|
| htmlnamePop(ctxt);
|
| - }
|
| + }
|
|
|
| /*
|
| * Capture end position and add node
|
| @@ -4064,8 +4235,8 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
| oldptr = ctxt->input->cur;
|
| htmlParseContent(ctxt);
|
| if (oldptr==ctxt->input->cur) break;
|
| - if (ctxt->nameNr < depth) break;
|
| - }
|
| + if (ctxt->nameNr < depth) break;
|
| + }
|
|
|
| /*
|
| * Capture end position and add node
|
| @@ -4085,10 +4256,305 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
| xmlFree(currentNode);
|
| }
|
|
|
| +static void
|
| +htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
|
| + /*
|
| + * Capture end position and add node
|
| + */
|
| + if ( ctxt->node != NULL && ctxt->record_info ) {
|
| + ctxt->nodeInfo->end_pos = ctxt->input->consumed +
|
| + (CUR_PTR - ctxt->input->base);
|
| + ctxt->nodeInfo->end_line = ctxt->input->line;
|
| + ctxt->nodeInfo->node = ctxt->node;
|
| + xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
|
| + htmlNodeInfoPop(ctxt);
|
| + }
|
| + if (!IS_CHAR_CH(CUR)) {
|
| + htmlAutoCloseOnEnd(ctxt);
|
| + }
|
| +}
|
| +
|
| +/**
|
| + * htmlParseElementInternal:
|
| + * @ctxt: an HTML parser context
|
| + *
|
| + * parse an HTML element, new version, non recursive
|
| + *
|
| + * [39] element ::= EmptyElemTag | STag content ETag
|
| + *
|
| + * [41] Attribute ::= Name Eq AttValue
|
| + */
|
| +
|
| +static void
|
| +htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
|
| + const xmlChar *name;
|
| + const htmlElemDesc * info;
|
| + htmlParserNodeInfo node_info;
|
| + int failed;
|
| +
|
| + if ((ctxt == NULL) || (ctxt->input == NULL)) {
|
| + htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
|
| + "htmlParseElementInternal: context error\n", NULL, NULL);
|
| + return;
|
| + }
|
| +
|
| + if (ctxt->instate == XML_PARSER_EOF)
|
| + return;
|
| +
|
| + /* Capture start position */
|
| + if (ctxt->record_info) {
|
| + node_info.begin_pos = ctxt->input->consumed +
|
| + (CUR_PTR - ctxt->input->base);
|
| + node_info.begin_line = ctxt->input->line;
|
| + }
|
| +
|
| + failed = htmlParseStartTag(ctxt);
|
| + name = ctxt->name;
|
| + if ((failed == -1) || (name == NULL)) {
|
| + if (CUR == '>')
|
| + NEXT;
|
| + return;
|
| + }
|
| +
|
| + /*
|
| + * Lookup the info for that element.
|
| + */
|
| + info = htmlTagLookup(name);
|
| + if (info == NULL) {
|
| + htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
|
| + "Tag %s invalid\n", name, NULL);
|
| + }
|
| +
|
| + /*
|
| + * Check for an Empty Element labeled the XML/SGML way
|
| + */
|
| + if ((CUR == '/') && (NXT(1) == '>')) {
|
| + SKIP(2);
|
| + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
| + ctxt->sax->endElement(ctxt->userData, name);
|
| + htmlnamePop(ctxt);
|
| + return;
|
| + }
|
| +
|
| + if (CUR == '>') {
|
| + NEXT;
|
| + } else {
|
| + htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
|
| + "Couldn't find end of Start Tag %s\n", name, NULL);
|
| +
|
| + /*
|
| + * end of parsing of this node.
|
| + */
|
| + if (xmlStrEqual(name, ctxt->name)) {
|
| + nodePop(ctxt);
|
| + htmlnamePop(ctxt);
|
| + }
|
| +
|
| + if (ctxt->record_info)
|
| + htmlNodeInfoPush(ctxt, &node_info);
|
| + htmlParserFinishElementParsing(ctxt);
|
| + return;
|
| + }
|
| +
|
| + /*
|
| + * Check for an Empty Element from DTD definition
|
| + */
|
| + if ((info != NULL) && (info->empty)) {
|
| + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
| + ctxt->sax->endElement(ctxt->userData, name);
|
| + htmlnamePop(ctxt);
|
| + return;
|
| + }
|
| +
|
| + if (ctxt->record_info)
|
| + htmlNodeInfoPush(ctxt, &node_info);
|
| +}
|
| +
|
| +/**
|
| + * htmlParseContentInternal:
|
| + * @ctxt: an HTML parser context
|
| + *
|
| + * Parse a content: comment, sub-element, reference or text.
|
| + * New version for non recursive htmlParseElementInternal
|
| + */
|
| +
|
| +static void
|
| +htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
|
| + xmlChar *currentNode;
|
| + int depth;
|
| + const xmlChar *name;
|
| +
|
| + currentNode = xmlStrdup(ctxt->name);
|
| + depth = ctxt->nameNr;
|
| + while (1) {
|
| + long cons = ctxt->nbChars;
|
| +
|
| + GROW;
|
| +
|
| + if (ctxt->instate == XML_PARSER_EOF)
|
| + break;
|
| +
|
| + /*
|
| + * Our tag or one of it's parent or children is ending.
|
| + */
|
| + if ((CUR == '<') && (NXT(1) == '/')) {
|
| + if (htmlParseEndTag(ctxt) &&
|
| + ((currentNode != NULL) || (ctxt->nameNr == 0))) {
|
| + if (currentNode != NULL)
|
| + xmlFree(currentNode);
|
| +
|
| + currentNode = xmlStrdup(ctxt->name);
|
| + depth = ctxt->nameNr;
|
| + }
|
| + continue; /* while */
|
| + }
|
| +
|
| + else if ((CUR == '<') &&
|
| + ((IS_ASCII_LETTER(NXT(1))) ||
|
| + (NXT(1) == '_') || (NXT(1) == ':'))) {
|
| + name = htmlParseHTMLName_nonInvasive(ctxt);
|
| + if (name == NULL) {
|
| + htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
|
| + "htmlParseStartTag: invalid element name\n",
|
| + NULL, NULL);
|
| + /* Dump the bogus tag like browsers do */
|
| + while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
|
| + NEXT;
|
| +
|
| + htmlParserFinishElementParsing(ctxt);
|
| + if (currentNode != NULL)
|
| + xmlFree(currentNode);
|
| +
|
| + currentNode = xmlStrdup(ctxt->name);
|
| + depth = ctxt->nameNr;
|
| + continue;
|
| + }
|
| +
|
| + if (ctxt->name != NULL) {
|
| + if (htmlCheckAutoClose(name, ctxt->name) == 1) {
|
| + htmlAutoClose(ctxt, name);
|
| + continue;
|
| + }
|
| + }
|
| + }
|
| +
|
| + /*
|
| + * Has this node been popped out during parsing of
|
| + * the next element
|
| + */
|
| + if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
|
| + (!xmlStrEqual(currentNode, ctxt->name)))
|
| + {
|
| + htmlParserFinishElementParsing(ctxt);
|
| + if (currentNode != NULL) xmlFree(currentNode);
|
| +
|
| + currentNode = xmlStrdup(ctxt->name);
|
| + depth = ctxt->nameNr;
|
| + continue;
|
| + }
|
| +
|
| + if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
|
| + (xmlStrEqual(currentNode, BAD_CAST"style")))) {
|
| + /*
|
| + * Handle SCRIPT/STYLE separately
|
| + */
|
| + htmlParseScript(ctxt);
|
| + } else {
|
| + /*
|
| + * Sometimes DOCTYPE arrives in the middle of the document
|
| + */
|
| + if ((CUR == '<') && (NXT(1) == '!') &&
|
| + (UPP(2) == 'D') && (UPP(3) == 'O') &&
|
| + (UPP(4) == 'C') && (UPP(5) == 'T') &&
|
| + (UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
| + (UPP(8) == 'E')) {
|
| + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
|
| + "Misplaced DOCTYPE declaration\n",
|
| + BAD_CAST "DOCTYPE" , NULL);
|
| + htmlParseDocTypeDecl(ctxt);
|
| + }
|
| +
|
| + /*
|
| + * First case : a comment
|
| + */
|
| + if ((CUR == '<') && (NXT(1) == '!') &&
|
| + (NXT(2) == '-') && (NXT(3) == '-')) {
|
| + htmlParseComment(ctxt);
|
| + }
|
| +
|
| + /*
|
| + * Second case : a Processing Instruction.
|
| + */
|
| + else if ((CUR == '<') && (NXT(1) == '?')) {
|
| + htmlParsePI(ctxt);
|
| + }
|
| +
|
| + /*
|
| + * Third case : a sub-element.
|
| + */
|
| + else if (CUR == '<') {
|
| + htmlParseElementInternal(ctxt);
|
| + if (currentNode != NULL) xmlFree(currentNode);
|
| +
|
| + currentNode = xmlStrdup(ctxt->name);
|
| + depth = ctxt->nameNr;
|
| + }
|
| +
|
| + /*
|
| + * Fourth case : a reference. If if has not been resolved,
|
| + * parsing returns it's Name, create the node
|
| + */
|
| + else if (CUR == '&') {
|
| + htmlParseReference(ctxt);
|
| + }
|
| +
|
| + /*
|
| + * Fifth case : end of the resource
|
| + */
|
| + else if (CUR == 0) {
|
| + htmlAutoCloseOnEnd(ctxt);
|
| + break;
|
| + }
|
| +
|
| + /*
|
| + * Last case, text. Note that References are handled directly.
|
| + */
|
| + else {
|
| + htmlParseCharData(ctxt);
|
| + }
|
| +
|
| + if (cons == ctxt->nbChars) {
|
| + if (ctxt->node != NULL) {
|
| + htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
|
| + "detected an error in element content\n",
|
| + NULL, NULL);
|
| + }
|
| + break;
|
| + }
|
| + }
|
| + GROW;
|
| + }
|
| + if (currentNode != NULL) xmlFree(currentNode);
|
| +}
|
| +
|
| +/**
|
| + * htmlParseContent:
|
| + * @ctxt: an HTML parser context
|
| + *
|
| + * Parse a content: comment, sub-element, reference or text.
|
| + * This is the entry point when called from parser.c
|
| + */
|
| +
|
| +void
|
| +__htmlParseContent(void *ctxt) {
|
| + if (ctxt != NULL)
|
| + htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
|
| +}
|
| +
|
| /**
|
| * htmlParseDocument:
|
| * @ctxt: an HTML parser context
|
| - *
|
| + *
|
| * parse an HTML document (and build a tree if using the standard SAX
|
| * interface).
|
| *
|
| @@ -4098,6 +4564,8 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|
|
| int
|
| htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
| + xmlChar start[4];
|
| + xmlCharEncoding enc;
|
| xmlDtdPtr dtd;
|
|
|
| xmlInitParser();
|
| @@ -4110,6 +4578,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
| return(XML_ERR_INTERNAL_ERROR);
|
| }
|
| ctxt->html = 1;
|
| + ctxt->linenumbers = 1;
|
| GROW;
|
| /*
|
| * SAX: beginning of the document processing.
|
| @@ -4117,12 +4586,29 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
| if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
|
| ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
|
|
|
| + if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
|
| + ((ctxt->input->end - ctxt->input->cur) >= 4)) {
|
| + /*
|
| + * Get the 4 first bytes and decode the charset
|
| + * if enc != XML_CHAR_ENCODING_NONE
|
| + * plug some encoding conversion routines.
|
| + */
|
| + start[0] = RAW;
|
| + start[1] = NXT(1);
|
| + start[2] = NXT(2);
|
| + start[3] = NXT(3);
|
| + enc = xmlDetectCharEncoding(&start[0], 4);
|
| + if (enc != XML_CHAR_ENCODING_NONE) {
|
| + xmlSwitchEncoding(ctxt, enc);
|
| + }
|
| + }
|
| +
|
| /*
|
| * Wipe out everything which is before the first '<'
|
| */
|
| SKIP_BLANKS;
|
| if (CUR == 0) {
|
| - htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
|
| + htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
|
| "Document is empty\n", NULL, NULL);
|
| }
|
|
|
| @@ -4136,10 +4622,10 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
| while (((CUR == '<') && (NXT(1) == '!') &&
|
| (NXT(2) == '-') && (NXT(3) == '-')) ||
|
| ((CUR == '<') && (NXT(1) == '?'))) {
|
| - htmlParseComment(ctxt);
|
| - htmlParsePI(ctxt);
|
| + htmlParseComment(ctxt);
|
| + htmlParsePI(ctxt);
|
| SKIP_BLANKS;
|
| - }
|
| + }
|
|
|
|
|
| /*
|
| @@ -4161,15 +4647,15 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
| while (((CUR == '<') && (NXT(1) == '!') &&
|
| (NXT(2) == '-') && (NXT(3) == '-')) ||
|
| ((CUR == '<') && (NXT(1) == '?'))) {
|
| - htmlParseComment(ctxt);
|
| - htmlParsePI(ctxt);
|
| + htmlParseComment(ctxt);
|
| + htmlParsePI(ctxt);
|
| SKIP_BLANKS;
|
| - }
|
| + }
|
|
|
| /*
|
| * Time to start parsing the tree itself
|
| */
|
| - htmlParseContent(ctxt);
|
| + htmlParseContentInternal(ctxt);
|
|
|
| /*
|
| * autoclose
|
| @@ -4187,8 +4673,8 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
| if (ctxt->myDoc != NULL) {
|
| dtd = xmlGetIntSubset(ctxt->myDoc);
|
| if (dtd == NULL)
|
| - ctxt->myDoc->intSubset =
|
| - xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
|
| + ctxt->myDoc->intSubset =
|
| + xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
|
| BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
|
| BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
|
| }
|
| @@ -4234,7 +4720,7 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
|
| memset(sax, 0, sizeof(htmlSAXHandler));
|
|
|
| /* Allocate the Input stack */
|
| - ctxt->inputTab = (htmlParserInputPtr *)
|
| + ctxt->inputTab = (htmlParserInputPtr *)
|
| xmlMalloc(5 * sizeof(htmlParserInputPtr));
|
| if (ctxt->inputTab == NULL) {
|
| htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
|
| @@ -4272,7 +4758,7 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
|
| if (ctxt->nameTab == NULL) {
|
| htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
|
| ctxt->nameNr = 0;
|
| - ctxt->nameMax = 10;
|
| + ctxt->nameMax = 0;
|
| ctxt->name = NULL;
|
| ctxt->nodeNr = 0;
|
| ctxt->nodeMax = 0;
|
| @@ -4286,6 +4772,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
|
| ctxt->nameMax = 10;
|
| ctxt->name = NULL;
|
|
|
| + ctxt->nodeInfoTab = NULL;
|
| + ctxt->nodeInfoNr = 0;
|
| + ctxt->nodeInfoMax = 0;
|
| +
|
| if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
|
| else {
|
| ctxt->sax = sax;
|
| @@ -4432,7 +4922,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
|
| xmlSwitchEncoding(ctxt, enc);
|
| if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
| htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
|
| - "Unsupported encoding %s\n",
|
| + "Unsupported encoding %s\n",
|
| (const xmlChar *) encoding, NULL);
|
| }
|
| } else {
|
| @@ -4455,7 +4945,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
|
| #ifdef LIBXML_PUSH_ENABLED
|
| /************************************************************************
|
| * *
|
| - * Progressive parsing interfaces *
|
| + * Progressive parsing interfaces *
|
| * *
|
| ************************************************************************/
|
|
|
| @@ -4479,85 +4969,190 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
|
| */
|
| static int
|
| htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
|
| - xmlChar next, xmlChar third, int iscomment) {
|
| + xmlChar next, xmlChar third, int iscomment,
|
| + int ignoreattrval)
|
| +{
|
| int base, len;
|
| htmlParserInputPtr in;
|
| const xmlChar *buf;
|
| int incomment = 0;
|
| + int invalue = 0;
|
| + char valdellim = 0x0;
|
|
|
| in = ctxt->input;
|
| - if (in == NULL) return(-1);
|
| + if (in == NULL)
|
| + return (-1);
|
| +
|
| base = in->cur - in->base;
|
| - if (base < 0) return(-1);
|
| + if (base < 0)
|
| + return (-1);
|
| +
|
| if (ctxt->checkIndex > base)
|
| base = ctxt->checkIndex;
|
| +
|
| if (in->buf == NULL) {
|
| - buf = in->base;
|
| - len = in->length;
|
| + buf = in->base;
|
| + len = in->length;
|
| } else {
|
| - buf = in->buf->buffer->content;
|
| - len = in->buf->buffer->use;
|
| + buf = in->buf->buffer->content;
|
| + len = in->buf->buffer->use;
|
| }
|
| +
|
| /* take into account the sequence length */
|
| - if (third) len -= 2;
|
| - else if (next) len --;
|
| - for (;base < len;base++) {
|
| - if (!incomment && (base + 4 < len) && !iscomment) {
|
| - if ((buf[base] == '<') && (buf[base + 1] == '!') &&
|
| - (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
|
| - incomment = 1;
|
| - /* do not increment past <! - some people use <!--> */
|
| - base += 2;
|
| - }
|
| - }
|
| - if (incomment) {
|
| - if (base + 3 > len)
|
| - return(-1);
|
| - if ((buf[base] == '-') && (buf[base + 1] == '-') &&
|
| - (buf[base + 2] == '>')) {
|
| - incomment = 0;
|
| - base += 2;
|
| - }
|
| - continue;
|
| - }
|
| + if (third)
|
| + len -= 2;
|
| + else if (next)
|
| + len--;
|
| + for (; base < len; base++) {
|
| + if ((!incomment) && (base + 4 < len) && (!iscomment)) {
|
| + if ((buf[base] == '<') && (buf[base + 1] == '!') &&
|
| + (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
|
| + incomment = 1;
|
| + /* do not increment past <! - some people use <!--> */
|
| + base += 2;
|
| + }
|
| + }
|
| + if (ignoreattrval) {
|
| + if (buf[base] == '"' || buf[base] == '\'') {
|
| + if (invalue) {
|
| + if (buf[base] == valdellim) {
|
| + invalue = 0;
|
| + continue;
|
| + }
|
| + } else {
|
| + valdellim = buf[base];
|
| + invalue = 1;
|
| + continue;
|
| + }
|
| + } else if (invalue) {
|
| + continue;
|
| + }
|
| + }
|
| + if (incomment) {
|
| + if (base + 3 > len)
|
| + return (-1);
|
| + if ((buf[base] == '-') && (buf[base + 1] == '-') &&
|
| + (buf[base + 2] == '>')) {
|
| + incomment = 0;
|
| + base += 2;
|
| + }
|
| + continue;
|
| + }
|
| if (buf[base] == first) {
|
| - if (third != 0) {
|
| - if ((buf[base + 1] != next) ||
|
| - (buf[base + 2] != third)) continue;
|
| - } else if (next != 0) {
|
| - if (buf[base + 1] != next) continue;
|
| - }
|
| - ctxt->checkIndex = 0;
|
| + if (third != 0) {
|
| + if ((buf[base + 1] != next) || (buf[base + 2] != third))
|
| + continue;
|
| + } else if (next != 0) {
|
| + if (buf[base + 1] != next)
|
| + continue;
|
| + }
|
| + ctxt->checkIndex = 0;
|
| #ifdef DEBUG_PUSH
|
| - if (next == 0)
|
| - xmlGenericError(xmlGenericErrorContext,
|
| - "HPP: lookup '%c' found at %d\n",
|
| - first, base);
|
| - else if (third == 0)
|
| - xmlGenericError(xmlGenericErrorContext,
|
| - "HPP: lookup '%c%c' found at %d\n",
|
| - first, next, base);
|
| - else
|
| - xmlGenericError(xmlGenericErrorContext,
|
| - "HPP: lookup '%c%c%c' found at %d\n",
|
| - first, next, third, base);
|
| + if (next == 0)
|
| + xmlGenericError(xmlGenericErrorContext,
|
| + "HPP: lookup '%c' found at %d\n",
|
| + first, base);
|
| + else if (third == 0)
|
| + xmlGenericError(xmlGenericErrorContext,
|
| + "HPP: lookup '%c%c' found at %d\n",
|
| + first, next, base);
|
| + else
|
| + xmlGenericError(xmlGenericErrorContext,
|
| + "HPP: lookup '%c%c%c' found at %d\n",
|
| + first, next, third, base);
|
| #endif
|
| - return(base - (in->cur - in->base));
|
| - }
|
| + return (base - (in->cur - in->base));
|
| + }
|
| }
|
| - ctxt->checkIndex = base;
|
| + if ((!incomment) && (!invalue))
|
| + ctxt->checkIndex = base;
|
| #ifdef DEBUG_PUSH
|
| if (next == 0)
|
| - xmlGenericError(xmlGenericErrorContext,
|
| - "HPP: lookup '%c' failed\n", first);
|
| + xmlGenericError(xmlGenericErrorContext,
|
| + "HPP: lookup '%c' failed\n", first);
|
| else if (third == 0)
|
| - xmlGenericError(xmlGenericErrorContext,
|
| - "HPP: lookup '%c%c' failed\n", first, next);
|
| - else
|
| - xmlGenericError(xmlGenericErrorContext,
|
| - "HPP: lookup '%c%c%c' failed\n", first, next, third);
|
| + xmlGenericError(xmlGenericErrorContext,
|
| + "HPP: lookup '%c%c' failed\n", first, next);
|
| + else
|
| + xmlGenericError(xmlGenericErrorContext,
|
| + "HPP: lookup '%c%c%c' failed\n", first, next,
|
| + third);
|
| #endif
|
| - return(-1);
|
| + return (-1);
|
| +}
|
| +
|
| +/**
|
| + * htmlParseLookupChars:
|
| + * @ctxt: an HTML parser context
|
| + * @stop: Array of chars, which stop the lookup.
|
| + * @stopLen: Length of stop-Array
|
| + *
|
| + * Try to find if any char of the stop-Array is available in the input
|
| + * stream.
|
| + * This function has a side effect of (possibly) incrementing ctxt->checkIndex
|
| + * to avoid rescanning sequences of bytes, it DOES change the state of the
|
| + * parser, do not use liberally.
|
| + *
|
| + * Returns the index to the current parsing point if a stopChar
|
| + * is available, -1 otherwise.
|
| + */
|
| +static int
|
| +htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
| + int stopLen)
|
| +{
|
| + int base, len;
|
| + htmlParserInputPtr in;
|
| + const xmlChar *buf;
|
| + int incomment = 0;
|
| + int i;
|
| +
|
| + in = ctxt->input;
|
| + if (in == NULL)
|
| + return (-1);
|
| +
|
| + base = in->cur - in->base;
|
| + if (base < 0)
|
| + return (-1);
|
| +
|
| + if (ctxt->checkIndex > base)
|
| + base = ctxt->checkIndex;
|
| +
|
| + if (in->buf == NULL) {
|
| + buf = in->base;
|
| + len = in->length;
|
| + } else {
|
| + buf = in->buf->buffer->content;
|
| + len = in->buf->buffer->use;
|
| + }
|
| +
|
| + for (; base < len; base++) {
|
| + if (!incomment && (base + 4 < len)) {
|
| + if ((buf[base] == '<') && (buf[base + 1] == '!') &&
|
| + (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
|
| + incomment = 1;
|
| + /* do not increment past <! - some people use <!--> */
|
| + base += 2;
|
| + }
|
| + }
|
| + if (incomment) {
|
| + if (base + 3 > len)
|
| + return (-1);
|
| + if ((buf[base] == '-') && (buf[base + 1] == '-') &&
|
| + (buf[base + 2] == '>')) {
|
| + incomment = 0;
|
| + base += 2;
|
| + }
|
| + continue;
|
| + }
|
| + for (i = 0; i < stopLen; ++i) {
|
| + if (buf[base] == stop[i]) {
|
| + ctxt->checkIndex = 0;
|
| + return (base - (in->cur - in->base));
|
| + }
|
| + }
|
| + }
|
| + ctxt->checkIndex = base;
|
| + return (-1);
|
| }
|
|
|
| /**
|
| @@ -4639,7 +5234,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| avail = in->buf->buffer->use - (in->cur - in->base);
|
| if ((avail == 0) && (terminate)) {
|
| htmlAutoCloseOnEnd(ctxt);
|
| - if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
|
| + if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
|
| /*
|
| * SAX: end of the document processing.
|
| */
|
| @@ -4689,7 +5284,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| (UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
| (UPP(8) == 'E')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4722,7 +5317,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| if ((cur == '<') && (next == '!') &&
|
| (in->cur[2] == '-') && (in->cur[3] == '-')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
|
| + (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4732,7 +5327,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| ctxt->instate = XML_PARSER_MISC;
|
| } else if ((cur == '<') && (next == '?')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4746,7 +5341,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| (UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
| (UPP(8) == 'E')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4775,14 +5370,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| avail = in->length - (in->cur - in->base);
|
| else
|
| avail = in->buf->buffer->use - (in->cur - in->base);
|
| - if (avail < 2)
|
| + if (avail < 2)
|
| goto done;
|
| cur = in->cur[0];
|
| next = in->cur[1];
|
| if ((cur == '<') && (next == '!') &&
|
| (in->cur[2] == '-') && (in->cur[3] == '-')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
|
| + (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4792,7 +5387,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| ctxt->instate = XML_PARSER_PROLOG;
|
| } else if ((cur == '<') && (next == '?')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4829,7 +5424,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| if ((cur == '<') && (next == '!') &&
|
| (in->cur[2] == '-') && (in->cur[3] == '-')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
|
| + (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4839,7 +5434,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| ctxt->instate = XML_PARSER_EPILOG;
|
| } else if ((cur == '<') && (next == '?')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -4889,7 +5484,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| break;
|
| }
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
|
|
| failed = htmlParseStartTag(ctxt);
|
| @@ -4936,10 +5531,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| /*
|
| * end of parsing of this node.
|
| */
|
| - if (xmlStrEqual(name, ctxt->name)) {
|
| + if (xmlStrEqual(name, ctxt->name)) {
|
| nodePop(ctxt);
|
| htmlnamePop(ctxt);
|
| - }
|
| + }
|
|
|
| ctxt->instate = XML_PARSER_CONTENT;
|
| #ifdef DEBUG_PUSH
|
| @@ -5014,7 +5609,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| int idx;
|
| xmlChar val;
|
|
|
| - idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
|
| + idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
|
| if (idx < 0)
|
| goto done;
|
| val = in->cur[idx + 2];
|
| @@ -5041,7 +5636,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| (UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
| (UPP(8) == 'E')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
|
| "Misplaced DOCTYPE declaration\n",
|
| @@ -5051,7 +5646,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| (in->cur[2] == '-') && (in->cur[3] == '-')) {
|
| if ((!terminate) &&
|
| (htmlParseLookupSequence(
|
| - ctxt, '-', '-', '>', 1) < 0))
|
| + ctxt, '-', '-', '>', 1, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -5061,7 +5656,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| ctxt->instate = XML_PARSER_CONTENT;
|
| } else if ((cur == '<') && (next == '?')) {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -5089,7 +5684,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| break;
|
| } else if (cur == '&') {
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
|
| + (htmlParseLookupChars(ctxt,
|
| + BAD_CAST "; >/", 4) < 0))
|
| goto done;
|
| #ifdef DEBUG_PUSH
|
| xmlGenericError(xmlGenericErrorContext,
|
| @@ -5105,7 +5701,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| * data detection.
|
| */
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
|
| + (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
|
| goto done;
|
| ctxt->checkIndex = 0;
|
| #ifdef DEBUG_PUSH
|
| @@ -5131,7 +5727,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
| if (avail < 2)
|
| goto done;
|
| if ((!terminate) &&
|
| - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
|
| + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
|
| goto done;
|
| htmlParseEndTag(ctxt);
|
| if (ctxt->nameNr == 0) {
|
| @@ -5258,10 +5854,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|
|
| }
|
| }
|
| -done:
|
| +done:
|
| if ((avail == 0) && (terminate)) {
|
| htmlAutoCloseOnEnd(ctxt);
|
| - if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
|
| + if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
|
| /*
|
| * SAX: end of the document processing.
|
| */
|
| @@ -5276,8 +5872,8 @@ done:
|
| xmlDtdPtr dtd;
|
| dtd = xmlGetIntSubset(ctxt->myDoc);
|
| if (dtd == NULL)
|
| - ctxt->myDoc->intSubset =
|
| - xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
|
| + ctxt->myDoc->intSubset =
|
| + xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
|
| BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
|
| BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
|
| }
|
| @@ -5311,8 +5907,8 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
|
| int base = ctxt->input->base - ctxt->input->buf->buffer->content;
|
| int cur = ctxt->input->cur - ctxt->input->base;
|
| int res;
|
| -
|
| - res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
|
| +
|
| + res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
|
| if (res < 0) {
|
| ctxt->errNo = XML_PARSER_EOF;
|
| ctxt->disableSAX = 1;
|
| @@ -5336,7 +5932,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
|
| if ((in->encoder != NULL) && (in->buffer != NULL) &&
|
| (in->raw != NULL)) {
|
| int nbchars;
|
| -
|
| +
|
| nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
|
| if (nbchars < 0) {
|
| htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
|
| @@ -5353,14 +5949,14 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
|
| (ctxt->instate != XML_PARSER_MISC)) {
|
| ctxt->errNo = XML_ERR_DOCUMENT_END;
|
| ctxt->wellFormed = 0;
|
| - }
|
| + }
|
| if (ctxt->instate != XML_PARSER_EOF) {
|
| if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
|
| ctxt->sax->endDocument(ctxt->userData);
|
| }
|
| ctxt->instate = XML_PARSER_EOF;
|
| }
|
| - return((xmlParserErrors) ctxt->errNo);
|
| + return((xmlParserErrors) ctxt->errNo);
|
| }
|
|
|
| /************************************************************************
|
| @@ -5385,7 +5981,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
|
| * Returns the new parser context or NULL
|
| */
|
| htmlParserCtxtPtr
|
| -htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
|
| +htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
|
| const char *chunk, int size, const char *filename,
|
| xmlCharEncoding enc) {
|
| htmlParserCtxtPtr ctxt;
|
| @@ -5416,7 +6012,7 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
|
| memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
|
| if (user_data != NULL)
|
| ctxt->userData = user_data;
|
| - }
|
| + }
|
| if (filename == NULL) {
|
| ctxt->directory = NULL;
|
| } else {
|
| @@ -5438,17 +6034,17 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
|
| inputStream->buf = buf;
|
| inputStream->base = inputStream->buf->buffer->content;
|
| inputStream->cur = inputStream->buf->buffer->content;
|
| - inputStream->end =
|
| + inputStream->end =
|
| &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
|
|
|
| inputPush(ctxt, inputStream);
|
|
|
| if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
|
| - (ctxt->input->buf != NULL)) {
|
| + (ctxt->input->buf != NULL)) {
|
| int base = ctxt->input->base - ctxt->input->buf->buffer->content;
|
| int cur = ctxt->input->cur - ctxt->input->base;
|
|
|
| - xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
|
| + xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
|
|
|
| ctxt->input->base = ctxt->input->buf->buffer->content + base;
|
| ctxt->input->cur = ctxt->input->base + cur;
|
| @@ -5469,12 +6065,12 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
|
| * @cur: a pointer to an array of xmlChar
|
| * @encoding: a free form C string describing the HTML document encoding, or NULL
|
| * @sax: the SAX handler block
|
| - * @userData: if using SAX, this pointer will be provided on callbacks.
|
| + * @userData: if using SAX, this pointer will be provided on callbacks.
|
| *
|
| * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
|
| * to handle parse events. If sax is NULL, fallback to the default DOM
|
| * behavior and return a tree.
|
| - *
|
| + *
|
| * Returns the resulting document tree unless SAX is NULL or the document is
|
| * not well formed.
|
| */
|
| @@ -5491,7 +6087,7 @@ htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void
|
|
|
| ctxt = htmlCreateDocParserCtxt(cur, encoding);
|
| if (ctxt == NULL) return(NULL);
|
| - if (sax != NULL) {
|
| + if (sax != NULL) {
|
| if (ctxt->sax != NULL) xmlFree (ctxt->sax);
|
| ctxt->sax = sax;
|
| ctxt->userData = userData;
|
| @@ -5504,7 +6100,7 @@ htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void
|
| ctxt->userData = NULL;
|
| }
|
| htmlFreeParserCtxt(ctxt);
|
| -
|
| +
|
| return(ret);
|
| }
|
|
|
| @@ -5514,7 +6110,7 @@ htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void
|
| * @encoding: a free form C string describing the HTML document encoding, or NULL
|
| *
|
| * parse an HTML in-memory document and build a tree.
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
|
|
| @@ -5529,7 +6125,7 @@ htmlParseDoc(xmlChar *cur, const char *encoding) {
|
| * @filename: the filename
|
| * @encoding: a free form C string describing the HTML document encoding, or NULL
|
| *
|
| - * Create a parser context for a file content.
|
| + * Create a parser context for a file content.
|
| * Automatic support for ZLIB/Compress compressed document is provided
|
| * by default if found at compile-time.
|
| *
|
| @@ -5561,7 +6157,7 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)
|
| xmlFreeParserCtxt(ctxt);
|
| return(NULL);
|
| }
|
| -
|
| +
|
| inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
|
| xmlFree(canonicFilename);
|
| if (inputStream == NULL) {
|
| @@ -5574,14 +6170,14 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)
|
| /* set encoding */
|
| if (encoding) {
|
| content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
|
| - if (content) {
|
| + if (content) {
|
| strcpy ((char *)content, (char *)content_line);
|
| strcat ((char *)content, (char *)encoding);
|
| htmlCheckEncoding (ctxt, content);
|
| xmlFree (content);
|
| }
|
| }
|
| -
|
| +
|
| return(ctxt);
|
| }
|
|
|
| @@ -5590,7 +6186,7 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)
|
| * @filename: the filename
|
| * @encoding: a free form C string describing the HTML document encoding, or NULL
|
| * @sax: the SAX handler block
|
| - * @userData: if using SAX, this pointer will be provided on callbacks.
|
| + * @userData: if using SAX, this pointer will be provided on callbacks.
|
| *
|
| * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
|
| * compressed document is provided by default if found at compile-time.
|
| @@ -5602,7 +6198,7 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)
|
| */
|
|
|
| htmlDocPtr
|
| -htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
|
| +htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
|
| void *userData) {
|
| htmlDocPtr ret;
|
| htmlParserCtxtPtr ctxt;
|
| @@ -5626,7 +6222,7 @@ htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr s
|
| ctxt->userData = NULL;
|
| }
|
| htmlFreeParserCtxt(ctxt);
|
| -
|
| +
|
| return(ret);
|
| }
|
|
|
| @@ -5648,7 +6244,7 @@ htmlParseFile(const char *filename, const char *encoding) {
|
|
|
| /**
|
| * htmlHandleOmittedElem:
|
| - * @val: int 0 or 1
|
| + * @val: int 0 or 1
|
| *
|
| * Set and return the previous value for handling HTML omitted tags.
|
| *
|
| @@ -5788,7 +6384,7 @@ htmlNodeStatus(const htmlNodePtr node, int legacy) {
|
| * current scope
|
| */
|
| #define DICT_FREE(str) \
|
| - if ((str) && ((!dict) || \
|
| + if ((str) && ((!dict) || \
|
| (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
|
| xmlFree((char *)(str));
|
|
|
| @@ -5803,7 +6399,7 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
|
| {
|
| xmlParserInputPtr input;
|
| xmlDictPtr dict;
|
| -
|
| +
|
| if (ctxt == NULL)
|
| return;
|
|
|
| @@ -5930,6 +6526,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
|
| ctxt->options |= HTML_PARSE_COMPACT;
|
| options -= HTML_PARSE_COMPACT;
|
| }
|
| + if (options & XML_PARSE_HUGE) {
|
| + ctxt->options |= XML_PARSE_HUGE;
|
| + options -= XML_PARSE_HUGE;
|
| + }
|
| ctxt->dictNames = 0;
|
| return (options);
|
| }
|
| @@ -5943,7 +6543,7 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
|
| * @reuse: keep the context for reuse
|
| *
|
| * Common front-end for the htmlRead functions
|
| - *
|
| + *
|
| * Returns the resulting document tree or NULL
|
| */
|
| static htmlDocPtr
|
| @@ -5951,15 +6551,19 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
|
| int options, int reuse)
|
| {
|
| htmlDocPtr ret;
|
| -
|
| +
|
| htmlCtxtUseOptions(ctxt, options);
|
| ctxt->html = 1;
|
| if (encoding != NULL) {
|
| xmlCharEncodingHandlerPtr hdlr;
|
|
|
| hdlr = xmlFindCharEncodingHandler(encoding);
|
| - if (hdlr != NULL)
|
| + if (hdlr != NULL) {
|
| xmlSwitchToEncoding(ctxt, hdlr);
|
| + if (ctxt->input->encoding != NULL)
|
| + xmlFree((xmlChar *) ctxt->input->encoding);
|
| + ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
|
| + }
|
| }
|
| if ((URL != NULL) && (ctxt->input != NULL) &&
|
| (ctxt->input->filename == NULL))
|
| @@ -5985,7 +6589,7 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
|
| * @options: a combination of htmlParserOption(s)
|
| *
|
| * parse an XML in-memory document and build a tree.
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6010,7 +6614,7 @@ htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int opti
|
| * @options: a combination of htmlParserOption(s)
|
| *
|
| * parse an XML file from the filesystem or the network.
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6034,7 +6638,7 @@ htmlReadFile(const char *filename, const char *encoding, int options)
|
| * @options: a combination of htmlParserOption(s)
|
| *
|
| * parse an XML in-memory document and build a tree.
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6060,7 +6664,7 @@ htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi
|
| * @options: a combination of htmlParserOption(s)
|
| *
|
| * parse an XML from a file descriptor and build a tree.
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6102,7 +6706,7 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options)
|
| * @options: a combination of htmlParserOption(s)
|
| *
|
| * parse an HTML document from I/O functions and source and build a tree.
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6146,7 +6750,7 @@ htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
|
| *
|
| * parse an XML in-memory document and build a tree.
|
| * This reuses the existing @ctxt parser context
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6179,7 +6783,7 @@ htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
|
| *
|
| * parse an XML file from the filesystem or the network.
|
| * This reuses the existing @ctxt parser context
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6214,7 +6818,7 @@ htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
|
| *
|
| * parse an XML in-memory document and build a tree.
|
| * This reuses the existing @ctxt parser context
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6256,7 +6860,7 @@ htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
|
| *
|
| * parse an XML from a file descriptor and build a tree.
|
| * This reuses the existing @ctxt parser context
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
| @@ -6298,7 +6902,7 @@ htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
|
| *
|
| * parse an HTML document from I/O functions and source and build a tree.
|
| * This reuses the existing @ctxt parser context
|
| - *
|
| + *
|
| * Returns the resulting document tree
|
| */
|
| htmlDocPtr
|
|
|