third_party/libxml/HTMLparser.c - Issue 2951008: Update libxml to 2.7.7.

Unified Diff: third_party/libxml/HTMLparser.c

Issue 2951008: Update libxml to 2.7.7. (Closed) Base URL: http://src.chromium.org/git/chromium.git

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/libxml/HTMLparser.c

diff --git a/third_party/libxml/HTMLparser.c b/third_party/libxml/HTMLparser.c

index 92503a1a4d92a3568c3a1b01b53f7568cda1f031..42dc776ae587e03d38416dafe072e563d4040683 100644

--- a/third_party/libxml/HTMLparser.c

+++ b/third_party/libxml/HTMLparser.c

@@ -59,7 +59,7 @@ static void htmlParseComment(htmlParserCtxtPtr ctxt);

/************************************************************************

* *

- * Some factorized error routines *

+ * Some factorized error routines *

* *

************************************************************************/

@@ -147,7 +147,7 @@ htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,

/************************************************************************

* *

- * Parser stacks related functions and macros *

+ * Parser stacks related functions and macros *

* *

************************************************************************/

@@ -163,6 +163,10 @@ htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,

static int

htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)

{

+ if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))

+ ctxt->html = 3;

+ if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))

+ ctxt->html = 10;

if (ctxt->nameNr >= ctxt->nameMax) {

ctxt->nameMax *= 2;

ctxt->nameTab = (const xmlChar * *)

@@ -205,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt)

return (ret);

}

+/**

+ * htmlNodeInfoPush:

+ * @ctxt: an HTML parser context

+ * @value: the node info

+ *

+ * Pushes a new element name on top of the node info stack

+ *

+ * Returns 0 in case of error, the index in the stack otherwise

+ */

+static int

+htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)

+ if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {

+ if (ctxt->nodeInfoMax == 0)

+ ctxt->nodeInfoMax = 5;

+ ctxt->nodeInfoMax *= 2;

+ ctxt->nodeInfoTab = (htmlParserNodeInfo *)

+ xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,

+ ctxt->nodeInfoMax *

+ sizeof(ctxt->nodeInfoTab[0]));

+ if (ctxt->nodeInfoTab == NULL) {

+ htmlErrMemory(ctxt, NULL);

+ return (0);

+ }

+ ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;

+ ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];

+ return (ctxt->nodeInfoNr++);

+/**

+ * htmlNodeInfoPop:

+ * @ctxt: an HTML parser context

+ *

+ * Pops the top element name from the node info stack

+ *

+ * Returns 0 in case of error, the pointer to NodeInfo otherwise

+ */

+static htmlParserNodeInfo *

+htmlNodeInfoPop(htmlParserCtxtPtr ctxt)

+ if (ctxt->nodeInfoNr <= 0)

+ return (NULL);

+ ctxt->nodeInfoNr--;

+ if (ctxt->nodeInfoNr < 0)

+ return (NULL);

+ if (ctxt->nodeInfoNr > 0)

+ ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];

+ else

+ ctxt->nodeInfo = NULL;

+ return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];

* Macros for accessing the content. Those should be used only by the parser,

* and not exported.

@@ -263,8 +320,6 @@ htmlnamePop(htmlParserCtxtPtr ctxt)

#define NEXT xmlNextChar(ctxt)

#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))

-#define NXT(val) ctxt->input->cur[(val)]

-#define CUR_PTR ctxt->input->cur

#define NEXTL(l) do { \

@@ -273,7 +328,7 @@ htmlnamePop(htmlParserCtxtPtr ctxt)

} else ctxt->input->col++; \

ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \

} while (0)

/************

if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \

@@ -288,6 +343,58 @@ htmlnamePop(htmlParserCtxtPtr ctxt)

else i += xmlCopyChar(l,&b[i],v)

/**

+ * htmlFindEncoding:

+ * @the HTML parser context

+ *

+ * Ty to find and encoding in the current data available in the input

+ * buffer this is needed to try to switch to the proper encoding when

+ * one face a character error.

+ * That's an heuristic, since it's operating outside of parsing it could

+ * try to use a meta which had been commented out, that's the reason it

+ * should only be used in case of error, not as a default.

+ *

+ * Returns an encoding string or NULL if not found, the string need to

+ * be freed

+ */

+static xmlChar *

+htmlFindEncoding(xmlParserCtxtPtr ctxt) {

+ const xmlChar *start, *cur, *end;

+ if ((ctxt == NULL) || (ctxt->input == NULL) ||

+ (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||

+ (ctxt->input->buf->encoder != NULL))

+ return(NULL);

+ if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))

+ return(NULL);

+ start = ctxt->input->cur;

+ end = ctxt->input->end;

+ /* we also expect the input buffer to be zero terminated */

+ if (*end != 0)

+ return(NULL);

+ cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");

+ if (cur == NULL)

+ return(NULL);

+ cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");

+ if (cur == NULL)

+ return(NULL);

+ cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");

+ if (cur == NULL)

+ return(NULL);

+ cur += 8;

+ start = cur;

+ while (((*cur >= 'A') && (*cur <= 'Z')) ||

+ ((*cur >= 'a') && (*cur <= 'z')) ||

+ ((*cur >= '0') && (*cur <= '9')) ||

+ (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))

+ cur++;

+ if (cur == start)

+ return(NULL);

+ return(xmlStrndup(start, cur - start));

+/**

* htmlCurrentChar:

* @ctxt: the HTML parser context

* @len: pointer to the length of the char read

@@ -309,7 +416,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {

if (ctxt->token != 0) {

*len = 0;

return(ctxt->token);

- }

+ }

if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {

* We are supposed to handle UTF8, check it's valid

@@ -318,7 +425,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {

* UCS-4 range (hex.) UTF-8 octet sequence (binary)

* 0000 0000-0000 007F 0xxxxxxx

* 0000 0080-0000 07FF 110xxxxx 10xxxxxx

- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx

+ * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx

* Check for the 0x110000 limit too

@@ -328,19 +435,25 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {

c = *cur;

if (c & 0x80) {

- if (cur[1] == 0)

+ if (cur[1] == 0) {

xmlParserInputGrow(ctxt->input, INPUT_CHUNK);

+ cur = ctxt->input->cur;

+ }

if ((cur[1] & 0xc0) != 0x80)

goto encoding_error;

if ((c & 0xe0) == 0xe0) {

- if (cur[2] == 0)

+ if (cur[2] == 0) {

xmlParserInputGrow(ctxt->input, INPUT_CHUNK);

+ cur = ctxt->input->cur;

+ }

if ((cur[2] & 0xc0) != 0x80)

goto encoding_error;

if ((c & 0xf0) == 0xf0) {

- if (cur[3] == 0)

+ if (cur[3] == 0) {

xmlParserInputGrow(ctxt->input, INPUT_CHUNK);

+ cur = ctxt->input->cur;

+ }

if (((c & 0xf8) != 0xf0) ||

((cur[3] & 0xc0) != 0x80))

goto encoding_error;

@@ -366,9 +479,16 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {

if (!IS_CHAR(val)) {

htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,

"Char 0x%X out of allowed range\n", val);

- }

+ }

return(val);

} else {

+ if ((*ctxt->input->cur == 0) &&

+ (ctxt->input->cur < ctxt->input->end)) {

+ htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,

+ "Char 0x%X out of allowed range\n", 0);

+ *len = 1;

+ return(' ');

+ }

/* 1-byte code */

*len = 1;

return((int) *ctxt->input->cur);

@@ -386,8 +506,28 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {

* Humm this is bad, do an automatic flow conversion

- xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);

- ctxt->charset = XML_CHAR_ENCODING_UTF8;

+ {

+ xmlChar * guess;

+ xmlCharEncodingHandlerPtr handler;

+ guess = htmlFindEncoding(ctxt);

+ if (guess == NULL) {

+ xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);

+ } else {

+ if (ctxt->input->encoding != NULL)

+ xmlFree((xmlChar *) ctxt->input->encoding);

+ ctxt->input->encoding = guess;

+ handler = xmlFindCharEncodingHandler((const char *) guess);

+ if (handler != NULL) {

+ xmlSwitchToEncoding(ctxt, handler);

+ } else {

+ htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,

+ "Unsupported encoding %s", guess, NULL);

+ }

+ ctxt->charset = XML_CHAR_ENCODING_UTF8;

+ }

return(xmlCurrentChar(ctxt, len));

encoding_error:

@@ -413,7 +553,7 @@ encoding_error:

BAD_CAST buffer, NULL);

}

- ctxt->charset = XML_CHAR_ENCODING_8859_1;

+ ctxt->charset = XML_CHAR_ENCODING_8859_1;

*len = 1;

return((int) *ctxt->input->cur);

}

@@ -453,7 +593,7 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {

/************************************************************************

* *

- * The list of HTML elements and their properties *

+ * The list of HTML elements and their properties *

* *

************************************************************************/

@@ -478,9 +618,9 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {

#define NB_PHRASE 10

#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"

#define NB_SPECIAL 16

-#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL

+#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL

#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL

-#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"

+#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"

#define NB_BLOCK NB_HEADING + NB_LIST + 14

#define FORMCTRL "input", "select", "textarea", "label", "button"

#define NB_FORMCTRL 5

@@ -606,7 +746,7 @@ static const char* const language_attr[] = { "language", NULL } ;

static const char* const select_content[] = { "optgroup", "option", NULL } ;

static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;

static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;

-static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;

+static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;

static const char* const table_depr[] = { "align", "bgcolor", NULL } ;

static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;

static const char* const tr_elt[] = { "tr", NULL } ;

@@ -938,7 +1078,7 @@ static const char * const htmlStartClose[] = {

"listing", "xmp", NULL,

"ol", "p", "head", "ul", NULL,

"menu", "p", "head", "ul", NULL,

-"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,

+"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,

"div", "p", "head", NULL,

"noscript", "p", "head", NULL,

"center", "font", "b", "i", "p", "head", NULL,

@@ -949,7 +1089,7 @@ static const char * const htmlStartClose[] = {

"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",

"listing", "xmp", "a", NULL,

"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,

-"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,

+"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,

"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,

"thead", "caption", "col", "colgroup", NULL,

"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",

@@ -1008,7 +1148,7 @@ static const char *const htmlScriptAttributes[] = {

* elements the parser can decide how to handle extra endtags.

* Endtags are only allowed to close elements with lower or equal

* priority.

- */

+ */

typedef struct {

const char *name;

@@ -1035,7 +1175,7 @@ static int htmlStartCloseIndexinitialized = 0;

/************************************************************************

* *

- * functions to handle HTML specific data *

+ * functions to handle HTML specific data *

* *

************************************************************************/

@@ -1085,7 +1225,7 @@ htmlTagLookup(const xmlChar *tag) {

/**

* htmlGetEndPriority:

* @name: The name of the element to look up the priority for.

- *

+ *

* Return value: The "endtag" priority.

**/

static int

@@ -1164,7 +1304,7 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)

* A missplaced endtag can only close elements with lower

* or equal priority, so if we find an element with higher

* priority before we find an element with

- * matching name, we just ignore this endtag

+ * matching name, we just ignore this endtag

if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)

return;

@@ -1215,7 +1355,7 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)

* called when a new tag has been detected and generates the

* appropriates closes if possible/needed.

* If newtag is NULL this mean we are at the end of the resource

- * and we should check

+ * and we should check

static void

htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)

@@ -1303,6 +1443,10 @@ htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {

static void

htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {

+ int i;

+ if (ctxt->options & HTML_PARSE_NOIMPLIED)

+ return;

if (!htmlOmittedDefaultValue)

return;

if (xmlStrEqual(newtag, BAD_CAST"html"))

@@ -1314,24 +1458,31 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {

}

if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))

return;

- if ((ctxt->nameNr <= 1) &&

+ if ((ctxt->nameNr <= 1) &&

((xmlStrEqual(newtag, BAD_CAST"script")) ||

(xmlStrEqual(newtag, BAD_CAST"style")) ||

(xmlStrEqual(newtag, BAD_CAST"meta")) ||

(xmlStrEqual(newtag, BAD_CAST"link")) ||

(xmlStrEqual(newtag, BAD_CAST"title")) ||

(xmlStrEqual(newtag, BAD_CAST"base")))) {

- /*

- * dropped OBJECT ... i you put it first BODY will be

- * assumed !

- */

- htmlnamePush(ctxt, BAD_CAST"head");

- if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

- ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);

+ if (ctxt->html >= 3) {

+ /* we already saw or generated an <head> before */

+ return;

+ }

+ /*

+ * dropped OBJECT ... i you put it first BODY will be

+ * assumed !

+ */

+ htmlnamePush(ctxt, BAD_CAST"head");

+ if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

+ ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);

} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&

(!xmlStrEqual(newtag, BAD_CAST"frame")) &&

(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {

- int i;

+ if (ctxt->html >= 10) {

+ /* we already saw or generated a <body> before */

+ return;

+ }

for (i = 0;i < ctxt->nameNr;i++) {

if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {

return;

@@ -1340,7 +1491,7 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {

return;

}

htmlnamePush(ctxt, BAD_CAST"body");

if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))

ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);

@@ -1402,12 +1553,12 @@ htmlIsScriptAttribute(const xmlChar *name) {

unsigned int i;

if (name == NULL)

- return(0);

+ return(0);

* all script attributes start with 'on'

if ((name[0] != 'o') || (name[1] != 'n'))

- return(0);

+ return(0);

for (i = 0;

i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);

i++) {

@@ -1419,7 +1570,7 @@ htmlIsScriptAttribute(const xmlChar *name) {

/************************************************************************

* *

- * The list of HTML predefined entities *

+ * The list of HTML predefined entities *

* *

************************************************************************/

@@ -1833,7 +1984,7 @@ UTF8ToHtml(unsigned char* out, int *outlen,

if (inend - in < trailing) {

break;

- }

+ }

for ( ; trailing; trailing--) {

if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))

@@ -2023,7 +2174,7 @@ htmlNewInputStream(htmlParserCtxtPtr ctxt) {

* *

************************************************************************/

- * all tags allowing pc data from the html 4.01 loose dtd

+ * all tags allowing pc data from the html 4.01 loose dtd

* NOTE: it might be more apropriate to integrate this information

* into the html40ElementTable array but I don't want to risk any

* binary incomptibility

@@ -2083,7 +2234,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {

if (lastChild == NULL) {

if ((ctxt->node->type != XML_ELEMENT_NODE) &&

(ctxt->node->content != NULL)) return(0);

- /* keep ws in constructs like ... ...

+ /* keep ws in constructs like ... ...

for all tags "b" allowing PCDATA */

for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {

if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {

@@ -2093,7 +2244,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {

} else if (xmlNodeIsText(lastChild)) {

return(0);

} else {

- /* keep ws in constructs like xy z

+ /* keep ws in constructs like xy z

for all tags "p" allowing PCDATA */

for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {

if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {

@@ -2133,7 +2284,7 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {

cur->intSubset = NULL;

cur->doc = cur;

cur->name = NULL;

- cur->children = NULL;

+ cur->children = NULL;

cur->extSubset = NULL;

cur->oldNs = NULL;

cur->encoding = NULL;

@@ -2143,6 +2294,7 @@ htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {

cur->refs = NULL;

cur->_private = NULL;

cur->charset = XML_CHAR_ENCODING_UTF8;

+ cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;

if ((ExternalID != NULL) ||

(URI != NULL))

xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);

@@ -2200,18 +2352,19 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt) {

xmlChar loc[HTML_PARSER_BUFFER_SIZE];

if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&

- (CUR != ':')) return(NULL);

+ (CUR != ':') && (CUR != '.')) return(NULL);

while ((i < HTML_PARSER_BUFFER_SIZE) &&

((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||

- (CUR == ':') || (CUR == '-') || (CUR == '_'))) {

+ (CUR == ':') || (CUR == '-') || (CUR == '_') ||

+ (CUR == '.'))) {

if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;

else loc[i] = CUR;

i++;

NEXT;

}

return(xmlDictLookup(ctxt->dict, loc, i));

}

@@ -2234,7 +2387,7 @@ htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {

if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&

(NXT(1) != ':')) return(NULL);

while ((i < HTML_PARSER_BUFFER_SIZE) &&

((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||

(NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {

@@ -2242,7 +2395,7 @@ htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {

else loc[i] = NXT(1+i);

i++;

}

return(xmlDictLookup(ctxt->dict, loc, i));

}

@@ -2310,7 +2463,7 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {

while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */

((IS_LETTER(c)) || (IS_DIGIT(c)) ||

(c == '.') || (c == '-') ||

- (c == '_') || (c == ':') ||

+ (c == '_') || (c == ':') ||

(IS_COMBINING(c)) ||

(IS_EXTENDER(c)))) {

if (count++ > 100) {

@@ -2329,7 +2482,7 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {

* htmlParseHTMLAttribute:

* @ctxt: an HTML parser context

* @stop: a char stop value

- *

+ *

* parse an HTML attribute value till the stop (quote), if

* stop is 0 then it stops at the first space

@@ -2374,13 +2527,13 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {

{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }

else if (c < 0x10000)

{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }

- else

+ else

{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }

for ( ; bits >= 0; bits-= 6) {

*out++ = ((c >> bits) & 0x3F) | 0x80;

}

if (out - buffer > buffer_size - 100) {

int indx = out - buffer;

@@ -2426,9 +2579,9 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {

{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }

else if (c < 0x10000)

{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }

- else

+ else

{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }

for ( ; bits >= 0; bits-= 6) {

*out++ = ((c >> bits) & 0x3F) | 0x80;

}

@@ -2451,16 +2604,16 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {

{ *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }

else if (c < 0x10000)

{ *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }

- else

+ else

{ *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }

for ( ; bits >= 0; bits-= 6) {

*out++ = ((c >> bits) & 0x3F) | 0x80;

}

NEXT;

}

- *out++ = 0;

+ *out = 0;

return(buffer);

}

@@ -2521,7 +2674,7 @@ htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {

* parse a value for an attribute

* Note: the parser won't do substitution of entities here, this

* will be handled later in xmlStringGetNodeList, unless it was

- * asked for ctxt->replaceEntities != 0

+ * asked for ctxt->replaceEntities != 0

* Returns the AttValue parsed or NULL.

@@ -2562,7 +2715,7 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {

/**

* htmlParseSystemLiteral:

* @ctxt: an HTML parser context

- *

+ *

* parse an HTML Literal

* [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")

@@ -2603,7 +2756,7 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,

" or ' expected\n", NULL, NULL);

}

return(ret);

}

@@ -2652,7 +2805,7 @@ htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {

htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,

"PubidLiteral \" or ' expected\n", NULL, NULL);

}

return(ret);

}

@@ -2699,8 +2852,8 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {

* CDATA.

if (ctxt->recovery) {

- if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,

- xmlStrlen(ctxt->name)) == 0)

+ if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,

+ xmlStrlen(ctxt->name)) == 0)

{

break; /* while */

} else {

@@ -2710,7 +2863,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {

}

} else {

if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||

- ((NXT(2) >= 'a') && (NXT(2) <= 'z')))

+ ((NXT(2) >= 'a') && (NXT(2) <= 'z')))

{

break; /* while */

}

@@ -2767,11 +2920,12 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {

xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];

int nbchar = 0;

int cur, l;

+ int chunk = 0;

SHRINK;

cur = CUR_CHAR(l);

while (((cur != '<') || (ctxt->token == '<')) &&

- ((cur != '&') || (ctxt->token == '&')) &&

+ ((cur != '&') || (ctxt->token == '&')) &&

(cur != 0)) {

if (!(IS_CHAR(cur))) {

htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,

@@ -2797,6 +2951,12 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {

nbchar = 0;

}

NEXTL(l);

+ chunk++;

+ if (chunk > HTML_PARSER_BUFFER_SIZE) {

+ chunk = 0;

+ SHRINK;

+ GROW;

+ }

cur = CUR_CHAR(l);

if (cur == 0) {

SHRINK;

@@ -2991,7 +3151,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) {

}

xmlFree(buf);

} else {

- htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,

+ htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,

"PI is not started correctly", NULL, NULL);

}

ctxt->instate = state;

@@ -3107,7 +3267,7 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {

((NXT(2) == 'x') || NXT(2) == 'X')) {

SKIP(3);

while (CUR != ';') {

- if ((CUR >= '0') && (CUR <= '9'))

+ if ((CUR >= '0') && (CUR <= '9'))

val = val * 16 + (CUR - '0');

else if ((CUR >= 'a') && (CUR <= 'f'))

val = val * 16 + (CUR - 'a') + 10;

@@ -3126,7 +3286,7 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {

} else if ((CUR == '&') && (NXT(1) == '#')) {

SKIP(2);

while (CUR != ';') {

- if ((CUR >= '0') && (CUR <= '9'))

+ if ((CUR >= '0') && (CUR <= '9'))

val = val * 10 + (CUR - '0');

else {

htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,

@@ -3162,7 +3322,7 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) {

* parse a DOCTYPE declaration

- * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?

+ * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?

* ('[' (markupdecl | PEReference | S)* ']' S?)? '>'

@@ -3266,11 +3426,6 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {

NEXT;

SKIP_BLANKS;

val = htmlParseAttValue(ctxt);

- } else if (htmlIsBooleanAttr(name)) {

- /*

- * assume a minimized attribute

- */

- val = xmlStrdup(name);

}

*value = val;

@@ -3294,7 +3449,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {

if ((ctxt == NULL) || (attvalue == NULL))

return;

- /* do not change encoding */

+ /* do not change encoding */

if (ctxt->input->encoding != NULL)

return;

@@ -3321,7 +3476,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {

* registered set of known encodings

if (enc != XML_CHAR_ENCODING_ERROR) {

- if (((enc == XML_CHAR_ENCODING_UTF16LE) ||

+ if (((enc == XML_CHAR_ENCODING_UTF16LE) ||

(enc == XML_CHAR_ENCODING_UTF16BE) ||

(enc == XML_CHAR_ENCODING_UCS4LE) ||

(enc == XML_CHAR_ENCODING_UCS4BE)) &&

@@ -3369,6 +3524,8 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {

}

ctxt->input->base =

ctxt->input->cur = ctxt->input->buf->buffer->content;

+ ctxt->input->end =

+ &ctxt->input->base[ctxt->input->buf->buffer->use];

}

@@ -3409,7 +3566,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {

/**

* htmlParseStartTag:

* @ctxt: an HTML parser context

- *

+ *

* parse a start of tag either for rule element or

* EmptyElement. In both case we don't parse the tag closing chars.

@@ -3438,6 +3595,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {

int i;

int discardtag = 0;

+ if (ctxt->instate == XML_PARSER_EOF)

+ return(-1);

if ((ctxt == NULL) || (ctxt->input == NULL)) {

htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

"htmlParseStartTag: context error\n", NULL, NULL);

@@ -3456,7 +3615,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {

"htmlParseStartTag: invalid element name\n",

NULL, NULL);

/* Dump the bogus tag like browsers do */

- while ((IS_CHAR_CH(CUR)) && (CUR != '>'))

+ while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&

+ (ctxt->instate != XML_PARSER_EOF))

NEXT;

return -1;

}

@@ -3482,13 +3642,15 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {

"htmlParseStartTag: misplaced <html> tag\n",

name, NULL);

discardtag = 1;

+ ctxt->depth++;

}

- if ((ctxt->nameNr != 1) &&

+ if ((ctxt->nameNr != 1) &&

(xmlStrEqual(name, BAD_CAST"head"))) {

htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

"htmlParseStartTag: misplaced <head> tag\n",

name, NULL);

discardtag = 1;

+ ctxt->depth++;

}

if (xmlStrEqual(name, BAD_CAST"body")) {

int indx;

@@ -3498,6 +3660,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {

"htmlParseStartTag: misplaced <body> tag\n",

name, NULL);

discardtag = 1;

+ ctxt->depth++;

}

@@ -3509,7 +3672,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {

SKIP_BLANKS;

while ((IS_CHAR_CH(CUR)) &&

- (CUR != '>') &&

+ (CUR != '>') &&

((CUR != '/') || (NXT(1) != '>'))) {

long cons = ctxt->nbChars;

@@ -3648,7 +3811,6 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)

name = htmlParseHTMLName(ctxt);

if (name == NULL)

return (0);

* We should definitely be at the ending "S? '>'" part

@@ -3669,6 +3831,18 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)

NEXT;

+ * if we ignored misplaced tags in htmlParseStartTag don't pop them

+ * out now.

+ */

+ if ((ctxt->depth > 0) &&

+ (xmlStrEqual(name, BAD_CAST "html") ||

+ xmlStrEqual(name, BAD_CAST "body") ||

+ xmlStrEqual(name, BAD_CAST "head"))) {

+ ctxt->depth--;

+ return (0);

+ }

+ /*

* If the name read is not one of the element in the parsing stack

* then return, it's just an error.

@@ -3722,7 +3896,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)

/**

* htmlParseReference:

* @ctxt: an HTML parser context

- *

+ *

* parse and handle entity references in content,

* this will end-up in a call to character() since this is either a

* CharRef, or a predefined entity.

@@ -3746,7 +3920,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {

else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }

else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }

else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }

for ( ; bits >= 0; bits-= 6) {

out[i++]= ((c >> bits) & 0x3F) | 0x80;

}

@@ -3781,9 +3955,9 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {

{ out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }

else if (c < 0x10000)

{ out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }

- else

+ else

{ out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }

for ( ; bits >= 0; bits-= 6) {

out[i++]= ((c >> bits) & 0x3F) | 0x80;

}

@@ -3801,6 +3975,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {

* @ctxt: an HTML parser context

* Parse a content: comment, sub-element, reference or text.

+ * Kept for compatibility with old code

static void

@@ -3815,6 +3990,10 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {

long cons = ctxt->nbChars;

GROW;

+ if (ctxt->instate == XML_PARSER_EOF)

+ break;

* Our tag or one of it's parent or children is ending.

@@ -3837,7 +4016,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {

"htmlParseStartTag: invalid element name\n",

NULL, NULL);

/* Dump the bogus tag like browsers do */

- while ((IS_CHAR_CH(CUR)) && (CUR != '>'))

+ while ((IS_CHAR_CH(CUR)) && (CUR != '>'))

NEXT;

if (currentNode != NULL)

@@ -3850,7 +4029,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {

htmlAutoClose(ctxt, name);

continue;

}

- }

+ }

}

@@ -3909,7 +4088,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {

* Fourth case : a reference. If if has not been resolved,

- * parsing returns it's Name, create the node

+ * parsing returns it's Name, create the node

else if (CUR == '&') {

htmlParseReference(ctxt);

@@ -3945,23 +4124,11 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {

}

/**

- * htmlParseContent:

- * @ctxt: an HTML parser context

- *

- * Parse a content: comment, sub-element, reference or text.

- */

-void

-__htmlParseContent(void *ctxt) {

- if (ctxt != NULL)

- htmlParseContent((htmlParserCtxtPtr) ctxt);

-/**

* htmlParseElement:

* @ctxt: an HTML parser context

* parse an HTML element, this is highly recursive

+ * this is kept for compatibility with previous code versions

* [39] element ::= EmptyElemTag | STag content ETag

@@ -3983,6 +4150,10 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {

"htmlParseElement: context error\n", NULL, NULL);

return;

}

+ if (ctxt->instate == XML_PARSER_EOF)

+ return;

/* Capture start position */

if (ctxt->record_info) {

node_info.begin_pos = ctxt->input->consumed +

@@ -4027,10 +4198,10 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {

* end of parsing of this node.

- if (xmlStrEqual(name, ctxt->name)) {

+ if (xmlStrEqual(name, ctxt->name)) {

nodePop(ctxt);

htmlnamePop(ctxt);

- }

+ }

* Capture end position and add node

@@ -4064,8 +4235,8 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {

oldptr = ctxt->input->cur;

htmlParseContent(ctxt);

if (oldptr==ctxt->input->cur) break;

- if (ctxt->nameNr < depth) break;

- }

+ if (ctxt->nameNr < depth) break;

+ }

* Capture end position and add node

@@ -4085,10 +4256,305 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {

xmlFree(currentNode);

}

+static void

+htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {

+ /*

+ * Capture end position and add node

+ */

+ if ( ctxt->node != NULL && ctxt->record_info ) {

+ ctxt->nodeInfo->end_pos = ctxt->input->consumed +

+ (CUR_PTR - ctxt->input->base);

+ ctxt->nodeInfo->end_line = ctxt->input->line;

+ ctxt->nodeInfo->node = ctxt->node;

+ xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);

+ htmlNodeInfoPop(ctxt);

+ }

+ if (!IS_CHAR_CH(CUR)) {

+ htmlAutoCloseOnEnd(ctxt);

+ }

+/**

+ * htmlParseElementInternal:

+ * @ctxt: an HTML parser context

+ *

+ * parse an HTML element, new version, non recursive

+ *

+ * [39] element ::= EmptyElemTag | STag content ETag

+ *

+ * [41] Attribute ::= Name Eq AttValue

+ */

+static void

+htmlParseElementInternal(htmlParserCtxtPtr ctxt) {

+ const xmlChar *name;

+ const htmlElemDesc * info;

+ htmlParserNodeInfo node_info;

+ int failed;

+ if ((ctxt == NULL) || (ctxt->input == NULL)) {

+ htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

+ "htmlParseElementInternal: context error\n", NULL, NULL);

+ return;

+ }

+ if (ctxt->instate == XML_PARSER_EOF)

+ return;

+ /* Capture start position */

+ if (ctxt->record_info) {

+ node_info.begin_pos = ctxt->input->consumed +

+ (CUR_PTR - ctxt->input->base);

+ node_info.begin_line = ctxt->input->line;

+ }

+ failed = htmlParseStartTag(ctxt);

+ name = ctxt->name;

+ if ((failed == -1) || (name == NULL)) {

+ if (CUR == '>')

+ NEXT;

+ return;

+ }

+ /*

+ * Lookup the info for that element.

+ */

+ info = htmlTagLookup(name);

+ if (info == NULL) {

+ htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,

+ "Tag %s invalid\n", name, NULL);

+ }

+ /*

+ * Check for an Empty Element labeled the XML/SGML way

+ */

+ if ((CUR == '/') && (NXT(1) == '>')) {

+ SKIP(2);

+ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

+ ctxt->sax->endElement(ctxt->userData, name);

+ htmlnamePop(ctxt);

+ return;

+ }

+ if (CUR == '>') {

+ NEXT;

+ } else {

+ htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,

+ "Couldn't find end of Start Tag %s\n", name, NULL);

+ /*

+ * end of parsing of this node.

+ */

+ if (xmlStrEqual(name, ctxt->name)) {

+ nodePop(ctxt);

+ htmlnamePop(ctxt);

+ }

+ if (ctxt->record_info)

+ htmlNodeInfoPush(ctxt, &node_info);

+ htmlParserFinishElementParsing(ctxt);

+ return;

+ }

+ /*

+ * Check for an Empty Element from DTD definition

+ */

+ if ((info != NULL) && (info->empty)) {

+ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))

+ ctxt->sax->endElement(ctxt->userData, name);

+ htmlnamePop(ctxt);

+ return;

+ }

+ if (ctxt->record_info)

+ htmlNodeInfoPush(ctxt, &node_info);

+/**

+ * htmlParseContentInternal:

+ * @ctxt: an HTML parser context

+ *

+ * Parse a content: comment, sub-element, reference or text.

+ * New version for non recursive htmlParseElementInternal

+ */

+static void

+htmlParseContentInternal(htmlParserCtxtPtr ctxt) {

+ xmlChar *currentNode;

+ int depth;

+ const xmlChar *name;

+ currentNode = xmlStrdup(ctxt->name);

+ depth = ctxt->nameNr;

+ while (1) {

+ long cons = ctxt->nbChars;

+ GROW;

+ if (ctxt->instate == XML_PARSER_EOF)

+ break;

+ /*

+ * Our tag or one of it's parent or children is ending.

+ */

+ if ((CUR == '<') && (NXT(1) == '/')) {

+ if (htmlParseEndTag(ctxt) &&

+ ((currentNode != NULL) || (ctxt->nameNr == 0))) {

+ if (currentNode != NULL)

+ xmlFree(currentNode);

+ currentNode = xmlStrdup(ctxt->name);

+ depth = ctxt->nameNr;

+ }

+ continue; /* while */

+ }

+ else if ((CUR == '<') &&

+ ((IS_ASCII_LETTER(NXT(1))) ||

+ (NXT(1) == '_') || (NXT(1) == ':'))) {

+ name = htmlParseHTMLName_nonInvasive(ctxt);

+ if (name == NULL) {

+ htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,

+ "htmlParseStartTag: invalid element name\n",

+ NULL, NULL);

+ /* Dump the bogus tag like browsers do */

+ while ((IS_CHAR_CH(CUR)) && (CUR != '>'))

+ NEXT;

+ htmlParserFinishElementParsing(ctxt);

+ if (currentNode != NULL)

+ xmlFree(currentNode);

+ currentNode = xmlStrdup(ctxt->name);

+ depth = ctxt->nameNr;

+ continue;

+ }

+ if (ctxt->name != NULL) {

+ if (htmlCheckAutoClose(name, ctxt->name) == 1) {

+ htmlAutoClose(ctxt, name);

+ continue;

+ }

+ /*

+ * Has this node been popped out during parsing of

+ * the next element

+ */

+ if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&

+ (!xmlStrEqual(currentNode, ctxt->name)))

+ {

+ htmlParserFinishElementParsing(ctxt);

+ if (currentNode != NULL) xmlFree(currentNode);

+ currentNode = xmlStrdup(ctxt->name);

+ depth = ctxt->nameNr;

+ continue;

+ }

+ if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||

+ (xmlStrEqual(currentNode, BAD_CAST"style")))) {

+ /*

+ * Handle SCRIPT/STYLE separately

+ */

+ htmlParseScript(ctxt);

+ } else {

+ /*

+ * Sometimes DOCTYPE arrives in the middle of the document

+ */

+ if ((CUR == '<') && (NXT(1) == '!') &&

+ (UPP(2) == 'D') && (UPP(3) == 'O') &&

+ (UPP(4) == 'C') && (UPP(5) == 'T') &&

+ (UPP(6) == 'Y') && (UPP(7) == 'P') &&

+ (UPP(8) == 'E')) {

+ htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

+ "Misplaced DOCTYPE declaration\n",

+ BAD_CAST "DOCTYPE" , NULL);

+ htmlParseDocTypeDecl(ctxt);

+ }

+ /*

+ * First case : a comment

+ */

+ if ((CUR == '<') && (NXT(1) == '!') &&

+ (NXT(2) == '-') && (NXT(3) == '-')) {

+ htmlParseComment(ctxt);

+ }

+ /*

+ * Second case : a Processing Instruction.

+ */

+ else if ((CUR == '<') && (NXT(1) == '?')) {

+ htmlParsePI(ctxt);

+ }

+ /*

+ * Third case : a sub-element.

+ */

+ else if (CUR == '<') {

+ htmlParseElementInternal(ctxt);

+ if (currentNode != NULL) xmlFree(currentNode);

+ currentNode = xmlStrdup(ctxt->name);

+ depth = ctxt->nameNr;

+ }

+ /*

+ * Fourth case : a reference. If if has not been resolved,

+ * parsing returns it's Name, create the node

+ */

+ else if (CUR == '&') {

+ htmlParseReference(ctxt);

+ }

+ /*

+ * Fifth case : end of the resource

+ */

+ else if (CUR == 0) {

+ htmlAutoCloseOnEnd(ctxt);

+ break;

+ }

+ /*

+ * Last case, text. Note that References are handled directly.

+ */

+ else {

+ htmlParseCharData(ctxt);

+ }

+ if (cons == ctxt->nbChars) {

+ if (ctxt->node != NULL) {

+ htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,

+ "detected an error in element content\n",

+ NULL, NULL);

+ }

+ break;

+ }

+ GROW;

+ }

+ if (currentNode != NULL) xmlFree(currentNode);

+/**

+ * htmlParseContent:

+ * @ctxt: an HTML parser context

+ *

+ * Parse a content: comment, sub-element, reference or text.

+ * This is the entry point when called from parser.c

+ */

+void

+__htmlParseContent(void *ctxt) {

+ if (ctxt != NULL)

+ htmlParseContentInternal((htmlParserCtxtPtr) ctxt);

/**

* htmlParseDocument:

* @ctxt: an HTML parser context

- *

+ *

* parse an HTML document (and build a tree if using the standard SAX

* interface).

@@ -4098,6 +4564,8 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {

int

htmlParseDocument(htmlParserCtxtPtr ctxt) {

+ xmlChar start[4];

+ xmlCharEncoding enc;

xmlDtdPtr dtd;

xmlInitParser();

@@ -4110,6 +4578,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {

return(XML_ERR_INTERNAL_ERROR);

}

ctxt->html = 1;

+ ctxt->linenumbers = 1;

GROW;

* SAX: beginning of the document processing.

@@ -4117,12 +4586,29 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {

if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))

ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);

+ if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&

+ ((ctxt->input->end - ctxt->input->cur) >= 4)) {

+ /*

+ * Get the 4 first bytes and decode the charset

+ * if enc != XML_CHAR_ENCODING_NONE

+ * plug some encoding conversion routines.

+ */

+ start[0] = RAW;

+ start[1] = NXT(1);

+ start[2] = NXT(2);

+ start[3] = NXT(3);

+ enc = xmlDetectCharEncoding(&start[0], 4);

+ if (enc != XML_CHAR_ENCODING_NONE) {

+ xmlSwitchEncoding(ctxt, enc);

+ }

* Wipe out everything which is before the first '<'

SKIP_BLANKS;

if (CUR == 0) {

- htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,

+ htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,

"Document is empty\n", NULL, NULL);

}

@@ -4136,10 +4622,10 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {

while (((CUR == '<') && (NXT(1) == '!') &&

(NXT(2) == '-') && (NXT(3) == '-')) ||

((CUR == '<') && (NXT(1) == '?'))) {

- htmlParseComment(ctxt);

- htmlParsePI(ctxt);

+ htmlParseComment(ctxt);

+ htmlParsePI(ctxt);

SKIP_BLANKS;

- }

+ }

@@ -4161,15 +4647,15 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {

while (((CUR == '<') && (NXT(1) == '!') &&

(NXT(2) == '-') && (NXT(3) == '-')) ||

((CUR == '<') && (NXT(1) == '?'))) {

- htmlParseComment(ctxt);

- htmlParsePI(ctxt);

+ htmlParseComment(ctxt);

+ htmlParsePI(ctxt);

SKIP_BLANKS;

- }

+ }

* Time to start parsing the tree itself

- htmlParseContent(ctxt);

+ htmlParseContentInternal(ctxt);

* autoclose

@@ -4187,8 +4673,8 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {

if (ctxt->myDoc != NULL) {

dtd = xmlGetIntSubset(ctxt->myDoc);

if (dtd == NULL)

- ctxt->myDoc->intSubset =

- xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",

+ ctxt->myDoc->intSubset =

+ xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",

BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",

BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");

}

@@ -4234,7 +4720,7 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)

memset(sax, 0, sizeof(htmlSAXHandler));

/* Allocate the Input stack */

- ctxt->inputTab = (htmlParserInputPtr *)

+ ctxt->inputTab = (htmlParserInputPtr *)

xmlMalloc(5 * sizeof(htmlParserInputPtr));

if (ctxt->inputTab == NULL) {

htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");

@@ -4272,7 +4758,7 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)

if (ctxt->nameTab == NULL) {

htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");

ctxt->nameNr = 0;

- ctxt->nameMax = 10;

+ ctxt->nameMax = 0;

ctxt->name = NULL;

ctxt->nodeNr = 0;

ctxt->nodeMax = 0;

@@ -4286,6 +4772,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)

ctxt->nameMax = 10;

ctxt->name = NULL;

+ ctxt->nodeInfoTab = NULL;

+ ctxt->nodeInfoNr = 0;

+ ctxt->nodeInfoMax = 0;

if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;

else {

ctxt->sax = sax;

@@ -4432,7 +4922,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {

xmlSwitchEncoding(ctxt, enc);

if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {

htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,

- "Unsupported encoding %s\n",

+ "Unsupported encoding %s\n",

(const xmlChar *) encoding, NULL);

}

} else {

@@ -4455,7 +4945,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {

#ifdef LIBXML_PUSH_ENABLED

/************************************************************************

* *

- * Progressive parsing interfaces *

+ * Progressive parsing interfaces *

* *

************************************************************************/

@@ -4479,85 +4969,190 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {

static int

htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,

- xmlChar next, xmlChar third, int iscomment) {

+ xmlChar next, xmlChar third, int iscomment,

+ int ignoreattrval)

int base, len;

htmlParserInputPtr in;

const xmlChar *buf;

int incomment = 0;

+ int invalue = 0;

+ char valdellim = 0x0;

in = ctxt->input;

- if (in == NULL) return(-1);

+ if (in == NULL)

+ return (-1);

base = in->cur - in->base;

- if (base < 0) return(-1);

+ if (base < 0)

+ return (-1);

if (ctxt->checkIndex > base)

base = ctxt->checkIndex;

if (in->buf == NULL) {

- buf = in->base;

- len = in->length;

+ buf = in->base;

+ len = in->length;

} else {

- buf = in->buf->buffer->content;

- len = in->buf->buffer->use;

+ buf = in->buf->buffer->content;

+ len = in->buf->buffer->use;

}

/* take into account the sequence length */

- if (third) len -= 2;

- else if (next) len --;

- for (;base < len;base++) {

- if (!incomment && (base + 4 < len) && !iscomment) {

- if ((buf[base] == '<') && (buf[base + 1] == '!') &&

- (buf[base + 2] == '-') && (buf[base + 3] == '-')) {

- incomment = 1;

- /* do not increment past <! - some people use <!--> */

- base += 2;

- }

- if (incomment) {

- if (base + 3 > len)

- return(-1);

- if ((buf[base] == '-') && (buf[base + 1] == '-') &&

- (buf[base + 2] == '>')) {

- incomment = 0;

- base += 2;

- }

- continue;

- }

+ if (third)

+ len -= 2;

+ else if (next)

+ len--;

+ for (; base < len; base++) {

+ if ((!incomment) && (base + 4 < len) && (!iscomment)) {

+ if ((buf[base] == '<') && (buf[base + 1] == '!') &&

+ (buf[base + 2] == '-') && (buf[base + 3] == '-')) {

+ incomment = 1;

+ /* do not increment past <! - some people use <!--> */

+ base += 2;

+ }

+ if (ignoreattrval) {

+ if (buf[base] == '"' || buf[base] == '\'') {

+ if (invalue) {

+ if (buf[base] == valdellim) {

+ invalue = 0;

+ continue;

+ }

+ } else {

+ valdellim = buf[base];

+ invalue = 1;

+ continue;

+ }

+ } else if (invalue) {

+ continue;

+ }

+ if (incomment) {

+ if (base + 3 > len)

+ return (-1);

+ if ((buf[base] == '-') && (buf[base + 1] == '-') &&

+ (buf[base + 2] == '>')) {

+ incomment = 0;

+ base += 2;

+ }

+ continue;

+ }

if (buf[base] == first) {

- if (third != 0) {

- if ((buf[base + 1] != next) ||

- (buf[base + 2] != third)) continue;

- } else if (next != 0) {

- if (buf[base + 1] != next) continue;

- }

- ctxt->checkIndex = 0;

+ if (third != 0) {

+ if ((buf[base + 1] != next) || (buf[base + 2] != third))

+ continue;

+ } else if (next != 0) {

+ if (buf[base + 1] != next)

+ continue;

+ }

+ ctxt->checkIndex = 0;

#ifdef DEBUG_PUSH

- if (next == 0)

- xmlGenericError(xmlGenericErrorContext,

- "HPP: lookup '%c' found at %d\n",

- first, base);

- else if (third == 0)

- xmlGenericError(xmlGenericErrorContext,

- "HPP: lookup '%c%c' found at %d\n",

- first, next, base);

- else

- xmlGenericError(xmlGenericErrorContext,

- "HPP: lookup '%c%c%c' found at %d\n",

- first, next, third, base);

+ if (next == 0)

+ xmlGenericError(xmlGenericErrorContext,

+ "HPP: lookup '%c' found at %d\n",

+ first, base);

+ else if (third == 0)

+ xmlGenericError(xmlGenericErrorContext,

+ "HPP: lookup '%c%c' found at %d\n",

+ first, next, base);

+ else

+ xmlGenericError(xmlGenericErrorContext,

+ "HPP: lookup '%c%c%c' found at %d\n",

+ first, next, third, base);

#endif

- return(base - (in->cur - in->base));

- }

+ return (base - (in->cur - in->base));

+ }

}

- ctxt->checkIndex = base;

+ if ((!incomment) && (!invalue))

+ ctxt->checkIndex = base;

#ifdef DEBUG_PUSH

if (next == 0)

- xmlGenericError(xmlGenericErrorContext,

- "HPP: lookup '%c' failed\n", first);

+ xmlGenericError(xmlGenericErrorContext,

+ "HPP: lookup '%c' failed\n", first);

else if (third == 0)

- xmlGenericError(xmlGenericErrorContext,

- "HPP: lookup '%c%c' failed\n", first, next);

- else

- xmlGenericError(xmlGenericErrorContext,

- "HPP: lookup '%c%c%c' failed\n", first, next, third);

+ xmlGenericError(xmlGenericErrorContext,

+ "HPP: lookup '%c%c' failed\n", first, next);

+ else

+ xmlGenericError(xmlGenericErrorContext,

+ "HPP: lookup '%c%c%c' failed\n", first, next,

+ third);

#endif

- return(-1);

+ return (-1);

+/**

+ * htmlParseLookupChars:

+ * @ctxt: an HTML parser context

+ * @stop: Array of chars, which stop the lookup.

+ * @stopLen: Length of stop-Array

+ *

+ * Try to find if any char of the stop-Array is available in the input

+ * stream.

+ * This function has a side effect of (possibly) incrementing ctxt->checkIndex

+ * to avoid rescanning sequences of bytes, it DOES change the state of the

+ * parser, do not use liberally.

+ *

+ * Returns the index to the current parsing point if a stopChar

+ * is available, -1 otherwise.

+ */

+static int

+htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,

+ int stopLen)

+ int base, len;

+ htmlParserInputPtr in;

+ const xmlChar *buf;

+ int incomment = 0;

+ int i;

+ in = ctxt->input;

+ if (in == NULL)

+ return (-1);

+ base = in->cur - in->base;

+ if (base < 0)

+ return (-1);

+ if (ctxt->checkIndex > base)

+ base = ctxt->checkIndex;

+ if (in->buf == NULL) {

+ buf = in->base;

+ len = in->length;

+ } else {

+ buf = in->buf->buffer->content;

+ len = in->buf->buffer->use;

+ }

+ for (; base < len; base++) {

+ if (!incomment && (base + 4 < len)) {

+ if ((buf[base] == '<') && (buf[base + 1] == '!') &&

+ (buf[base + 2] == '-') && (buf[base + 3] == '-')) {

+ incomment = 1;

+ /* do not increment past <! - some people use <!--> */

+ base += 2;

+ }

+ if (incomment) {

+ if (base + 3 > len)

+ return (-1);

+ if ((buf[base] == '-') && (buf[base + 1] == '-') &&

+ (buf[base + 2] == '>')) {

+ incomment = 0;

+ base += 2;

+ }

+ continue;

+ }

+ for (i = 0; i < stopLen; ++i) {

+ if (buf[base] == stop[i]) {

+ ctxt->checkIndex = 0;

+ return (base - (in->cur - in->base));

+ }

+ ctxt->checkIndex = base;

+ return (-1);

}

/**

@@ -4639,7 +5234,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

avail = in->buf->buffer->use - (in->cur - in->base);

if ((avail == 0) && (terminate)) {

htmlAutoCloseOnEnd(ctxt);

- if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {

+ if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {

* SAX: end of the document processing.

@@ -4689,7 +5284,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

(UPP(8) == 'E')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4722,7 +5317,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

if ((cur == '<') && (next == '!') &&

(in->cur[2] == '-') && (in->cur[3] == '-')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))

+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4732,7 +5327,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

ctxt->instate = XML_PARSER_MISC;

} else if ((cur == '<') && (next == '?')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4746,7 +5341,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

(UPP(8) == 'E')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4775,14 +5370,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

avail = in->length - (in->cur - in->base);

else

avail = in->buf->buffer->use - (in->cur - in->base);

- if (avail < 2)

+ if (avail < 2)

goto done;

cur = in->cur[0];

next = in->cur[1];

if ((cur == '<') && (next == '!') &&

(in->cur[2] == '-') && (in->cur[3] == '-')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))

+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4792,7 +5387,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

ctxt->instate = XML_PARSER_PROLOG;

} else if ((cur == '<') && (next == '?')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4829,7 +5424,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

if ((cur == '<') && (next == '!') &&

(in->cur[2] == '-') && (in->cur[3] == '-')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))

+ (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4839,7 +5434,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

ctxt->instate = XML_PARSER_EPILOG;

} else if ((cur == '<') && (next == '?')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -4889,7 +5484,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

break;

}

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

failed = htmlParseStartTag(ctxt);

@@ -4936,10 +5531,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

* end of parsing of this node.

- if (xmlStrEqual(name, ctxt->name)) {

+ if (xmlStrEqual(name, ctxt->name)) {

nodePop(ctxt);

htmlnamePop(ctxt);

- }

+ }

ctxt->instate = XML_PARSER_CONTENT;

#ifdef DEBUG_PUSH

@@ -5014,7 +5609,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

int idx;

xmlChar val;

- idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);

+ idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);

if (idx < 0)

goto done;

val = in->cur[idx + 2];

@@ -5041,7 +5636,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

(UPP(6) == 'Y') && (UPP(7) == 'P') &&

(UPP(8) == 'E')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,

"Misplaced DOCTYPE declaration\n",

@@ -5051,7 +5646,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

(in->cur[2] == '-') && (in->cur[3] == '-')) {

if ((!terminate) &&

(htmlParseLookupSequence(

- ctxt, '-', '-', '>', 1) < 0))

+ ctxt, '-', '-', '>', 1, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -5061,7 +5656,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

ctxt->instate = XML_PARSER_CONTENT;

} else if ((cur == '<') && (next == '?')) {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -5089,7 +5684,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

break;

} else if (cur == '&') {

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))

+ (htmlParseLookupChars(ctxt,

+ BAD_CAST "; >/", 4) < 0))

goto done;

#ifdef DEBUG_PUSH

xmlGenericError(xmlGenericErrorContext,

@@ -5105,7 +5701,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

* data detection.

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))

+ (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))

goto done;

ctxt->checkIndex = 0;

#ifdef DEBUG_PUSH

@@ -5131,7 +5727,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

if (avail < 2)

goto done;

if ((!terminate) &&

- (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

+ (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))

goto done;

htmlParseEndTag(ctxt);

if (ctxt->nameNr == 0) {

@@ -5258,10 +5854,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {

}

-done:

+done:

if ((avail == 0) && (terminate)) {

htmlAutoCloseOnEnd(ctxt);

- if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {

+ if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {

* SAX: end of the document processing.

@@ -5276,8 +5872,8 @@ done:

xmlDtdPtr dtd;

dtd = xmlGetIntSubset(ctxt->myDoc);

if (dtd == NULL)

- ctxt->myDoc->intSubset =

- xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",

+ ctxt->myDoc->intSubset =

+ xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",

BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",

BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");

}

@@ -5311,8 +5907,8 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,

int base = ctxt->input->base - ctxt->input->buf->buffer->content;

int cur = ctxt->input->cur - ctxt->input->base;

int res;

- res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);

+ res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);

if (res < 0) {

ctxt->errNo = XML_PARSER_EOF;

ctxt->disableSAX = 1;

@@ -5336,7 +5932,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,

if ((in->encoder != NULL) && (in->buffer != NULL) &&

(in->raw != NULL)) {

int nbchars;

nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);

if (nbchars < 0) {

htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,

@@ -5353,14 +5949,14 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,

(ctxt->instate != XML_PARSER_MISC)) {

ctxt->errNo = XML_ERR_DOCUMENT_END;

ctxt->wellFormed = 0;

- }

+ }

if (ctxt->instate != XML_PARSER_EOF) {

if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))

ctxt->sax->endDocument(ctxt->userData);

}

ctxt->instate = XML_PARSER_EOF;

}

- return((xmlParserErrors) ctxt->errNo);

+ return((xmlParserErrors) ctxt->errNo);

}

/************************************************************************

@@ -5385,7 +5981,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,

* Returns the new parser context or NULL

htmlParserCtxtPtr

-htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,

+htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,

const char *chunk, int size, const char *filename,

xmlCharEncoding enc) {

htmlParserCtxtPtr ctxt;

@@ -5416,7 +6012,7 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,

memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));

if (user_data != NULL)

ctxt->userData = user_data;

- }

+ }

if (filename == NULL) {

ctxt->directory = NULL;

} else {

@@ -5438,17 +6034,17 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,

inputStream->buf = buf;

inputStream->base = inputStream->buf->buffer->content;

inputStream->cur = inputStream->buf->buffer->content;

- inputStream->end =

+ inputStream->end =

&inputStream->buf->buffer->content[inputStream->buf->buffer->use];

inputPush(ctxt, inputStream);

if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&

- (ctxt->input->buf != NULL)) {

+ (ctxt->input->buf != NULL)) {

int base = ctxt->input->base - ctxt->input->buf->buffer->content;

int cur = ctxt->input->cur - ctxt->input->base;

- xmlParserInputBufferPush(ctxt->input->buf, size, chunk);

+ xmlParserInputBufferPush(ctxt->input->buf, size, chunk);

ctxt->input->base = ctxt->input->buf->buffer->content + base;

ctxt->input->cur = ctxt->input->base + cur;

@@ -5469,12 +6065,12 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,

* @cur: a pointer to an array of xmlChar

* @encoding: a free form C string describing the HTML document encoding, or NULL

* @sax: the SAX handler block

- * @userData: if using SAX, this pointer will be provided on callbacks.

+ * @userData: if using SAX, this pointer will be provided on callbacks.

* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks

* to handle parse events. If sax is NULL, fallback to the default DOM

* behavior and return a tree.

- *

+ *

* Returns the resulting document tree unless SAX is NULL or the document is

* not well formed.

@@ -5491,7 +6087,7 @@ htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void

ctxt = htmlCreateDocParserCtxt(cur, encoding);

if (ctxt == NULL) return(NULL);

- if (sax != NULL) {

+ if (sax != NULL) {

if (ctxt->sax != NULL) xmlFree (ctxt->sax);

ctxt->sax = sax;

ctxt->userData = userData;

@@ -5504,7 +6100,7 @@ htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void

ctxt->userData = NULL;

}

htmlFreeParserCtxt(ctxt);

return(ret);

}

@@ -5514,7 +6110,7 @@ htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void

* @encoding: a free form C string describing the HTML document encoding, or NULL

* parse an HTML in-memory document and build a tree.

- *

+ *

* Returns the resulting document tree

@@ -5529,7 +6125,7 @@ htmlParseDoc(xmlChar *cur, const char *encoding) {

* @filename: the filename

* @encoding: a free form C string describing the HTML document encoding, or NULL

- * Create a parser context for a file content.

+ * Create a parser context for a file content.

* Automatic support for ZLIB/Compress compressed document is provided

* by default if found at compile-time.

@@ -5561,7 +6157,7 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)

xmlFreeParserCtxt(ctxt);

return(NULL);

}

inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);

xmlFree(canonicFilename);

if (inputStream == NULL) {

@@ -5574,14 +6170,14 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)

/* set encoding */

if (encoding) {

content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);

- if (content) {

+ if (content) {

strcpy ((char *)content, (char *)content_line);

strcat ((char *)content, (char *)encoding);

htmlCheckEncoding (ctxt, content);

xmlFree (content);

}

return(ctxt);

}

@@ -5590,7 +6186,7 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)

* @filename: the filename

* @encoding: a free form C string describing the HTML document encoding, or NULL

* @sax: the SAX handler block

- * @userData: if using SAX, this pointer will be provided on callbacks.

+ * @userData: if using SAX, this pointer will be provided on callbacks.

* parse an HTML file and build a tree. Automatic support for ZLIB/Compress

* compressed document is provided by default if found at compile-time.

@@ -5602,7 +6198,7 @@ htmlCreateFileParserCtxt(const char *filename, const char *encoding)

htmlDocPtr

-htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,

+htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,

void *userData) {

htmlDocPtr ret;

htmlParserCtxtPtr ctxt;

@@ -5626,7 +6222,7 @@ htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr s

ctxt->userData = NULL;

}

htmlFreeParserCtxt(ctxt);

return(ret);

}

@@ -5648,7 +6244,7 @@ htmlParseFile(const char *filename, const char *encoding) {

/**

* htmlHandleOmittedElem:

- * @val: int 0 or 1

+ * @val: int 0 or 1

* Set and return the previous value for handling HTML omitted tags.

@@ -5788,7 +6384,7 @@ htmlNodeStatus(const htmlNodePtr node, int legacy) {

* current scope

#define DICT_FREE(str) \

- if ((str) && ((!dict) || \

+ if ((str) && ((!dict) || \

(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \

xmlFree((char *)(str));

@@ -5803,7 +6399,7 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)

{

xmlParserInputPtr input;

xmlDictPtr dict;

if (ctxt == NULL)

return;

@@ -5930,6 +6526,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)

ctxt->options |= HTML_PARSE_COMPACT;

options -= HTML_PARSE_COMPACT;

}

+ if (options & XML_PARSE_HUGE) {

+ ctxt->options |= XML_PARSE_HUGE;

+ options -= XML_PARSE_HUGE;

+ }

ctxt->dictNames = 0;

return (options);

}

@@ -5943,7 +6543,7 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)

* @reuse: keep the context for reuse

* Common front-end for the htmlRead functions

- *

+ *

* Returns the resulting document tree or NULL

static htmlDocPtr

@@ -5951,15 +6551,19 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,

int options, int reuse)

{

htmlDocPtr ret;

htmlCtxtUseOptions(ctxt, options);

ctxt->html = 1;

if (encoding != NULL) {

xmlCharEncodingHandlerPtr hdlr;

hdlr = xmlFindCharEncodingHandler(encoding);

- if (hdlr != NULL)

+ if (hdlr != NULL) {

xmlSwitchToEncoding(ctxt, hdlr);

+ if (ctxt->input->encoding != NULL)

+ xmlFree((xmlChar *) ctxt->input->encoding);

+ ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);

+ }

}

if ((URL != NULL) && (ctxt->input != NULL) &&

(ctxt->input->filename == NULL))

@@ -5985,7 +6589,7 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,

* @options: a combination of htmlParserOption(s)

* parse an XML in-memory document and build a tree.

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6010,7 +6614,7 @@ htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int opti

* @options: a combination of htmlParserOption(s)

* parse an XML file from the filesystem or the network.

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6034,7 +6638,7 @@ htmlReadFile(const char *filename, const char *encoding, int options)

* @options: a combination of htmlParserOption(s)

* parse an XML in-memory document and build a tree.

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6060,7 +6664,7 @@ htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi

* @options: a combination of htmlParserOption(s)

* parse an XML from a file descriptor and build a tree.

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6102,7 +6706,7 @@ htmlReadFd(int fd, const char *URL, const char *encoding, int options)

* @options: a combination of htmlParserOption(s)

* parse an HTML document from I/O functions and source and build a tree.

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6146,7 +6750,7 @@ htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,

* parse an XML in-memory document and build a tree.

* This reuses the existing @ctxt parser context

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6179,7 +6783,7 @@ htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,

* parse an XML file from the filesystem or the network.

* This reuses the existing @ctxt parser context

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6214,7 +6818,7 @@ htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,

* parse an XML in-memory document and build a tree.

* This reuses the existing @ctxt parser context

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6256,7 +6860,7 @@ htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,

* parse an XML from a file descriptor and build a tree.

* This reuses the existing @ctxt parser context

- *

+ *

* Returns the resulting document tree

htmlDocPtr

@@ -6298,7 +6902,7 @@ htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,

* parse an HTML document from I/O functions and source and build a tree.

* This reuses the existing @ctxt parser context

- *

+ *

* Returns the resulting document tree

htmlDocPtr

« no previous file with comments | « third_party/libxml/ChangeLog ('k') | third_party/libxml/HTMLtree.c » ('j') | third_party/libxml/patches/icu » ('J')