Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(107)

Side by Side Diff: third_party/libxml/HTMLparser.c

Issue 2951008: Update libxml to 2.7.7. (Closed) Base URL: http://src.chromium.org/git/chromium.git
Patch Set: Created 10 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser 2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 * 3 *
4 * See Copyright for the status of this software. 4 * See Copyright for the status of this software.
5 * 5 *
6 * daniel@veillard.com 6 * daniel@veillard.com
7 */ 7 */
8 8
9 #define IN_LIBXML 9 #define IN_LIBXML
10 #include "libxml.h" 10 #include "libxml.h"
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
52 /* #define DEBUG_PUSH */ 52 /* #define DEBUG_PUSH */
53 53
54 static int htmlOmittedDefaultValue = 1; 54 static int htmlOmittedDefaultValue = 1;
55 55
56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 56 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57 xmlChar end, xmlChar end2, xmlChar end3); 57 xmlChar end, xmlChar end2, xmlChar end3);
58 static void htmlParseComment(htmlParserCtxtPtr ctxt); 58 static void htmlParseComment(htmlParserCtxtPtr ctxt);
59 59
60 /************************************************************************ 60 /************************************************************************
61 * * 61 * *
62 * » » Some factorized error routines» » » » * 62 *» » Some factorized error routines» » » » *
63 * * 63 * *
64 ************************************************************************/ 64 ************************************************************************/
65 65
66 /** 66 /**
67 * htmlErrMemory: 67 * htmlErrMemory:
68 * @ctxt: an HTML parser context 68 * @ctxt: an HTML parser context
69 * @extra: extra informations 69 * @extra: extra informations
70 * 70 *
71 * Handle a redefinition of attribute error 71 * Handle a redefinition of attribute error
72 */ 72 */
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
140 ctxt->errNo = error; 140 ctxt->errNo = error;
141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 141 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142 XML_ERR_ERROR, NULL, 0, NULL, NULL, 142 XML_ERR_ERROR, NULL, 0, NULL, NULL,
143 NULL, val, 0, msg, val); 143 NULL, val, 0, msg, val);
144 if (ctxt != NULL) 144 if (ctxt != NULL)
145 ctxt->wellFormed = 0; 145 ctxt->wellFormed = 0;
146 } 146 }
147 147
148 /************************************************************************ 148 /************************************************************************
149 * * 149 * *
150 * » » Parser stacks related functions and macros» » * 150 *» Parser stacks related functions and macros» » *
151 * * 151 * *
152 ************************************************************************/ 152 ************************************************************************/
153 153
154 /** 154 /**
155 * htmlnamePush: 155 * htmlnamePush:
156 * @ctxt: an HTML parser context 156 * @ctxt: an HTML parser context
157 * @value: the element name 157 * @value: the element name
158 * 158 *
159 * Pushes a new element name on top of the name stack 159 * Pushes a new element name on top of the name stack
160 * 160 *
161 * Returns 0 in case of error, the index in the stack otherwise 161 * Returns 0 in case of error, the index in the stack otherwise
162 */ 162 */
163 static int 163 static int
164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 164 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165 { 165 {
166 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167 ctxt->html = 3;
168 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169 ctxt->html = 10;
166 if (ctxt->nameNr >= ctxt->nameMax) { 170 if (ctxt->nameNr >= ctxt->nameMax) {
167 ctxt->nameMax *= 2; 171 ctxt->nameMax *= 2;
168 ctxt->nameTab = (const xmlChar * *) 172 ctxt->nameTab = (const xmlChar * *)
169 xmlRealloc((xmlChar * *)ctxt->nameTab, 173 xmlRealloc((xmlChar * *)ctxt->nameTab,
170 ctxt->nameMax * 174 ctxt->nameMax *
171 sizeof(ctxt->nameTab[0])); 175 sizeof(ctxt->nameTab[0]));
172 if (ctxt->nameTab == NULL) { 176 if (ctxt->nameTab == NULL) {
173 htmlErrMemory(ctxt, NULL); 177 htmlErrMemory(ctxt, NULL);
174 return (0); 178 return (0);
175 } 179 }
(...skipping 22 matching lines...) Expand all
198 return (NULL); 202 return (NULL);
199 if (ctxt->nameNr > 0) 203 if (ctxt->nameNr > 0)
200 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 204 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201 else 205 else
202 ctxt->name = NULL; 206 ctxt->name = NULL;
203 ret = ctxt->nameTab[ctxt->nameNr]; 207 ret = ctxt->nameTab[ctxt->nameNr];
204 ctxt->nameTab[ctxt->nameNr] = NULL; 208 ctxt->nameTab[ctxt->nameNr] = NULL;
205 return (ret); 209 return (ret);
206 } 210 }
207 211
212 /**
213 * htmlNodeInfoPush:
214 * @ctxt: an HTML parser context
215 * @value: the node info
216 *
217 * Pushes a new element name on top of the node info stack
218 *
219 * Returns 0 in case of error, the index in the stack otherwise
220 */
221 static int
222 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
223 {
224 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225 if (ctxt->nodeInfoMax == 0)
226 ctxt->nodeInfoMax = 5;
227 ctxt->nodeInfoMax *= 2;
228 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
230 ctxt->nodeInfoMax *
231 sizeof(ctxt->nodeInfoTab[0]));
232 if (ctxt->nodeInfoTab == NULL) {
233 htmlErrMemory(ctxt, NULL);
234 return (0);
235 }
236 }
237 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239 return (ctxt->nodeInfoNr++);
240 }
241
242 /**
243 * htmlNodeInfoPop:
244 * @ctxt: an HTML parser context
245 *
246 * Pops the top element name from the node info stack
247 *
248 * Returns 0 in case of error, the pointer to NodeInfo otherwise
249 */
250 static htmlParserNodeInfo *
251 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
252 {
253 if (ctxt->nodeInfoNr <= 0)
254 return (NULL);
255 ctxt->nodeInfoNr--;
256 if (ctxt->nodeInfoNr < 0)
257 return (NULL);
258 if (ctxt->nodeInfoNr > 0)
259 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
260 else
261 ctxt->nodeInfo = NULL;
262 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
263 }
264
208 /* 265 /*
209 * Macros for accessing the content. Those should be used only by the parser, 266 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported. 267 * and not exported.
211 * 268 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them 269 * Dirty macros, i.e. one need to make assumption on the context to use them
213 * 270 *
214 * CUR_PTR return the current pointer to the xmlChar to be parsed. 271 * CUR_PTR return the current pointer to the xmlChar to be parsed.
215 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 272 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 273 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 * in UNICODE mode. This should be used internally by the parser 274 * in UNICODE mode. This should be used internally by the parser
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
256 313
257 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 314 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258 315
259 /* Inported from XML */ 316 /* Inported from XML */
260 317
261 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 318 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262 #define CUR ((int) (*ctxt->input->cur)) 319 #define CUR ((int) (*ctxt->input->cur))
263 #define NEXT xmlNextChar(ctxt) 320 #define NEXT xmlNextChar(ctxt)
264 321
265 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 322 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266 #define NXT(val) ctxt->input->cur[(val)]
267 #define CUR_PTR ctxt->input->cur
268 323
269 324
270 #define NEXTL(l) do { \ 325 #define NEXTL(l) do { \
271 if (*(ctxt->input->cur) == '\n') { \ 326 if (*(ctxt->input->cur) == '\n') { \
272 ctxt->input->line++; ctxt->input->col = 1; \ 327 ctxt->input->line++; ctxt->input->col = 1; \
273 } else ctxt->input->col++; \ 328 } else ctxt->input->col++; \
274 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 329 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275 } while (0) 330 } while (0)
276 331
277 /************ 332 /************
278 \ 333 \
279 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 334 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 335 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/ 336 ************/
282 337
283 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 338 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 339 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285 340
286 #define COPY_BUF(l,b,i,v) \ 341 #define COPY_BUF(l,b,i,v) \
287 if (l == 1) b[i++] = (xmlChar) v; \ 342 if (l == 1) b[i++] = (xmlChar) v; \
288 else i += xmlCopyChar(l,&b[i],v) 343 else i += xmlCopyChar(l,&b[i],v)
289 344
290 /** 345 /**
346 * htmlFindEncoding:
347 * @the HTML parser context
348 *
349 * Ty to find and encoding in the current data available in the input
350 * buffer this is needed to try to switch to the proper encoding when
351 * one face a character error.
352 * That's an heuristic, since it's operating outside of parsing it could
353 * try to use a meta which had been commented out, that's the reason it
354 * should only be used in case of error, not as a default.
355 *
356 * Returns an encoding string or NULL if not found, the string need to
357 * be freed
358 */
359 static xmlChar *
360 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361 const xmlChar *start, *cur, *end;
362
363 if ((ctxt == NULL) || (ctxt->input == NULL) ||
364 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365 (ctxt->input->buf->encoder != NULL))
366 return(NULL);
367 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
368 return(NULL);
369
370 start = ctxt->input->cur;
371 end = ctxt->input->end;
372 /* we also expect the input buffer to be zero terminated */
373 if (*end != 0)
374 return(NULL);
375
376 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
377 if (cur == NULL)
378 return(NULL);
379 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
380 if (cur == NULL)
381 return(NULL);
382 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
383 if (cur == NULL)
384 return(NULL);
385 cur += 8;
386 start = cur;
387 while (((*cur >= 'A') && (*cur <= 'Z')) ||
388 ((*cur >= 'a') && (*cur <= 'z')) ||
389 ((*cur >= '0') && (*cur <= '9')) ||
390 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
391 cur++;
392 if (cur == start)
393 return(NULL);
394 return(xmlStrndup(start, cur - start));
395 }
396
397 /**
291 * htmlCurrentChar: 398 * htmlCurrentChar:
292 * @ctxt: the HTML parser context 399 * @ctxt: the HTML parser context
293 * @len: pointer to the length of the char read 400 * @len: pointer to the length of the char read
294 * 401 *
295 * The current char value, if using UTF-8 this may actually span multiple 402 * The current char value, if using UTF-8 this may actually span multiple
296 * bytes in the input buffer. Implement the end of line normalization: 403 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling 404 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1 405 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically. 406 * char, then the encoding converter is plugged in automatically.
300 * 407 *
301 * Returns the current char value and its length 408 * Returns the current char value and its length
302 */ 409 */
303 410
304 static int 411 static int
305 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 412 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306 if (ctxt->instate == XML_PARSER_EOF) 413 if (ctxt->instate == XML_PARSER_EOF)
307 return(0); 414 return(0);
308 415
309 if (ctxt->token != 0) { 416 if (ctxt->token != 0) {
310 *len = 0; 417 *len = 0;
311 return(ctxt->token); 418 return(ctxt->token);
312 }» 419 }
313 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 420 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314 /* 421 /*
315 * We are supposed to handle UTF8, check it's valid 422 * We are supposed to handle UTF8, check it's valid
316 * From rfc2044: encoding of the Unicode values on UTF-8: 423 * From rfc2044: encoding of the Unicode values on UTF-8:
317 * 424 *
318 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 425 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
319 * 0000 0000-0000 007F 0xxxxxxx 426 * 0000 0000-0000 007F 0xxxxxxx
320 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 427 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321 » * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 428 » * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322 * 429 *
323 * Check for the 0x110000 limit too 430 * Check for the 0x110000 limit too
324 */ 431 */
325 const unsigned char *cur = ctxt->input->cur; 432 const unsigned char *cur = ctxt->input->cur;
326 unsigned char c; 433 unsigned char c;
327 unsigned int val; 434 unsigned int val;
328 435
329 c = *cur; 436 c = *cur;
330 if (c & 0x80) { 437 if (c & 0x80) {
331 » if (cur[1] == 0) 438 » if (cur[1] == 0) {
332 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 439 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
440 cur = ctxt->input->cur;
441 }
333 if ((cur[1] & 0xc0) != 0x80) 442 if ((cur[1] & 0xc0) != 0x80)
334 goto encoding_error; 443 goto encoding_error;
335 if ((c & 0xe0) == 0xe0) { 444 if ((c & 0xe0) == 0xe0) {
336 445
337 » » if (cur[2] == 0) 446 » » if (cur[2] == 0) {
338 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 447 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
448 cur = ctxt->input->cur;
449 }
339 if ((cur[2] & 0xc0) != 0x80) 450 if ((cur[2] & 0xc0) != 0x80)
340 goto encoding_error; 451 goto encoding_error;
341 if ((c & 0xf0) == 0xf0) { 452 if ((c & 0xf0) == 0xf0) {
342 » » if (cur[3] == 0) 453 » » if (cur[3] == 0) {
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 454 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
455 cur = ctxt->input->cur;
456 }
344 if (((c & 0xf8) != 0xf0) || 457 if (((c & 0xf8) != 0xf0) ||
345 ((cur[3] & 0xc0) != 0x80)) 458 ((cur[3] & 0xc0) != 0x80))
346 goto encoding_error; 459 goto encoding_error;
347 /* 4-byte code */ 460 /* 4-byte code */
348 *len = 4; 461 *len = 4;
349 val = (cur[0] & 0x7) << 18; 462 val = (cur[0] & 0x7) << 18;
350 val |= (cur[1] & 0x3f) << 12; 463 val |= (cur[1] & 0x3f) << 12;
351 val |= (cur[2] & 0x3f) << 6; 464 val |= (cur[2] & 0x3f) << 6;
352 val |= cur[3] & 0x3f; 465 val |= cur[3] & 0x3f;
353 } else { 466 } else {
354 /* 3-byte code */ 467 /* 3-byte code */
355 *len = 3; 468 *len = 3;
356 val = (cur[0] & 0xf) << 12; 469 val = (cur[0] & 0xf) << 12;
357 val |= (cur[1] & 0x3f) << 6; 470 val |= (cur[1] & 0x3f) << 6;
358 val |= cur[2] & 0x3f; 471 val |= cur[2] & 0x3f;
359 } 472 }
360 } else { 473 } else {
361 /* 2-byte code */ 474 /* 2-byte code */
362 *len = 2; 475 *len = 2;
363 val = (cur[0] & 0x1f) << 6; 476 val = (cur[0] & 0x1f) << 6;
364 val |= cur[1] & 0x3f; 477 val |= cur[1] & 0x3f;
365 } 478 }
366 if (!IS_CHAR(val)) { 479 if (!IS_CHAR(val)) {
367 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 480 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368 "Char 0x%X out of allowed range\n", val); 481 "Char 0x%X out of allowed range\n", val);
369 » } 482 » }
370 return(val); 483 return(val);
371 } else { 484 } else {
485 if ((*ctxt->input->cur == 0) &&
486 (ctxt->input->cur < ctxt->input->end)) {
487 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488 "Char 0x%X out of allowed range\n", 0);
489 *len = 1;
490 return(' ');
491 }
372 /* 1-byte code */ 492 /* 1-byte code */
373 *len = 1; 493 *len = 1;
374 return((int) *ctxt->input->cur); 494 return((int) *ctxt->input->cur);
375 } 495 }
376 } 496 }
377 /* 497 /*
378 * Assume it's a fixed length encoding (1) with 498 * Assume it's a fixed length encoding (1) with
379 * a compatible encoding for the ASCII set, since 499 * a compatible encoding for the ASCII set, since
380 * XML constructs only use < 128 chars 500 * XML constructs only use < 128 chars
381 */ 501 */
382 *len = 1; 502 *len = 1;
383 if ((int) *ctxt->input->cur < 0x80) 503 if ((int) *ctxt->input->cur < 0x80)
384 return((int) *ctxt->input->cur); 504 return((int) *ctxt->input->cur);
385 505
386 /* 506 /*
387 * Humm this is bad, do an automatic flow conversion 507 * Humm this is bad, do an automatic flow conversion
388 */ 508 */
389 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 509 {
390 ctxt->charset = XML_CHAR_ENCODING_UTF8; 510 xmlChar * guess;
511 xmlCharEncodingHandlerPtr handler;
512
513 guess = htmlFindEncoding(ctxt);
514 if (guess == NULL) {
515 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
516 } else {
517 if (ctxt->input->encoding != NULL)
518 xmlFree((xmlChar *) ctxt->input->encoding);
519 ctxt->input->encoding = guess;
520 handler = xmlFindCharEncodingHandler((const char *) guess);
521 if (handler != NULL) {
522 xmlSwitchToEncoding(ctxt, handler);
523 } else {
524 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525 "Unsupported encoding %s", guess, NULL);
526 }
527 }
528 ctxt->charset = XML_CHAR_ENCODING_UTF8;
529 }
530
391 return(xmlCurrentChar(ctxt, len)); 531 return(xmlCurrentChar(ctxt, len));
392 532
393 encoding_error: 533 encoding_error:
394 /* 534 /*
395 * If we detect an UTF8 error that probably mean that the 535 * If we detect an UTF8 error that probably mean that the
396 * input encoding didn't get properly advertized in the 536 * input encoding didn't get properly advertized in the
397 * declaration header. Report the error and switch the encoding 537 * declaration header. Report the error and switch the encoding
398 * to ISO-Latin-1 (if you don't like this policy, just declare the 538 * to ISO-Latin-1 (if you don't like this policy, just declare the
399 * encoding !) 539 * encoding !)
400 */ 540 */
401 { 541 {
402 char buffer[150]; 542 char buffer[150];
403 543
404 if (ctxt->input->end - ctxt->input->cur >= 4) { 544 if (ctxt->input->end - ctxt->input->cur >= 4) {
405 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 545 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406 ctxt->input->cur[0], ctxt->input->cur[1], 546 ctxt->input->cur[0], ctxt->input->cur[1],
407 ctxt->input->cur[2], ctxt->input->cur[3]); 547 ctxt->input->cur[2], ctxt->input->cur[3]);
408 } else { 548 } else {
409 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 549 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410 } 550 }
411 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 551 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412 "Input is not proper UTF-8, indicate encoding !\n", 552 "Input is not proper UTF-8, indicate encoding !\n",
413 BAD_CAST buffer, NULL); 553 BAD_CAST buffer, NULL);
414 } 554 }
415 555
416 ctxt->charset = XML_CHAR_ENCODING_8859_1; 556 ctxt->charset = XML_CHAR_ENCODING_8859_1;
417 *len = 1; 557 *len = 1;
418 return((int) *ctxt->input->cur); 558 return((int) *ctxt->input->cur);
419 } 559 }
420 560
421 /** 561 /**
422 * htmlSkipBlankChars: 562 * htmlSkipBlankChars:
423 * @ctxt: the HTML parser context 563 * @ctxt: the HTML parser context
424 * 564 *
425 * skip all blanks character found at that point in the input streams. 565 * skip all blanks character found at that point in the input streams.
426 * 566 *
(...skipping 19 matching lines...) Expand all
446 } 586 }
447 res++; 587 res++;
448 } 588 }
449 return(res); 589 return(res);
450 } 590 }
451 591
452 592
453 593
454 /************************************************************************ 594 /************************************************************************
455 * * 595 * *
456 * » » The list of HTML elements and their properties» » * 596 *» The list of HTML elements and their properties» » *
457 * * 597 * *
458 ************************************************************************/ 598 ************************************************************************/
459 599
460 /* 600 /*
461 * Start Tag: 1 means the start tag can be ommited 601 * Start Tag: 1 means the start tag can be ommited
462 * End Tag: 1 means the end tag can be ommited 602 * End Tag: 1 means the end tag can be ommited
463 * 2 means it's forbidden (empty elements) 603 * 2 means it's forbidden (empty elements)
464 * 3 means the tag is stylistic and should be closed easily 604 * 3 means the tag is stylistic and should be closed easily
465 * Depr: this element is deprecated 605 * Depr: this element is deprecated
466 * DTD: 1 means that this element is valid only in the Loose DTD 606 * DTD: 1 means that this element is valid only in the Loose DTD
467 * 2 means that this element is valid only in the Frameset DTD 607 * 2 means that this element is valid only in the Frameset DTD
468 * 608 *
469 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
470 , subElements , impliedsubelt , Attributes, userdata 610 , subElements , impliedsubelt , Attributes, userdata
471 */ 611 */
472 612
473 /* Definitions and a couple of vars for HTML Elements */ 613 /* Definitions and a couple of vars for HTML Elements */
474 614
475 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 615 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
476 #define NB_FONTSTYLE 8 616 #define NB_FONTSTYLE 8
477 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abb r", "acronym" 617 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abb r", "acronym"
478 #define NB_PHRASE 10 618 #define NB_PHRASE 10
479 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br ", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 619 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br ", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480 #define NB_SPECIAL 16 620 #define NB_SPECIAL 16
481 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL 621 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
482 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTR L 622 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTR L
483 #define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "nofr ames", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 623 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "nof rames", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484 #define NB_BLOCK NB_HEADING + NB_LIST + 14 624 #define NB_BLOCK NB_HEADING + NB_LIST + 14
485 #define FORMCTRL "input", "select", "textarea", "label", "button" 625 #define FORMCTRL "input", "select", "textarea", "label", "button"
486 #define NB_FORMCTRL 5 626 #define NB_FORMCTRL 5
487 #define PCDATA 627 #define PCDATA
488 #define NB_PCDATA 0 628 #define NB_PCDATA 0
489 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 629 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
490 #define NB_HEADING 6 630 #define NB_HEADING 6
491 #define LIST "ul", "ol", "dir", "menu" 631 #define LIST "ul", "ol", "dir", "menu"
492 #define NB_LIST 4 632 #define NB_LIST 4
493 #define MODIFIER 633 #define MODIFIER
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after
599 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 739 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selecte d", "value", NULL } ; 740 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selecte d", "value", NULL } ;
601 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", N ULL } ; 741 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", N ULL } ;
602 static const char* const width_attr[] = { "width", NULL } ; 742 static const char* const width_attr[] = { "width", NULL } ;
603 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "st rike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 743 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "st rike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 744 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605 static const char* const language_attr[] = { "language", NULL } ; 745 static const char* const language_attr[] = { "language", NULL } ;
606 static const char* const select_content[] = { "optgroup", "option", NULL } ; 746 static const char* const select_content[] = { "optgroup", "option", NULL } ;
607 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", " disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 747 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", " disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 748 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", " frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 749 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 750 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611 static const char* const table_contents[] = { "caption", "col", "colgroup", "the ad", "tfoot", "tbody", "tr", NULL} ; 751 static const char* const table_contents[] = { "caption", "col", "colgroup", "the ad", "tfoot", "tbody", "tr", NULL} ;
612 static const char* const tr_elt[] = { "tr", NULL } ; 752 static const char* const tr_elt[] = { "tr", NULL } ;
613 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 753 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height" , NULL } ; 754 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height" , NULL } ;
615 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "sco pe", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 755 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "sco pe", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readon ly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 756 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readon ly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617 static const char* const tr_contents[] = { "th", "td", NULL } ; 757 static const char* const tr_contents[] = { "th", "td", NULL } ;
618 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 758 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619 static const char* const li_elt[] = { "li", NULL } ; 759 static const char* const li_elt[] = { "li", NULL } ;
(...skipping 311 matching lines...) Expand 10 before | Expand all | Expand 10 after
931 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1071 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
932 "xmp", "head", NULL, 1072 "xmp", "head", NULL,
933 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1073 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
934 "head", "dd", NULL, 1074 "head", "dd", NULL,
935 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1075 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
936 "head", "dt", NULL, 1076 "head", "dt", NULL,
937 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1077 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
938 "listing", "xmp", NULL, 1078 "listing", "xmp", NULL,
939 "ol", "p", "head", "ul", NULL, 1079 "ol", "p", "head", "ul", NULL,
940 "menu", "p", "head", "ul", NULL, 1080 "menu", "p", "head", "ul", NULL,
941 "p",» » "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL, 1081 "p",» » "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL ,
942 "div", "p", "head", NULL, 1082 "div", "p", "head", NULL,
943 "noscript", "p", "head", NULL, 1083 "noscript", "p", "head", NULL,
944 "center", "font", "b", "i", "p", "head", NULL, 1084 "center", "font", "b", "i", "p", "head", NULL,
945 "a", "a", NULL, 1085 "a", "a", NULL,
946 "caption", "p", NULL, 1086 "caption", "p", NULL,
947 "colgroup", "caption", "colgroup", "col", "p", NULL, 1087 "colgroup", "caption", "colgroup", "col", "p", NULL,
948 "col", "caption", "col", "p", NULL, 1088 "col", "caption", "col", "p", NULL,
949 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1089 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950 "listing", "xmp", "a", NULL, 1090 "listing", "xmp", "a", NULL,
951 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1091 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952 "td",» » "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1092 "td",» » "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
953 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1093 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954 "thead", "caption", "col", "colgroup", NULL, 1094 "thead", "caption", "col", "colgroup", NULL,
955 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1095 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
956 "tbody", "p", NULL, 1096 "tbody", "p", NULL,
957 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1097 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
958 "tfoot", "tbody", "p", NULL, 1098 "tfoot", "tbody", "p", NULL,
959 "optgroup", "option", NULL, 1099 "optgroup", "option", NULL,
960 "option", "option", NULL, 1100 "option", "option", NULL,
961 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1101 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962 "pre", "listing", "xmp", "a", NULL, 1102 "pre", "listing", "xmp", "a", NULL,
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
1001 "onchange", 1141 "onchange",
1002 "onselect" 1142 "onselect"
1003 }; 1143 };
1004 1144
1005 /* 1145 /*
1006 * This table is used by the htmlparser to know what to do with 1146 * This table is used by the htmlparser to know what to do with
1007 * broken html pages. By assigning different priorities to different 1147 * broken html pages. By assigning different priorities to different
1008 * elements the parser can decide how to handle extra endtags. 1148 * elements the parser can decide how to handle extra endtags.
1009 * Endtags are only allowed to close elements with lower or equal 1149 * Endtags are only allowed to close elements with lower or equal
1010 * priority. 1150 * priority.
1011 */ 1151 */
1012 1152
1013 typedef struct { 1153 typedef struct {
1014 const char *name; 1154 const char *name;
1015 int priority; 1155 int priority;
1016 } elementPriority; 1156 } elementPriority;
1017 1157
1018 static const elementPriority htmlEndPriority[] = { 1158 static const elementPriority htmlEndPriority[] = {
1019 {"div", 150}, 1159 {"div", 150},
1020 {"td", 160}, 1160 {"td", 160},
1021 {"th", 160}, 1161 {"th", 160},
1022 {"tr", 170}, 1162 {"tr", 170},
1023 {"thead", 180}, 1163 {"thead", 180},
1024 {"tbody", 180}, 1164 {"tbody", 180},
1025 {"tfoot", 180}, 1165 {"tfoot", 180},
1026 {"table", 190}, 1166 {"table", 190},
1027 {"head", 200}, 1167 {"head", 200},
1028 {"body", 200}, 1168 {"body", 200},
1029 {"html", 220}, 1169 {"html", 220},
1030 {NULL, 100} /* Default priority */ 1170 {NULL, 100} /* Default priority */
1031 }; 1171 };
1032 1172
1033 static const char** htmlStartCloseIndex[100]; 1173 static const char** htmlStartCloseIndex[100];
1034 static int htmlStartCloseIndexinitialized = 0; 1174 static int htmlStartCloseIndexinitialized = 0;
1035 1175
1036 /************************************************************************ 1176 /************************************************************************
1037 * * 1177 * *
1038 * » » functions to handle HTML specific data» » » * 1178 *» functions to handle HTML specific data» » » *
1039 * * 1179 * *
1040 ************************************************************************/ 1180 ************************************************************************/
1041 1181
1042 /** 1182 /**
1043 * htmlInitAutoClose: 1183 * htmlInitAutoClose:
1044 * 1184 *
1045 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046 * This is not reentrant. Call xmlInitParser() once before processing in 1186 * This is not reentrant. Call xmlInitParser() once before processing in
1047 * case of use in multithreaded programs. 1187 * case of use in multithreaded programs.
1048 */ 1188 */
(...skipping 29 matching lines...) Expand all
1078 sizeof(html40ElementTable[0]));i++) { 1218 sizeof(html40ElementTable[0]));i++) {
1079 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1219 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1080 return((htmlElemDescPtr) &html40ElementTable[i]); 1220 return((htmlElemDescPtr) &html40ElementTable[i]);
1081 } 1221 }
1082 return(NULL); 1222 return(NULL);
1083 } 1223 }
1084 1224
1085 /** 1225 /**
1086 * htmlGetEndPriority: 1226 * htmlGetEndPriority:
1087 * @name: The name of the element to look up the priority for. 1227 * @name: The name of the element to look up the priority for.
1088 * 1228 *
1089 * Return value: The "endtag" priority. 1229 * Return value: The "endtag" priority.
1090 **/ 1230 **/
1091 static int 1231 static int
1092 htmlGetEndPriority (const xmlChar *name) { 1232 htmlGetEndPriority (const xmlChar *name) {
1093 int i = 0; 1233 int i = 0;
1094 1234
1095 while ((htmlEndPriority[i].name != NULL) && 1235 while ((htmlEndPriority[i].name != NULL) &&
1096 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1236 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097 i++; 1237 i++;
1098 1238
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
1157 priority = htmlGetEndPriority(newtag); 1297 priority = htmlGetEndPriority(newtag);
1158 1298
1159 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1299 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1160 1300
1161 if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1301 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162 break; 1302 break;
1163 /* 1303 /*
1164 * A missplaced endtag can only close elements with lower 1304 * A missplaced endtag can only close elements with lower
1165 * or equal priority, so if we find an element with higher 1305 * or equal priority, so if we find an element with higher
1166 * priority before we find an element with 1306 * priority before we find an element with
1167 * matching name, we just ignore this endtag 1307 * matching name, we just ignore this endtag
1168 */ 1308 */
1169 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1309 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170 return; 1310 return;
1171 } 1311 }
1172 if (i < 0) 1312 if (i < 0)
1173 return; 1313 return;
1174 1314
1175 while (!xmlStrEqual(newtag, ctxt->name)) { 1315 while (!xmlStrEqual(newtag, ctxt->name)) {
1176 info = htmlTagLookup(ctxt->name); 1316 info = htmlTagLookup(ctxt->name);
1177 if ((info != NULL) && (info->endTag == 3)) { 1317 if ((info != NULL) && (info->endTag == 3)) {
(...skipping 30 matching lines...) Expand all
1208 /** 1348 /**
1209 * htmlAutoClose: 1349 * htmlAutoClose:
1210 * @ctxt: an HTML parser context 1350 * @ctxt: an HTML parser context
1211 * @newtag: The new tag name or NULL 1351 * @newtag: The new tag name or NULL
1212 * 1352 *
1213 * The HTML DTD allows a tag to implicitly close other tags. 1353 * The HTML DTD allows a tag to implicitly close other tags.
1214 * The list is kept in htmlStartClose array. This function is 1354 * The list is kept in htmlStartClose array. This function is
1215 * called when a new tag has been detected and generates the 1355 * called when a new tag has been detected and generates the
1216 * appropriates closes if possible/needed. 1356 * appropriates closes if possible/needed.
1217 * If newtag is NULL this mean we are at the end of the resource 1357 * If newtag is NULL this mean we are at the end of the resource
1218 * and we should check 1358 * and we should check
1219 */ 1359 */
1220 static void 1360 static void
1221 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1361 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222 { 1362 {
1223 while ((newtag != NULL) && (ctxt->name != NULL) && 1363 while ((newtag != NULL) && (ctxt->name != NULL) &&
1224 (htmlCheckAutoClose(newtag, ctxt->name))) { 1364 (htmlCheckAutoClose(newtag, ctxt->name))) {
1225 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1365 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226 ctxt->sax->endElement(ctxt->userData, ctxt->name); 1366 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1227 htmlnamePop(ctxt); 1367 htmlnamePop(ctxt);
1228 } 1368 }
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
1296 * htmlCheckImplied: 1436 * htmlCheckImplied:
1297 * @ctxt: an HTML parser context 1437 * @ctxt: an HTML parser context
1298 * @newtag: The new tag name 1438 * @newtag: The new tag name
1299 * 1439 *
1300 * The HTML DTD allows a tag to exists only implicitly 1440 * The HTML DTD allows a tag to exists only implicitly
1301 * called when a new tag has been detected and generates the 1441 * called when a new tag has been detected and generates the
1302 * appropriates implicit tags if missing 1442 * appropriates implicit tags if missing
1303 */ 1443 */
1304 static void 1444 static void
1305 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1445 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1446 int i;
1447
1448 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449 return;
1306 if (!htmlOmittedDefaultValue) 1450 if (!htmlOmittedDefaultValue)
1307 return; 1451 return;
1308 if (xmlStrEqual(newtag, BAD_CAST"html")) 1452 if (xmlStrEqual(newtag, BAD_CAST"html"))
1309 return; 1453 return;
1310 if (ctxt->nameNr <= 0) { 1454 if (ctxt->nameNr <= 0) {
1311 htmlnamePush(ctxt, BAD_CAST"html"); 1455 htmlnamePush(ctxt, BAD_CAST"html");
1312 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1456 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1457 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314 } 1458 }
1315 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"h ead"))) 1459 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"h ead")))
1316 return; 1460 return;
1317 if ((ctxt->nameNr <= 1) && 1461 if ((ctxt->nameNr <= 1) &&
1318 ((xmlStrEqual(newtag, BAD_CAST"script")) || 1462 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1319 (xmlStrEqual(newtag, BAD_CAST"style")) || 1463 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1320 (xmlStrEqual(newtag, BAD_CAST"meta")) || 1464 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1321 (xmlStrEqual(newtag, BAD_CAST"link")) || 1465 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1322 (xmlStrEqual(newtag, BAD_CAST"title")) || 1466 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1323 (xmlStrEqual(newtag, BAD_CAST"base")))) { 1467 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1324 » /* 1468 if (ctxt->html >= 3) {
1325 » * dropped OBJECT ... i you put it first BODY will be 1469 /* we already saw or generated an <head> before */
1326 » * assumed ! 1470 return;
1327 » */ 1471 }
1328 » htmlnamePush(ctxt, BAD_CAST"head"); 1472 /*
1329 » if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1473 * dropped OBJECT ... i you put it first BODY will be
1330 » » ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1474 * assumed !
1475 */
1476 htmlnamePush(ctxt, BAD_CAST"head");
1477 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1479 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332 (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1480 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1481 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334 » int i; 1482 if (ctxt->html >= 10) {
1483 /* we already saw or generated a <body> before */
1484 return;
1485 }
1335 for (i = 0;i < ctxt->nameNr;i++) { 1486 for (i = 0;i < ctxt->nameNr;i++) {
1336 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1487 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337 return; 1488 return;
1338 } 1489 }
1339 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1490 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340 return; 1491 return;
1341 } 1492 }
1342 } 1493 }
1343 » 1494
1344 htmlnamePush(ctxt, BAD_CAST"body"); 1495 htmlnamePush(ctxt, BAD_CAST"body");
1345 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1496 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1497 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347 } 1498 }
1348 } 1499 }
1349 1500
1350 /** 1501 /**
1351 * htmlCheckParagraph 1502 * htmlCheckParagraph
1352 * @ctxt: an HTML parser context 1503 * @ctxt: an HTML parser context
1353 * 1504 *
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
1395 * 1546 *
1396 * Check if an attribute is of content type Script 1547 * Check if an attribute is of content type Script
1397 * 1548 *
1398 * Returns 1 is the attribute is a script 0 otherwise 1549 * Returns 1 is the attribute is a script 0 otherwise
1399 */ 1550 */
1400 int 1551 int
1401 htmlIsScriptAttribute(const xmlChar *name) { 1552 htmlIsScriptAttribute(const xmlChar *name) {
1402 unsigned int i; 1553 unsigned int i;
1403 1554
1404 if (name == NULL) 1555 if (name == NULL)
1405 »return(0); 1556 return(0);
1406 /* 1557 /*
1407 * all script attributes start with 'on' 1558 * all script attributes start with 'on'
1408 */ 1559 */
1409 if ((name[0] != 'o') || (name[1] != 'n')) 1560 if ((name[0] != 'o') || (name[1] != 'n'))
1410 »return(0); 1561 return(0);
1411 for (i = 0; 1562 for (i = 0;
1412 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1563 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413 i++) { 1564 i++) {
1414 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1565 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415 return(1); 1566 return(1);
1416 } 1567 }
1417 return(0); 1568 return(0);
1418 } 1569 }
1419 1570
1420 /************************************************************************ 1571 /************************************************************************
1421 * * 1572 * *
1422 * » » The list of HTML predefined entities» » » * 1573 *» The list of HTML predefined entities» » » *
1423 * * 1574 * *
1424 ************************************************************************/ 1575 ************************************************************************/
1425 1576
1426 1577
1427 static const htmlEntityDesc html40EntitiesTable[] = { 1578 static const htmlEntityDesc html40EntitiesTable[] = {
1428 /* 1579 /*
1429 * the 4 absolute ones, plus apostrophe. 1580 * the 4 absolute ones, plus apostrophe.
1430 */ 1581 */
1431 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1582 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1432 { 38, "amp", "ampersand, U+0026 ISOnum" }, 1583 { 38, "amp", "ampersand, U+0026 ISOnum" },
(...skipping 393 matching lines...) Expand 10 before | Expand all | Expand 10 after
1826 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1977 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1827 else { 1978 else {
1828 /* no chance for this in Ascii */ 1979 /* no chance for this in Ascii */
1829 *outlen = out - outstart; 1980 *outlen = out - outstart;
1830 *inlen = processed - instart; 1981 *inlen = processed - instart;
1831 return(-2); 1982 return(-2);
1832 } 1983 }
1833 1984
1834 if (inend - in < trailing) { 1985 if (inend - in < trailing) {
1835 break; 1986 break;
1836 » } 1987 » }
1837 1988
1838 for ( ; trailing; trailing--) { 1989 for ( ; trailing; trailing--) {
1839 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1990 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1840 break; 1991 break;
1841 c <<= 6; 1992 c <<= 6;
1842 c |= d & 0x3F; 1993 c |= d & 0x3F;
1843 } 1994 }
1844 1995
1845 /* assertion: c is a single UTF-4 value */ 1996 /* assertion: c is a single UTF-4 value */
1846 if (c < 0x80) { 1997 if (c < 0x80) {
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after
2016 return(input); 2167 return(input);
2017 } 2168 }
2018 2169
2019 2170
2020 /************************************************************************ 2171 /************************************************************************
2021 * * 2172 * *
2022 * Commodity functions, cleanup needed ? * 2173 * Commodity functions, cleanup needed ? *
2023 * * 2174 * *
2024 ************************************************************************/ 2175 ************************************************************************/
2025 /* 2176 /*
2026 * all tags allowing pc data from the html 4.01 loose dtd 2177 * all tags allowing pc data from the html 4.01 loose dtd
2027 * NOTE: it might be more apropriate to integrate this information 2178 * NOTE: it might be more apropriate to integrate this information
2028 * into the html40ElementTable array but I don't want to risk any 2179 * into the html40ElementTable array but I don't want to risk any
2029 * binary incomptibility 2180 * binary incomptibility
2030 */ 2181 */
2031 static const char *allowPCData[] = { 2182 static const char *allowPCData[] = {
2032 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2183 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033 "blockquote", "body", "button", "caption", "center", "cite", "code", 2184 "blockquote", "body", "button", "caption", "center", "cite", "code",
2034 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2185 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2186 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2187 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
2076 } 2227 }
2077 } 2228 }
2078 2229
2079 if (ctxt->node == NULL) return(0); 2230 if (ctxt->node == NULL) return(0);
2080 lastChild = xmlGetLastChild(ctxt->node); 2231 lastChild = xmlGetLastChild(ctxt->node);
2081 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2232 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082 lastChild = lastChild->prev; 2233 lastChild = lastChild->prev;
2083 if (lastChild == NULL) { 2234 if (lastChild == NULL) {
2084 if ((ctxt->node->type != XML_ELEMENT_NODE) && 2235 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085 (ctxt->node->content != NULL)) return(0); 2236 (ctxt->node->content != NULL)) return(0);
2086 » /* keep ws in constructs like ...<b> </b>... 2237 » /* keep ws in constructs like ...<b> </b>...
2087 for all tags "b" allowing PCDATA */ 2238 for all tags "b" allowing PCDATA */
2088 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2239 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2240 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090 return(0); 2241 return(0);
2091 } 2242 }
2092 } 2243 }
2093 } else if (xmlNodeIsText(lastChild)) { 2244 } else if (xmlNodeIsText(lastChild)) {
2094 return(0); 2245 return(0);
2095 } else { 2246 } else {
2096 » /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 2247 » /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097 for all tags "p" allowing PCDATA */ 2248 for all tags "p" allowing PCDATA */
2098 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2249 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2250 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100 return(0); 2251 return(0);
2101 } 2252 }
2102 } 2253 }
2103 } 2254 }
2104 return(1); 2255 return(1);
2105 } 2256 }
2106 2257
(...skipping 19 matching lines...) Expand all
2126 htmlErrMemory(NULL, "HTML document creation failed\n"); 2277 htmlErrMemory(NULL, "HTML document creation failed\n");
2127 return(NULL); 2278 return(NULL);
2128 } 2279 }
2129 memset(cur, 0, sizeof(xmlDoc)); 2280 memset(cur, 0, sizeof(xmlDoc));
2130 2281
2131 cur->type = XML_HTML_DOCUMENT_NODE; 2282 cur->type = XML_HTML_DOCUMENT_NODE;
2132 cur->version = NULL; 2283 cur->version = NULL;
2133 cur->intSubset = NULL; 2284 cur->intSubset = NULL;
2134 cur->doc = cur; 2285 cur->doc = cur;
2135 cur->name = NULL; 2286 cur->name = NULL;
2136 cur->children = NULL; 2287 cur->children = NULL;
2137 cur->extSubset = NULL; 2288 cur->extSubset = NULL;
2138 cur->oldNs = NULL; 2289 cur->oldNs = NULL;
2139 cur->encoding = NULL; 2290 cur->encoding = NULL;
2140 cur->standalone = 1; 2291 cur->standalone = 1;
2141 cur->compression = 0; 2292 cur->compression = 0;
2142 cur->ids = NULL; 2293 cur->ids = NULL;
2143 cur->refs = NULL; 2294 cur->refs = NULL;
2144 cur->_private = NULL; 2295 cur->_private = NULL;
2145 cur->charset = XML_CHAR_ENCODING_UTF8; 2296 cur->charset = XML_CHAR_ENCODING_UTF8;
2297 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2146 if ((ExternalID != NULL) || 2298 if ((ExternalID != NULL) ||
2147 (URI != NULL)) 2299 (URI != NULL))
2148 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2300 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2149 return(cur); 2301 return(cur);
2150 } 2302 }
2151 2303
2152 /** 2304 /**
2153 * htmlNewDoc: 2305 * htmlNewDoc:
2154 * @URI: URI for the dtd, or NULL 2306 * @URI: URI for the dtd, or NULL
2155 * @ExternalID: the external ID of the DTD, or NULL 2307 * @ExternalID: the external ID of the DTD, or NULL
(...skipping 37 matching lines...) Expand 10 before | Expand all | Expand 10 after
2193 * 2345 *
2194 * Returns the Tag Name parsed or NULL 2346 * Returns the Tag Name parsed or NULL
2195 */ 2347 */
2196 2348
2197 static const xmlChar * 2349 static const xmlChar *
2198 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2350 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2199 int i = 0; 2351 int i = 0;
2200 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2352 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2201 2353
2202 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2354 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2203 (CUR != ':')) return(NULL); 2355 (CUR != ':') && (CUR != '.')) return(NULL);
2204 2356
2205 while ((i < HTML_PARSER_BUFFER_SIZE) && 2357 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2206 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2358 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2207 » (CUR == ':') || (CUR == '-') || (CUR == '_'))) { 2359 » (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360 (CUR == '.'))) {
2208 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2361 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2209 else loc[i] = CUR; 2362 else loc[i] = CUR;
2210 i++; 2363 i++;
2211 » 2364
2212 NEXT; 2365 NEXT;
2213 } 2366 }
2214 2367
2215 return(xmlDictLookup(ctxt->dict, loc, i)); 2368 return(xmlDictLookup(ctxt->dict, loc, i));
2216 } 2369 }
2217 2370
2218 2371
2219 /** 2372 /**
2220 * htmlParseHTMLName_nonInvasive: 2373 * htmlParseHTMLName_nonInvasive:
2221 * @ctxt: an HTML parser context 2374 * @ctxt: an HTML parser context
2222 * 2375 *
2223 * parse an HTML tag or attribute name, note that we convert it to lowercase 2376 * parse an HTML tag or attribute name, note that we convert it to lowercase
2224 * since HTML names are not case-sensitive, this doesn't consume the data 2377 * since HTML names are not case-sensitive, this doesn't consume the data
2225 * from the stream, it's a look-ahead 2378 * from the stream, it's a look-ahead
2226 * 2379 *
2227 * Returns the Tag Name parsed or NULL 2380 * Returns the Tag Name parsed or NULL
2228 */ 2381 */
2229 2382
2230 static const xmlChar * 2383 static const xmlChar *
2231 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2384 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2232 int i = 0; 2385 int i = 0;
2233 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2386 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2234 2387
2235 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2388 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2236 (NXT(1) != ':')) return(NULL); 2389 (NXT(1) != ':')) return(NULL);
2237 2390
2238 while ((i < HTML_PARSER_BUFFER_SIZE) && 2391 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2239 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2392 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2240 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2393 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2241 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2394 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2242 else loc[i] = NXT(1+i); 2395 else loc[i] = NXT(1+i);
2243 i++; 2396 i++;
2244 } 2397 }
2245 2398
2246 return(xmlDictLookup(ctxt->dict, loc, i)); 2399 return(xmlDictLookup(ctxt->dict, loc, i));
2247 } 2400 }
2248 2401
2249 2402
2250 /** 2403 /**
2251 * htmlParseName: 2404 * htmlParseName:
2252 * @ctxt: an HTML parser context 2405 * @ctxt: an HTML parser context
2253 * 2406 *
2254 * parse an HTML name, this routine is case sensitive. 2407 * parse an HTML name, this routine is case sensitive.
2255 * 2408 *
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
2303 c = CUR_CHAR(l); 2456 c = CUR_CHAR(l);
2304 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2457 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2305 (!IS_LETTER(c) && (c != '_') && 2458 (!IS_LETTER(c) && (c != '_') &&
2306 (c != ':'))) { 2459 (c != ':'))) {
2307 return(NULL); 2460 return(NULL);
2308 } 2461 }
2309 2462
2310 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2463 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2311 ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2464 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2312 (c == '.') || (c == '-') || 2465 (c == '.') || (c == '-') ||
2313 » (c == '_') || (c == ':') || 2466 » (c == '_') || (c == ':') ||
2314 (IS_COMBINING(c)) || 2467 (IS_COMBINING(c)) ||
2315 (IS_EXTENDER(c)))) { 2468 (IS_EXTENDER(c)))) {
2316 if (count++ > 100) { 2469 if (count++ > 100) {
2317 count = 0; 2470 count = 0;
2318 GROW; 2471 GROW;
2319 } 2472 }
2320 len += l; 2473 len += l;
2321 NEXTL(l); 2474 NEXTL(l);
2322 c = CUR_CHAR(l); 2475 c = CUR_CHAR(l);
2323 } 2476 }
2324 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2477 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2325 } 2478 }
2326 2479
2327 2480
2328 /** 2481 /**
2329 * htmlParseHTMLAttribute: 2482 * htmlParseHTMLAttribute:
2330 * @ctxt: an HTML parser context 2483 * @ctxt: an HTML parser context
2331 * @stop: a char stop value 2484 * @stop: a char stop value
2332 * 2485 *
2333 * parse an HTML attribute value till the stop (quote), if 2486 * parse an HTML attribute value till the stop (quote), if
2334 * stop is 0 then it stops at the first space 2487 * stop is 0 then it stops at the first space
2335 * 2488 *
2336 * Returns the attribute parsed or NULL 2489 * Returns the attribute parsed or NULL
2337 */ 2490 */
2338 2491
2339 static xmlChar * 2492 static xmlChar *
2340 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2493 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2341 xmlChar *buffer = NULL; 2494 xmlChar *buffer = NULL;
2342 int buffer_size = 0; 2495 int buffer_size = 0;
(...skipping 24 matching lines...) Expand all
2367 unsigned int c; 2520 unsigned int c;
2368 int bits; 2521 int bits;
2369 2522
2370 c = htmlParseCharRef(ctxt); 2523 c = htmlParseCharRef(ctxt);
2371 if (c < 0x80) 2524 if (c < 0x80)
2372 { *out++ = c; bits= -6; } 2525 { *out++ = c; bits= -6; }
2373 else if (c < 0x800) 2526 else if (c < 0x800)
2374 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2527 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2375 else if (c < 0x10000) 2528 else if (c < 0x10000)
2376 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2529 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2377 » » else 2530 » » else
2378 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2531 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2379 » 2532
2380 for ( ; bits >= 0; bits-= 6) { 2533 for ( ; bits >= 0; bits-= 6) {
2381 *out++ = ((c >> bits) & 0x3F) | 0x80; 2534 *out++ = ((c >> bits) & 0x3F) | 0x80;
2382 } 2535 }
2383 » » 2536
2384 if (out - buffer > buffer_size - 100) { 2537 if (out - buffer > buffer_size - 100) {
2385 int indx = out - buffer; 2538 int indx = out - buffer;
2386 2539
2387 growBuffer(buffer); 2540 growBuffer(buffer);
2388 out = &buffer[indx]; 2541 out = &buffer[indx];
2389 } 2542 }
2390 } else { 2543 } else {
2391 ent = htmlParseEntityRef(ctxt, &name); 2544 ent = htmlParseEntityRef(ctxt, &name);
2392 if (name == NULL) { 2545 if (name == NULL) {
2393 *out++ = '&'; 2546 *out++ = '&';
(...skipping 25 matching lines...) Expand all
2419 growBuffer(buffer); 2572 growBuffer(buffer);
2420 out = &buffer[indx]; 2573 out = &buffer[indx];
2421 } 2574 }
2422 c = ent->value; 2575 c = ent->value;
2423 if (c < 0x80) 2576 if (c < 0x80)
2424 { *out++ = c; bits= -6; } 2577 { *out++ = c; bits= -6; }
2425 else if (c < 0x800) 2578 else if (c < 0x800)
2426 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2579 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2427 else if (c < 0x10000) 2580 else if (c < 0x10000)
2428 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2581 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2429 » » else 2582 » » else
2430 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2583 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2431 » 2584
2432 for ( ; bits >= 0; bits-= 6) { 2585 for ( ; bits >= 0; bits-= 6) {
2433 *out++ = ((c >> bits) & 0x3F) | 0x80; 2586 *out++ = ((c >> bits) & 0x3F) | 0x80;
2434 } 2587 }
2435 } 2588 }
2436 } 2589 }
2437 } else { 2590 } else {
2438 unsigned int c; 2591 unsigned int c;
2439 int bits, l; 2592 int bits, l;
2440 2593
2441 if (out - buffer > buffer_size - 100) { 2594 if (out - buffer > buffer_size - 100) {
2442 int indx = out - buffer; 2595 int indx = out - buffer;
2443 2596
2444 growBuffer(buffer); 2597 growBuffer(buffer);
2445 out = &buffer[indx]; 2598 out = &buffer[indx];
2446 } 2599 }
2447 c = CUR_CHAR(l); 2600 c = CUR_CHAR(l);
2448 if (c < 0x80) 2601 if (c < 0x80)
2449 { *out++ = c; bits= -6; } 2602 { *out++ = c; bits= -6; }
2450 else if (c < 0x800) 2603 else if (c < 0x800)
2451 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2604 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2452 else if (c < 0x10000) 2605 else if (c < 0x10000)
2453 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2606 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2454 » else 2607 » else
2455 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2608 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2456 2609
2457 for ( ; bits >= 0; bits-= 6) { 2610 for ( ; bits >= 0; bits-= 6) {
2458 *out++ = ((c >> bits) & 0x3F) | 0x80; 2611 *out++ = ((c >> bits) & 0x3F) | 0x80;
2459 } 2612 }
2460 NEXT; 2613 NEXT;
2461 } 2614 }
2462 } 2615 }
2463 *out++ = 0; 2616 *out = 0;
2464 return(buffer); 2617 return(buffer);
2465 } 2618 }
2466 2619
2467 /** 2620 /**
2468 * htmlParseEntityRef: 2621 * htmlParseEntityRef:
2469 * @ctxt: an HTML parser context 2622 * @ctxt: an HTML parser context
2470 * @str: location to store the entity name 2623 * @str: location to store the entity name
2471 * 2624 *
2472 * parse an HTML ENTITY references 2625 * parse an HTML ENTITY references
2473 * 2626 *
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
2514 return(ent); 2667 return(ent);
2515 } 2668 }
2516 2669
2517 /** 2670 /**
2518 * htmlParseAttValue: 2671 * htmlParseAttValue:
2519 * @ctxt: an HTML parser context 2672 * @ctxt: an HTML parser context
2520 * 2673 *
2521 * parse a value for an attribute 2674 * parse a value for an attribute
2522 * Note: the parser won't do substitution of entities here, this 2675 * Note: the parser won't do substitution of entities here, this
2523 * will be handled later in xmlStringGetNodeList, unless it was 2676 * will be handled later in xmlStringGetNodeList, unless it was
2524 * asked for ctxt->replaceEntities != 0 2677 * asked for ctxt->replaceEntities != 0
2525 * 2678 *
2526 * Returns the AttValue parsed or NULL. 2679 * Returns the AttValue parsed or NULL.
2527 */ 2680 */
2528 2681
2529 static xmlChar * 2682 static xmlChar *
2530 htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2683 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2531 xmlChar *ret = NULL; 2684 xmlChar *ret = NULL;
2532 2685
2533 if (CUR == '"') { 2686 if (CUR == '"') {
2534 NEXT; 2687 NEXT;
(...skipping 20 matching lines...) Expand all
2555 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2708 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2556 "AttValue: no value found\n", NULL, NULL); 2709 "AttValue: no value found\n", NULL, NULL);
2557 } 2710 }
2558 } 2711 }
2559 return(ret); 2712 return(ret);
2560 } 2713 }
2561 2714
2562 /** 2715 /**
2563 * htmlParseSystemLiteral: 2716 * htmlParseSystemLiteral:
2564 * @ctxt: an HTML parser context 2717 * @ctxt: an HTML parser context
2565 * 2718 *
2566 * parse an HTML Literal 2719 * parse an HTML Literal
2567 * 2720 *
2568 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2569 * 2722 *
2570 * Returns the SystemLiteral parsed or NULL 2723 * Returns the SystemLiteral parsed or NULL
2571 */ 2724 */
2572 2725
2573 static xmlChar * 2726 static xmlChar *
2574 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2727 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2575 const xmlChar *q; 2728 const xmlChar *q;
(...skipping 20 matching lines...) Expand all
2596 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2749 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2597 "Unfinished SystemLiteral\n", NULL, NULL); 2750 "Unfinished SystemLiteral\n", NULL, NULL);
2598 } else { 2751 } else {
2599 ret = xmlStrndup(q, CUR_PTR - q); 2752 ret = xmlStrndup(q, CUR_PTR - q);
2600 NEXT; 2753 NEXT;
2601 } 2754 }
2602 } else { 2755 } else {
2603 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2756 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2604 " or ' expected\n", NULL, NULL); 2757 " or ' expected\n", NULL, NULL);
2605 } 2758 }
2606 2759
2607 return(ret); 2760 return(ret);
2608 } 2761 }
2609 2762
2610 /** 2763 /**
2611 * htmlParsePubidLiteral: 2764 * htmlParsePubidLiteral:
2612 * @ctxt: an HTML parser context 2765 * @ctxt: an HTML parser context
2613 * 2766 *
2614 * parse an HTML public literal 2767 * parse an HTML public literal
2615 * 2768 *
2616 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
(...skipping 28 matching lines...) Expand all
2645 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2798 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2646 "Unfinished PubidLiteral\n", NULL, NULL); 2799 "Unfinished PubidLiteral\n", NULL, NULL);
2647 } else { 2800 } else {
2648 ret = xmlStrndup(q, CUR_PTR - q); 2801 ret = xmlStrndup(q, CUR_PTR - q);
2649 NEXT; 2802 NEXT;
2650 } 2803 }
2651 } else { 2804 } else {
2652 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2805 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2653 "PubidLiteral \" or ' expected\n", NULL, NULL); 2806 "PubidLiteral \" or ' expected\n", NULL, NULL);
2654 } 2807 }
2655 2808
2656 return(ret); 2809 return(ret);
2657 } 2810 }
2658 2811
2659 /** 2812 /**
2660 * htmlParseScript: 2813 * htmlParseScript:
2661 * @ctxt: an HTML parser context 2814 * @ctxt: an HTML parser context
2662 * 2815 *
2663 * parse the content of an HTML SCRIPT or STYLE element 2816 * parse the content of an HTML SCRIPT or STYLE element
2664 * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2665 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
(...skipping 26 matching lines...) Expand all
2692 * Authors should therefore escape "</" within the content. 2845 * Authors should therefore escape "</" within the content.
2693 * Escape mechanisms are specific to each scripting or 2846 * Escape mechanisms are specific to each scripting or
2694 * style sheet language. 2847 * style sheet language.
2695 * 2848 *
2696 * In recovery mode, only break if end tag match the 2849 * In recovery mode, only break if end tag match the
2697 * current tag, effectively ignoring all tags inside the 2850 * current tag, effectively ignoring all tags inside the
2698 * script/style block and treating the entire block as 2851 * script/style block and treating the entire block as
2699 * CDATA. 2852 * CDATA.
2700 */ 2853 */
2701 if (ctxt->recovery) { 2854 if (ctxt->recovery) {
2702 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2855 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2703 » » » » xmlStrlen(ctxt->name)) == 0) 2856 » » » » xmlStrlen(ctxt->name)) == 0)
2704 { 2857 {
2705 break; /* while */ 2858 break; /* while */
2706 } else { 2859 } else {
2707 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2860 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2708 "Element %s embeds close tag\n", 2861 "Element %s embeds close tag\n",
2709 ctxt->name, NULL); 2862 ctxt->name, NULL);
2710 } 2863 }
2711 } else { 2864 } else {
2712 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2865 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2713 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2866 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2714 { 2867 {
2715 break; /* while */ 2868 break; /* while */
2716 } 2869 }
2717 } 2870 }
2718 } 2871 }
2719 COPY_BUF(l,buf,nbchar,cur); 2872 COPY_BUF(l,buf,nbchar,cur);
2720 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2873 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2721 if (ctxt->sax->cdataBlock!= NULL) { 2874 if (ctxt->sax->cdataBlock!= NULL) {
2722 /* 2875 /*
2723 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2876 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
2760 * if we are within a CDATA section ']]>' marks an end of section. 2913 * if we are within a CDATA section ']]>' marks an end of section.
2761 * 2914 *
2762 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2915 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2763 */ 2916 */
2764 2917
2765 static void 2918 static void
2766 htmlParseCharData(htmlParserCtxtPtr ctxt) { 2919 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2767 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2920 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2768 int nbchar = 0; 2921 int nbchar = 0;
2769 int cur, l; 2922 int cur, l;
2923 int chunk = 0;
2770 2924
2771 SHRINK; 2925 SHRINK;
2772 cur = CUR_CHAR(l); 2926 cur = CUR_CHAR(l);
2773 while (((cur != '<') || (ctxt->token == '<')) && 2927 while (((cur != '<') || (ctxt->token == '<')) &&
2774 ((cur != '&') || (ctxt->token == '&')) && 2928 ((cur != '&') || (ctxt->token == '&')) &&
2775 (cur != 0)) { 2929 (cur != 0)) {
2776 if (!(IS_CHAR(cur))) { 2930 if (!(IS_CHAR(cur))) {
2777 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2931 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2778 "Invalid char in CDATA 0x%X\n", cur); 2932 "Invalid char in CDATA 0x%X\n", cur);
2779 } else { 2933 } else {
2780 COPY_BUF(l,buf,nbchar,cur); 2934 COPY_BUF(l,buf,nbchar,cur);
2781 } 2935 }
2782 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2936 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2783 /* 2937 /*
2784 * Ok the segment is to be consumed as chars. 2938 * Ok the segment is to be consumed as chars.
2785 */ 2939 */
2786 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2940 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2787 if (areBlanks(ctxt, buf, nbchar)) { 2941 if (areBlanks(ctxt, buf, nbchar)) {
2788 if (ctxt->sax->ignorableWhitespace != NULL) 2942 if (ctxt->sax->ignorableWhitespace != NULL)
2789 ctxt->sax->ignorableWhitespace(ctxt->userData, 2943 ctxt->sax->ignorableWhitespace(ctxt->userData,
2790 buf, nbchar); 2944 buf, nbchar);
2791 } else { 2945 } else {
2792 htmlCheckParagraph(ctxt); 2946 htmlCheckParagraph(ctxt);
2793 if (ctxt->sax->characters != NULL) 2947 if (ctxt->sax->characters != NULL)
2794 ctxt->sax->characters(ctxt->userData, buf, nbchar); 2948 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2795 } 2949 }
2796 } 2950 }
2797 nbchar = 0; 2951 nbchar = 0;
2798 } 2952 }
2799 NEXTL(l); 2953 NEXTL(l);
2954 chunk++;
2955 if (chunk > HTML_PARSER_BUFFER_SIZE) {
2956 chunk = 0;
2957 SHRINK;
2958 GROW;
2959 }
2800 cur = CUR_CHAR(l); 2960 cur = CUR_CHAR(l);
2801 if (cur == 0) { 2961 if (cur == 0) {
2802 SHRINK; 2962 SHRINK;
2803 GROW; 2963 GROW;
2804 cur = CUR_CHAR(l); 2964 cur = CUR_CHAR(l);
2805 } 2965 }
2806 } 2966 }
2807 if (nbchar != 0) { 2967 if (nbchar != 0) {
2808 buf[nbchar] = 0; 2968 buf[nbchar] = 0;
2809 2969
(...skipping 174 matching lines...) Expand 10 before | Expand all | Expand 10 after
2984 /* 3144 /*
2985 * SAX: PI detected. 3145 * SAX: PI detected.
2986 */ 3146 */
2987 if ((ctxt->sax) && (!ctxt->disableSAX) && 3147 if ((ctxt->sax) && (!ctxt->disableSAX) &&
2988 (ctxt->sax->processingInstruction != NULL)) 3148 (ctxt->sax->processingInstruction != NULL))
2989 ctxt->sax->processingInstruction(ctxt->userData, 3149 ctxt->sax->processingInstruction(ctxt->userData,
2990 target, buf); 3150 target, buf);
2991 } 3151 }
2992 xmlFree(buf); 3152 xmlFree(buf);
2993 } else { 3153 } else {
2994 » htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3154 » htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2995 "PI is not started correctly", NULL, NULL); 3155 "PI is not started correctly", NULL, NULL);
2996 } 3156 }
2997 ctxt->instate = state; 3157 ctxt->instate = state;
2998 } 3158 }
2999 } 3159 }
3000 3160
3001 /** 3161 /**
3002 * htmlParseComment: 3162 * htmlParseComment:
3003 * @ctxt: an HTML parser context 3163 * @ctxt: an HTML parser context
3004 * 3164 *
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
3100 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3260 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3101 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3261 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3102 "htmlParseCharRef: context error\n", 3262 "htmlParseCharRef: context error\n",
3103 NULL, NULL); 3263 NULL, NULL);
3104 return(0); 3264 return(0);
3105 } 3265 }
3106 if ((CUR == '&') && (NXT(1) == '#') && 3266 if ((CUR == '&') && (NXT(1) == '#') &&
3107 ((NXT(2) == 'x') || NXT(2) == 'X')) { 3267 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3108 SKIP(3); 3268 SKIP(3);
3109 while (CUR != ';') { 3269 while (CUR != ';') {
3110 » if ((CUR >= '0') && (CUR <= '9')) 3270 » if ((CUR >= '0') && (CUR <= '9'))
3111 val = val * 16 + (CUR - '0'); 3271 val = val * 16 + (CUR - '0');
3112 else if ((CUR >= 'a') && (CUR <= 'f')) 3272 else if ((CUR >= 'a') && (CUR <= 'f'))
3113 val = val * 16 + (CUR - 'a') + 10; 3273 val = val * 16 + (CUR - 'a') + 10;
3114 else if ((CUR >= 'A') && (CUR <= 'F')) 3274 else if ((CUR >= 'A') && (CUR <= 'F'))
3115 val = val * 16 + (CUR - 'A') + 10; 3275 val = val * 16 + (CUR - 'A') + 10;
3116 else { 3276 else {
3117 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, 3277 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3118 "htmlParseCharRef: missing semicolumn\n", 3278 "htmlParseCharRef: missing semicolumn\n",
3119 NULL, NULL); 3279 NULL, NULL);
3120 break; 3280 break;
3121 } 3281 }
3122 NEXT; 3282 NEXT;
3123 } 3283 }
3124 if (CUR == ';') 3284 if (CUR == ';')
3125 NEXT; 3285 NEXT;
3126 } else if ((CUR == '&') && (NXT(1) == '#')) { 3286 } else if ((CUR == '&') && (NXT(1) == '#')) {
3127 SKIP(2); 3287 SKIP(2);
3128 while (CUR != ';') { 3288 while (CUR != ';') {
3129 » if ((CUR >= '0') && (CUR <= '9')) 3289 » if ((CUR >= '0') && (CUR <= '9'))
3130 val = val * 10 + (CUR - '0'); 3290 val = val * 10 + (CUR - '0');
3131 else { 3291 else {
3132 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, 3292 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3133 "htmlParseCharRef: missing semicolumn\n", 3293 "htmlParseCharRef: missing semicolumn\n",
3134 NULL, NULL); 3294 NULL, NULL);
3135 break; 3295 break;
3136 } 3296 }
3137 NEXT; 3297 NEXT;
3138 } 3298 }
3139 if (CUR == ';') 3299 if (CUR == ';')
(...skipping 15 matching lines...) Expand all
3155 return(0); 3315 return(0);
3156 } 3316 }
3157 3317
3158 3318
3159 /** 3319 /**
3160 * htmlParseDocTypeDecl: 3320 * htmlParseDocTypeDecl:
3161 * @ctxt: an HTML parser context 3321 * @ctxt: an HTML parser context
3162 * 3322 *
3163 * parse a DOCTYPE declaration 3323 * parse a DOCTYPE declaration
3164 * 3324 *
3165 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 3325 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3166 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 3326 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3167 */ 3327 */
3168 3328
3169 static void 3329 static void
3170 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { 3330 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3171 const xmlChar *name; 3331 const xmlChar *name;
3172 xmlChar *ExternalID = NULL; 3332 xmlChar *ExternalID = NULL;
3173 xmlChar *URI = NULL; 3333 xmlChar *URI = NULL;
3174 3334
3175 /* 3335 /*
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
3259 } 3419 }
3260 3420
3261 /* 3421 /*
3262 * read the value 3422 * read the value
3263 */ 3423 */
3264 SKIP_BLANKS; 3424 SKIP_BLANKS;
3265 if (CUR == '=') { 3425 if (CUR == '=') {
3266 NEXT; 3426 NEXT;
3267 SKIP_BLANKS; 3427 SKIP_BLANKS;
3268 val = htmlParseAttValue(ctxt); 3428 val = htmlParseAttValue(ctxt);
3269 } else if (htmlIsBooleanAttr(name)) {
3270 /*
3271 * assume a minimized attribute
3272 */
3273 val = xmlStrdup(name);
3274 } 3429 }
3275 3430
3276 *value = val; 3431 *value = val;
3277 return(name); 3432 return(name);
3278 } 3433 }
3279 3434
3280 /** 3435 /**
3281 * htmlCheckEncoding: 3436 * htmlCheckEncoding:
3282 * @ctxt: an HTML parser context 3437 * @ctxt: an HTML parser context
3283 * @attvalue: the attribute value 3438 * @attvalue: the attribute value
3284 * 3439 *
3285 * Checks an http-equiv attribute from a Meta tag to detect 3440 * Checks an http-equiv attribute from a Meta tag to detect
3286 * the encoding 3441 * the encoding
3287 * If a new encoding is detected the parser is switched to decode 3442 * If a new encoding is detected the parser is switched to decode
3288 * it and pass UTF8 3443 * it and pass UTF8
3289 */ 3444 */
3290 static void 3445 static void
3291 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { 3446 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3292 const xmlChar *encoding; 3447 const xmlChar *encoding;
3293 3448
3294 if ((ctxt == NULL) || (attvalue == NULL)) 3449 if ((ctxt == NULL) || (attvalue == NULL))
3295 return; 3450 return;
3296 3451
3297 /* do not change encoding */» 3452 /* do not change encoding */
3298 if (ctxt->input->encoding != NULL) 3453 if (ctxt->input->encoding != NULL)
3299 return; 3454 return;
3300 3455
3301 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); 3456 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3302 if (encoding != NULL) { 3457 if (encoding != NULL) {
3303 encoding += 8; 3458 encoding += 8;
3304 } else { 3459 } else {
3305 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); 3460 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3306 if (encoding != NULL) 3461 if (encoding != NULL)
3307 encoding += 9; 3462 encoding += 9;
3308 } 3463 }
3309 if (encoding != NULL) { 3464 if (encoding != NULL) {
3310 xmlCharEncoding enc; 3465 xmlCharEncoding enc;
3311 xmlCharEncodingHandlerPtr handler; 3466 xmlCharEncodingHandlerPtr handler;
3312 3467
3313 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 3468 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3314 3469
3315 if (ctxt->input->encoding != NULL) 3470 if (ctxt->input->encoding != NULL)
3316 xmlFree((xmlChar *) ctxt->input->encoding); 3471 xmlFree((xmlChar *) ctxt->input->encoding);
3317 ctxt->input->encoding = xmlStrdup(encoding); 3472 ctxt->input->encoding = xmlStrdup(encoding);
3318 3473
3319 enc = xmlParseCharEncoding((const char *) encoding); 3474 enc = xmlParseCharEncoding((const char *) encoding);
3320 /* 3475 /*
3321 * registered set of known encodings 3476 * registered set of known encodings
3322 */ 3477 */
3323 if (enc != XML_CHAR_ENCODING_ERROR) { 3478 if (enc != XML_CHAR_ENCODING_ERROR) {
3324 » if (((enc == XML_CHAR_ENCODING_UTF16LE) || 3479 » if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3325 (enc == XML_CHAR_ENCODING_UTF16BE) || 3480 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3326 (enc == XML_CHAR_ENCODING_UCS4LE) || 3481 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3327 (enc == XML_CHAR_ENCODING_UCS4BE)) && 3482 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3328 (ctxt->input->buf != NULL) && 3483 (ctxt->input->buf != NULL) &&
3329 (ctxt->input->buf->encoder == NULL)) { 3484 (ctxt->input->buf->encoder == NULL)) {
3330 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3485 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3331 "htmlCheckEncoding: wrong encoding meta\n", 3486 "htmlCheckEncoding: wrong encoding meta\n",
3332 NULL, NULL); 3487 NULL, NULL);
3333 } else { 3488 } else {
3334 xmlSwitchEncoding(ctxt, enc); 3489 xmlSwitchEncoding(ctxt, enc);
(...skipping 27 matching lines...) Expand all
3362 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, 3517 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3363 ctxt->input->buf->buffer, 3518 ctxt->input->buf->buffer,
3364 ctxt->input->buf->raw); 3519 ctxt->input->buf->raw);
3365 if (nbchars < 0) { 3520 if (nbchars < 0) {
3366 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 3521 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3367 "htmlCheckEncoding: encoder error\n", 3522 "htmlCheckEncoding: encoder error\n",
3368 NULL, NULL); 3523 NULL, NULL);
3369 } 3524 }
3370 ctxt->input->base = 3525 ctxt->input->base =
3371 ctxt->input->cur = ctxt->input->buf->buffer->content; 3526 ctxt->input->cur = ctxt->input->buf->buffer->content;
3527 ctxt->input->end =
3528 &ctxt->input->base[ctxt->input->buf->buffer->use];
3372 } 3529 }
3373 } 3530 }
3374 } 3531 }
3375 3532
3376 /** 3533 /**
3377 * htmlCheckMeta: 3534 * htmlCheckMeta:
3378 * @ctxt: an HTML parser context 3535 * @ctxt: an HTML parser context
3379 * @atts: the attributes values 3536 * @atts: the attributes values
3380 * 3537 *
3381 * Checks an attributes from a Meta tag 3538 * Checks an attributes from a Meta tag
(...skipping 20 matching lines...) Expand all
3402 att = atts[i++]; 3559 att = atts[i++];
3403 } 3560 }
3404 if ((http) && (content != NULL)) 3561 if ((http) && (content != NULL))
3405 htmlCheckEncoding(ctxt, content); 3562 htmlCheckEncoding(ctxt, content);
3406 3563
3407 } 3564 }
3408 3565
3409 /** 3566 /**
3410 * htmlParseStartTag: 3567 * htmlParseStartTag:
3411 * @ctxt: an HTML parser context 3568 * @ctxt: an HTML parser context
3412 * 3569 *
3413 * parse a start of tag either for rule element or 3570 * parse a start of tag either for rule element or
3414 * EmptyElement. In both case we don't parse the tag closing chars. 3571 * EmptyElement. In both case we don't parse the tag closing chars.
3415 * 3572 *
3416 * [40] STag ::= '<' Name (S Attribute)* S? '>' 3573 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3417 * 3574 *
3418 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' 3575 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3419 * 3576 *
3420 * With namespace: 3577 * With namespace:
3421 * 3578 *
3422 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' 3579 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3423 * 3580 *
3424 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' 3581 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3425 * 3582 *
3426 * Returns 0 in case of success, -1 in case of error and 1 if discarded 3583 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3427 */ 3584 */
3428 3585
3429 static int 3586 static int
3430 htmlParseStartTag(htmlParserCtxtPtr ctxt) { 3587 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3431 const xmlChar *name; 3588 const xmlChar *name;
3432 const xmlChar *attname; 3589 const xmlChar *attname;
3433 xmlChar *attvalue; 3590 xmlChar *attvalue;
3434 const xmlChar **atts; 3591 const xmlChar **atts;
3435 int nbatts = 0; 3592 int nbatts = 0;
3436 int maxatts; 3593 int maxatts;
3437 int meta = 0; 3594 int meta = 0;
3438 int i; 3595 int i;
3439 int discardtag = 0; 3596 int discardtag = 0;
3440 3597
3598 if (ctxt->instate == XML_PARSER_EOF)
3599 return(-1);
3441 if ((ctxt == NULL) || (ctxt->input == NULL)) { 3600 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3442 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 3601 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3443 "htmlParseStartTag: context error\n", NULL, NULL); 3602 "htmlParseStartTag: context error\n", NULL, NULL);
3444 return -1; 3603 return -1;
3445 } 3604 }
3446 if (CUR != '<') return -1; 3605 if (CUR != '<') return -1;
3447 NEXT; 3606 NEXT;
3448 3607
3449 atts = ctxt->atts; 3608 atts = ctxt->atts;
3450 maxatts = ctxt->maxatts; 3609 maxatts = ctxt->maxatts;
3451 3610
3452 GROW; 3611 GROW;
3453 name = htmlParseHTMLName(ctxt); 3612 name = htmlParseHTMLName(ctxt);
3454 if (name == NULL) { 3613 if (name == NULL) {
3455 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 3614 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3456 "htmlParseStartTag: invalid element name\n", 3615 "htmlParseStartTag: invalid element name\n",
3457 NULL, NULL); 3616 NULL, NULL);
3458 /* Dump the bogus tag like browsers do */ 3617 /* Dump the bogus tag like browsers do */
3459 » while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 3618 » while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3619 (ctxt->instate != XML_PARSER_EOF))
3460 NEXT; 3620 NEXT;
3461 return -1; 3621 return -1;
3462 } 3622 }
3463 if (xmlStrEqual(name, BAD_CAST"meta")) 3623 if (xmlStrEqual(name, BAD_CAST"meta"))
3464 meta = 1; 3624 meta = 1;
3465 3625
3466 /* 3626 /*
3467 * Check for auto-closure of HTML elements. 3627 * Check for auto-closure of HTML elements.
3468 */ 3628 */
3469 htmlAutoClose(ctxt, name); 3629 htmlAutoClose(ctxt, name);
3470 3630
3471 /* 3631 /*
3472 * Check for implied HTML elements. 3632 * Check for implied HTML elements.
3473 */ 3633 */
3474 htmlCheckImplied(ctxt, name); 3634 htmlCheckImplied(ctxt, name);
3475 3635
3476 /* 3636 /*
3477 * Avoid html at any level > 0, head at any level != 1 3637 * Avoid html at any level > 0, head at any level != 1
3478 * or any attempt to recurse body 3638 * or any attempt to recurse body
3479 */ 3639 */
3480 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { 3640 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3481 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3641 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3482 "htmlParseStartTag: misplaced <html> tag\n", 3642 "htmlParseStartTag: misplaced <html> tag\n",
3483 name, NULL); 3643 name, NULL);
3484 discardtag = 1; 3644 discardtag = 1;
3645 ctxt->depth++;
3485 } 3646 }
3486 if ((ctxt->nameNr != 1) && 3647 if ((ctxt->nameNr != 1) &&
3487 (xmlStrEqual(name, BAD_CAST"head"))) { 3648 (xmlStrEqual(name, BAD_CAST"head"))) {
3488 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3649 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3489 "htmlParseStartTag: misplaced <head> tag\n", 3650 "htmlParseStartTag: misplaced <head> tag\n",
3490 name, NULL); 3651 name, NULL);
3491 discardtag = 1; 3652 discardtag = 1;
3653 ctxt->depth++;
3492 } 3654 }
3493 if (xmlStrEqual(name, BAD_CAST"body")) { 3655 if (xmlStrEqual(name, BAD_CAST"body")) {
3494 int indx; 3656 int indx;
3495 for (indx = 0;indx < ctxt->nameNr;indx++) { 3657 for (indx = 0;indx < ctxt->nameNr;indx++) {
3496 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { 3658 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3497 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 3659 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3498 "htmlParseStartTag: misplaced <body> tag\n", 3660 "htmlParseStartTag: misplaced <body> tag\n",
3499 name, NULL); 3661 name, NULL);
3500 discardtag = 1; 3662 discardtag = 1;
3663 ctxt->depth++;
3501 } 3664 }
3502 } 3665 }
3503 } 3666 }
3504 3667
3505 /* 3668 /*
3506 * Now parse the attributes, it ends up with the ending 3669 * Now parse the attributes, it ends up with the ending
3507 * 3670 *
3508 * (S Attribute)* S? 3671 * (S Attribute)* S?
3509 */ 3672 */
3510 SKIP_BLANKS; 3673 SKIP_BLANKS;
3511 while ((IS_CHAR_CH(CUR)) && 3674 while ((IS_CHAR_CH(CUR)) &&
3512 (CUR != '>') && 3675 (CUR != '>') &&
3513 ((CUR != '/') || (NXT(1) != '>'))) { 3676 ((CUR != '/') || (NXT(1) != '>'))) {
3514 long cons = ctxt->nbChars; 3677 long cons = ctxt->nbChars;
3515 3678
3516 GROW; 3679 GROW;
3517 attname = htmlParseAttribute(ctxt, &attvalue); 3680 attname = htmlParseAttribute(ctxt, &attvalue);
3518 if (attname != NULL) { 3681 if (attname != NULL) {
3519 3682
3520 /* 3683 /*
3521 * Well formedness requires at most one declaration of an attribute 3684 * Well formedness requires at most one declaration of an attribute
3522 */ 3685 */
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
3641 if ((CUR != '<') || (NXT(1) != '/')) { 3804 if ((CUR != '<') || (NXT(1) != '/')) {
3642 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, 3805 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3643 "htmlParseEndTag: '</' not found\n", NULL, NULL); 3806 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3644 return (0); 3807 return (0);
3645 } 3808 }
3646 SKIP(2); 3809 SKIP(2);
3647 3810
3648 name = htmlParseHTMLName(ctxt); 3811 name = htmlParseHTMLName(ctxt);
3649 if (name == NULL) 3812 if (name == NULL)
3650 return (0); 3813 return (0);
3651
3652 /* 3814 /*
3653 * We should definitely be at the ending "S? '>'" part 3815 * We should definitely be at the ending "S? '>'" part
3654 */ 3816 */
3655 SKIP_BLANKS; 3817 SKIP_BLANKS;
3656 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { 3818 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3657 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 3819 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3658 "End tag : expected '>'\n", NULL, NULL); 3820 "End tag : expected '>'\n", NULL, NULL);
3659 if (ctxt->recovery) { 3821 if (ctxt->recovery) {
3660 /* 3822 /*
3661 * We're not at the ending > !! 3823 * We're not at the ending > !!
3662 * Error, unless in recover mode where we search forwards 3824 * Error, unless in recover mode where we search forwards
3663 * until we find a > 3825 * until we find a >
3664 */ 3826 */
3665 while (CUR != '\0' && CUR != '>') NEXT; 3827 while (CUR != '\0' && CUR != '>') NEXT;
3666 NEXT; 3828 NEXT;
3667 } 3829 }
3668 } else 3830 } else
3669 NEXT; 3831 NEXT;
3670 3832
3671 /* 3833 /*
3834 * if we ignored misplaced tags in htmlParseStartTag don't pop them
3835 * out now.
3836 */
3837 if ((ctxt->depth > 0) &&
3838 (xmlStrEqual(name, BAD_CAST "html") ||
3839 xmlStrEqual(name, BAD_CAST "body") ||
3840 xmlStrEqual(name, BAD_CAST "head"))) {
3841 ctxt->depth--;
3842 return (0);
3843 }
3844
3845 /*
3672 * If the name read is not one of the element in the parsing stack 3846 * If the name read is not one of the element in the parsing stack
3673 * then return, it's just an error. 3847 * then return, it's just an error.
3674 */ 3848 */
3675 for (i = (ctxt->nameNr - 1); i >= 0; i--) { 3849 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3676 if (xmlStrEqual(name, ctxt->nameTab[i])) 3850 if (xmlStrEqual(name, ctxt->nameTab[i]))
3677 break; 3851 break;
3678 } 3852 }
3679 if (i < 0) { 3853 if (i < 0) {
3680 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 3854 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3681 "Unexpected end tag : %s\n", name, NULL); 3855 "Unexpected end tag : %s\n", name, NULL);
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
3715 ret = 0; 3889 ret = 0;
3716 } 3890 }
3717 3891
3718 return (ret); 3892 return (ret);
3719 } 3893 }
3720 3894
3721 3895
3722 /** 3896 /**
3723 * htmlParseReference: 3897 * htmlParseReference:
3724 * @ctxt: an HTML parser context 3898 * @ctxt: an HTML parser context
3725 * 3899 *
3726 * parse and handle entity references in content, 3900 * parse and handle entity references in content,
3727 * this will end-up in a call to character() since this is either a 3901 * this will end-up in a call to character() since this is either a
3728 * CharRef, or a predefined entity. 3902 * CharRef, or a predefined entity.
3729 */ 3903 */
3730 static void 3904 static void
3731 htmlParseReference(htmlParserCtxtPtr ctxt) { 3905 htmlParseReference(htmlParserCtxtPtr ctxt) {
3732 const htmlEntityDesc * ent; 3906 const htmlEntityDesc * ent;
3733 xmlChar out[6]; 3907 xmlChar out[6];
3734 const xmlChar *name; 3908 const xmlChar *name;
3735 if (CUR != '&') return; 3909 if (CUR != '&') return;
3736 3910
3737 if (NXT(1) == '#') { 3911 if (NXT(1) == '#') {
3738 unsigned int c; 3912 unsigned int c;
3739 int bits, i = 0; 3913 int bits, i = 0;
3740 3914
3741 c = htmlParseCharRef(ctxt); 3915 c = htmlParseCharRef(ctxt);
3742 if (c == 0) 3916 if (c == 0)
3743 return; 3917 return;
3744 3918
3745 if (c < 0x80) { out[i++]= c; bits= -6; } 3919 if (c < 0x80) { out[i++]= c; bits= -6; }
3746 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3920 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3747 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3921 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3748 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3922 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3749 3923
3750 for ( ; bits >= 0; bits-= 6) { 3924 for ( ; bits >= 0; bits-= 6) {
3751 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3925 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3752 } 3926 }
3753 out[i] = 0; 3927 out[i] = 0;
3754 3928
3755 htmlCheckParagraph(ctxt); 3929 htmlCheckParagraph(ctxt);
3756 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3930 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3757 ctxt->sax->characters(ctxt->userData, out, i); 3931 ctxt->sax->characters(ctxt->userData, out, i);
3758 } else { 3932 } else {
3759 ent = htmlParseEntityRef(ctxt, &name); 3933 ent = htmlParseEntityRef(ctxt, &name);
(...skipping 14 matching lines...) Expand all
3774 unsigned int c; 3948 unsigned int c;
3775 int bits, i = 0; 3949 int bits, i = 0;
3776 3950
3777 c = ent->value; 3951 c = ent->value;
3778 if (c < 0x80) 3952 if (c < 0x80)
3779 { out[i++]= c; bits= -6; } 3953 { out[i++]= c; bits= -6; }
3780 else if (c < 0x800) 3954 else if (c < 0x800)
3781 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } 3955 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
3782 else if (c < 0x10000) 3956 else if (c < 0x10000)
3783 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } 3957 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
3784 » else 3958 » else
3785 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } 3959 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
3786 3960
3787 for ( ; bits >= 0; bits-= 6) { 3961 for ( ; bits >= 0; bits-= 6) {
3788 out[i++]= ((c >> bits) & 0x3F) | 0x80; 3962 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3789 } 3963 }
3790 out[i] = 0; 3964 out[i] = 0;
3791 3965
3792 htmlCheckParagraph(ctxt); 3966 htmlCheckParagraph(ctxt);
3793 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 3967 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3794 ctxt->sax->characters(ctxt->userData, out, i); 3968 ctxt->sax->characters(ctxt->userData, out, i);
3795 } 3969 }
3796 } 3970 }
3797 } 3971 }
3798 3972
3799 /** 3973 /**
3800 * htmlParseContent: 3974 * htmlParseContent:
3801 * @ctxt: an HTML parser context 3975 * @ctxt: an HTML parser context
3802 * 3976 *
3803 * Parse a content: comment, sub-element, reference or text. 3977 * Parse a content: comment, sub-element, reference or text.
3978 * Kept for compatibility with old code
3804 */ 3979 */
3805 3980
3806 static void 3981 static void
3807 htmlParseContent(htmlParserCtxtPtr ctxt) { 3982 htmlParseContent(htmlParserCtxtPtr ctxt) {
3808 xmlChar *currentNode; 3983 xmlChar *currentNode;
3809 int depth; 3984 int depth;
3810 const xmlChar *name; 3985 const xmlChar *name;
3811 3986
3812 currentNode = xmlStrdup(ctxt->name); 3987 currentNode = xmlStrdup(ctxt->name);
3813 depth = ctxt->nameNr; 3988 depth = ctxt->nameNr;
3814 while (1) { 3989 while (1) {
3815 long cons = ctxt->nbChars; 3990 long cons = ctxt->nbChars;
3816 3991
3817 GROW; 3992 GROW;
3993
3994 if (ctxt->instate == XML_PARSER_EOF)
3995 break;
3996
3818 /* 3997 /*
3819 * Our tag or one of it's parent or children is ending. 3998 * Our tag or one of it's parent or children is ending.
3820 */ 3999 */
3821 if ((CUR == '<') && (NXT(1) == '/')) { 4000 if ((CUR == '<') && (NXT(1) == '/')) {
3822 if (htmlParseEndTag(ctxt) && 4001 if (htmlParseEndTag(ctxt) &&
3823 ((currentNode != NULL) || (ctxt->nameNr == 0))) { 4002 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3824 if (currentNode != NULL) 4003 if (currentNode != NULL)
3825 xmlFree(currentNode); 4004 xmlFree(currentNode);
3826 return; 4005 return;
3827 } 4006 }
3828 continue; /* while */ 4007 continue; /* while */
3829 } 4008 }
3830 4009
3831 else if ((CUR == '<') && 4010 else if ((CUR == '<') &&
3832 ((IS_ASCII_LETTER(NXT(1))) || 4011 ((IS_ASCII_LETTER(NXT(1))) ||
3833 (NXT(1) == '_') || (NXT(1) == ':'))) { 4012 (NXT(1) == '_') || (NXT(1) == ':'))) {
3834 name = htmlParseHTMLName_nonInvasive(ctxt); 4013 name = htmlParseHTMLName_nonInvasive(ctxt);
3835 if (name == NULL) { 4014 if (name == NULL) {
3836 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 4015 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3837 "htmlParseStartTag: invalid element name\n", 4016 "htmlParseStartTag: invalid element name\n",
3838 NULL, NULL); 4017 NULL, NULL);
3839 /* Dump the bogus tag like browsers do */ 4018 /* Dump the bogus tag like browsers do */
3840 » while ((IS_CHAR_CH(CUR)) && (CUR != '>')) 4019 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3841 NEXT; 4020 NEXT;
3842 4021
3843 if (currentNode != NULL) 4022 if (currentNode != NULL)
3844 xmlFree(currentNode); 4023 xmlFree(currentNode);
3845 return; 4024 return;
3846 } 4025 }
3847 4026
3848 if (ctxt->name != NULL) { 4027 if (ctxt->name != NULL) {
3849 if (htmlCheckAutoClose(name, ctxt->name) == 1) { 4028 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3850 htmlAutoClose(ctxt, name); 4029 htmlAutoClose(ctxt, name);
3851 continue; 4030 continue;
3852 } 4031 }
3853 » }» 4032 » }
3854 } 4033 }
3855 4034
3856 /* 4035 /*
3857 * Has this node been popped out during parsing of 4036 * Has this node been popped out during parsing of
3858 * the next element 4037 * the next element
3859 */ 4038 */
3860 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && 4039 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3861 (!xmlStrEqual(currentNode, ctxt->name))) 4040 (!xmlStrEqual(currentNode, ctxt->name)))
3862 { 4041 {
3863 if (currentNode != NULL) xmlFree(currentNode); 4042 if (currentNode != NULL) xmlFree(currentNode);
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
3902 4081
3903 /* 4082 /*
3904 * Third case : a sub-element. 4083 * Third case : a sub-element.
3905 */ 4084 */
3906 else if (CUR == '<') { 4085 else if (CUR == '<') {
3907 htmlParseElement(ctxt); 4086 htmlParseElement(ctxt);
3908 } 4087 }
3909 4088
3910 /* 4089 /*
3911 * Fourth case : a reference. If if has not been resolved, 4090 * Fourth case : a reference. If if has not been resolved,
3912 » * parsing returns it's Name, create the node 4091 » * parsing returns it's Name, create the node
3913 */ 4092 */
3914 else if (CUR == '&') { 4093 else if (CUR == '&') {
3915 htmlParseReference(ctxt); 4094 htmlParseReference(ctxt);
3916 } 4095 }
3917 4096
3918 /* 4097 /*
3919 * Fifth case : end of the resource 4098 * Fifth case : end of the resource
3920 */ 4099 */
3921 else if (CUR == 0) { 4100 else if (CUR == 0) {
3922 htmlAutoCloseOnEnd(ctxt); 4101 htmlAutoCloseOnEnd(ctxt);
(...skipping 15 matching lines...) Expand all
3938 } 4117 }
3939 break; 4118 break;
3940 } 4119 }
3941 } 4120 }
3942 GROW; 4121 GROW;
3943 } 4122 }
3944 if (currentNode != NULL) xmlFree(currentNode); 4123 if (currentNode != NULL) xmlFree(currentNode);
3945 } 4124 }
3946 4125
3947 /** 4126 /**
3948 * htmlParseContent:
3949 * @ctxt: an HTML parser context
3950 *
3951 * Parse a content: comment, sub-element, reference or text.
3952 */
3953
3954 void
3955 __htmlParseContent(void *ctxt) {
3956 if (ctxt != NULL)
3957 htmlParseContent((htmlParserCtxtPtr) ctxt);
3958 }
3959
3960 /**
3961 * htmlParseElement: 4127 * htmlParseElement:
3962 * @ctxt: an HTML parser context 4128 * @ctxt: an HTML parser context
3963 * 4129 *
3964 * parse an HTML element, this is highly recursive 4130 * parse an HTML element, this is highly recursive
4131 * this is kept for compatibility with previous code versions
3965 * 4132 *
3966 * [39] element ::= EmptyElemTag | STag content ETag 4133 * [39] element ::= EmptyElemTag | STag content ETag
3967 * 4134 *
3968 * [41] Attribute ::= Name Eq AttValue 4135 * [41] Attribute ::= Name Eq AttValue
3969 */ 4136 */
3970 4137
3971 void 4138 void
3972 htmlParseElement(htmlParserCtxtPtr ctxt) { 4139 htmlParseElement(htmlParserCtxtPtr ctxt) {
3973 const xmlChar *name; 4140 const xmlChar *name;
3974 xmlChar *currentNode = NULL; 4141 xmlChar *currentNode = NULL;
3975 const htmlElemDesc * info; 4142 const htmlElemDesc * info;
3976 htmlParserNodeInfo node_info; 4143 htmlParserNodeInfo node_info;
3977 int failed; 4144 int failed;
3978 int depth; 4145 int depth;
3979 const xmlChar *oldptr; 4146 const xmlChar *oldptr;
3980 4147
3981 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4148 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3982 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4149 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3983 "htmlParseElement: context error\n", NULL, NULL); 4150 "htmlParseElement: context error\n", NULL, NULL);
3984 return; 4151 return;
3985 } 4152 }
4153
4154 if (ctxt->instate == XML_PARSER_EOF)
4155 return;
4156
3986 /* Capture start position */ 4157 /* Capture start position */
3987 if (ctxt->record_info) { 4158 if (ctxt->record_info) {
3988 node_info.begin_pos = ctxt->input->consumed + 4159 node_info.begin_pos = ctxt->input->consumed +
3989 (CUR_PTR - ctxt->input->base); 4160 (CUR_PTR - ctxt->input->base);
3990 node_info.begin_line = ctxt->input->line; 4161 node_info.begin_line = ctxt->input->line;
3991 } 4162 }
3992 4163
3993 failed = htmlParseStartTag(ctxt); 4164 failed = htmlParseStartTag(ctxt);
3994 name = ctxt->name; 4165 name = ctxt->name;
3995 if ((failed == -1) || (name == NULL)) { 4166 if ((failed == -1) || (name == NULL)) {
(...skipping 24 matching lines...) Expand all
4020 4191
4021 if (CUR == '>') { 4192 if (CUR == '>') {
4022 NEXT; 4193 NEXT;
4023 } else { 4194 } else {
4024 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 4195 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4025 "Couldn't find end of Start Tag %s\n", name, NULL); 4196 "Couldn't find end of Start Tag %s\n", name, NULL);
4026 4197
4027 /* 4198 /*
4028 * end of parsing of this node. 4199 * end of parsing of this node.
4029 */ 4200 */
4030 » if (xmlStrEqual(name, ctxt->name)) { 4201 » if (xmlStrEqual(name, ctxt->name)) {
4031 nodePop(ctxt); 4202 nodePop(ctxt);
4032 htmlnamePop(ctxt); 4203 htmlnamePop(ctxt);
4033 » } 4204 » }
4034 4205
4035 /* 4206 /*
4036 * Capture end position and add node 4207 * Capture end position and add node
4037 */ 4208 */
4038 if (ctxt->record_info) { 4209 if (ctxt->record_info) {
4039 node_info.end_pos = ctxt->input->consumed + 4210 node_info.end_pos = ctxt->input->consumed +
4040 (CUR_PTR - ctxt->input->base); 4211 (CUR_PTR - ctxt->input->base);
4041 node_info.end_line = ctxt->input->line; 4212 node_info.end_line = ctxt->input->line;
4042 node_info.node = ctxt->node; 4213 node_info.node = ctxt->node;
4043 xmlParserAddNodeInfo(ctxt, &node_info); 4214 xmlParserAddNodeInfo(ctxt, &node_info);
(...skipping 13 matching lines...) Expand all
4057 4228
4058 /* 4229 /*
4059 * Parse the content of the element: 4230 * Parse the content of the element:
4060 */ 4231 */
4061 currentNode = xmlStrdup(ctxt->name); 4232 currentNode = xmlStrdup(ctxt->name);
4062 depth = ctxt->nameNr; 4233 depth = ctxt->nameNr;
4063 while (IS_CHAR_CH(CUR)) { 4234 while (IS_CHAR_CH(CUR)) {
4064 oldptr = ctxt->input->cur; 4235 oldptr = ctxt->input->cur;
4065 htmlParseContent(ctxt); 4236 htmlParseContent(ctxt);
4066 if (oldptr==ctxt->input->cur) break; 4237 if (oldptr==ctxt->input->cur) break;
4067 » if (ctxt->nameNr < depth) break; 4238 » if (ctxt->nameNr < depth) break;
4068 }» 4239 }
4069 4240
4070 /* 4241 /*
4071 * Capture end position and add node 4242 * Capture end position and add node
4072 */ 4243 */
4073 if ( currentNode != NULL && ctxt->record_info ) { 4244 if ( currentNode != NULL && ctxt->record_info ) {
4074 node_info.end_pos = ctxt->input->consumed + 4245 node_info.end_pos = ctxt->input->consumed +
4075 (CUR_PTR - ctxt->input->base); 4246 (CUR_PTR - ctxt->input->base);
4076 node_info.end_line = ctxt->input->line; 4247 node_info.end_line = ctxt->input->line;
4077 node_info.node = ctxt->node; 4248 node_info.node = ctxt->node;
4078 xmlParserAddNodeInfo(ctxt, &node_info); 4249 xmlParserAddNodeInfo(ctxt, &node_info);
4079 } 4250 }
4080 if (!IS_CHAR_CH(CUR)) { 4251 if (!IS_CHAR_CH(CUR)) {
4081 htmlAutoCloseOnEnd(ctxt); 4252 htmlAutoCloseOnEnd(ctxt);
4082 } 4253 }
4083 4254
4084 if (currentNode != NULL) 4255 if (currentNode != NULL)
4085 xmlFree(currentNode); 4256 xmlFree(currentNode);
4086 } 4257 }
4087 4258
4259 static void
4260 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4261 /*
4262 * Capture end position and add node
4263 */
4264 if ( ctxt->node != NULL && ctxt->record_info ) {
4265 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4266 (CUR_PTR - ctxt->input->base);
4267 ctxt->nodeInfo->end_line = ctxt->input->line;
4268 ctxt->nodeInfo->node = ctxt->node;
4269 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4270 htmlNodeInfoPop(ctxt);
4271 }
4272 if (!IS_CHAR_CH(CUR)) {
4273 htmlAutoCloseOnEnd(ctxt);
4274 }
4275 }
4276
4277 /**
4278 * htmlParseElementInternal:
4279 * @ctxt: an HTML parser context
4280 *
4281 * parse an HTML element, new version, non recursive
4282 *
4283 * [39] element ::= EmptyElemTag | STag content ETag
4284 *
4285 * [41] Attribute ::= Name Eq AttValue
4286 */
4287
4288 static void
4289 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4290 const xmlChar *name;
4291 const htmlElemDesc * info;
4292 htmlParserNodeInfo node_info;
4293 int failed;
4294
4295 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4296 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4297 "htmlParseElementInternal: context error\n", NULL, NULL);
4298 return;
4299 }
4300
4301 if (ctxt->instate == XML_PARSER_EOF)
4302 return;
4303
4304 /* Capture start position */
4305 if (ctxt->record_info) {
4306 node_info.begin_pos = ctxt->input->consumed +
4307 (CUR_PTR - ctxt->input->base);
4308 node_info.begin_line = ctxt->input->line;
4309 }
4310
4311 failed = htmlParseStartTag(ctxt);
4312 name = ctxt->name;
4313 if ((failed == -1) || (name == NULL)) {
4314 if (CUR == '>')
4315 NEXT;
4316 return;
4317 }
4318
4319 /*
4320 * Lookup the info for that element.
4321 */
4322 info = htmlTagLookup(name);
4323 if (info == NULL) {
4324 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4325 "Tag %s invalid\n", name, NULL);
4326 }
4327
4328 /*
4329 * Check for an Empty Element labeled the XML/SGML way
4330 */
4331 if ((CUR == '/') && (NXT(1) == '>')) {
4332 SKIP(2);
4333 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4334 ctxt->sax->endElement(ctxt->userData, name);
4335 htmlnamePop(ctxt);
4336 return;
4337 }
4338
4339 if (CUR == '>') {
4340 NEXT;
4341 } else {
4342 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4343 "Couldn't find end of Start Tag %s\n", name, NULL);
4344
4345 /*
4346 * end of parsing of this node.
4347 */
4348 if (xmlStrEqual(name, ctxt->name)) {
4349 nodePop(ctxt);
4350 htmlnamePop(ctxt);
4351 }
4352
4353 if (ctxt->record_info)
4354 htmlNodeInfoPush(ctxt, &node_info);
4355 htmlParserFinishElementParsing(ctxt);
4356 return;
4357 }
4358
4359 /*
4360 * Check for an Empty Element from DTD definition
4361 */
4362 if ((info != NULL) && (info->empty)) {
4363 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4364 ctxt->sax->endElement(ctxt->userData, name);
4365 htmlnamePop(ctxt);
4366 return;
4367 }
4368
4369 if (ctxt->record_info)
4370 htmlNodeInfoPush(ctxt, &node_info);
4371 }
4372
4373 /**
4374 * htmlParseContentInternal:
4375 * @ctxt: an HTML parser context
4376 *
4377 * Parse a content: comment, sub-element, reference or text.
4378 * New version for non recursive htmlParseElementInternal
4379 */
4380
4381 static void
4382 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4383 xmlChar *currentNode;
4384 int depth;
4385 const xmlChar *name;
4386
4387 currentNode = xmlStrdup(ctxt->name);
4388 depth = ctxt->nameNr;
4389 while (1) {
4390 long cons = ctxt->nbChars;
4391
4392 GROW;
4393
4394 if (ctxt->instate == XML_PARSER_EOF)
4395 break;
4396
4397 /*
4398 * Our tag or one of it's parent or children is ending.
4399 */
4400 if ((CUR == '<') && (NXT(1) == '/')) {
4401 if (htmlParseEndTag(ctxt) &&
4402 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4403 if (currentNode != NULL)
4404 xmlFree(currentNode);
4405
4406 currentNode = xmlStrdup(ctxt->name);
4407 depth = ctxt->nameNr;
4408 }
4409 continue; /* while */
4410 }
4411
4412 else if ((CUR == '<') &&
4413 ((IS_ASCII_LETTER(NXT(1))) ||
4414 (NXT(1) == '_') || (NXT(1) == ':'))) {
4415 name = htmlParseHTMLName_nonInvasive(ctxt);
4416 if (name == NULL) {
4417 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4418 "htmlParseStartTag: invalid element name\n",
4419 NULL, NULL);
4420 /* Dump the bogus tag like browsers do */
4421 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4422 NEXT;
4423
4424 htmlParserFinishElementParsing(ctxt);
4425 if (currentNode != NULL)
4426 xmlFree(currentNode);
4427
4428 currentNode = xmlStrdup(ctxt->name);
4429 depth = ctxt->nameNr;
4430 continue;
4431 }
4432
4433 if (ctxt->name != NULL) {
4434 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4435 htmlAutoClose(ctxt, name);
4436 continue;
4437 }
4438 }
4439 }
4440
4441 /*
4442 * Has this node been popped out during parsing of
4443 * the next element
4444 */
4445 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4446 (!xmlStrEqual(currentNode, ctxt->name)))
4447 {
4448 htmlParserFinishElementParsing(ctxt);
4449 if (currentNode != NULL) xmlFree(currentNode);
4450
4451 currentNode = xmlStrdup(ctxt->name);
4452 depth = ctxt->nameNr;
4453 continue;
4454 }
4455
4456 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4457 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4458 /*
4459 * Handle SCRIPT/STYLE separately
4460 */
4461 htmlParseScript(ctxt);
4462 } else {
4463 /*
4464 * Sometimes DOCTYPE arrives in the middle of the document
4465 */
4466 if ((CUR == '<') && (NXT(1) == '!') &&
4467 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4468 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4469 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4470 (UPP(8) == 'E')) {
4471 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4472 "Misplaced DOCTYPE declaration\n",
4473 BAD_CAST "DOCTYPE" , NULL);
4474 htmlParseDocTypeDecl(ctxt);
4475 }
4476
4477 /*
4478 * First case : a comment
4479 */
4480 if ((CUR == '<') && (NXT(1) == '!') &&
4481 (NXT(2) == '-') && (NXT(3) == '-')) {
4482 htmlParseComment(ctxt);
4483 }
4484
4485 /*
4486 * Second case : a Processing Instruction.
4487 */
4488 else if ((CUR == '<') && (NXT(1) == '?')) {
4489 htmlParsePI(ctxt);
4490 }
4491
4492 /*
4493 * Third case : a sub-element.
4494 */
4495 else if (CUR == '<') {
4496 htmlParseElementInternal(ctxt);
4497 if (currentNode != NULL) xmlFree(currentNode);
4498
4499 currentNode = xmlStrdup(ctxt->name);
4500 depth = ctxt->nameNr;
4501 }
4502
4503 /*
4504 * Fourth case : a reference. If if has not been resolved,
4505 * parsing returns it's Name, create the node
4506 */
4507 else if (CUR == '&') {
4508 htmlParseReference(ctxt);
4509 }
4510
4511 /*
4512 * Fifth case : end of the resource
4513 */
4514 else if (CUR == 0) {
4515 htmlAutoCloseOnEnd(ctxt);
4516 break;
4517 }
4518
4519 /*
4520 * Last case, text. Note that References are handled directly.
4521 */
4522 else {
4523 htmlParseCharData(ctxt);
4524 }
4525
4526 if (cons == ctxt->nbChars) {
4527 if (ctxt->node != NULL) {
4528 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4529 "detected an error in element content\n",
4530 NULL, NULL);
4531 }
4532 break;
4533 }
4534 }
4535 GROW;
4536 }
4537 if (currentNode != NULL) xmlFree(currentNode);
4538 }
4539
4540 /**
4541 * htmlParseContent:
4542 * @ctxt: an HTML parser context
4543 *
4544 * Parse a content: comment, sub-element, reference or text.
4545 * This is the entry point when called from parser.c
4546 */
4547
4548 void
4549 __htmlParseContent(void *ctxt) {
4550 if (ctxt != NULL)
4551 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4552 }
4553
4088 /** 4554 /**
4089 * htmlParseDocument: 4555 * htmlParseDocument:
4090 * @ctxt: an HTML parser context 4556 * @ctxt: an HTML parser context
4091 * 4557 *
4092 * parse an HTML document (and build a tree if using the standard SAX 4558 * parse an HTML document (and build a tree if using the standard SAX
4093 * interface). 4559 * interface).
4094 * 4560 *
4095 * Returns 0, -1 in case of error. the parser context is augmented 4561 * Returns 0, -1 in case of error. the parser context is augmented
4096 * as a result of the parsing. 4562 * as a result of the parsing.
4097 */ 4563 */
4098 4564
4099 int 4565 int
4100 htmlParseDocument(htmlParserCtxtPtr ctxt) { 4566 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4567 xmlChar start[4];
4568 xmlCharEncoding enc;
4101 xmlDtdPtr dtd; 4569 xmlDtdPtr dtd;
4102 4570
4103 xmlInitParser(); 4571 xmlInitParser();
4104 4572
4105 htmlDefaultSAXHandlerInit(); 4573 htmlDefaultSAXHandlerInit();
4106 4574
4107 if ((ctxt == NULL) || (ctxt->input == NULL)) { 4575 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4108 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 4576 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4109 "htmlParseDocument: context error\n", NULL, NULL); 4577 "htmlParseDocument: context error\n", NULL, NULL);
4110 return(XML_ERR_INTERNAL_ERROR); 4578 return(XML_ERR_INTERNAL_ERROR);
4111 } 4579 }
4112 ctxt->html = 1; 4580 ctxt->html = 1;
4581 ctxt->linenumbers = 1;
4113 GROW; 4582 GROW;
4114 /* 4583 /*
4115 * SAX: beginning of the document processing. 4584 * SAX: beginning of the document processing.
4116 */ 4585 */
4117 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 4586 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4118 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); 4587 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4119 4588
4589 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4590 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4591 /*
4592 * Get the 4 first bytes and decode the charset
4593 * if enc != XML_CHAR_ENCODING_NONE
4594 * plug some encoding conversion routines.
4595 */
4596 start[0] = RAW;
4597 start[1] = NXT(1);
4598 start[2] = NXT(2);
4599 start[3] = NXT(3);
4600 enc = xmlDetectCharEncoding(&start[0], 4);
4601 if (enc != XML_CHAR_ENCODING_NONE) {
4602 xmlSwitchEncoding(ctxt, enc);
4603 }
4604 }
4605
4120 /* 4606 /*
4121 * Wipe out everything which is before the first '<' 4607 * Wipe out everything which is before the first '<'
4122 */ 4608 */
4123 SKIP_BLANKS; 4609 SKIP_BLANKS;
4124 if (CUR == 0) { 4610 if (CUR == 0) {
4125 » htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 4611 » htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4126 "Document is empty\n", NULL, NULL); 4612 "Document is empty\n", NULL, NULL);
4127 } 4613 }
4128 4614
4129 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) 4615 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4130 ctxt->sax->startDocument(ctxt->userData); 4616 ctxt->sax->startDocument(ctxt->userData);
4131 4617
4132 4618
4133 /* 4619 /*
4134 * Parse possible comments and PIs before any content 4620 * Parse possible comments and PIs before any content
4135 */ 4621 */
4136 while (((CUR == '<') && (NXT(1) == '!') && 4622 while (((CUR == '<') && (NXT(1) == '!') &&
4137 (NXT(2) == '-') && (NXT(3) == '-')) || 4623 (NXT(2) == '-') && (NXT(3) == '-')) ||
4138 ((CUR == '<') && (NXT(1) == '?'))) { 4624 ((CUR == '<') && (NXT(1) == '?'))) {
4139 htmlParseComment(ctxt);» 4625 htmlParseComment(ctxt);
4140 htmlParsePI(ctxt);» 4626 htmlParsePI(ctxt);
4141 SKIP_BLANKS; 4627 SKIP_BLANKS;
4142 }» 4628 }
4143 4629
4144 4630
4145 /* 4631 /*
4146 * Then possibly doc type declaration(s) and more Misc 4632 * Then possibly doc type declaration(s) and more Misc
4147 * (doctypedecl Misc*)? 4633 * (doctypedecl Misc*)?
4148 */ 4634 */
4149 if ((CUR == '<') && (NXT(1) == '!') && 4635 if ((CUR == '<') && (NXT(1) == '!') &&
4150 (UPP(2) == 'D') && (UPP(3) == 'O') && 4636 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4151 (UPP(4) == 'C') && (UPP(5) == 'T') && 4637 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4152 (UPP(6) == 'Y') && (UPP(7) == 'P') && 4638 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4153 (UPP(8) == 'E')) { 4639 (UPP(8) == 'E')) {
4154 htmlParseDocTypeDecl(ctxt); 4640 htmlParseDocTypeDecl(ctxt);
4155 } 4641 }
4156 SKIP_BLANKS; 4642 SKIP_BLANKS;
4157 4643
4158 /* 4644 /*
4159 * Parse possible comments and PIs before any content 4645 * Parse possible comments and PIs before any content
4160 */ 4646 */
4161 while (((CUR == '<') && (NXT(1) == '!') && 4647 while (((CUR == '<') && (NXT(1) == '!') &&
4162 (NXT(2) == '-') && (NXT(3) == '-')) || 4648 (NXT(2) == '-') && (NXT(3) == '-')) ||
4163 ((CUR == '<') && (NXT(1) == '?'))) { 4649 ((CUR == '<') && (NXT(1) == '?'))) {
4164 htmlParseComment(ctxt);» 4650 htmlParseComment(ctxt);
4165 htmlParsePI(ctxt);» 4651 htmlParsePI(ctxt);
4166 SKIP_BLANKS; 4652 SKIP_BLANKS;
4167 }» 4653 }
4168 4654
4169 /* 4655 /*
4170 * Time to start parsing the tree itself 4656 * Time to start parsing the tree itself
4171 */ 4657 */
4172 htmlParseContent(ctxt); 4658 htmlParseContentInternal(ctxt);
4173 4659
4174 /* 4660 /*
4175 * autoclose 4661 * autoclose
4176 */ 4662 */
4177 if (CUR == 0) 4663 if (CUR == 0)
4178 htmlAutoCloseOnEnd(ctxt); 4664 htmlAutoCloseOnEnd(ctxt);
4179 4665
4180 4666
4181 /* 4667 /*
4182 * SAX: end of the document processing. 4668 * SAX: end of the document processing.
4183 */ 4669 */
4184 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 4670 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4185 ctxt->sax->endDocument(ctxt->userData); 4671 ctxt->sax->endDocument(ctxt->userData);
4186 4672
4187 if (ctxt->myDoc != NULL) { 4673 if (ctxt->myDoc != NULL) {
4188 dtd = xmlGetIntSubset(ctxt->myDoc); 4674 dtd = xmlGetIntSubset(ctxt->myDoc);
4189 if (dtd == NULL) 4675 if (dtd == NULL)
4190 » ctxt->myDoc->intSubset = 4676 » ctxt->myDoc->intSubset =
4191 » » xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 4677 » » xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4192 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 4678 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4193 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 4679 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4194 } 4680 }
4195 if (! ctxt->wellFormed) return(-1); 4681 if (! ctxt->wellFormed) return(-1);
4196 return(0); 4682 return(0);
4197 } 4683 }
4198 4684
4199 4685
4200 /************************************************************************ 4686 /************************************************************************
4201 * * 4687 * *
(...skipping 25 matching lines...) Expand all
4227 } 4713 }
4228 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); 4714 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4229 if (sax == NULL) { 4715 if (sax == NULL) {
4230 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4716 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4231 return(-1); 4717 return(-1);
4232 } 4718 }
4233 else 4719 else
4234 memset(sax, 0, sizeof(htmlSAXHandler)); 4720 memset(sax, 0, sizeof(htmlSAXHandler));
4235 4721
4236 /* Allocate the Input stack */ 4722 /* Allocate the Input stack */
4237 ctxt->inputTab = (htmlParserInputPtr *) 4723 ctxt->inputTab = (htmlParserInputPtr *)
4238 xmlMalloc(5 * sizeof(htmlParserInputPtr)); 4724 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4239 if (ctxt->inputTab == NULL) { 4725 if (ctxt->inputTab == NULL) {
4240 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4726 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4241 ctxt->inputNr = 0; 4727 ctxt->inputNr = 0;
4242 ctxt->inputMax = 0; 4728 ctxt->inputMax = 0;
4243 ctxt->input = NULL; 4729 ctxt->input = NULL;
4244 return(-1); 4730 return(-1);
4245 } 4731 }
4246 ctxt->inputNr = 0; 4732 ctxt->inputNr = 0;
4247 ctxt->inputMax = 5; 4733 ctxt->inputMax = 5;
(...skipping 17 matching lines...) Expand all
4265 } 4751 }
4266 ctxt->nodeNr = 0; 4752 ctxt->nodeNr = 0;
4267 ctxt->nodeMax = 10; 4753 ctxt->nodeMax = 10;
4268 ctxt->node = NULL; 4754 ctxt->node = NULL;
4269 4755
4270 /* Allocate the Name stack */ 4756 /* Allocate the Name stack */
4271 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); 4757 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4272 if (ctxt->nameTab == NULL) { 4758 if (ctxt->nameTab == NULL) {
4273 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); 4759 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4274 ctxt->nameNr = 0; 4760 ctxt->nameNr = 0;
4275 » ctxt->nameMax = 10; 4761 » ctxt->nameMax = 0;
4276 ctxt->name = NULL; 4762 ctxt->name = NULL;
4277 ctxt->nodeNr = 0; 4763 ctxt->nodeNr = 0;
4278 ctxt->nodeMax = 0; 4764 ctxt->nodeMax = 0;
4279 ctxt->node = NULL; 4765 ctxt->node = NULL;
4280 ctxt->inputNr = 0; 4766 ctxt->inputNr = 0;
4281 ctxt->inputMax = 0; 4767 ctxt->inputMax = 0;
4282 ctxt->input = NULL; 4768 ctxt->input = NULL;
4283 return(-1); 4769 return(-1);
4284 } 4770 }
4285 ctxt->nameNr = 0; 4771 ctxt->nameNr = 0;
4286 ctxt->nameMax = 10; 4772 ctxt->nameMax = 10;
4287 ctxt->name = NULL; 4773 ctxt->name = NULL;
4288 4774
4775 ctxt->nodeInfoTab = NULL;
4776 ctxt->nodeInfoNr = 0;
4777 ctxt->nodeInfoMax = 0;
4778
4289 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; 4779 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4290 else { 4780 else {
4291 ctxt->sax = sax; 4781 ctxt->sax = sax;
4292 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 4782 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4293 } 4783 }
4294 ctxt->userData = ctxt; 4784 ctxt->userData = ctxt;
4295 ctxt->myDoc = NULL; 4785 ctxt->myDoc = NULL;
4296 ctxt->wellFormed = 1; 4786 ctxt->wellFormed = 1;
4297 ctxt->replaceEntities = 0; 4787 ctxt->replaceEntities = 0;
4298 ctxt->linenumbers = xmlLineNumbersDefaultValue; 4788 ctxt->linenumbers = xmlLineNumbersDefaultValue;
(...skipping 126 matching lines...) Expand 10 before | Expand all | Expand 10 after
4425 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); 4915 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4426 4916
4427 enc = xmlParseCharEncoding(encoding); 4917 enc = xmlParseCharEncoding(encoding);
4428 /* 4918 /*
4429 * registered set of known encodings 4919 * registered set of known encodings
4430 */ 4920 */
4431 if (enc != XML_CHAR_ENCODING_ERROR) { 4921 if (enc != XML_CHAR_ENCODING_ERROR) {
4432 xmlSwitchEncoding(ctxt, enc); 4922 xmlSwitchEncoding(ctxt, enc);
4433 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { 4923 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4434 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4924 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4435 » » "Unsupported encoding %s\n", 4925 » » "Unsupported encoding %s\n",
4436 (const xmlChar *) encoding, NULL); 4926 (const xmlChar *) encoding, NULL);
4437 } 4927 }
4438 } else { 4928 } else {
4439 /* 4929 /*
4440 * fallback for unknown encodings 4930 * fallback for unknown encodings
4441 */ 4931 */
4442 handler = xmlFindCharEncodingHandler((const char *) encoding); 4932 handler = xmlFindCharEncodingHandler((const char *) encoding);
4443 if (handler != NULL) { 4933 if (handler != NULL) {
4444 xmlSwitchToEncoding(ctxt, handler); 4934 xmlSwitchToEncoding(ctxt, handler);
4445 } else { 4935 } else {
4446 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, 4936 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4447 "Unsupported encoding %s\n", 4937 "Unsupported encoding %s\n",
4448 (const xmlChar *) encoding, NULL); 4938 (const xmlChar *) encoding, NULL);
4449 } 4939 }
4450 } 4940 }
4451 } 4941 }
4452 return(ctxt); 4942 return(ctxt);
4453 } 4943 }
4454 4944
4455 #ifdef LIBXML_PUSH_ENABLED 4945 #ifdef LIBXML_PUSH_ENABLED
4456 /************************************************************************ 4946 /************************************************************************
4457 * * 4947 * *
4458 * » » Progressive parsing interfaces» » » » * 4948 *» Progressive parsing interfaces» » » » *
4459 * * 4949 * *
4460 ************************************************************************/ 4950 ************************************************************************/
4461 4951
4462 /** 4952 /**
4463 * htmlParseLookupSequence: 4953 * htmlParseLookupSequence:
4464 * @ctxt: an HTML parser context 4954 * @ctxt: an HTML parser context
4465 * @first: the first char to lookup 4955 * @first: the first char to lookup
4466 * @next: the next char to lookup or zero 4956 * @next: the next char to lookup or zero
4467 * @third: the next char to lookup or zero 4957 * @third: the next char to lookup or zero
4468 * @comment: flag to force checking inside comments 4958 * @comment: flag to force checking inside comments
4469 * 4959 *
4470 * Try to find if a sequence (first, next, third) or just (first next) or 4960 * Try to find if a sequence (first, next, third) or just (first next) or
4471 * (first) is available in the input stream. 4961 * (first) is available in the input stream.
4472 * This function has a side effect of (possibly) incrementing ctxt->checkIndex 4962 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4473 * to avoid rescanning sequences of bytes, it DOES change the state of the 4963 * to avoid rescanning sequences of bytes, it DOES change the state of the
4474 * parser, do not use liberally. 4964 * parser, do not use liberally.
4475 * This is basically similar to xmlParseLookupSequence() 4965 * This is basically similar to xmlParseLookupSequence()
4476 * 4966 *
4477 * Returns the index to the current parsing point if the full sequence 4967 * Returns the index to the current parsing point if the full sequence
4478 * is available, -1 otherwise. 4968 * is available, -1 otherwise.
4479 */ 4969 */
4480 static int 4970 static int
4481 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, 4971 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4482 xmlChar next, xmlChar third, int iscomment) { 4972 xmlChar next, xmlChar third, int iscomment,
4973 int ignoreattrval)
4974 {
4483 int base, len; 4975 int base, len;
4484 htmlParserInputPtr in; 4976 htmlParserInputPtr in;
4485 const xmlChar *buf; 4977 const xmlChar *buf;
4486 int incomment = 0; 4978 int incomment = 0;
4979 int invalue = 0;
4980 char valdellim = 0x0;
4487 4981
4488 in = ctxt->input; 4982 in = ctxt->input;
4489 if (in == NULL) return(-1); 4983 if (in == NULL)
4984 return (-1);
4985
4490 base = in->cur - in->base; 4986 base = in->cur - in->base;
4491 if (base < 0) return(-1); 4987 if (base < 0)
4988 return (-1);
4989
4492 if (ctxt->checkIndex > base) 4990 if (ctxt->checkIndex > base)
4493 base = ctxt->checkIndex; 4991 base = ctxt->checkIndex;
4992
4494 if (in->buf == NULL) { 4993 if (in->buf == NULL) {
4495 » buf = in->base; 4994 buf = in->base;
4496 » len = in->length; 4995 len = in->length;
4497 } else { 4996 } else {
4498 » buf = in->buf->buffer->content; 4997 buf = in->buf->buffer->content;
4499 » len = in->buf->buffer->use; 4998 len = in->buf->buffer->use;
4500 } 4999 }
5000
4501 /* take into account the sequence length */ 5001 /* take into account the sequence length */
4502 if (third) len -= 2; 5002 if (third)
4503 else if (next) len --; 5003 len -= 2;
4504 for (;base < len;base++) { 5004 else if (next)
4505 » if (!incomment && (base + 4 < len) && !iscomment) { 5005 len--;
4506 » if ((buf[base] == '<') && (buf[base + 1] == '!') && 5006 for (; base < len; base++) {
4507 » » (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5007 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
4508 » » incomment = 1; 5008 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4509 » » /* do not increment past <! - some people use <!--> */ 5009 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4510 » » base += 2; 5010 incomment = 1;
4511 » } 5011 /* do not increment past <! - some people use <!--> */
4512 » } 5012 base += 2;
4513 » if (incomment) { 5013 }
4514 » if (base + 3 > len) 5014 }
4515 » » return(-1); 5015 if (ignoreattrval) {
4516 » if ((buf[base] == '-') && (buf[base + 1] == '-') && 5016 if (buf[base] == '"' || buf[base] == '\'') {
4517 » » (buf[base + 2] == '>')) { 5017 if (invalue) {
4518 » » incomment = 0; 5018 if (buf[base] == valdellim) {
4519 » » base += 2; 5019 invalue = 0;
4520 » } 5020 continue;
4521 » continue; 5021 }
4522 » } 5022 } else {
5023 valdellim = buf[base];
5024 invalue = 1;
5025 continue;
5026 }
5027 } else if (invalue) {
5028 continue;
5029 }
5030 }
5031 if (incomment) {
5032 if (base + 3 > len)
5033 return (-1);
5034 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5035 (buf[base + 2] == '>')) {
5036 incomment = 0;
5037 base += 2;
5038 }
5039 continue;
5040 }
4523 if (buf[base] == first) { 5041 if (buf[base] == first) {
4524 » if (third != 0) { 5042 if (third != 0) {
4525 » » if ((buf[base + 1] != next) || 5043 if ((buf[base + 1] != next) || (buf[base + 2] != third))
4526 » » (buf[base + 2] != third)) continue; 5044 continue;
4527 » } else if (next != 0) { 5045 } else if (next != 0) {
4528 » » if (buf[base + 1] != next) continue; 5046 if (buf[base + 1] != next)
4529 » } 5047 continue;
4530 » ctxt->checkIndex = 0; 5048 }
5049 ctxt->checkIndex = 0;
4531 #ifdef DEBUG_PUSH 5050 #ifdef DEBUG_PUSH
4532 » if (next == 0) 5051 if (next == 0)
4533 » » xmlGenericError(xmlGenericErrorContext, 5052 xmlGenericError(xmlGenericErrorContext,
4534 » » » "HPP: lookup '%c' found at %d\n", 5053 "HPP: lookup '%c' found at %d\n",
4535 » » » first, base); 5054 first, base);
4536 » else if (third == 0) 5055 else if (third == 0)
4537 » » xmlGenericError(xmlGenericErrorContext, 5056 xmlGenericError(xmlGenericErrorContext,
4538 » » » "HPP: lookup '%c%c' found at %d\n", 5057 "HPP: lookup '%c%c' found at %d\n",
4539 » » » first, next, base); 5058 first, next, base);
4540 » else 5059 else
4541 » » xmlGenericError(xmlGenericErrorContext, 5060 xmlGenericError(xmlGenericErrorContext,
4542 » » » "HPP: lookup '%c%c%c' found at %d\n", 5061 "HPP: lookup '%c%c%c' found at %d\n",
4543 » » » first, next, third, base); 5062 first, next, third, base);
4544 #endif 5063 #endif
4545 » return(base - (in->cur - in->base)); 5064 return (base - (in->cur - in->base));
4546 » } 5065 }
4547 } 5066 }
4548 ctxt->checkIndex = base; 5067 if ((!incomment) && (!invalue))
5068 ctxt->checkIndex = base;
4549 #ifdef DEBUG_PUSH 5069 #ifdef DEBUG_PUSH
4550 if (next == 0) 5070 if (next == 0)
4551 » xmlGenericError(xmlGenericErrorContext, 5071 xmlGenericError(xmlGenericErrorContext,
4552 » » "HPP: lookup '%c' failed\n", first); 5072 "HPP: lookup '%c' failed\n", first);
4553 else if (third == 0) 5073 else if (third == 0)
4554 » xmlGenericError(xmlGenericErrorContext, 5074 xmlGenericError(xmlGenericErrorContext,
4555 » » "HPP: lookup '%c%c' failed\n", first, next); 5075 "HPP: lookup '%c%c' failed\n", first, next);
4556 else» 5076 else
4557 » xmlGenericError(xmlGenericErrorContext, 5077 xmlGenericError(xmlGenericErrorContext,
4558 » » "HPP: lookup '%c%c%c' failed\n", first, next, third); 5078 "HPP: lookup '%c%c%c' failed\n", first, next,
5079 third);
4559 #endif 5080 #endif
4560 return(-1); 5081 return (-1);
4561 } 5082 }
4562 5083
4563 /** 5084 /**
5085 * htmlParseLookupChars:
5086 * @ctxt: an HTML parser context
5087 * @stop: Array of chars, which stop the lookup.
5088 * @stopLen: Length of stop-Array
5089 *
5090 * Try to find if any char of the stop-Array is available in the input
5091 * stream.
5092 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5093 * to avoid rescanning sequences of bytes, it DOES change the state of the
5094 * parser, do not use liberally.
5095 *
5096 * Returns the index to the current parsing point if a stopChar
5097 * is available, -1 otherwise.
5098 */
5099 static int
5100 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5101 int stopLen)
5102 {
5103 int base, len;
5104 htmlParserInputPtr in;
5105 const xmlChar *buf;
5106 int incomment = 0;
5107 int i;
5108
5109 in = ctxt->input;
5110 if (in == NULL)
5111 return (-1);
5112
5113 base = in->cur - in->base;
5114 if (base < 0)
5115 return (-1);
5116
5117 if (ctxt->checkIndex > base)
5118 base = ctxt->checkIndex;
5119
5120 if (in->buf == NULL) {
5121 buf = in->base;
5122 len = in->length;
5123 } else {
5124 buf = in->buf->buffer->content;
5125 len = in->buf->buffer->use;
5126 }
5127
5128 for (; base < len; base++) {
5129 if (!incomment && (base + 4 < len)) {
5130 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5131 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5132 incomment = 1;
5133 /* do not increment past <! - some people use <!--> */
5134 base += 2;
5135 }
5136 }
5137 if (incomment) {
5138 if (base + 3 > len)
5139 return (-1);
5140 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5141 (buf[base + 2] == '>')) {
5142 incomment = 0;
5143 base += 2;
5144 }
5145 continue;
5146 }
5147 for (i = 0; i < stopLen; ++i) {
5148 if (buf[base] == stop[i]) {
5149 ctxt->checkIndex = 0;
5150 return (base - (in->cur - in->base));
5151 }
5152 }
5153 }
5154 ctxt->checkIndex = base;
5155 return (-1);
5156 }
5157
5158 /**
4564 * htmlParseTryOrFinish: 5159 * htmlParseTryOrFinish:
4565 * @ctxt: an HTML parser context 5160 * @ctxt: an HTML parser context
4566 * @terminate: last chunk indicator 5161 * @terminate: last chunk indicator
4567 * 5162 *
4568 * Try to progress on parsing 5163 * Try to progress on parsing
4569 * 5164 *
4570 * Returns zero if no parsing was possible 5165 * Returns zero if no parsing was possible
4571 */ 5166 */
4572 static int 5167 static int
4573 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5168 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after
4632 while (1) { 5227 while (1) {
4633 5228
4634 in = ctxt->input; 5229 in = ctxt->input;
4635 if (in == NULL) break; 5230 if (in == NULL) break;
4636 if (in->buf == NULL) 5231 if (in->buf == NULL)
4637 avail = in->length - (in->cur - in->base); 5232 avail = in->length - (in->cur - in->base);
4638 else 5233 else
4639 avail = in->buf->buffer->use - (in->cur - in->base); 5234 avail = in->buf->buffer->use - (in->cur - in->base);
4640 if ((avail == 0) && (terminate)) { 5235 if ((avail == 0) && (terminate)) {
4641 htmlAutoCloseOnEnd(ctxt); 5236 htmlAutoCloseOnEnd(ctxt);
4642 » if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5237 » if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4643 /* 5238 /*
4644 * SAX: end of the document processing. 5239 * SAX: end of the document processing.
4645 */ 5240 */
4646 ctxt->instate = XML_PARSER_EOF; 5241 ctxt->instate = XML_PARSER_EOF;
4647 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5242 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4648 ctxt->sax->endDocument(ctxt->userData); 5243 ctxt->sax->endDocument(ctxt->userData);
4649 } 5244 }
4650 } 5245 }
4651 if (avail < 1) 5246 if (avail < 1)
4652 goto done; 5247 goto done;
(...skipping 29 matching lines...) Expand all
4682 ctxt->sax->startDocument(ctxt->userData); 5277 ctxt->sax->startDocument(ctxt->userData);
4683 5278
4684 cur = in->cur[0]; 5279 cur = in->cur[0];
4685 next = in->cur[1]; 5280 next = in->cur[1];
4686 if ((cur == '<') && (next == '!') && 5281 if ((cur == '<') && (next == '!') &&
4687 (UPP(2) == 'D') && (UPP(3) == 'O') && 5282 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4688 (UPP(4) == 'C') && (UPP(5) == 'T') && 5283 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4689 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5284 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4690 (UPP(8) == 'E')) { 5285 (UPP(8) == 'E')) {
4691 if ((!terminate) && 5286 if ((!terminate) &&
4692 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5287 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
4693 goto done; 5288 goto done;
4694 #ifdef DEBUG_PUSH 5289 #ifdef DEBUG_PUSH
4695 xmlGenericError(xmlGenericErrorContext, 5290 xmlGenericError(xmlGenericErrorContext,
4696 "HPP: Parsing internal subset\n"); 5291 "HPP: Parsing internal subset\n");
4697 #endif 5292 #endif
4698 htmlParseDocTypeDecl(ctxt); 5293 htmlParseDocTypeDecl(ctxt);
4699 ctxt->instate = XML_PARSER_PROLOG; 5294 ctxt->instate = XML_PARSER_PROLOG;
4700 #ifdef DEBUG_PUSH 5295 #ifdef DEBUG_PUSH
4701 xmlGenericError(xmlGenericErrorContext, 5296 xmlGenericError(xmlGenericErrorContext,
4702 "HPP: entering PROLOG\n"); 5297 "HPP: entering PROLOG\n");
(...skipping 12 matching lines...) Expand all
4715 avail = in->length - (in->cur - in->base); 5310 avail = in->length - (in->cur - in->base);
4716 else 5311 else
4717 avail = in->buf->buffer->use - (in->cur - in->base); 5312 avail = in->buf->buffer->use - (in->cur - in->base);
4718 if (avail < 2) 5313 if (avail < 2)
4719 goto done; 5314 goto done;
4720 cur = in->cur[0]; 5315 cur = in->cur[0];
4721 next = in->cur[1]; 5316 next = in->cur[1];
4722 if ((cur == '<') && (next == '!') && 5317 if ((cur == '<') && (next == '!') &&
4723 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5318 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4724 if ((!terminate) && 5319 if ((!terminate) &&
4725 » » (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 5320 » » (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0) )
4726 goto done; 5321 goto done;
4727 #ifdef DEBUG_PUSH 5322 #ifdef DEBUG_PUSH
4728 xmlGenericError(xmlGenericErrorContext, 5323 xmlGenericError(xmlGenericErrorContext,
4729 "HPP: Parsing Comment\n"); 5324 "HPP: Parsing Comment\n");
4730 #endif 5325 #endif
4731 htmlParseComment(ctxt); 5326 htmlParseComment(ctxt);
4732 ctxt->instate = XML_PARSER_MISC; 5327 ctxt->instate = XML_PARSER_MISC;
4733 } else if ((cur == '<') && (next == '?')) { 5328 } else if ((cur == '<') && (next == '?')) {
4734 if ((!terminate) && 5329 if ((!terminate) &&
4735 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5330 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
4736 goto done; 5331 goto done;
4737 #ifdef DEBUG_PUSH 5332 #ifdef DEBUG_PUSH
4738 xmlGenericError(xmlGenericErrorContext, 5333 xmlGenericError(xmlGenericErrorContext,
4739 "HPP: Parsing PI\n"); 5334 "HPP: Parsing PI\n");
4740 #endif 5335 #endif
4741 htmlParsePI(ctxt); 5336 htmlParsePI(ctxt);
4742 ctxt->instate = XML_PARSER_MISC; 5337 ctxt->instate = XML_PARSER_MISC;
4743 } else if ((cur == '<') && (next == '!') && 5338 } else if ((cur == '<') && (next == '!') &&
4744 (UPP(2) == 'D') && (UPP(3) == 'O') && 5339 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4745 (UPP(4) == 'C') && (UPP(5) == 'T') && 5340 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4746 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5341 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4747 (UPP(8) == 'E')) { 5342 (UPP(8) == 'E')) {
4748 if ((!terminate) && 5343 if ((!terminate) &&
4749 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5344 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
4750 goto done; 5345 goto done;
4751 #ifdef DEBUG_PUSH 5346 #ifdef DEBUG_PUSH
4752 xmlGenericError(xmlGenericErrorContext, 5347 xmlGenericError(xmlGenericErrorContext,
4753 "HPP: Parsing internal subset\n"); 5348 "HPP: Parsing internal subset\n");
4754 #endif 5349 #endif
4755 htmlParseDocTypeDecl(ctxt); 5350 htmlParseDocTypeDecl(ctxt);
4756 ctxt->instate = XML_PARSER_PROLOG; 5351 ctxt->instate = XML_PARSER_PROLOG;
4757 #ifdef DEBUG_PUSH 5352 #ifdef DEBUG_PUSH
4758 xmlGenericError(xmlGenericErrorContext, 5353 xmlGenericError(xmlGenericErrorContext,
4759 "HPP: entering PROLOG\n"); 5354 "HPP: entering PROLOG\n");
4760 #endif 5355 #endif
4761 } else if ((cur == '<') && (next == '!') && 5356 } else if ((cur == '<') && (next == '!') &&
4762 (avail < 9)) { 5357 (avail < 9)) {
4763 goto done; 5358 goto done;
4764 } else { 5359 } else {
4765 ctxt->instate = XML_PARSER_START_TAG; 5360 ctxt->instate = XML_PARSER_START_TAG;
4766 #ifdef DEBUG_PUSH 5361 #ifdef DEBUG_PUSH
4767 xmlGenericError(xmlGenericErrorContext, 5362 xmlGenericError(xmlGenericErrorContext,
4768 "HPP: entering START_TAG\n"); 5363 "HPP: entering START_TAG\n");
4769 #endif 5364 #endif
4770 } 5365 }
4771 break; 5366 break;
4772 case XML_PARSER_PROLOG: 5367 case XML_PARSER_PROLOG:
4773 SKIP_BLANKS; 5368 SKIP_BLANKS;
4774 if (in->buf == NULL) 5369 if (in->buf == NULL)
4775 avail = in->length - (in->cur - in->base); 5370 avail = in->length - (in->cur - in->base);
4776 else 5371 else
4777 avail = in->buf->buffer->use - (in->cur - in->base); 5372 avail = in->buf->buffer->use - (in->cur - in->base);
4778 » » if (avail < 2) 5373 » » if (avail < 2)
4779 goto done; 5374 goto done;
4780 cur = in->cur[0]; 5375 cur = in->cur[0];
4781 next = in->cur[1]; 5376 next = in->cur[1];
4782 if ((cur == '<') && (next == '!') && 5377 if ((cur == '<') && (next == '!') &&
4783 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5378 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4784 if ((!terminate) && 5379 if ((!terminate) &&
4785 » » (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 5380 » » (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0) )
4786 goto done; 5381 goto done;
4787 #ifdef DEBUG_PUSH 5382 #ifdef DEBUG_PUSH
4788 xmlGenericError(xmlGenericErrorContext, 5383 xmlGenericError(xmlGenericErrorContext,
4789 "HPP: Parsing Comment\n"); 5384 "HPP: Parsing Comment\n");
4790 #endif 5385 #endif
4791 htmlParseComment(ctxt); 5386 htmlParseComment(ctxt);
4792 ctxt->instate = XML_PARSER_PROLOG; 5387 ctxt->instate = XML_PARSER_PROLOG;
4793 } else if ((cur == '<') && (next == '?')) { 5388 } else if ((cur == '<') && (next == '?')) {
4794 if ((!terminate) && 5389 if ((!terminate) &&
4795 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5390 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
4796 goto done; 5391 goto done;
4797 #ifdef DEBUG_PUSH 5392 #ifdef DEBUG_PUSH
4798 xmlGenericError(xmlGenericErrorContext, 5393 xmlGenericError(xmlGenericErrorContext,
4799 "HPP: Parsing PI\n"); 5394 "HPP: Parsing PI\n");
4800 #endif 5395 #endif
4801 htmlParsePI(ctxt); 5396 htmlParsePI(ctxt);
4802 ctxt->instate = XML_PARSER_PROLOG; 5397 ctxt->instate = XML_PARSER_PROLOG;
4803 } else if ((cur == '<') && (next == '!') && 5398 } else if ((cur == '<') && (next == '!') &&
4804 (avail < 4)) { 5399 (avail < 4)) {
4805 goto done; 5400 goto done;
(...skipping 16 matching lines...) Expand all
4822 if (IS_BLANK_CH(cur)) { 5417 if (IS_BLANK_CH(cur)) {
4823 htmlParseCharData(ctxt); 5418 htmlParseCharData(ctxt);
4824 goto done; 5419 goto done;
4825 } 5420 }
4826 if (avail < 2) 5421 if (avail < 2)
4827 goto done; 5422 goto done;
4828 next = in->cur[1]; 5423 next = in->cur[1];
4829 if ((cur == '<') && (next == '!') && 5424 if ((cur == '<') && (next == '!') &&
4830 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5425 (in->cur[2] == '-') && (in->cur[3] == '-')) {
4831 if ((!terminate) && 5426 if ((!terminate) &&
4832 » » (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) 5427 » » (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0) )
4833 goto done; 5428 goto done;
4834 #ifdef DEBUG_PUSH 5429 #ifdef DEBUG_PUSH
4835 xmlGenericError(xmlGenericErrorContext, 5430 xmlGenericError(xmlGenericErrorContext,
4836 "HPP: Parsing Comment\n"); 5431 "HPP: Parsing Comment\n");
4837 #endif 5432 #endif
4838 htmlParseComment(ctxt); 5433 htmlParseComment(ctxt);
4839 ctxt->instate = XML_PARSER_EPILOG; 5434 ctxt->instate = XML_PARSER_EPILOG;
4840 } else if ((cur == '<') && (next == '?')) { 5435 } else if ((cur == '<') && (next == '?')) {
4841 if ((!terminate) && 5436 if ((!terminate) &&
4842 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5437 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
4843 goto done; 5438 goto done;
4844 #ifdef DEBUG_PUSH 5439 #ifdef DEBUG_PUSH
4845 xmlGenericError(xmlGenericErrorContext, 5440 xmlGenericError(xmlGenericErrorContext,
4846 "HPP: Parsing PI\n"); 5441 "HPP: Parsing PI\n");
4847 #endif 5442 #endif
4848 htmlParsePI(ctxt); 5443 htmlParsePI(ctxt);
4849 ctxt->instate = XML_PARSER_EPILOG; 5444 ctxt->instate = XML_PARSER_EPILOG;
4850 } else if ((cur == '<') && (next == '!') && 5445 } else if ((cur == '<') && (next == '!') &&
4851 (avail < 4)) { 5446 (avail < 4)) {
4852 goto done; 5447 goto done;
(...skipping 29 matching lines...) Expand all
4882 if (in->cur[1] == '/') { 5477 if (in->cur[1] == '/') {
4883 ctxt->instate = XML_PARSER_END_TAG; 5478 ctxt->instate = XML_PARSER_END_TAG;
4884 ctxt->checkIndex = 0; 5479 ctxt->checkIndex = 0;
4885 #ifdef DEBUG_PUSH 5480 #ifdef DEBUG_PUSH
4886 xmlGenericError(xmlGenericErrorContext, 5481 xmlGenericError(xmlGenericErrorContext,
4887 "HPP: entering END_TAG\n"); 5482 "HPP: entering END_TAG\n");
4888 #endif 5483 #endif
4889 break; 5484 break;
4890 } 5485 }
4891 if ((!terminate) && 5486 if ((!terminate) &&
4892 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5487 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
4893 goto done; 5488 goto done;
4894 5489
4895 failed = htmlParseStartTag(ctxt); 5490 failed = htmlParseStartTag(ctxt);
4896 name = ctxt->name; 5491 name = ctxt->name;
4897 if ((failed == -1) || 5492 if ((failed == -1) ||
4898 (name == NULL)) { 5493 (name == NULL)) {
4899 if (CUR == '>') 5494 if (CUR == '>')
4900 NEXT; 5495 NEXT;
4901 break; 5496 break;
4902 } 5497 }
(...skipping 26 matching lines...) Expand all
4929 if (CUR == '>') { 5524 if (CUR == '>') {
4930 NEXT; 5525 NEXT;
4931 } else { 5526 } else {
4932 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5527 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4933 "Couldn't find end of Start Tag %s\n", 5528 "Couldn't find end of Start Tag %s\n",
4934 name, NULL); 5529 name, NULL);
4935 5530
4936 /* 5531 /*
4937 * end of parsing of this node. 5532 * end of parsing of this node.
4938 */ 5533 */
4939 » » if (xmlStrEqual(name, ctxt->name)) { 5534 » » if (xmlStrEqual(name, ctxt->name)) {
4940 nodePop(ctxt); 5535 nodePop(ctxt);
4941 htmlnamePop(ctxt); 5536 htmlnamePop(ctxt);
4942 » » } 5537 » » }
4943 5538
4944 ctxt->instate = XML_PARSER_CONTENT; 5539 ctxt->instate = XML_PARSER_CONTENT;
4945 #ifdef DEBUG_PUSH 5540 #ifdef DEBUG_PUSH
4946 xmlGenericError(xmlGenericErrorContext, 5541 xmlGenericError(xmlGenericErrorContext,
4947 "HPP: entering CONTENT\n"); 5542 "HPP: entering CONTENT\n");
4948 #endif 5543 #endif
4949 break; 5544 break;
4950 } 5545 }
4951 5546
4952 /* 5547 /*
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
5007 cons = ctxt->nbChars; 5602 cons = ctxt->nbChars;
5008 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5603 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5009 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5604 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5010 /* 5605 /*
5011 * Handle SCRIPT/STYLE separately 5606 * Handle SCRIPT/STYLE separately
5012 */ 5607 */
5013 if (!terminate) { 5608 if (!terminate) {
5014 int idx; 5609 int idx;
5015 xmlChar val; 5610 xmlChar val;
5016 5611
5017 » » » idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0); 5612 » » » idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
5018 if (idx < 0) 5613 if (idx < 0)
5019 goto done; 5614 goto done;
5020 val = in->cur[idx + 2]; 5615 val = in->cur[idx + 2];
5021 if (val == 0) /* bad cut of input */ 5616 if (val == 0) /* bad cut of input */
5022 goto done; 5617 goto done;
5023 } 5618 }
5024 htmlParseScript(ctxt); 5619 htmlParseScript(ctxt);
5025 if ((cur == '<') && (next == '/')) { 5620 if ((cur == '<') && (next == '/')) {
5026 ctxt->instate = XML_PARSER_END_TAG; 5621 ctxt->instate = XML_PARSER_END_TAG;
5027 ctxt->checkIndex = 0; 5622 ctxt->checkIndex = 0;
5028 #ifdef DEBUG_PUSH 5623 #ifdef DEBUG_PUSH
5029 xmlGenericError(xmlGenericErrorContext, 5624 xmlGenericError(xmlGenericErrorContext,
5030 "HPP: entering END_TAG\n"); 5625 "HPP: entering END_TAG\n");
5031 #endif 5626 #endif
5032 break; 5627 break;
5033 } 5628 }
5034 } else { 5629 } else {
5035 /* 5630 /*
5036 * Sometimes DOCTYPE arrives in the middle of the document 5631 * Sometimes DOCTYPE arrives in the middle of the document
5037 */ 5632 */
5038 if ((cur == '<') && (next == '!') && 5633 if ((cur == '<') && (next == '!') &&
5039 (UPP(2) == 'D') && (UPP(3) == 'O') && 5634 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5040 (UPP(4) == 'C') && (UPP(5) == 'T') && 5635 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5041 (UPP(6) == 'Y') && (UPP(7) == 'P') && 5636 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5042 (UPP(8) == 'E')) { 5637 (UPP(8) == 'E')) {
5043 if ((!terminate) && 5638 if ((!terminate) &&
5044 » » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5639 » » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0) )
5045 goto done; 5640 goto done;
5046 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5641 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5047 "Misplaced DOCTYPE declaration\n", 5642 "Misplaced DOCTYPE declaration\n",
5048 BAD_CAST "DOCTYPE" , NULL); 5643 BAD_CAST "DOCTYPE" , NULL);
5049 htmlParseDocTypeDecl(ctxt); 5644 htmlParseDocTypeDecl(ctxt);
5050 } else if ((cur == '<') && (next == '!') && 5645 } else if ((cur == '<') && (next == '!') &&
5051 (in->cur[2] == '-') && (in->cur[3] == '-')) { 5646 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5052 if ((!terminate) && 5647 if ((!terminate) &&
5053 (htmlParseLookupSequence( 5648 (htmlParseLookupSequence(
5054 » » » » » ctxt, '-', '-', '>', 1) < 0)) 5649 » » » » ctxt, '-', '-', '>', 1, 1) < 0))
5055 goto done; 5650 goto done;
5056 #ifdef DEBUG_PUSH 5651 #ifdef DEBUG_PUSH
5057 xmlGenericError(xmlGenericErrorContext, 5652 xmlGenericError(xmlGenericErrorContext,
5058 "HPP: Parsing Comment\n"); 5653 "HPP: Parsing Comment\n");
5059 #endif 5654 #endif
5060 htmlParseComment(ctxt); 5655 htmlParseComment(ctxt);
5061 ctxt->instate = XML_PARSER_CONTENT; 5656 ctxt->instate = XML_PARSER_CONTENT;
5062 } else if ((cur == '<') && (next == '?')) { 5657 } else if ((cur == '<') && (next == '?')) {
5063 if ((!terminate) && 5658 if ((!terminate) &&
5064 » » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5659 » » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0) )
5065 goto done; 5660 goto done;
5066 #ifdef DEBUG_PUSH 5661 #ifdef DEBUG_PUSH
5067 xmlGenericError(xmlGenericErrorContext, 5662 xmlGenericError(xmlGenericErrorContext,
5068 "HPP: Parsing PI\n"); 5663 "HPP: Parsing PI\n");
5069 #endif 5664 #endif
5070 htmlParsePI(ctxt); 5665 htmlParsePI(ctxt);
5071 ctxt->instate = XML_PARSER_CONTENT; 5666 ctxt->instate = XML_PARSER_CONTENT;
5072 } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5667 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5073 goto done; 5668 goto done;
5074 } else if ((cur == '<') && (next == '/')) { 5669 } else if ((cur == '<') && (next == '/')) {
5075 ctxt->instate = XML_PARSER_END_TAG; 5670 ctxt->instate = XML_PARSER_END_TAG;
5076 ctxt->checkIndex = 0; 5671 ctxt->checkIndex = 0;
5077 #ifdef DEBUG_PUSH 5672 #ifdef DEBUG_PUSH
5078 xmlGenericError(xmlGenericErrorContext, 5673 xmlGenericError(xmlGenericErrorContext,
5079 "HPP: entering END_TAG\n"); 5674 "HPP: entering END_TAG\n");
5080 #endif 5675 #endif
5081 break; 5676 break;
5082 } else if (cur == '<') { 5677 } else if (cur == '<') {
5083 ctxt->instate = XML_PARSER_START_TAG; 5678 ctxt->instate = XML_PARSER_START_TAG;
5084 ctxt->checkIndex = 0; 5679 ctxt->checkIndex = 0;
5085 #ifdef DEBUG_PUSH 5680 #ifdef DEBUG_PUSH
5086 xmlGenericError(xmlGenericErrorContext, 5681 xmlGenericError(xmlGenericErrorContext,
5087 "HPP: entering START_TAG\n"); 5682 "HPP: entering START_TAG\n");
5088 #endif 5683 #endif
5089 break; 5684 break;
5090 } else if (cur == '&') { 5685 } else if (cur == '&') {
5091 if ((!terminate) && 5686 if ((!terminate) &&
5092 » » » (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0)) 5687 » » » (htmlParseLookupChars(ctxt,
5688 BAD_CAST "; >/", 4) < 0))
5093 goto done; 5689 goto done;
5094 #ifdef DEBUG_PUSH 5690 #ifdef DEBUG_PUSH
5095 xmlGenericError(xmlGenericErrorContext, 5691 xmlGenericError(xmlGenericErrorContext,
5096 "HPP: Parsing Reference\n"); 5692 "HPP: Parsing Reference\n");
5097 #endif 5693 #endif
5098 /* TODO: check generation of subtrees if noent !!! */ 5694 /* TODO: check generation of subtrees if noent !!! */
5099 htmlParseReference(ctxt); 5695 htmlParseReference(ctxt);
5100 } else { 5696 } else {
5101 /* 5697 /*
5102 * check that the text sequence is complete 5698 * check that the text sequence is complete
5103 * before handing out the data to the parser 5699 * before handing out the data to the parser
5104 * to avoid problems with erroneous end of 5700 * to avoid problems with erroneous end of
5105 * data detection. 5701 * data detection.
5106 */ 5702 */
5107 if ((!terminate) && 5703 if ((!terminate) &&
5108 » » » (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) 5704 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5109 goto done; 5705 goto done;
5110 ctxt->checkIndex = 0; 5706 ctxt->checkIndex = 0;
5111 #ifdef DEBUG_PUSH 5707 #ifdef DEBUG_PUSH
5112 xmlGenericError(xmlGenericErrorContext, 5708 xmlGenericError(xmlGenericErrorContext,
5113 "HPP: Parsing char data\n"); 5709 "HPP: Parsing char data\n");
5114 #endif 5710 #endif
5115 htmlParseCharData(ctxt); 5711 htmlParseCharData(ctxt);
5116 } 5712 }
5117 } 5713 }
5118 if (cons == ctxt->nbChars) { 5714 if (cons == ctxt->nbChars) {
5119 if (ctxt->node != NULL) { 5715 if (ctxt->node != NULL) {
5120 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5716 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5121 "detected an error in element content\n", 5717 "detected an error in element content\n",
5122 NULL, NULL); 5718 NULL, NULL);
5123 } 5719 }
5124 NEXT; 5720 NEXT;
5125 break; 5721 break;
5126 } 5722 }
5127 5723
5128 break; 5724 break;
5129 } 5725 }
5130 case XML_PARSER_END_TAG: 5726 case XML_PARSER_END_TAG:
5131 if (avail < 2) 5727 if (avail < 2)
5132 goto done; 5728 goto done;
5133 if ((!terminate) && 5729 if ((!terminate) &&
5134 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) 5730 » » (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5135 goto done; 5731 goto done;
5136 htmlParseEndTag(ctxt); 5732 htmlParseEndTag(ctxt);
5137 if (ctxt->nameNr == 0) { 5733 if (ctxt->nameNr == 0) {
5138 ctxt->instate = XML_PARSER_EPILOG; 5734 ctxt->instate = XML_PARSER_EPILOG;
5139 } else { 5735 } else {
5140 ctxt->instate = XML_PARSER_CONTENT; 5736 ctxt->instate = XML_PARSER_CONTENT;
5141 } 5737 }
5142 ctxt->checkIndex = 0; 5738 ctxt->checkIndex = 0;
5143 #ifdef DEBUG_PUSH 5739 #ifdef DEBUG_PUSH
5144 xmlGenericError(xmlGenericErrorContext, 5740 xmlGenericError(xmlGenericErrorContext,
(...skipping 106 matching lines...) Expand 10 before | Expand all | Expand 10 after
5251 ctxt->instate = XML_PARSER_CONTENT; 5847 ctxt->instate = XML_PARSER_CONTENT;
5252 ctxt->checkIndex = 0; 5848 ctxt->checkIndex = 0;
5253 #ifdef DEBUG_PUSH 5849 #ifdef DEBUG_PUSH
5254 xmlGenericError(xmlGenericErrorContext, 5850 xmlGenericError(xmlGenericErrorContext,
5255 "HPP: entering CONTENT\n"); 5851 "HPP: entering CONTENT\n");
5256 #endif 5852 #endif
5257 break; 5853 break;
5258 5854
5259 } 5855 }
5260 } 5856 }
5261 done: 5857 done:
5262 if ((avail == 0) && (terminate)) { 5858 if ((avail == 0) && (terminate)) {
5263 htmlAutoCloseOnEnd(ctxt); 5859 htmlAutoCloseOnEnd(ctxt);
5264 » if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5860 » if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5265 /* 5861 /*
5266 * SAX: end of the document processing. 5862 * SAX: end of the document processing.
5267 */ 5863 */
5268 ctxt->instate = XML_PARSER_EOF; 5864 ctxt->instate = XML_PARSER_EOF;
5269 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5865 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5270 ctxt->sax->endDocument(ctxt->userData); 5866 ctxt->sax->endDocument(ctxt->userData);
5271 } 5867 }
5272 } 5868 }
5273 if ((ctxt->myDoc != NULL) && 5869 if ((ctxt->myDoc != NULL) &&
5274 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5870 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5275 (ctxt->instate == XML_PARSER_EPILOG))) { 5871 (ctxt->instate == XML_PARSER_EPILOG))) {
5276 xmlDtdPtr dtd; 5872 xmlDtdPtr dtd;
5277 dtd = xmlGetIntSubset(ctxt->myDoc); 5873 dtd = xmlGetIntSubset(ctxt->myDoc);
5278 if (dtd == NULL) 5874 if (dtd == NULL)
5279 » ctxt->myDoc->intSubset = 5875 » ctxt->myDoc->intSubset =
5280 » » xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5876 » » xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5281 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5877 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5282 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5878 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5283 } 5879 }
5284 #ifdef DEBUG_PUSH 5880 #ifdef DEBUG_PUSH
5285 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5881 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5286 #endif 5882 #endif
5287 return(ret); 5883 return(ret);
5288 } 5884 }
5289 5885
5290 /** 5886 /**
(...skipping 13 matching lines...) Expand all
5304 if ((ctxt == NULL) || (ctxt->input == NULL)) { 5900 if ((ctxt == NULL) || (ctxt->input == NULL)) {
5305 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5901 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5306 "htmlParseChunk: context error\n", NULL, NULL); 5902 "htmlParseChunk: context error\n", NULL, NULL);
5307 return(XML_ERR_INTERNAL_ERROR); 5903 return(XML_ERR_INTERNAL_ERROR);
5308 } 5904 }
5309 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5905 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5310 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5906 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5311 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5907 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5312 int cur = ctxt->input->cur - ctxt->input->base; 5908 int cur = ctxt->input->cur - ctxt->input->base;
5313 int res; 5909 int res;
5314 » 5910
5315 » res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);» 5911 » res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5316 if (res < 0) { 5912 if (res < 0) {
5317 ctxt->errNo = XML_PARSER_EOF; 5913 ctxt->errNo = XML_PARSER_EOF;
5318 ctxt->disableSAX = 1; 5914 ctxt->disableSAX = 1;
5319 return (XML_PARSER_EOF); 5915 return (XML_PARSER_EOF);
5320 } 5916 }
5321 ctxt->input->base = ctxt->input->buf->buffer->content + base; 5917 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5322 ctxt->input->cur = ctxt->input->base + cur; 5918 ctxt->input->cur = ctxt->input->base + cur;
5323 ctxt->input->end = 5919 ctxt->input->end =
5324 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5920 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5325 #ifdef DEBUG_PUSH 5921 #ifdef DEBUG_PUSH
5326 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5922 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5327 #endif 5923 #endif
5328 5924
5329 #if 0 5925 #if 0
5330 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 5926 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5331 htmlParseTryOrFinish(ctxt, terminate); 5927 htmlParseTryOrFinish(ctxt, terminate);
5332 #endif 5928 #endif
5333 } else if (ctxt->instate != XML_PARSER_EOF) { 5929 } else if (ctxt->instate != XML_PARSER_EOF) {
5334 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 5930 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5335 xmlParserInputBufferPtr in = ctxt->input->buf; 5931 xmlParserInputBufferPtr in = ctxt->input->buf;
5336 if ((in->encoder != NULL) && (in->buffer != NULL) && 5932 if ((in->encoder != NULL) && (in->buffer != NULL) &&
5337 (in->raw != NULL)) { 5933 (in->raw != NULL)) {
5338 int nbchars; 5934 int nbchars;
5339 » » 5935
5340 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 5936 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5341 if (nbchars < 0) { 5937 if (nbchars < 0) {
5342 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 5938 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5343 "encoder error\n", NULL, NULL); 5939 "encoder error\n", NULL, NULL);
5344 return(XML_ERR_INVALID_ENCODING); 5940 return(XML_ERR_INVALID_ENCODING);
5345 } 5941 }
5346 } 5942 }
5347 } 5943 }
5348 } 5944 }
5349 htmlParseTryOrFinish(ctxt, terminate); 5945 htmlParseTryOrFinish(ctxt, terminate);
5350 if (terminate) { 5946 if (terminate) {
5351 if ((ctxt->instate != XML_PARSER_EOF) && 5947 if ((ctxt->instate != XML_PARSER_EOF) &&
5352 (ctxt->instate != XML_PARSER_EPILOG) && 5948 (ctxt->instate != XML_PARSER_EPILOG) &&
5353 (ctxt->instate != XML_PARSER_MISC)) { 5949 (ctxt->instate != XML_PARSER_MISC)) {
5354 ctxt->errNo = XML_ERR_DOCUMENT_END; 5950 ctxt->errNo = XML_ERR_DOCUMENT_END;
5355 ctxt->wellFormed = 0; 5951 ctxt->wellFormed = 0;
5356 » } 5952 » }
5357 if (ctxt->instate != XML_PARSER_EOF) { 5953 if (ctxt->instate != XML_PARSER_EOF) {
5358 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5954 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5359 ctxt->sax->endDocument(ctxt->userData); 5955 ctxt->sax->endDocument(ctxt->userData);
5360 } 5956 }
5361 ctxt->instate = XML_PARSER_EOF; 5957 ctxt->instate = XML_PARSER_EOF;
5362 } 5958 }
5363 return((xmlParserErrors) ctxt->errNo);» 5959 return((xmlParserErrors) ctxt->errNo);
5364 } 5960 }
5365 5961
5366 /************************************************************************ 5962 /************************************************************************
5367 * * 5963 * *
5368 * User entry points * 5964 * User entry points *
5369 * * 5965 * *
5370 ************************************************************************/ 5966 ************************************************************************/
5371 5967
5372 /** 5968 /**
5373 * htmlCreatePushParserCtxt: 5969 * htmlCreatePushParserCtxt:
5374 * @sax: a SAX handler 5970 * @sax: a SAX handler
5375 * @user_data: The user data returned on SAX callbacks 5971 * @user_data: The user data returned on SAX callbacks
5376 * @chunk: a pointer to an array of chars 5972 * @chunk: a pointer to an array of chars
5377 * @size: number of chars in the array 5973 * @size: number of chars in the array
5378 * @filename: an optional file name or URI 5974 * @filename: an optional file name or URI
5379 * @enc: an optional encoding 5975 * @enc: an optional encoding
5380 * 5976 *
5381 * Create a parser context for using the HTML parser in push mode 5977 * Create a parser context for using the HTML parser in push mode
5382 * The value of @filename is used for fetching external entities 5978 * The value of @filename is used for fetching external entities
5383 * and error/warning reports. 5979 * and error/warning reports.
5384 * 5980 *
5385 * Returns the new parser context or NULL 5981 * Returns the new parser context or NULL
5386 */ 5982 */
5387 htmlParserCtxtPtr 5983 htmlParserCtxtPtr
5388 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 5984 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5389 const char *chunk, int size, const char *filename, 5985 const char *chunk, int size, const char *filename,
5390 xmlCharEncoding enc) { 5986 xmlCharEncoding enc) {
5391 htmlParserCtxtPtr ctxt; 5987 htmlParserCtxtPtr ctxt;
5392 htmlParserInputPtr inputStream; 5988 htmlParserInputPtr inputStream;
5393 xmlParserInputBufferPtr buf; 5989 xmlParserInputBufferPtr buf;
5394 5990
5395 xmlInitParser(); 5991 xmlInitParser();
5396 5992
5397 buf = xmlAllocParserInputBuffer(enc); 5993 buf = xmlAllocParserInputBuffer(enc);
5398 if (buf == NULL) return(NULL); 5994 if (buf == NULL) return(NULL);
(...skipping 10 matching lines...) Expand all
5409 xmlFree(ctxt->sax); 6005 xmlFree(ctxt->sax);
5410 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6006 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5411 if (ctxt->sax == NULL) { 6007 if (ctxt->sax == NULL) {
5412 xmlFree(buf); 6008 xmlFree(buf);
5413 xmlFree(ctxt); 6009 xmlFree(ctxt);
5414 return(NULL); 6010 return(NULL);
5415 } 6011 }
5416 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6012 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5417 if (user_data != NULL) 6013 if (user_data != NULL)
5418 ctxt->userData = user_data; 6014 ctxt->userData = user_data;
5419 }» 6015 }
5420 if (filename == NULL) { 6016 if (filename == NULL) {
5421 ctxt->directory = NULL; 6017 ctxt->directory = NULL;
5422 } else { 6018 } else {
5423 ctxt->directory = xmlParserGetDirectory(filename); 6019 ctxt->directory = xmlParserGetDirectory(filename);
5424 } 6020 }
5425 6021
5426 inputStream = htmlNewInputStream(ctxt); 6022 inputStream = htmlNewInputStream(ctxt);
5427 if (inputStream == NULL) { 6023 if (inputStream == NULL) {
5428 xmlFreeParserCtxt(ctxt); 6024 xmlFreeParserCtxt(ctxt);
5429 xmlFree(buf); 6025 xmlFree(buf);
5430 return(NULL); 6026 return(NULL);
5431 } 6027 }
5432 6028
5433 if (filename == NULL) 6029 if (filename == NULL)
5434 inputStream->filename = NULL; 6030 inputStream->filename = NULL;
5435 else 6031 else
5436 inputStream->filename = (char *) 6032 inputStream->filename = (char *)
5437 xmlCanonicPath((const xmlChar *) filename); 6033 xmlCanonicPath((const xmlChar *) filename);
5438 inputStream->buf = buf; 6034 inputStream->buf = buf;
5439 inputStream->base = inputStream->buf->buffer->content; 6035 inputStream->base = inputStream->buf->buffer->content;
5440 inputStream->cur = inputStream->buf->buffer->content; 6036 inputStream->cur = inputStream->buf->buffer->content;
5441 inputStream->end = 6037 inputStream->end =
5442 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 6038 &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5443 6039
5444 inputPush(ctxt, inputStream); 6040 inputPush(ctxt, inputStream);
5445 6041
5446 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6042 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5447 (ctxt->input->buf != NULL)) {» 6043 (ctxt->input->buf != NULL)) {
5448 int base = ctxt->input->base - ctxt->input->buf->buffer->content; 6044 int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5449 int cur = ctxt->input->cur - ctxt->input->base; 6045 int cur = ctxt->input->cur - ctxt->input->base;
5450 6046
5451 » xmlParserInputBufferPush(ctxt->input->buf, size, chunk);» 6047 » xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5452 6048
5453 ctxt->input->base = ctxt->input->buf->buffer->content + base; 6049 ctxt->input->base = ctxt->input->buf->buffer->content + base;
5454 ctxt->input->cur = ctxt->input->base + cur; 6050 ctxt->input->cur = ctxt->input->base + cur;
5455 ctxt->input->end = 6051 ctxt->input->end =
5456 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 6052 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5457 #ifdef DEBUG_PUSH 6053 #ifdef DEBUG_PUSH
5458 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6054 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5459 #endif 6055 #endif
5460 } 6056 }
5461 ctxt->progressive = 1; 6057 ctxt->progressive = 1;
5462 6058
5463 return(ctxt); 6059 return(ctxt);
5464 } 6060 }
5465 #endif /* LIBXML_PUSH_ENABLED */ 6061 #endif /* LIBXML_PUSH_ENABLED */
5466 6062
5467 /** 6063 /**
5468 * htmlSAXParseDoc: 6064 * htmlSAXParseDoc:
5469 * @cur: a pointer to an array of xmlChar 6065 * @cur: a pointer to an array of xmlChar
5470 * @encoding: a free form C string describing the HTML document encoding, or NU LL 6066 * @encoding: a free form C string describing the HTML document encoding, or NU LL
5471 * @sax: the SAX handler block 6067 * @sax: the SAX handler block
5472 * @userData: if using SAX, this pointer will be provided on callbacks. 6068 * @userData: if using SAX, this pointer will be provided on callbacks.
5473 * 6069 *
5474 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6070 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5475 * to handle parse events. If sax is NULL, fallback to the default DOM 6071 * to handle parse events. If sax is NULL, fallback to the default DOM
5476 * behavior and return a tree. 6072 * behavior and return a tree.
5477 * 6073 *
5478 * Returns the resulting document tree unless SAX is NULL or the document is 6074 * Returns the resulting document tree unless SAX is NULL or the document is
5479 * not well formed. 6075 * not well formed.
5480 */ 6076 */
5481 6077
5482 htmlDocPtr 6078 htmlDocPtr
5483 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 6079 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5484 htmlDocPtr ret; 6080 htmlDocPtr ret;
5485 htmlParserCtxtPtr ctxt; 6081 htmlParserCtxtPtr ctxt;
5486 6082
5487 xmlInitParser(); 6083 xmlInitParser();
5488 6084
5489 if (cur == NULL) return(NULL); 6085 if (cur == NULL) return(NULL);
5490 6086
5491 6087
5492 ctxt = htmlCreateDocParserCtxt(cur, encoding); 6088 ctxt = htmlCreateDocParserCtxt(cur, encoding);
5493 if (ctxt == NULL) return(NULL); 6089 if (ctxt == NULL) return(NULL);
5494 if (sax != NULL) { 6090 if (sax != NULL) {
5495 if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6091 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5496 ctxt->sax = sax; 6092 ctxt->sax = sax;
5497 ctxt->userData = userData; 6093 ctxt->userData = userData;
5498 } 6094 }
5499 6095
5500 htmlParseDocument(ctxt); 6096 htmlParseDocument(ctxt);
5501 ret = ctxt->myDoc; 6097 ret = ctxt->myDoc;
5502 if (sax != NULL) { 6098 if (sax != NULL) {
5503 ctxt->sax = NULL; 6099 ctxt->sax = NULL;
5504 ctxt->userData = NULL; 6100 ctxt->userData = NULL;
5505 } 6101 }
5506 htmlFreeParserCtxt(ctxt); 6102 htmlFreeParserCtxt(ctxt);
5507 6103
5508 return(ret); 6104 return(ret);
5509 } 6105 }
5510 6106
5511 /** 6107 /**
5512 * htmlParseDoc: 6108 * htmlParseDoc:
5513 * @cur: a pointer to an array of xmlChar 6109 * @cur: a pointer to an array of xmlChar
5514 * @encoding: a free form C string describing the HTML document encoding, or NU LL 6110 * @encoding: a free form C string describing the HTML document encoding, or NU LL
5515 * 6111 *
5516 * parse an HTML in-memory document and build a tree. 6112 * parse an HTML in-memory document and build a tree.
5517 * 6113 *
5518 * Returns the resulting document tree 6114 * Returns the resulting document tree
5519 */ 6115 */
5520 6116
5521 htmlDocPtr 6117 htmlDocPtr
5522 htmlParseDoc(xmlChar *cur, const char *encoding) { 6118 htmlParseDoc(xmlChar *cur, const char *encoding) {
5523 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6119 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5524 } 6120 }
5525 6121
5526 6122
5527 /** 6123 /**
5528 * htmlCreateFileParserCtxt: 6124 * htmlCreateFileParserCtxt:
5529 * @filename: the filename 6125 * @filename: the filename
5530 * @encoding: a free form C string describing the HTML document encoding, or NU LL 6126 * @encoding: a free form C string describing the HTML document encoding, or NU LL
5531 * 6127 *
5532 * Create a parser context for a file content. 6128 * Create a parser context for a file content.
5533 * Automatic support for ZLIB/Compress compressed document is provided 6129 * Automatic support for ZLIB/Compress compressed document is provided
5534 * by default if found at compile-time. 6130 * by default if found at compile-time.
5535 * 6131 *
5536 * Returns the new parser context or NULL 6132 * Returns the new parser context or NULL
5537 */ 6133 */
5538 htmlParserCtxtPtr 6134 htmlParserCtxtPtr
5539 htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6135 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5540 { 6136 {
5541 htmlParserCtxtPtr ctxt; 6137 htmlParserCtxtPtr ctxt;
5542 htmlParserInputPtr inputStream; 6138 htmlParserInputPtr inputStream;
(...skipping 11 matching lines...) Expand all
5554 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6150 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5555 if (canonicFilename == NULL) { 6151 if (canonicFilename == NULL) {
5556 #ifdef LIBXML_SAX1_ENABLED 6152 #ifdef LIBXML_SAX1_ENABLED
5557 if (xmlDefaultSAXHandler.error != NULL) { 6153 if (xmlDefaultSAXHandler.error != NULL) {
5558 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6154 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5559 } 6155 }
5560 #endif 6156 #endif
5561 xmlFreeParserCtxt(ctxt); 6157 xmlFreeParserCtxt(ctxt);
5562 return(NULL); 6158 return(NULL);
5563 } 6159 }
5564 6160
5565 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6161 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5566 xmlFree(canonicFilename); 6162 xmlFree(canonicFilename);
5567 if (inputStream == NULL) { 6163 if (inputStream == NULL) {
5568 xmlFreeParserCtxt(ctxt); 6164 xmlFreeParserCtxt(ctxt);
5569 return(NULL); 6165 return(NULL);
5570 } 6166 }
5571 6167
5572 inputPush(ctxt, inputStream); 6168 inputPush(ctxt, inputStream);
5573 6169
5574 /* set encoding */ 6170 /* set encoding */
5575 if (encoding) { 6171 if (encoding) {
5576 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 6172 content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5577 » if (content) { 6173 » if (content) {
5578 strcpy ((char *)content, (char *)content_line); 6174 strcpy ((char *)content, (char *)content_line);
5579 strcat ((char *)content, (char *)encoding); 6175 strcat ((char *)content, (char *)encoding);
5580 htmlCheckEncoding (ctxt, content); 6176 htmlCheckEncoding (ctxt, content);
5581 xmlFree (content); 6177 xmlFree (content);
5582 } 6178 }
5583 } 6179 }
5584 6180
5585 return(ctxt); 6181 return(ctxt);
5586 } 6182 }
5587 6183
5588 /** 6184 /**
5589 * htmlSAXParseFile: 6185 * htmlSAXParseFile:
5590 * @filename: the filename 6186 * @filename: the filename
5591 * @encoding: a free form C string describing the HTML document encoding, or NU LL 6187 * @encoding: a free form C string describing the HTML document encoding, or NU LL
5592 * @sax: the SAX handler block 6188 * @sax: the SAX handler block
5593 * @userData: if using SAX, this pointer will be provided on callbacks. 6189 * @userData: if using SAX, this pointer will be provided on callbacks.
5594 * 6190 *
5595 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6191 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5596 * compressed document is provided by default if found at compile-time. 6192 * compressed document is provided by default if found at compile-time.
5597 * It use the given SAX function block to handle the parsing callback. 6193 * It use the given SAX function block to handle the parsing callback.
5598 * If sax is NULL, fallback to the default DOM tree building routines. 6194 * If sax is NULL, fallback to the default DOM tree building routines.
5599 * 6195 *
5600 * Returns the resulting document tree unless SAX is NULL or the document is 6196 * Returns the resulting document tree unless SAX is NULL or the document is
5601 * not well formed. 6197 * not well formed.
5602 */ 6198 */
5603 6199
5604 htmlDocPtr 6200 htmlDocPtr
5605 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr s ax, 6201 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr s ax,
5606 void *userData) { 6202 void *userData) {
5607 htmlDocPtr ret; 6203 htmlDocPtr ret;
5608 htmlParserCtxtPtr ctxt; 6204 htmlParserCtxtPtr ctxt;
5609 htmlSAXHandlerPtr oldsax = NULL; 6205 htmlSAXHandlerPtr oldsax = NULL;
5610 6206
5611 xmlInitParser(); 6207 xmlInitParser();
5612 6208
5613 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6209 ctxt = htmlCreateFileParserCtxt(filename, encoding);
5614 if (ctxt == NULL) return(NULL); 6210 if (ctxt == NULL) return(NULL);
5615 if (sax != NULL) { 6211 if (sax != NULL) {
5616 oldsax = ctxt->sax; 6212 oldsax = ctxt->sax;
5617 ctxt->sax = sax; 6213 ctxt->sax = sax;
5618 ctxt->userData = userData; 6214 ctxt->userData = userData;
5619 } 6215 }
5620 6216
5621 htmlParseDocument(ctxt); 6217 htmlParseDocument(ctxt);
5622 6218
5623 ret = ctxt->myDoc; 6219 ret = ctxt->myDoc;
5624 if (sax != NULL) { 6220 if (sax != NULL) {
5625 ctxt->sax = oldsax; 6221 ctxt->sax = oldsax;
5626 ctxt->userData = NULL; 6222 ctxt->userData = NULL;
5627 } 6223 }
5628 htmlFreeParserCtxt(ctxt); 6224 htmlFreeParserCtxt(ctxt);
5629 6225
5630 return(ret); 6226 return(ret);
5631 } 6227 }
5632 6228
5633 /** 6229 /**
5634 * htmlParseFile: 6230 * htmlParseFile:
5635 * @filename: the filename 6231 * @filename: the filename
5636 * @encoding: a free form C string describing the HTML document encoding, or NU LL 6232 * @encoding: a free form C string describing the HTML document encoding, or NU LL
5637 * 6233 *
5638 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6234 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5639 * compressed document is provided by default if found at compile-time. 6235 * compressed document is provided by default if found at compile-time.
5640 * 6236 *
5641 * Returns the resulting document tree 6237 * Returns the resulting document tree
5642 */ 6238 */
5643 6239
5644 htmlDocPtr 6240 htmlDocPtr
5645 htmlParseFile(const char *filename, const char *encoding) { 6241 htmlParseFile(const char *filename, const char *encoding) {
5646 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6242 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5647 } 6243 }
5648 6244
5649 /** 6245 /**
5650 * htmlHandleOmittedElem: 6246 * htmlHandleOmittedElem:
5651 * @val: int 0 or 1 6247 * @val: int 0 or 1
5652 * 6248 *
5653 * Set and return the previous value for handling HTML omitted tags. 6249 * Set and return the previous value for handling HTML omitted tags.
5654 * 6250 *
5655 * Returns the last value for 0 for no handling, 1 for auto insertion. 6251 * Returns the last value for 0 for no handling, 1 for auto insertion.
5656 */ 6252 */
5657 6253
5658 int 6254 int
5659 htmlHandleOmittedElem(int val) { 6255 htmlHandleOmittedElem(int val) {
5660 int old = htmlOmittedDefaultValue; 6256 int old = htmlOmittedDefaultValue;
5661 6257
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after
5781 * * 6377 * *
5782 ************************************************************************/ 6378 ************************************************************************/
5783 /** 6379 /**
5784 * DICT_FREE: 6380 * DICT_FREE:
5785 * @str: a string 6381 * @str: a string
5786 * 6382 *
5787 * Free a string if it is not owned by the "dict" dictionnary in the 6383 * Free a string if it is not owned by the "dict" dictionnary in the
5788 * current scope 6384 * current scope
5789 */ 6385 */
5790 #define DICT_FREE(str) \ 6386 #define DICT_FREE(str) \
5791 » if ((str) && ((!dict) || » » » » \ 6387 » if ((str) && ((!dict) ||» » » » \
5792 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6388 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5793 xmlFree((char *)(str)); 6389 xmlFree((char *)(str));
5794 6390
5795 /** 6391 /**
5796 * htmlCtxtReset: 6392 * htmlCtxtReset:
5797 * @ctxt: an HTML parser context 6393 * @ctxt: an HTML parser context
5798 * 6394 *
5799 * Reset a parser context 6395 * Reset a parser context
5800 */ 6396 */
5801 void 6397 void
5802 htmlCtxtReset(htmlParserCtxtPtr ctxt) 6398 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5803 { 6399 {
5804 xmlParserInputPtr input; 6400 xmlParserInputPtr input;
5805 xmlDictPtr dict; 6401 xmlDictPtr dict;
5806 6402
5807 if (ctxt == NULL) 6403 if (ctxt == NULL)
5808 return; 6404 return;
5809 6405
5810 xmlInitParser(); 6406 xmlInitParser();
5811 dict = ctxt->dict; 6407 dict = ctxt->dict;
5812 6408
5813 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6409 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5814 xmlFreeInputStream(input); 6410 xmlFreeInputStream(input);
5815 } 6411 }
5816 ctxt->inputNr = 0; 6412 ctxt->inputNr = 0;
(...skipping 106 matching lines...) Expand 10 before | Expand all | Expand 10 after
5923 ctxt->keepBlanks = 1; 6519 ctxt->keepBlanks = 1;
5924 if (options & HTML_PARSE_RECOVER) { 6520 if (options & HTML_PARSE_RECOVER) {
5925 ctxt->recovery = 1; 6521 ctxt->recovery = 1;
5926 options -= HTML_PARSE_RECOVER; 6522 options -= HTML_PARSE_RECOVER;
5927 } else 6523 } else
5928 ctxt->recovery = 0; 6524 ctxt->recovery = 0;
5929 if (options & HTML_PARSE_COMPACT) { 6525 if (options & HTML_PARSE_COMPACT) {
5930 ctxt->options |= HTML_PARSE_COMPACT; 6526 ctxt->options |= HTML_PARSE_COMPACT;
5931 options -= HTML_PARSE_COMPACT; 6527 options -= HTML_PARSE_COMPACT;
5932 } 6528 }
6529 if (options & XML_PARSE_HUGE) {
6530 ctxt->options |= XML_PARSE_HUGE;
6531 options -= XML_PARSE_HUGE;
6532 }
5933 ctxt->dictNames = 0; 6533 ctxt->dictNames = 0;
5934 return (options); 6534 return (options);
5935 } 6535 }
5936 6536
5937 /** 6537 /**
5938 * htmlDoRead: 6538 * htmlDoRead:
5939 * @ctxt: an HTML parser context 6539 * @ctxt: an HTML parser context
5940 * @URL: the base URL to use for the document 6540 * @URL: the base URL to use for the document
5941 * @encoding: the document encoding, or NULL 6541 * @encoding: the document encoding, or NULL
5942 * @options: a combination of htmlParserOption(s) 6542 * @options: a combination of htmlParserOption(s)
5943 * @reuse: keep the context for reuse 6543 * @reuse: keep the context for reuse
5944 * 6544 *
5945 * Common front-end for the htmlRead functions 6545 * Common front-end for the htmlRead functions
5946 * 6546 *
5947 * Returns the resulting document tree or NULL 6547 * Returns the resulting document tree or NULL
5948 */ 6548 */
5949 static htmlDocPtr 6549 static htmlDocPtr
5950 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6550 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5951 int options, int reuse) 6551 int options, int reuse)
5952 { 6552 {
5953 htmlDocPtr ret; 6553 htmlDocPtr ret;
5954 6554
5955 htmlCtxtUseOptions(ctxt, options); 6555 htmlCtxtUseOptions(ctxt, options);
5956 ctxt->html = 1; 6556 ctxt->html = 1;
5957 if (encoding != NULL) { 6557 if (encoding != NULL) {
5958 xmlCharEncodingHandlerPtr hdlr; 6558 xmlCharEncodingHandlerPtr hdlr;
5959 6559
5960 hdlr = xmlFindCharEncodingHandler(encoding); 6560 hdlr = xmlFindCharEncodingHandler(encoding);
5961 » if (hdlr != NULL) 6561 » if (hdlr != NULL) {
5962 xmlSwitchToEncoding(ctxt, hdlr); 6562 xmlSwitchToEncoding(ctxt, hdlr);
6563 if (ctxt->input->encoding != NULL)
6564 xmlFree((xmlChar *) ctxt->input->encoding);
6565 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6566 }
5963 } 6567 }
5964 if ((URL != NULL) && (ctxt->input != NULL) && 6568 if ((URL != NULL) && (ctxt->input != NULL) &&
5965 (ctxt->input->filename == NULL)) 6569 (ctxt->input->filename == NULL))
5966 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6570 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5967 htmlParseDocument(ctxt); 6571 htmlParseDocument(ctxt);
5968 ret = ctxt->myDoc; 6572 ret = ctxt->myDoc;
5969 ctxt->myDoc = NULL; 6573 ctxt->myDoc = NULL;
5970 if (!reuse) { 6574 if (!reuse) {
5971 if ((ctxt->dictNames) && 6575 if ((ctxt->dictNames) &&
5972 (ret != NULL) && 6576 (ret != NULL) &&
5973 (ret->dict == ctxt->dict)) 6577 (ret->dict == ctxt->dict))
5974 ctxt->dict = NULL; 6578 ctxt->dict = NULL;
5975 xmlFreeParserCtxt(ctxt); 6579 xmlFreeParserCtxt(ctxt);
5976 } 6580 }
5977 return (ret); 6581 return (ret);
5978 } 6582 }
5979 6583
5980 /** 6584 /**
5981 * htmlReadDoc: 6585 * htmlReadDoc:
5982 * @cur: a pointer to a zero terminated string 6586 * @cur: a pointer to a zero terminated string
5983 * @URL: the base URL to use for the document 6587 * @URL: the base URL to use for the document
5984 * @encoding: the document encoding, or NULL 6588 * @encoding: the document encoding, or NULL
5985 * @options: a combination of htmlParserOption(s) 6589 * @options: a combination of htmlParserOption(s)
5986 * 6590 *
5987 * parse an XML in-memory document and build a tree. 6591 * parse an XML in-memory document and build a tree.
5988 * 6592 *
5989 * Returns the resulting document tree 6593 * Returns the resulting document tree
5990 */ 6594 */
5991 htmlDocPtr 6595 htmlDocPtr
5992 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int opti ons) 6596 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int opti ons)
5993 { 6597 {
5994 htmlParserCtxtPtr ctxt; 6598 htmlParserCtxtPtr ctxt;
5995 6599
5996 if (cur == NULL) 6600 if (cur == NULL)
5997 return (NULL); 6601 return (NULL);
5998 6602
5999 xmlInitParser(); 6603 xmlInitParser();
6000 ctxt = htmlCreateDocParserCtxt(cur, NULL); 6604 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6001 if (ctxt == NULL) 6605 if (ctxt == NULL)
6002 return (NULL); 6606 return (NULL);
6003 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6607 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6004 } 6608 }
6005 6609
6006 /** 6610 /**
6007 * htmlReadFile: 6611 * htmlReadFile:
6008 * @filename: a file or URL 6612 * @filename: a file or URL
6009 * @encoding: the document encoding, or NULL 6613 * @encoding: the document encoding, or NULL
6010 * @options: a combination of htmlParserOption(s) 6614 * @options: a combination of htmlParserOption(s)
6011 * 6615 *
6012 * parse an XML file from the filesystem or the network. 6616 * parse an XML file from the filesystem or the network.
6013 * 6617 *
6014 * Returns the resulting document tree 6618 * Returns the resulting document tree
6015 */ 6619 */
6016 htmlDocPtr 6620 htmlDocPtr
6017 htmlReadFile(const char *filename, const char *encoding, int options) 6621 htmlReadFile(const char *filename, const char *encoding, int options)
6018 { 6622 {
6019 htmlParserCtxtPtr ctxt; 6623 htmlParserCtxtPtr ctxt;
6020 6624
6021 xmlInitParser(); 6625 xmlInitParser();
6022 ctxt = htmlCreateFileParserCtxt(filename, encoding); 6626 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6023 if (ctxt == NULL) 6627 if (ctxt == NULL)
6024 return (NULL); 6628 return (NULL);
6025 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6629 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6026 } 6630 }
6027 6631
6028 /** 6632 /**
6029 * htmlReadMemory: 6633 * htmlReadMemory:
6030 * @buffer: a pointer to a char array 6634 * @buffer: a pointer to a char array
6031 * @size: the size of the array 6635 * @size: the size of the array
6032 * @URL: the base URL to use for the document 6636 * @URL: the base URL to use for the document
6033 * @encoding: the document encoding, or NULL 6637 * @encoding: the document encoding, or NULL
6034 * @options: a combination of htmlParserOption(s) 6638 * @options: a combination of htmlParserOption(s)
6035 * 6639 *
6036 * parse an XML in-memory document and build a tree. 6640 * parse an XML in-memory document and build a tree.
6037 * 6641 *
6038 * Returns the resulting document tree 6642 * Returns the resulting document tree
6039 */ 6643 */
6040 htmlDocPtr 6644 htmlDocPtr
6041 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi ng, int options) 6645 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encodi ng, int options)
6042 { 6646 {
6043 htmlParserCtxtPtr ctxt; 6647 htmlParserCtxtPtr ctxt;
6044 6648
6045 xmlInitParser(); 6649 xmlInitParser();
6046 ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6650 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6047 if (ctxt == NULL) 6651 if (ctxt == NULL)
6048 return (NULL); 6652 return (NULL);
6049 htmlDefaultSAXHandlerInit(); 6653 htmlDefaultSAXHandlerInit();
6050 if (ctxt->sax != NULL) 6654 if (ctxt->sax != NULL)
6051 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6655 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6052 return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6656 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6053 } 6657 }
6054 6658
6055 /** 6659 /**
6056 * htmlReadFd: 6660 * htmlReadFd:
6057 * @fd: an open file descriptor 6661 * @fd: an open file descriptor
6058 * @URL: the base URL to use for the document 6662 * @URL: the base URL to use for the document
6059 * @encoding: the document encoding, or NULL 6663 * @encoding: the document encoding, or NULL
6060 * @options: a combination of htmlParserOption(s) 6664 * @options: a combination of htmlParserOption(s)
6061 * 6665 *
6062 * parse an XML from a file descriptor and build a tree. 6666 * parse an XML from a file descriptor and build a tree.
6063 * 6667 *
6064 * Returns the resulting document tree 6668 * Returns the resulting document tree
6065 */ 6669 */
6066 htmlDocPtr 6670 htmlDocPtr
6067 htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6671 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6068 { 6672 {
6069 htmlParserCtxtPtr ctxt; 6673 htmlParserCtxtPtr ctxt;
6070 xmlParserInputBufferPtr input; 6674 xmlParserInputBufferPtr input;
6071 xmlParserInputPtr stream; 6675 xmlParserInputPtr stream;
6072 6676
6073 if (fd < 0) 6677 if (fd < 0)
(...skipping 21 matching lines...) Expand all
6095 /** 6699 /**
6096 * htmlReadIO: 6700 * htmlReadIO:
6097 * @ioread: an I/O read function 6701 * @ioread: an I/O read function
6098 * @ioclose: an I/O close function 6702 * @ioclose: an I/O close function
6099 * @ioctx: an I/O handler 6703 * @ioctx: an I/O handler
6100 * @URL: the base URL to use for the document 6704 * @URL: the base URL to use for the document
6101 * @encoding: the document encoding, or NULL 6705 * @encoding: the document encoding, or NULL
6102 * @options: a combination of htmlParserOption(s) 6706 * @options: a combination of htmlParserOption(s)
6103 * 6707 *
6104 * parse an HTML document from I/O functions and source and build a tree. 6708 * parse an HTML document from I/O functions and source and build a tree.
6105 * 6709 *
6106 * Returns the resulting document tree 6710 * Returns the resulting document tree
6107 */ 6711 */
6108 htmlDocPtr 6712 htmlDocPtr
6109 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6713 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6110 void *ioctx, const char *URL, const char *encoding, int options) 6714 void *ioctx, const char *URL, const char *encoding, int options)
6111 { 6715 {
6112 htmlParserCtxtPtr ctxt; 6716 htmlParserCtxtPtr ctxt;
6113 xmlParserInputBufferPtr input; 6717 xmlParserInputBufferPtr input;
6114 xmlParserInputPtr stream; 6718 xmlParserInputPtr stream;
6115 6719
(...skipping 23 matching lines...) Expand all
6139 /** 6743 /**
6140 * htmlCtxtReadDoc: 6744 * htmlCtxtReadDoc:
6141 * @ctxt: an HTML parser context 6745 * @ctxt: an HTML parser context
6142 * @cur: a pointer to a zero terminated string 6746 * @cur: a pointer to a zero terminated string
6143 * @URL: the base URL to use for the document 6747 * @URL: the base URL to use for the document
6144 * @encoding: the document encoding, or NULL 6748 * @encoding: the document encoding, or NULL
6145 * @options: a combination of htmlParserOption(s) 6749 * @options: a combination of htmlParserOption(s)
6146 * 6750 *
6147 * parse an XML in-memory document and build a tree. 6751 * parse an XML in-memory document and build a tree.
6148 * This reuses the existing @ctxt parser context 6752 * This reuses the existing @ctxt parser context
6149 * 6753 *
6150 * Returns the resulting document tree 6754 * Returns the resulting document tree
6151 */ 6755 */
6152 htmlDocPtr 6756 htmlDocPtr
6153 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6757 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6154 const char *URL, const char *encoding, int options) 6758 const char *URL, const char *encoding, int options)
6155 { 6759 {
6156 xmlParserInputPtr stream; 6760 xmlParserInputPtr stream;
6157 6761
6158 if (cur == NULL) 6762 if (cur == NULL)
6159 return (NULL); 6763 return (NULL);
(...skipping 12 matching lines...) Expand all
6172 6776
6173 /** 6777 /**
6174 * htmlCtxtReadFile: 6778 * htmlCtxtReadFile:
6175 * @ctxt: an HTML parser context 6779 * @ctxt: an HTML parser context
6176 * @filename: a file or URL 6780 * @filename: a file or URL
6177 * @encoding: the document encoding, or NULL 6781 * @encoding: the document encoding, or NULL
6178 * @options: a combination of htmlParserOption(s) 6782 * @options: a combination of htmlParserOption(s)
6179 * 6783 *
6180 * parse an XML file from the filesystem or the network. 6784 * parse an XML file from the filesystem or the network.
6181 * This reuses the existing @ctxt parser context 6785 * This reuses the existing @ctxt parser context
6182 * 6786 *
6183 * Returns the resulting document tree 6787 * Returns the resulting document tree
6184 */ 6788 */
6185 htmlDocPtr 6789 htmlDocPtr
6186 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6790 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6187 const char *encoding, int options) 6791 const char *encoding, int options)
6188 { 6792 {
6189 xmlParserInputPtr stream; 6793 xmlParserInputPtr stream;
6190 6794
6191 if (filename == NULL) 6795 if (filename == NULL)
6192 return (NULL); 6796 return (NULL);
(...skipping 14 matching lines...) Expand all
6207 * htmlCtxtReadMemory: 6811 * htmlCtxtReadMemory:
6208 * @ctxt: an HTML parser context 6812 * @ctxt: an HTML parser context
6209 * @buffer: a pointer to a char array 6813 * @buffer: a pointer to a char array
6210 * @size: the size of the array 6814 * @size: the size of the array
6211 * @URL: the base URL to use for the document 6815 * @URL: the base URL to use for the document
6212 * @encoding: the document encoding, or NULL 6816 * @encoding: the document encoding, or NULL
6213 * @options: a combination of htmlParserOption(s) 6817 * @options: a combination of htmlParserOption(s)
6214 * 6818 *
6215 * parse an XML in-memory document and build a tree. 6819 * parse an XML in-memory document and build a tree.
6216 * This reuses the existing @ctxt parser context 6820 * This reuses the existing @ctxt parser context
6217 * 6821 *
6218 * Returns the resulting document tree 6822 * Returns the resulting document tree
6219 */ 6823 */
6220 htmlDocPtr 6824 htmlDocPtr
6221 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6825 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6222 const char *URL, const char *encoding, int options) 6826 const char *URL, const char *encoding, int options)
6223 { 6827 {
6224 xmlParserInputBufferPtr input; 6828 xmlParserInputBufferPtr input;
6225 xmlParserInputPtr stream; 6829 xmlParserInputPtr stream;
6226 6830
6227 if (ctxt == NULL) 6831 if (ctxt == NULL)
(...skipping 21 matching lines...) Expand all
6249 /** 6853 /**
6250 * htmlCtxtReadFd: 6854 * htmlCtxtReadFd:
6251 * @ctxt: an HTML parser context 6855 * @ctxt: an HTML parser context
6252 * @fd: an open file descriptor 6856 * @fd: an open file descriptor
6253 * @URL: the base URL to use for the document 6857 * @URL: the base URL to use for the document
6254 * @encoding: the document encoding, or NULL 6858 * @encoding: the document encoding, or NULL
6255 * @options: a combination of htmlParserOption(s) 6859 * @options: a combination of htmlParserOption(s)
6256 * 6860 *
6257 * parse an XML from a file descriptor and build a tree. 6861 * parse an XML from a file descriptor and build a tree.
6258 * This reuses the existing @ctxt parser context 6862 * This reuses the existing @ctxt parser context
6259 * 6863 *
6260 * Returns the resulting document tree 6864 * Returns the resulting document tree
6261 */ 6865 */
6262 htmlDocPtr 6866 htmlDocPtr
6263 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6867 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6264 const char *URL, const char *encoding, int options) 6868 const char *URL, const char *encoding, int options)
6265 { 6869 {
6266 xmlParserInputBufferPtr input; 6870 xmlParserInputBufferPtr input;
6267 xmlParserInputPtr stream; 6871 xmlParserInputPtr stream;
6268 6872
6269 if (fd < 0) 6873 if (fd < 0)
(...skipping 21 matching lines...) Expand all
6291 * @ctxt: an HTML parser context 6895 * @ctxt: an HTML parser context
6292 * @ioread: an I/O read function 6896 * @ioread: an I/O read function
6293 * @ioclose: an I/O close function 6897 * @ioclose: an I/O close function
6294 * @ioctx: an I/O handler 6898 * @ioctx: an I/O handler
6295 * @URL: the base URL to use for the document 6899 * @URL: the base URL to use for the document
6296 * @encoding: the document encoding, or NULL 6900 * @encoding: the document encoding, or NULL
6297 * @options: a combination of htmlParserOption(s) 6901 * @options: a combination of htmlParserOption(s)
6298 * 6902 *
6299 * parse an HTML document from I/O functions and source and build a tree. 6903 * parse an HTML document from I/O functions and source and build a tree.
6300 * This reuses the existing @ctxt parser context 6904 * This reuses the existing @ctxt parser context
6301 * 6905 *
6302 * Returns the resulting document tree 6906 * Returns the resulting document tree
6303 */ 6907 */
6304 htmlDocPtr 6908 htmlDocPtr
6305 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 6909 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6306 xmlInputCloseCallback ioclose, void *ioctx, 6910 xmlInputCloseCallback ioclose, void *ioctx,
6307 const char *URL, 6911 const char *URL,
6308 const char *encoding, int options) 6912 const char *encoding, int options)
6309 { 6913 {
6310 xmlParserInputBufferPtr input; 6914 xmlParserInputBufferPtr input;
6311 xmlParserInputPtr stream; 6915 xmlParserInputPtr stream;
(...skipping 14 matching lines...) Expand all
6326 xmlFreeParserInputBuffer(input); 6930 xmlFreeParserInputBuffer(input);
6327 return (NULL); 6931 return (NULL);
6328 } 6932 }
6329 inputPush(ctxt, stream); 6933 inputPush(ctxt, stream);
6330 return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6934 return (htmlDoRead(ctxt, URL, encoding, options, 1));
6331 } 6935 }
6332 6936
6333 #define bottom_HTMLparser 6937 #define bottom_HTMLparser
6334 #include "elfgcchack.h" 6938 #include "elfgcchack.h"
6335 #endif /* LIBXML_HTML_ENABLED */ 6939 #endif /* LIBXML_HTML_ENABLED */
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698