OLD | NEW |
1 /* | 1 /* |
2 * encoding.c : implements the encoding conversion functions needed for XML | 2 * encoding.c : implements the encoding conversion functions needed for XML |
3 * | 3 * |
4 * Related specs: | 4 * Related specs: |
5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies | 5 * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies |
6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau | 6 * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau |
7 * [ISO-10646] UTF-8 and UTF-16 in Annexes | 7 * [ISO-10646] UTF-8 and UTF-16 in Annexes |
8 * [ISO-8859-1] ISO Latin-1 characters codes. | 8 * [ISO-8859-1] ISO Latin-1 characters codes. |
9 * [UNICODE] The Unicode Consortium, "The Unicode Standard -- | 9 * [UNICODE] The Unicode Consortium, "The Unicode Standard -- |
10 * Worldwide Character Encoding -- Version 1.0", Addison- | 10 * Worldwide Character Encoding -- Version 1.0", Addison- |
(...skipping 1448 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1459 | 1459 |
1460 /** | 1460 /** |
1461 * xmlRegisterCharEncodingHandler: | 1461 * xmlRegisterCharEncodingHandler: |
1462 * @handler: the xmlCharEncodingHandlerPtr handler block | 1462 * @handler: the xmlCharEncodingHandlerPtr handler block |
1463 * | 1463 * |
1464 * Register the char encoding handler, surprising, isn't it ? | 1464 * Register the char encoding handler, surprising, isn't it ? |
1465 */ | 1465 */ |
1466 void | 1466 void |
1467 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { | 1467 xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { |
1468 if (handlers == NULL) xmlInitCharEncodingHandlers(); | 1468 if (handlers == NULL) xmlInitCharEncodingHandlers(); |
1469 if (handler == NULL) { | 1469 if ((handler == NULL) || (handlers == NULL)) { |
1470 xmlEncodingErr(XML_I18N_NO_HANDLER, | 1470 xmlEncodingErr(XML_I18N_NO_HANDLER, |
1471 "xmlRegisterCharEncodingHandler: NULL handler !\n", NULL); | 1471 "xmlRegisterCharEncodingHandler: NULL handler !\n", NULL); |
1472 return; | 1472 return; |
1473 } | 1473 } |
1474 | 1474 |
1475 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) { | 1475 if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) { |
1476 xmlEncodingErr(XML_I18N_EXCESS_HANDLER, | 1476 xmlEncodingErr(XML_I18N_EXCESS_HANDLER, |
1477 "xmlRegisterCharEncodingHandler: Too many handler registered, see %s\n", | 1477 "xmlRegisterCharEncodingHandler: Too many handler registered, see %s\n", |
1478 "MAX_ENCODING_HANDLERS"); | 1478 "MAX_ENCODING_HANDLERS"); |
1479 return; | 1479 return; |
(...skipping 23 matching lines...) Expand all Loading... |
1503 return(NULL); | 1503 return(NULL); |
1504 case XML_CHAR_ENCODING_UTF16LE: | 1504 case XML_CHAR_ENCODING_UTF16LE: |
1505 return(xmlUTF16LEHandler); | 1505 return(xmlUTF16LEHandler); |
1506 case XML_CHAR_ENCODING_UTF16BE: | 1506 case XML_CHAR_ENCODING_UTF16BE: |
1507 return(xmlUTF16BEHandler); | 1507 return(xmlUTF16BEHandler); |
1508 case XML_CHAR_ENCODING_EBCDIC: | 1508 case XML_CHAR_ENCODING_EBCDIC: |
1509 handler = xmlFindCharEncodingHandler("EBCDIC"); | 1509 handler = xmlFindCharEncodingHandler("EBCDIC"); |
1510 if (handler != NULL) return(handler); | 1510 if (handler != NULL) return(handler); |
1511 handler = xmlFindCharEncodingHandler("ebcdic"); | 1511 handler = xmlFindCharEncodingHandler("ebcdic"); |
1512 if (handler != NULL) return(handler); | 1512 if (handler != NULL) return(handler); |
| 1513 handler = xmlFindCharEncodingHandler("EBCDIC-US"); |
| 1514 if (handler != NULL) return(handler); |
1513 break; | 1515 break; |
1514 case XML_CHAR_ENCODING_UCS4BE: | 1516 case XML_CHAR_ENCODING_UCS4BE: |
1515 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); | 1517 handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); |
1516 if (handler != NULL) return(handler); | 1518 if (handler != NULL) return(handler); |
1517 handler = xmlFindCharEncodingHandler("UCS-4"); | 1519 handler = xmlFindCharEncodingHandler("UCS-4"); |
1518 if (handler != NULL) return(handler); | 1520 if (handler != NULL) return(handler); |
1519 handler = xmlFindCharEncodingHandler("UCS4"); | 1521 handler = xmlFindCharEncodingHandler("UCS4"); |
1520 if (handler != NULL) return(handler); | 1522 if (handler != NULL) return(handler); |
1521 break; | 1523 break; |
1522 case XML_CHAR_ENCODING_UCS4LE: | 1524 case XML_CHAR_ENCODING_UCS4LE: |
(...skipping 126 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1649 | 1651 |
1650 /* | 1652 /* |
1651 * Check first for directly registered encoding names | 1653 * Check first for directly registered encoding names |
1652 */ | 1654 */ |
1653 for (i = 0;i < 99;i++) { | 1655 for (i = 0;i < 99;i++) { |
1654 upper[i] = toupper(name[i]); | 1656 upper[i] = toupper(name[i]); |
1655 if (upper[i] == 0) break; | 1657 if (upper[i] == 0) break; |
1656 } | 1658 } |
1657 upper[i] = 0; | 1659 upper[i] = 0; |
1658 | 1660 |
1659 for (i = 0;i < nbCharEncodingHandler; i++) | 1661 if (handlers != NULL) { |
1660 if (!strcmp(upper, handlers[i]->name)) { | 1662 for (i = 0;i < nbCharEncodingHandler; i++) { |
| 1663 if (!strcmp(upper, handlers[i]->name)) { |
1661 #ifdef DEBUG_ENCODING | 1664 #ifdef DEBUG_ENCODING |
1662 xmlGenericError(xmlGenericErrorContext, | 1665 xmlGenericError(xmlGenericErrorContext, |
1663 » » "Found registered handler for encoding %s\n", name); | 1666 "Found registered handler for encoding %s\n", name); |
1664 #endif | 1667 #endif |
1665 » return(handlers[i]); | 1668 return(handlers[i]); |
1666 » } | 1669 } |
| 1670 } |
| 1671 } |
1667 | 1672 |
1668 #ifdef LIBXML_ICONV_ENABLED | 1673 #ifdef LIBXML_ICONV_ENABLED |
1669 /* check whether iconv can handle this */ | 1674 /* check whether iconv can handle this */ |
1670 icv_in = iconv_open("UTF-8", name); | 1675 icv_in = iconv_open("UTF-8", name); |
1671 icv_out = iconv_open(name, "UTF-8"); | 1676 icv_out = iconv_open(name, "UTF-8"); |
1672 if (icv_in == (iconv_t) -1) { | 1677 if (icv_in == (iconv_t) -1) { |
1673 icv_in = iconv_open("UTF-8", upper); | 1678 icv_in = iconv_open("UTF-8", upper); |
1674 } | 1679 } |
1675 if (icv_out == (iconv_t) -1) { | 1680 if (icv_out == (iconv_t) -1) { |
1676 icv_out = iconv_open(upper, "UTF-8"); | 1681 icv_out = iconv_open(upper, "UTF-8"); |
(...skipping 205 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1882 /* if (err == U_TRUNCATED_CHAR_FOUND) */ | 1887 /* if (err == U_TRUNCATED_CHAR_FOUND) */ |
1883 return -3; | 1888 return -3; |
1884 } | 1889 } |
1885 #endif /* LIBXML_ICU_ENABLED */ | 1890 #endif /* LIBXML_ICU_ENABLED */ |
1886 | 1891 |
1887 /************************************************************************ | 1892 /************************************************************************ |
1888 * * | 1893 * * |
1889 * The real API used by libxml for on-the-fly conversion * | 1894 * The real API used by libxml for on-the-fly conversion * |
1890 * * | 1895 * * |
1891 ************************************************************************/ | 1896 ************************************************************************/ |
| 1897 int |
| 1898 xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| 1899 xmlBufferPtr in, int len); |
1892 | 1900 |
1893 /** | 1901 /** |
1894 * xmlCharEncFirstLine: | 1902 * xmlCharEncFirstLineInt: |
1895 * @handler: char enconding transformation data structure | 1903 * @handler: char enconding transformation data structure |
1896 * @out: an xmlBuffer for the output. | 1904 * @out: an xmlBuffer for the output. |
1897 * @in: an xmlBuffer for the input | 1905 * @in: an xmlBuffer for the input |
1898 * | 1906 * @len: number of bytes to convert for the first line, or -1 |
| 1907 * |
1899 * Front-end for the encoding handler input function, but handle only | 1908 * Front-end for the encoding handler input function, but handle only |
1900 * the very first line, i.e. limit itself to 45 chars. | 1909 * the very first line, i.e. limit itself to 45 chars. |
1901 * | 1910 * |
1902 * Returns the number of byte written if success, or | 1911 * Returns the number of byte written if success, or |
1903 * -1 general error | 1912 * -1 general error |
1904 * -2 if the transcoding fails (for *in is not valid utf8 string or | 1913 * -2 if the transcoding fails (for *in is not valid utf8 string or |
1905 * the result of transformation can't fit into the encoding we want), or | 1914 * the result of transformation can't fit into the encoding we want), or |
1906 */ | 1915 */ |
1907 int | 1916 int |
1908 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, | 1917 xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
1909 xmlBufferPtr in) { | 1918 xmlBufferPtr in, int len) { |
1910 int ret = -2; | 1919 int ret = -2; |
1911 int written; | 1920 int written; |
1912 int toconv; | 1921 int toconv; |
1913 | 1922 |
1914 if (handler == NULL) return(-1); | 1923 if (handler == NULL) return(-1); |
1915 if (out == NULL) return(-1); | 1924 if (out == NULL) return(-1); |
1916 if (in == NULL) return(-1); | 1925 if (in == NULL) return(-1); |
1917 | 1926 |
1918 /* calculate space available */ | 1927 /* calculate space available */ |
1919 written = out->size - out->use; | 1928 written = out->size - out->use; |
1920 toconv = in->use; | 1929 toconv = in->use; |
1921 /* | 1930 /* |
1922 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 | 1931 * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 |
1923 * 45 chars should be sufficient to reach the end of the encoding | 1932 * 45 chars should be sufficient to reach the end of the encoding |
1924 * declaration without going too far inside the document content. | 1933 * declaration without going too far inside the document content. |
1925 * on UTF-16 this means 90bytes, on UCS4 this means 180 | 1934 * on UTF-16 this means 90bytes, on UCS4 this means 180 |
| 1935 * The actual value depending on guessed encoding is passed as @len |
| 1936 * if provided |
1926 */ | 1937 */ |
1927 if (toconv > 180) | 1938 if (len >= 0) { |
1928 » toconv = 180; | 1939 if (toconv > len) |
| 1940 toconv = len; |
| 1941 } else { |
| 1942 if (toconv > 180) |
| 1943 toconv = 180; |
| 1944 } |
1929 if (toconv * 2 >= written) { | 1945 if (toconv * 2 >= written) { |
1930 xmlBufferGrow(out, toconv); | 1946 xmlBufferGrow(out, toconv); |
1931 written = out->size - out->use - 1; | 1947 written = out->size - out->use - 1; |
1932 } | 1948 } |
1933 | 1949 |
1934 if (handler->input != NULL) { | 1950 if (handler->input != NULL) { |
1935 ret = handler->input(&out->content[out->use], &written, | 1951 ret = handler->input(&out->content[out->use], &written, |
1936 in->content, &toconv); | 1952 in->content, &toconv); |
1937 xmlBufferShrink(in, toconv); | 1953 xmlBufferShrink(in, toconv); |
1938 out->use += written; | 1954 out->use += written; |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1983 #endif /* DEBUG_ENCODING */ | 1999 #endif /* DEBUG_ENCODING */ |
1984 /* | 2000 /* |
1985 * Ignore when input buffer is not on a boundary | 2001 * Ignore when input buffer is not on a boundary |
1986 */ | 2002 */ |
1987 if (ret == -3) ret = 0; | 2003 if (ret == -3) ret = 0; |
1988 if (ret == -1) ret = 0; | 2004 if (ret == -1) ret = 0; |
1989 return(ret); | 2005 return(ret); |
1990 } | 2006 } |
1991 | 2007 |
1992 /** | 2008 /** |
| 2009 * xmlCharEncFirstLine: |
| 2010 * @handler: char enconding transformation data structure |
| 2011 * @out: an xmlBuffer for the output. |
| 2012 * @in: an xmlBuffer for the input |
| 2013 * |
| 2014 * Front-end for the encoding handler input function, but handle only |
| 2015 * the very first line, i.e. limit itself to 45 chars. |
| 2016 * |
| 2017 * Returns the number of byte written if success, or |
| 2018 * -1 general error |
| 2019 * -2 if the transcoding fails (for *in is not valid utf8 string or |
| 2020 * the result of transformation can't fit into the encoding we want), or |
| 2021 */ |
| 2022 int |
| 2023 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| 2024 xmlBufferPtr in) { |
| 2025 return(xmlCharEncFirstLineInt(handler, out, in, -1)); |
| 2026 } |
| 2027 |
| 2028 /** |
1993 * xmlCharEncInFunc: | 2029 * xmlCharEncInFunc: |
1994 * @handler: char encoding transformation data structure | 2030 * @handler: char encoding transformation data structure |
1995 * @out: an xmlBuffer for the output. | 2031 * @out: an xmlBuffer for the output. |
1996 * @in: an xmlBuffer for the input | 2032 * @in: an xmlBuffer for the input |
1997 * | 2033 * |
1998 * Generic front-end for the encoding handler input function | 2034 * Generic front-end for the encoding handler input function |
1999 * | 2035 * |
2000 * Returns the number of byte written if success, or | 2036 * Returns the number of byte written if success, or |
2001 * -1 general error | 2037 * -1 general error |
2002 * -2 if the transcoding fails (for *in is not valid utf8 string or | 2038 * -2 if the transcoding fails (for *in is not valid utf8 string or |
2003 * the result of transformation can't fit into the encoding we want), or | 2039 * the result of transformation can't fit into the encoding we want), or |
2004 */ | 2040 */ |
2005 int | 2041 int |
2006 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, | 2042 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, |
2007 xmlBufferPtr in) | 2043 xmlBufferPtr in) |
2008 { | 2044 { |
2009 int ret = -2; | 2045 int ret = -2; |
2010 int written; | 2046 int written; |
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2176 toconv = in->use; | 2212 toconv = in->use; |
2177 if (toconv == 0) | 2213 if (toconv == 0) |
2178 return(0); | 2214 return(0); |
2179 if (toconv * 4 >= written) { | 2215 if (toconv * 4 >= written) { |
2180 xmlBufferGrow(out, toconv * 4); | 2216 xmlBufferGrow(out, toconv * 4); |
2181 written = out->size - out->use - 1; | 2217 written = out->size - out->use - 1; |
2182 } | 2218 } |
2183 if (handler->output != NULL) { | 2219 if (handler->output != NULL) { |
2184 ret = handler->output(&out->content[out->use], &written, | 2220 ret = handler->output(&out->content[out->use], &written, |
2185 in->content, &toconv); | 2221 in->content, &toconv); |
2186 » xmlBufferShrink(in, toconv); | 2222 » if (written > 0) { |
2187 » out->use += written; | 2223 » xmlBufferShrink(in, toconv); |
2188 » writtentot += written; | 2224 » out->use += written; |
| 2225 » writtentot += written; |
| 2226 » } |
2189 out->content[out->use] = 0; | 2227 out->content[out->use] = 0; |
2190 } | 2228 } |
2191 #ifdef LIBXML_ICONV_ENABLED | 2229 #ifdef LIBXML_ICONV_ENABLED |
2192 else if (handler->iconv_out != NULL) { | 2230 else if (handler->iconv_out != NULL) { |
2193 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], | 2231 ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], |
2194 &written, in->content, &toconv); | 2232 &written, in->content, &toconv); |
2195 xmlBufferShrink(in, toconv); | 2233 xmlBufferShrink(in, toconv); |
2196 out->use += written; | 2234 out->use += written; |
2197 writtentot += written; | 2235 writtentot += written; |
2198 out->content[out->use] = 0; | 2236 out->content[out->use] = 0; |
(...skipping 1325 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3524 xmlNewCharEncodingHandler ("ISO-8859-14", ISO8859_14ToUTF8, UTF8ToISO8859_14
); | 3562 xmlNewCharEncodingHandler ("ISO-8859-14", ISO8859_14ToUTF8, UTF8ToISO8859_14
); |
3525 xmlNewCharEncodingHandler ("ISO-8859-15", ISO8859_15ToUTF8, UTF8ToISO8859_15
); | 3563 xmlNewCharEncodingHandler ("ISO-8859-15", ISO8859_15ToUTF8, UTF8ToISO8859_15
); |
3526 xmlNewCharEncodingHandler ("ISO-8859-16", ISO8859_16ToUTF8, UTF8ToISO8859_16
); | 3564 xmlNewCharEncodingHandler ("ISO-8859-16", ISO8859_16ToUTF8, UTF8ToISO8859_16
); |
3527 } | 3565 } |
3528 | 3566 |
3529 #endif | 3567 #endif |
3530 #endif | 3568 #endif |
3531 | 3569 |
3532 #define bottom_encoding | 3570 #define bottom_encoding |
3533 #include "elfgcchack.h" | 3571 #include "elfgcchack.h" |
| 3572 |
OLD | NEW |