OLD | NEW |
| (Empty) |
1 Add code support for ICU. | |
2 | |
3 diff --git a/third_party/libxml/encoding.c b/third_party/libxml/encoding.c | |
4 index b86a547..0f41df9 100644 | |
5 --- a/third_party/libxml/encoding.c | |
6 +++ b/third_party/libxml/encoding.c | |
7 @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; | |
8 static int xmlCharEncodingAliasesNb = 0; | |
9 static int xmlCharEncodingAliasesMax = 0; | |
10 | |
11 -#ifdef LIBXML_ICONV_ENABLED | |
12 +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) | |
13 #if 0 | |
14 #define DEBUG_ENCODING /* Define this to get encoding traces */ | |
15 #endif | |
16 @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const
char *val) | |
17 NULL, 0, val, NULL, NULL, 0, 0, msg, val); | |
18 } | |
19 | |
20 +#ifdef LIBXML_ICU_ENABLED | |
21 +static uconv_t* | |
22 +openIcuConverter(const char* name, int toUnicode) | |
23 +{ | |
24 + UErrorCode status = U_ZERO_ERROR; | |
25 + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); | |
26 + if (conv == NULL) | |
27 + return NULL; | |
28 + | |
29 + conv->uconv = ucnv_open(name, &status); | |
30 + if (U_FAILURE(status)) | |
31 + goto error; | |
32 + | |
33 + status = U_ZERO_ERROR; | |
34 + if (toUnicode) { | |
35 + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, | |
36 + NULL, NULL, NULL, &status); | |
37 + } | |
38 + else { | |
39 + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, | |
40 + NULL, NULL, NULL, &status); | |
41 + } | |
42 + if (U_FAILURE(status)) | |
43 + goto error; | |
44 + | |
45 + status = U_ZERO_ERROR; | |
46 + conv->utf8 = ucnv_open("UTF-8", &status); | |
47 + if (U_SUCCESS(status)) | |
48 + return conv; | |
49 + | |
50 +error: | |
51 + if (conv->uconv) | |
52 + ucnv_close(conv->uconv); | |
53 + xmlFree(conv); | |
54 + return NULL; | |
55 +} | |
56 + | |
57 +static void | |
58 +closeIcuConverter(uconv_t *conv) | |
59 +{ | |
60 + if (conv != NULL) { | |
61 + ucnv_close(conv->uconv); | |
62 + ucnv_close(conv->utf8); | |
63 + xmlFree(conv); | |
64 + } | |
65 +} | |
66 +#endif /* LIBXML_ICU_ENABLED */ | |
67 + | |
68 /************************************************************************ | |
69 * * | |
70 * Conversions To/From UTF8 encoding * | |
71 @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, | |
72 #ifdef LIBXML_ICONV_ENABLED | |
73 handler->iconv_in = NULL; | |
74 handler->iconv_out = NULL; | |
75 -#endif /* LIBXML_ICONV_ENABLED */ | |
76 +#endif | |
77 +#ifdef LIBXML_ICU_ENABLED | |
78 + handler->uconv_in = NULL; | |
79 + handler->uconv_out = NULL; | |
80 +#endif | |
81 | |
82 /* | |
83 * registers and returns the handler. | |
84 @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { | |
85 xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); | |
86 xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); | |
87 #endif /* LIBXML_OUTPUT_ENABLED */ | |
88 -#ifndef LIBXML_ICONV_ENABLED | |
89 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) | |
90 #ifdef LIBXML_ISO8859X_ENABLED | |
91 xmlRegisterCharEncodingHandlersISO8859x (); | |
92 #endif | |
93 @@ -1578,6 +1630,10 @@ xmlFindCharEncodingHandler(const char *name) { | |
94 xmlCharEncodingHandlerPtr enc; | |
95 iconv_t icv_in, icv_out; | |
96 #endif /* LIBXML_ICONV_ENABLED */ | |
97 +#ifdef LIBXML_ICU_ENABLED | |
98 + xmlCharEncodingHandlerPtr enc; | |
99 + uconv_t *ucv_in, *ucv_out; | |
100 +#endif /* LIBXML_ICU_ENABLED */ | |
101 char upper[100]; | |
102 int i; | |
103 | |
104 @@ -1647,6 +1703,35 @@ xmlFindCharEncodingHandler(const char *name) { | |
105 "iconv : problems with filters for '%s'\n", name); | |
106 } | |
107 #endif /* LIBXML_ICONV_ENABLED */ | |
108 +#ifdef LIBXML_ICU_ENABLED | |
109 + /* check whether icu can handle this */ | |
110 + ucv_in = openIcuConverter(name, 1); | |
111 + ucv_out = openIcuConverter(name, 0); | |
112 + if (ucv_in != NULL && ucv_out != NULL) { | |
113 + enc = (xmlCharEncodingHandlerPtr) | |
114 + xmlMalloc(sizeof(xmlCharEncodingHandler)); | |
115 + if (enc == NULL) { | |
116 + closeIcuConverter(ucv_in); | |
117 + closeIcuConverter(ucv_out); | |
118 + return(NULL); | |
119 + } | |
120 + enc->name = xmlMemStrdup(name); | |
121 + enc->input = NULL; | |
122 + enc->output = NULL; | |
123 + enc->uconv_in = ucv_in; | |
124 + enc->uconv_out = ucv_out; | |
125 +#ifdef DEBUG_ENCODING | |
126 + xmlGenericError(xmlGenericErrorContext, | |
127 + "Found ICU converter handler for encoding %s\n", name); | |
128 +#endif | |
129 + return enc; | |
130 + } else if (ucv_in != NULL || ucv_out != NULL) { | |
131 + closeIcuConverter(ucv_in); | |
132 + closeIcuConverter(ucv_out); | |
133 + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, | |
134 + "ICU converter : problems with filters for '%s'\n", name); | |
135 + } | |
136 +#endif /* LIBXML_ICU_ENABLED */ | |
137 | |
138 #ifdef DEBUG_ENCODING | |
139 xmlGenericError(xmlGenericErrorContext, | |
140 @@ -1737,6 +1822,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outl
en, | |
141 | |
142 /************************************************************************ | |
143 * * | |
144 + * ICU based generic conversion functions * | |
145 + * * | |
146 + ************************************************************************/ | |
147 + | |
148 +#ifdef LIBXML_ICU_ENABLED | |
149 +/** | |
150 + * xmlUconvWrapper: | |
151 + * @cd: ICU uconverter data structure | |
152 + * @toUnicode : non-zero if toUnicode. 0 otherwise. | |
153 + * @out: a pointer to an array of bytes to store the result | |
154 + * @outlen: the length of @out | |
155 + * @in: a pointer to an array of ISO Latin 1 chars | |
156 + * @inlen: the length of @in | |
157 + * | |
158 + * Returns 0 if success, or | |
159 + * -1 by lack of space, or | |
160 + * -2 if the transcoding fails (for *in is not valid utf8 string or | |
161 + * the result of transformation can't fit into the encoding we want), or | |
162 + * -3 if there the last byte can't form a single output char. | |
163 + * | |
164 + * The value of @inlen after return is the number of octets consumed | |
165 + * as the return value is positive, else unpredictable. | |
166 + * The value of @outlen after return is the number of ocetes consumed. | |
167 + */ | |
168 +static int | |
169 +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, | |
170 + const unsigned char *in, int *inlen) { | |
171 + const char *ucv_in = (const char *) in; | |
172 + char *ucv_out = (char *) out; | |
173 + UErrorCode err = U_ZERO_ERROR; | |
174 + | |
175 + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { | |
176 + if (outlen != NULL) *outlen = 0; | |
177 + return(-1); | |
178 + } | |
179 + | |
180 + /* | |
181 + * TODO(jungshik) | |
182 + * 1. is ucnv_convert(To|From)Algorithmic better? | |
183 + * 2. had we better use an explicit pivot buffer? | |
184 + * 3. error returned comes from 'fromUnicode' only even | |
185 + * when toUnicode is true ! | |
186 + */ | |
187 + if (toUnicode) { | |
188 + /* encoding => UTF-16 => UTF-8 */ | |
189 + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, | |
190 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, | |
191 + 0, TRUE, &err); | |
192 + } else { | |
193 + /* UTF-8 => UTF-16 => encoding */ | |
194 + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, | |
195 + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, | |
196 + 0, TRUE, &err); | |
197 + } | |
198 + *inlen = ucv_in - (const char*) in; | |
199 + *outlen = ucv_out - (char *) out; | |
200 + if (U_SUCCESS(err)) | |
201 + return 0; | |
202 + if (err == U_BUFFER_OVERFLOW_ERROR) | |
203 + return -1; | |
204 + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) | |
205 + return -2; | |
206 + /* if (err == U_TRUNCATED_CHAR_FOUND) */ | |
207 + return -3; | |
208 +} | |
209 +#endif /* LIBXML_ICU_ENABLED */ | |
210 + | |
211 +/************************************************************************ | |
212 + * * | |
213 * The real API used by libxml for on-the-fly conversion * | |
214 * * | |
215 ************************************************************************/ | |
216 @@ -1810,6 +1964,16 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, x
mlBufferPtr out, | |
217 if (ret == -1) ret = -3; | |
218 } | |
219 #endif /* LIBXML_ICONV_ENABLED */ | |
220 +#ifdef LIBXML_ICU_ENABLED | |
221 + else if (handler->uconv_in != NULL) { | |
222 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], | |
223 + &written, in->content, &toconv); | |
224 + xmlBufferShrink(in, toconv); | |
225 + out->use += written; | |
226 + out->content[out->use] = 0; | |
227 + if (ret == -1) ret = -3; | |
228 + } | |
229 +#endif /* LIBXML_ICU_ENABLED */ | |
230 #ifdef DEBUG_ENCODING | |
231 switch (ret) { | |
232 case 0: | |
233 @@ -1915,6 +2079,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBuf
ferPtr out, | |
234 ret = -3; | |
235 } | |
236 #endif /* LIBXML_ICONV_ENABLED */ | |
237 +#ifdef LIBXML_ICU_ENABLED | |
238 + else if (handler->uconv_in != NULL) { | |
239 + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], | |
240 + &written, in->content, &toconv); | |
241 + xmlBufferShrink(in, toconv); | |
242 + out->use += written; | |
243 + out->content[out->use] = 0; | |
244 + if (ret == -1) | |
245 + ret = -3; | |
246 + } | |
247 +#endif /* LIBXML_ICU_ENABLED */ | |
248 switch (ret) { | |
249 case 0: | |
250 #ifdef DEBUG_ENCODING | |
251 @@ -2015,6 +2190,15 @@ retry: | |
252 out->content[out->use] = 0; | |
253 } | |
254 #endif /* LIBXML_ICONV_ENABLED */ | |
255 +#ifdef LIBXML_ICU_ENABLED | |
256 + else if (handler->uconv_out != NULL) { | |
257 + ret = xmlUconvWrapper(handler->uconv_out, 0, | |
258 + &out->content[out->use], | |
259 + &written, NULL, &toconv); | |
260 + out->use += written; | |
261 + out->content[out->use] = 0; | |
262 + } | |
263 +#endif /* LIBXML_ICU_ENABLED */ | |
264 #ifdef DEBUG_ENCODING | |
265 xmlGenericError(xmlGenericErrorContext, | |
266 "initialized encoder\n"); | |
267 @@ -2061,6 +2245,26 @@ retry: | |
268 } | |
269 } | |
270 #endif /* LIBXML_ICONV_ENABLED */ | |
271 +#ifdef LIBXML_ICU_ENABLED | |
272 + else if (handler->uconv_out != NULL) { | |
273 + ret = xmlUconvWrapper(handler->uconv_out, 0, | |
274 + &out->content[out->use], | |
275 + &written, in->content, &toconv); | |
276 + xmlBufferShrink(in, toconv); | |
277 + out->use += written; | |
278 + writtentot += written; | |
279 + out->content[out->use] = 0; | |
280 + if (ret == -1) { | |
281 + if (written > 0) { | |
282 + /* | |
283 + * Can be a limitation of iconv | |
284 + */ | |
285 + goto retry; | |
286 + } | |
287 + ret = -3; | |
288 + } | |
289 + } | |
290 +#endif /* LIBXML_ICU_ENABLED */ | |
291 else { | |
292 xmlEncodingErr(XML_I18N_NO_OUTPUT, | |
293 "xmlCharEncOutFunc: no output function !\n", NULL); | |
294 @@ -2173,6 +2377,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { | |
295 xmlFree(handler); | |
296 } | |
297 #endif /* LIBXML_ICONV_ENABLED */ | |
298 +#ifdef LIBXML_ICU_ENABLED | |
299 + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { | |
300 + if (handler->name != NULL) | |
301 + xmlFree(handler->name); | |
302 + handler->name = NULL; | |
303 + if (handler->uconv_out != NULL) { | |
304 + closeIcuConverter(handler->uconv_out); | |
305 + handler->uconv_out = NULL; | |
306 + } | |
307 + if (handler->uconv_in != NULL) { | |
308 + closeIcuConverter(handler->uconv_in); | |
309 + handler->uconv_in = NULL; | |
310 + } | |
311 + xmlFree(handler); | |
312 + } | |
313 +#endif | |
314 #ifdef DEBUG_ENCODING | |
315 if (ret) | |
316 xmlGenericError(xmlGenericErrorContext, | |
317 @@ -2248,6 +2468,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { | |
318 cur += toconv; | |
319 } while (ret == -2); | |
320 #endif | |
321 +#ifdef LIBXML_ICU_ENABLED | |
322 + } else if (handler->uconv_out != NULL) { | |
323 + do { | |
324 + toconv = in->end - cur; | |
325 + written = 32000; | |
326 + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], | |
327 + &written, cur, &toconv); | |
328 + if (ret < 0) { | |
329 + if (written > 0) | |
330 + ret = -2; | |
331 + else | |
332 + return(-1); | |
333 + } | |
334 + unused += written; | |
335 + cur += toconv; | |
336 + } while (ret == -2); | |
337 } else { | |
338 /* could not find a converter */ | |
339 return(-1); | |
340 @@ -2259,8 +2495,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { | |
341 } | |
342 return(in->consumed + (in->cur - in->base)); | |
343 } | |
344 +#endif | |
345 | |
346 -#ifndef LIBXML_ICONV_ENABLED | |
347 +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) | |
348 #ifdef LIBXML_ISO8859X_ENABLED | |
349 | |
350 /** | |
351 diff --git a/third_party/libxml/include/libxml/encoding.h b/third_party/libxml/i
nclude/libxml/encoding.h | |
352 index c74b25f..b5f8b48 100644 | |
353 --- a/third_party/libxml/include/libxml/encoding.h | |
354 +++ b/third_party/libxml/include/libxml/encoding.h | |
355 @@ -26,6 +26,24 @@ | |
356 | |
357 #ifdef LIBXML_ICONV_ENABLED | |
358 #include <iconv.h> | |
359 +#else | |
360 +#ifdef LIBXML_ICU_ENABLED | |
361 +#include <unicode/ucnv.h> | |
362 +#if 0 | |
363 +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> | |
364 + * to prevent unwanted ICU symbols being exposed to users of libxml2. | |
365 + * One particular case is Qt4 conflicting on UChar32. | |
366 + */ | |
367 +#include <stdint.h> | |
368 +struct UConverter; | |
369 +typedef struct UConverter UConverter; | |
370 +#ifdef _MSC_VER | |
371 +typedef wchar_t UChar; | |
372 +#else | |
373 +typedef uint16_t UChar; | |
374 +#endif | |
375 +#endif | |
376 +#endif | |
377 #endif | |
378 #ifdef __cplusplus | |
379 extern "C" { | |
380 @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *ou
t, int *outlen, | |
381 * Block defining the handlers for non UTF-8 encodings. | |
382 * If iconv is supported, there are two extra fields. | |
383 */ | |
384 +#ifdef LIBXML_ICU_ENABLED | |
385 +struct _uconv_t { | |
386 + UConverter *uconv; /* for conversion between an encoding and UTF-16 */ | |
387 + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ | |
388 +}; | |
389 +typedef struct _uconv_t uconv_t; | |
390 +#endif | |
391 | |
392 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; | |
393 typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; | |
394 @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { | |
395 iconv_t iconv_in; | |
396 iconv_t iconv_out; | |
397 #endif /* LIBXML_ICONV_ENABLED */ | |
398 +#ifdef LIBXML_ICU_ENABLED | |
399 + uconv_t *uconv_in; | |
400 + uconv_t *uconv_out; | |
401 +#endif /* LIBXML_ICU_ENABLED */ | |
402 }; | |
403 | |
404 #ifdef __cplusplus | |
405 diff --git a/third_party/libxml/include/libxml/parser.h b/third_party/libxml/inc
lude/libxml/parser.h | |
406 index dd79c42..3580b63 100644 | |
407 --- a/third_party/libxml/include/libxml/parser.h | |
408 +++ b/third_party/libxml/include/libxml/parser.h | |
409 @@ -1222,6 +1222,7 @@ typedef enum { | |
410 XML_WITH_DEBUG_MEM = 29, | |
411 XML_WITH_DEBUG_RUN = 30, | |
412 XML_WITH_ZLIB = 31, | |
413 + XML_WITH_ICU = 32, | |
414 XML_WITH_NONE = 99999 /* just to be sure of allocation size */ | |
415 } xmlFeature; | |
416 | |
417 diff --git a/third_party/libxml/include/libxml/xmlversion.h.in b/third_party/lib
xml/include/libxml/xmlversion.h.in | |
418 index 4739f3a..de310ab 100644 | |
419 --- a/third_party/libxml/include/libxml/xmlversion.h.in | |
420 +++ b/third_party/libxml/include/libxml/xmlversion.h.in | |
421 @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); | |
422 #endif | |
423 | |
424 /** | |
425 + * LIBXML_ICU_ENABLED: | |
426 + * | |
427 + * Whether icu support is available | |
428 + */ | |
429 +#if @WITH_ICU@ | |
430 +#define LIBXML_ICU_ENABLED | |
431 +#endif | |
432 + | |
433 +/** | |
434 * LIBXML_ISO8859X_ENABLED: | |
435 * | |
436 * Whether ISO-8859-* support is made available in case iconv is not | |
437 diff --git a/third_party/libxml/parser.c b/third_party/libxml/parser.c | |
438 index 85e7599..3ba2a06 100644 | |
439 --- a/third_party/libxml/parser.c | |
440 +++ b/third_party/libxml/parser.c | |
441 @@ -954,6 +954,12 @@ xmlHasFeature(xmlFeature feature) | |
442 #else | |
443 return(0); | |
444 #endif | |
445 + case XML_WITH_ICU: | |
446 +#ifdef LIBXML_ICU_ENABLED | |
447 + return(1); | |
448 +#else | |
449 + return(0); | |
450 +#endif | |
451 default: | |
452 break; | |
453 } | |
OLD | NEW |