OLD | NEW |
1 /* | 1 /* |
2 ** 2004 April 13 | 2 ** 2004 April 13 |
3 ** | 3 ** |
4 ** The author disclaims copyright to this source code. In place of | 4 ** The author disclaims copyright to this source code. In place of |
5 ** a legal notice, here is a blessing: | 5 ** a legal notice, here is a blessing: |
6 ** | 6 ** |
7 ** May you do good and not evil. | 7 ** May you do good and not evil. |
8 ** May you find forgiveness for yourself and forgive others. | 8 ** May you find forgiveness for yourself and forgive others. |
9 ** May you share freely, never taking more than you give. | 9 ** May you share freely, never taking more than you give. |
10 ** | 10 ** |
11 ************************************************************************* | 11 ************************************************************************* |
12 ** This file contains routines used to translate between UTF-8, | 12 ** This file contains routines used to translate between UTF-8, |
13 ** UTF-16, UTF-16BE, and UTF-16LE. | 13 ** UTF-16, UTF-16BE, and UTF-16LE. |
14 ** | 14 ** |
15 ** $Id: utf.c,v 1.73 2009/04/01 18:40:32 drh Exp $ | |
16 ** | |
17 ** Notes on UTF-8: | 15 ** Notes on UTF-8: |
18 ** | 16 ** |
19 ** Byte-0 Byte-1 Byte-2 Byte-3 Value | 17 ** Byte-0 Byte-1 Byte-2 Byte-3 Value |
20 ** 0xxxxxxx 00000000 00000000 0xxxxxxx | 18 ** 0xxxxxxx 00000000 00000000 0xxxxxxx |
21 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx | 19 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx |
22 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx | 20 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx |
23 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx | 21 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx |
24 ** | 22 ** |
25 ** | 23 ** |
26 ** Notes on UTF-16: (with wwww+1==uuuuu) | 24 ** Notes on UTF-16: (with wwww+1==uuuuu) |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
100 *zOut++ = (u8)((c>>8)&0x00FF); \ | 98 *zOut++ = (u8)((c>>8)&0x00FF); \ |
101 *zOut++ = (u8)(c&0x00FF); \ | 99 *zOut++ = (u8)(c&0x00FF); \ |
102 }else{ \ | 100 }else{ \ |
103 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \ | 101 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \ |
104 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ | 102 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ |
105 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \ | 103 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \ |
106 *zOut++ = (u8)(c&0x00FF); \ | 104 *zOut++ = (u8)(c&0x00FF); \ |
107 } \ | 105 } \ |
108 } | 106 } |
109 | 107 |
110 #define READ_UTF16LE(zIn, c){ \ | 108 #define READ_UTF16LE(zIn, TERM, c){ \ |
111 c = (*zIn++); \ | 109 c = (*zIn++); \ |
112 c += ((*zIn++)<<8); \ | 110 c += ((*zIn++)<<8); \ |
113 if( c>=0xD800 && c<0xE000 ){ \ | 111 if( c>=0xD800 && c<0xE000 && TERM ){ \ |
114 int c2 = (*zIn++); \ | 112 int c2 = (*zIn++); \ |
115 c2 += ((*zIn++)<<8); \ | 113 c2 += ((*zIn++)<<8); \ |
116 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ | 114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ |
117 } \ | 115 } \ |
118 } | 116 } |
119 | 117 |
120 #define READ_UTF16BE(zIn, c){ \ | 118 #define READ_UTF16BE(zIn, TERM, c){ \ |
121 c = ((*zIn++)<<8); \ | 119 c = ((*zIn++)<<8); \ |
122 c += (*zIn++); \ | 120 c += (*zIn++); \ |
123 if( c>=0xD800 && c<0xE000 ){ \ | 121 if( c>=0xD800 && c<0xE000 && TERM ){ \ |
124 int c2 = ((*zIn++)<<8); \ | 122 int c2 = ((*zIn++)<<8); \ |
125 c2 += (*zIn++); \ | 123 c2 += (*zIn++); \ |
126 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ | 124 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ |
127 } \ | 125 } \ |
128 } | 126 } |
129 | 127 |
130 /* | 128 /* |
131 ** Translate a single UTF-8 character. Return the unicode value. | 129 ** Translate a single UTF-8 character. Return the unicode value. |
132 ** | 130 ** |
133 ** During translation, assume that the byte that zTerm points | 131 ** During translation, assume that the byte that zTerm points |
(...skipping 28 matching lines...) Expand all Loading... |
162 c = (c<<6) + (0x3f & *(zIn++)); \ | 160 c = (c<<6) + (0x3f & *(zIn++)); \ |
163 } \ | 161 } \ |
164 if( c<0x80 \ | 162 if( c<0x80 \ |
165 || (c&0xFFFFF800)==0xD800 \ | 163 || (c&0xFFFFF800)==0xD800 \ |
166 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ | 164 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ |
167 } | 165 } |
168 int sqlite3Utf8Read( | 166 int sqlite3Utf8Read( |
169 const unsigned char *zIn, /* First byte of UTF-8 character */ | 167 const unsigned char *zIn, /* First byte of UTF-8 character */ |
170 const unsigned char **pzNext /* Write first byte past UTF-8 char here */ | 168 const unsigned char **pzNext /* Write first byte past UTF-8 char here */ |
171 ){ | 169 ){ |
172 int c; | 170 unsigned int c; |
173 | 171 |
174 /* Same as READ_UTF8() above but without the zTerm parameter. | 172 /* Same as READ_UTF8() above but without the zTerm parameter. |
175 ** For this routine, we assume the UTF8 string is always zero-terminated. | 173 ** For this routine, we assume the UTF8 string is always zero-terminated. |
176 */ | 174 */ |
177 c = *(zIn++); | 175 c = *(zIn++); |
178 if( c>=0xc0 ){ | 176 if( c>=0xc0 ){ |
179 c = sqlite3Utf8Trans1[c-0xc0]; | 177 c = sqlite3Utf8Trans1[c-0xc0]; |
180 while( (*zIn & 0xc0)==0x80 ){ | 178 while( (*zIn & 0xc0)==0x80 ){ |
181 c = (c<<6) + (0x3f & *(zIn++)); | 179 c = (c<<6) + (0x3f & *(zIn++)); |
182 } | 180 } |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
298 WRITE_UTF16BE(z, c); | 296 WRITE_UTF16BE(z, c); |
299 } | 297 } |
300 } | 298 } |
301 pMem->n = (int)(z - zOut); | 299 pMem->n = (int)(z - zOut); |
302 *z++ = 0; | 300 *z++ = 0; |
303 }else{ | 301 }else{ |
304 assert( desiredEnc==SQLITE_UTF8 ); | 302 assert( desiredEnc==SQLITE_UTF8 ); |
305 if( pMem->enc==SQLITE_UTF16LE ){ | 303 if( pMem->enc==SQLITE_UTF16LE ){ |
306 /* UTF-16 Little-endian -> UTF-8 */ | 304 /* UTF-16 Little-endian -> UTF-8 */ |
307 while( zIn<zTerm ){ | 305 while( zIn<zTerm ){ |
308 READ_UTF16LE(zIn, c); | 306 READ_UTF16LE(zIn, zIn<zTerm, c); |
309 WRITE_UTF8(z, c); | 307 WRITE_UTF8(z, c); |
310 } | 308 } |
311 }else{ | 309 }else{ |
312 /* UTF-16 Big-endian -> UTF-8 */ | 310 /* UTF-16 Big-endian -> UTF-8 */ |
313 while( zIn<zTerm ){ | 311 while( zIn<zTerm ){ |
314 READ_UTF16BE(zIn, c); | 312 READ_UTF16BE(zIn, zIn<zTerm, c); |
315 WRITE_UTF8(z, c); | 313 WRITE_UTF8(z, c); |
316 } | 314 } |
317 } | 315 } |
318 pMem->n = (int)(z - zOut); | 316 pMem->n = (int)(z - zOut); |
319 } | 317 } |
320 *z = 0; | 318 *z = 0; |
321 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); | 319 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); |
322 | 320 |
323 sqlite3VdbeMemRelease(pMem); | 321 sqlite3VdbeMemRelease(pMem); |
324 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem); | 322 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem); |
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
405 /* This test function is not currently used by the automated test-suite. | 403 /* This test function is not currently used by the automated test-suite. |
406 ** Hence it is only available in debug builds. | 404 ** Hence it is only available in debug builds. |
407 */ | 405 */ |
408 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) | 406 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) |
409 /* | 407 /* |
410 ** Translate UTF-8 to UTF-8. | 408 ** Translate UTF-8 to UTF-8. |
411 ** | 409 ** |
412 ** This has the effect of making sure that the string is well-formed | 410 ** This has the effect of making sure that the string is well-formed |
413 ** UTF-8. Miscoded characters are removed. | 411 ** UTF-8. Miscoded characters are removed. |
414 ** | 412 ** |
415 ** The translation is done in-place (since it is impossible for the | 413 ** The translation is done in-place and aborted if the output |
416 ** correct UTF-8 encoding to be longer than a malformed encoding). | 414 ** overruns the input. |
417 */ | 415 */ |
418 int sqlite3Utf8To8(unsigned char *zIn){ | 416 int sqlite3Utf8To8(unsigned char *zIn){ |
419 unsigned char *zOut = zIn; | 417 unsigned char *zOut = zIn; |
420 unsigned char *zStart = zIn; | 418 unsigned char *zStart = zIn; |
421 u32 c; | 419 u32 c; |
422 | 420 |
423 while( zIn[0] ){ | 421 while( zIn[0] && zOut<=zIn ){ |
424 c = sqlite3Utf8Read(zIn, (const u8**)&zIn); | 422 c = sqlite3Utf8Read(zIn, (const u8**)&zIn); |
425 if( c!=0xfffd ){ | 423 if( c!=0xfffd ){ |
426 WRITE_UTF8(zOut, c); | 424 WRITE_UTF8(zOut, c); |
427 } | 425 } |
428 } | 426 } |
429 *zOut = 0; | 427 *zOut = 0; |
430 return (int)(zOut - zStart); | 428 return (int)(zOut - zStart); |
431 } | 429 } |
432 #endif | 430 #endif |
433 | 431 |
434 #ifndef SQLITE_OMIT_UTF16 | 432 #ifndef SQLITE_OMIT_UTF16 |
435 /* | 433 /* |
436 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. | 434 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. |
437 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must | 435 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must |
438 ** be freed by the calling function. | 436 ** be freed by the calling function. |
439 ** | 437 ** |
440 ** NULL is returned if there is an allocation error. | 438 ** NULL is returned if there is an allocation error. |
441 */ | 439 */ |
442 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){ | 440 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){ |
443 Mem m; | 441 Mem m; |
444 memset(&m, 0, sizeof(m)); | 442 memset(&m, 0, sizeof(m)); |
445 m.db = db; | 443 m.db = db; |
446 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC); | 444 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC); |
447 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); | 445 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); |
448 if( db->mallocFailed ){ | 446 if( db->mallocFailed ){ |
449 sqlite3VdbeMemRelease(&m); | 447 sqlite3VdbeMemRelease(&m); |
450 m.z = 0; | 448 m.z = 0; |
451 } | 449 } |
452 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); | 450 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); |
453 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); | 451 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); |
454 return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z); | 452 assert( (m.flags & MEM_Dyn)!=0 || db->mallocFailed ); |
| 453 assert( m.z || db->mallocFailed ); |
| 454 return m.z; |
455 } | 455 } |
456 | 456 |
457 /* | 457 /* |
458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter | 458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter |
459 ** enc. A pointer to the new string is returned, and the value of *pnOut | 459 ** enc. A pointer to the new string is returned, and the value of *pnOut |
460 ** is set to the length of the returned string in bytes. The call should | 460 ** is set to the length of the returned string in bytes. The call should |
461 ** arrange to call sqlite3DbFree() on the returned pointer when it is | 461 ** arrange to call sqlite3DbFree() on the returned pointer when it is |
462 ** no longer required. | 462 ** no longer required. |
463 ** | 463 ** |
464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed | 464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed |
465 ** flag set. | 465 ** flag set. |
466 */ | 466 */ |
467 #ifdef SQLITE_ENABLE_STAT2 | 467 #ifdef SQLITE_ENABLE_STAT2 |
468 char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){ | 468 char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){ |
469 Mem m; | 469 Mem m; |
470 memset(&m, 0, sizeof(m)); | 470 memset(&m, 0, sizeof(m)); |
471 m.db = db; | 471 m.db = db; |
472 sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC); | 472 sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC); |
473 if( sqlite3VdbeMemTranslate(&m, enc) ){ | 473 if( sqlite3VdbeMemTranslate(&m, enc) ){ |
474 assert( db->mallocFailed ); | 474 assert( db->mallocFailed ); |
475 return 0; | 475 return 0; |
476 } | 476 } |
477 assert( m.z==m.zMalloc ); | 477 assert( m.z==m.zMalloc ); |
478 *pnOut = m.n; | 478 *pnOut = m.n; |
479 return m.z; | 479 return m.z; |
480 } | 480 } |
481 #endif | 481 #endif |
482 | 482 |
483 /* | 483 /* |
484 ** pZ is a UTF-16 encoded unicode string at least nChar characters long. | 484 ** zIn is a UTF-16 encoded unicode string at least nChar characters long. |
485 ** Return the number of bytes in the first nChar unicode characters | 485 ** Return the number of bytes in the first nChar unicode characters |
486 ** in pZ. nChar must be non-negative. | 486 ** in pZ. nChar must be non-negative. |
487 */ | 487 */ |
488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ | 488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ |
489 int c; | 489 int c; |
490 unsigned char const *z = zIn; | 490 unsigned char const *z = zIn; |
491 int n = 0; | 491 int n = 0; |
| 492 |
492 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ | 493 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ |
493 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here | |
494 ** and in other parts of this file means that at one branch will | |
495 ** not be covered by coverage testing on any single host. But coverage | |
496 ** will be complete if the tests are run on both a little-endian and | |
497 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE | |
498 ** macros are constant at compile time the compiler can determine | |
499 ** which branch will be followed. It is therefore assumed that no runtime | |
500 ** penalty is paid for this "if" statement. | |
501 */ | |
502 while( n<nChar ){ | 494 while( n<nChar ){ |
503 READ_UTF16BE(z, c); | 495 READ_UTF16BE(z, 1, c); |
504 n++; | 496 n++; |
505 } | 497 } |
506 }else{ | 498 }else{ |
507 while( n<nChar ){ | 499 while( n<nChar ){ |
508 READ_UTF16LE(z, c); | 500 READ_UTF16LE(z, 1, c); |
509 n++; | 501 n++; |
510 } | 502 } |
511 } | 503 } |
512 return (int)(z-(unsigned char const *)zIn); | 504 return (int)(z-(unsigned char const *)zIn); |
513 } | 505 } |
514 | 506 |
515 #if defined(SQLITE_TEST) | 507 #if defined(SQLITE_TEST) |
516 /* | 508 /* |
517 ** This routine is called from the TCL test function "translate_selftest". | 509 ** This routine is called from the TCL test function "translate_selftest". |
518 ** It checks that the primitives for serializing and deserializing | 510 ** It checks that the primitives for serializing and deserializing |
(...skipping 21 matching lines...) Expand all Loading... |
540 assert( (z-zBuf)==n ); | 532 assert( (z-zBuf)==n ); |
541 } | 533 } |
542 for(i=0; i<0x00110000; i++){ | 534 for(i=0; i<0x00110000; i++){ |
543 if( i>=0xD800 && i<0xE000 ) continue; | 535 if( i>=0xD800 && i<0xE000 ) continue; |
544 z = zBuf; | 536 z = zBuf; |
545 WRITE_UTF16LE(z, i); | 537 WRITE_UTF16LE(z, i); |
546 n = (int)(z-zBuf); | 538 n = (int)(z-zBuf); |
547 assert( n>0 && n<=4 ); | 539 assert( n>0 && n<=4 ); |
548 z[0] = 0; | 540 z[0] = 0; |
549 z = zBuf; | 541 z = zBuf; |
550 READ_UTF16LE(z, c); | 542 READ_UTF16LE(z, 1, c); |
551 assert( c==i ); | 543 assert( c==i ); |
552 assert( (z-zBuf)==n ); | 544 assert( (z-zBuf)==n ); |
553 } | 545 } |
554 for(i=0; i<0x00110000; i++){ | 546 for(i=0; i<0x00110000; i++){ |
555 if( i>=0xD800 && i<0xE000 ) continue; | 547 if( i>=0xD800 && i<0xE000 ) continue; |
556 z = zBuf; | 548 z = zBuf; |
557 WRITE_UTF16BE(z, i); | 549 WRITE_UTF16BE(z, i); |
558 n = (int)(z-zBuf); | 550 n = (int)(z-zBuf); |
559 assert( n>0 && n<=4 ); | 551 assert( n>0 && n<=4 ); |
560 z[0] = 0; | 552 z[0] = 0; |
561 z = zBuf; | 553 z = zBuf; |
562 READ_UTF16BE(z, c); | 554 READ_UTF16BE(z, 1, c); |
563 assert( c==i ); | 555 assert( c==i ); |
564 assert( (z-zBuf)==n ); | 556 assert( (z-zBuf)==n ); |
565 } | 557 } |
566 } | 558 } |
567 #endif /* SQLITE_TEST */ | 559 #endif /* SQLITE_TEST */ |
568 #endif /* SQLITE_OMIT_UTF16 */ | 560 #endif /* SQLITE_OMIT_UTF16 */ |
OLD | NEW |