| OLD | NEW |
| 1 /* | 1 /* |
| 2 ** 2004 April 13 | 2 ** 2004 April 13 |
| 3 ** | 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of | 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: | 5 ** a legal notice, here is a blessing: |
| 6 ** | 6 ** |
| 7 ** May you do good and not evil. | 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. | 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. | 9 ** May you share freely, never taking more than you give. |
| 10 ** | 10 ** |
| 11 ************************************************************************* | 11 ************************************************************************* |
| 12 ** This file contains routines used to translate between UTF-8, | 12 ** This file contains routines used to translate between UTF-8, |
| 13 ** UTF-16, UTF-16BE, and UTF-16LE. | 13 ** UTF-16, UTF-16BE, and UTF-16LE. |
| 14 ** | 14 ** |
| 15 ** $Id: utf.c,v 1.73 2009/04/01 18:40:32 drh Exp $ | |
| 16 ** | |
| 17 ** Notes on UTF-8: | 15 ** Notes on UTF-8: |
| 18 ** | 16 ** |
| 19 ** Byte-0 Byte-1 Byte-2 Byte-3 Value | 17 ** Byte-0 Byte-1 Byte-2 Byte-3 Value |
| 20 ** 0xxxxxxx 00000000 00000000 0xxxxxxx | 18 ** 0xxxxxxx 00000000 00000000 0xxxxxxx |
| 21 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx | 19 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx |
| 22 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx | 20 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx |
| 23 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx | 21 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx |
| 24 ** | 22 ** |
| 25 ** | 23 ** |
| 26 ** Notes on UTF-16: (with wwww+1==uuuuu) | 24 ** Notes on UTF-16: (with wwww+1==uuuuu) |
| (...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 100 *zOut++ = (u8)((c>>8)&0x00FF); \ | 98 *zOut++ = (u8)((c>>8)&0x00FF); \ |
| 101 *zOut++ = (u8)(c&0x00FF); \ | 99 *zOut++ = (u8)(c&0x00FF); \ |
| 102 }else{ \ | 100 }else{ \ |
| 103 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \ | 101 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \ |
| 104 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ | 102 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ |
| 105 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \ | 103 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \ |
| 106 *zOut++ = (u8)(c&0x00FF); \ | 104 *zOut++ = (u8)(c&0x00FF); \ |
| 107 } \ | 105 } \ |
| 108 } | 106 } |
| 109 | 107 |
| 110 #define READ_UTF16LE(zIn, c){ \ | 108 #define READ_UTF16LE(zIn, TERM, c){ \ |
| 111 c = (*zIn++); \ | 109 c = (*zIn++); \ |
| 112 c += ((*zIn++)<<8); \ | 110 c += ((*zIn++)<<8); \ |
| 113 if( c>=0xD800 && c<0xE000 ){ \ | 111 if( c>=0xD800 && c<0xE000 && TERM ){ \ |
| 114 int c2 = (*zIn++); \ | 112 int c2 = (*zIn++); \ |
| 115 c2 += ((*zIn++)<<8); \ | 113 c2 += ((*zIn++)<<8); \ |
| 116 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ | 114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ |
| 117 } \ | 115 } \ |
| 118 } | 116 } |
| 119 | 117 |
| 120 #define READ_UTF16BE(zIn, c){ \ | 118 #define READ_UTF16BE(zIn, TERM, c){ \ |
| 121 c = ((*zIn++)<<8); \ | 119 c = ((*zIn++)<<8); \ |
| 122 c += (*zIn++); \ | 120 c += (*zIn++); \ |
| 123 if( c>=0xD800 && c<0xE000 ){ \ | 121 if( c>=0xD800 && c<0xE000 && TERM ){ \ |
| 124 int c2 = ((*zIn++)<<8); \ | 122 int c2 = ((*zIn++)<<8); \ |
| 125 c2 += (*zIn++); \ | 123 c2 += (*zIn++); \ |
| 126 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ | 124 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ |
| 127 } \ | 125 } \ |
| 128 } | 126 } |
| 129 | 127 |
| 130 /* | 128 /* |
| 131 ** Translate a single UTF-8 character. Return the unicode value. | 129 ** Translate a single UTF-8 character. Return the unicode value. |
| 132 ** | 130 ** |
| 133 ** During translation, assume that the byte that zTerm points | 131 ** During translation, assume that the byte that zTerm points |
| (...skipping 28 matching lines...) Expand all Loading... |
| 162 c = (c<<6) + (0x3f & *(zIn++)); \ | 160 c = (c<<6) + (0x3f & *(zIn++)); \ |
| 163 } \ | 161 } \ |
| 164 if( c<0x80 \ | 162 if( c<0x80 \ |
| 165 || (c&0xFFFFF800)==0xD800 \ | 163 || (c&0xFFFFF800)==0xD800 \ |
| 166 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ | 164 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ |
| 167 } | 165 } |
| 168 int sqlite3Utf8Read( | 166 int sqlite3Utf8Read( |
| 169 const unsigned char *zIn, /* First byte of UTF-8 character */ | 167 const unsigned char *zIn, /* First byte of UTF-8 character */ |
| 170 const unsigned char **pzNext /* Write first byte past UTF-8 char here */ | 168 const unsigned char **pzNext /* Write first byte past UTF-8 char here */ |
| 171 ){ | 169 ){ |
| 172 int c; | 170 unsigned int c; |
| 173 | 171 |
| 174 /* Same as READ_UTF8() above but without the zTerm parameter. | 172 /* Same as READ_UTF8() above but without the zTerm parameter. |
| 175 ** For this routine, we assume the UTF8 string is always zero-terminated. | 173 ** For this routine, we assume the UTF8 string is always zero-terminated. |
| 176 */ | 174 */ |
| 177 c = *(zIn++); | 175 c = *(zIn++); |
| 178 if( c>=0xc0 ){ | 176 if( c>=0xc0 ){ |
| 179 c = sqlite3Utf8Trans1[c-0xc0]; | 177 c = sqlite3Utf8Trans1[c-0xc0]; |
| 180 while( (*zIn & 0xc0)==0x80 ){ | 178 while( (*zIn & 0xc0)==0x80 ){ |
| 181 c = (c<<6) + (0x3f & *(zIn++)); | 179 c = (c<<6) + (0x3f & *(zIn++)); |
| 182 } | 180 } |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 298 WRITE_UTF16BE(z, c); | 296 WRITE_UTF16BE(z, c); |
| 299 } | 297 } |
| 300 } | 298 } |
| 301 pMem->n = (int)(z - zOut); | 299 pMem->n = (int)(z - zOut); |
| 302 *z++ = 0; | 300 *z++ = 0; |
| 303 }else{ | 301 }else{ |
| 304 assert( desiredEnc==SQLITE_UTF8 ); | 302 assert( desiredEnc==SQLITE_UTF8 ); |
| 305 if( pMem->enc==SQLITE_UTF16LE ){ | 303 if( pMem->enc==SQLITE_UTF16LE ){ |
| 306 /* UTF-16 Little-endian -> UTF-8 */ | 304 /* UTF-16 Little-endian -> UTF-8 */ |
| 307 while( zIn<zTerm ){ | 305 while( zIn<zTerm ){ |
| 308 READ_UTF16LE(zIn, c); | 306 READ_UTF16LE(zIn, zIn<zTerm, c); |
| 309 WRITE_UTF8(z, c); | 307 WRITE_UTF8(z, c); |
| 310 } | 308 } |
| 311 }else{ | 309 }else{ |
| 312 /* UTF-16 Big-endian -> UTF-8 */ | 310 /* UTF-16 Big-endian -> UTF-8 */ |
| 313 while( zIn<zTerm ){ | 311 while( zIn<zTerm ){ |
| 314 READ_UTF16BE(zIn, c); | 312 READ_UTF16BE(zIn, zIn<zTerm, c); |
| 315 WRITE_UTF8(z, c); | 313 WRITE_UTF8(z, c); |
| 316 } | 314 } |
| 317 } | 315 } |
| 318 pMem->n = (int)(z - zOut); | 316 pMem->n = (int)(z - zOut); |
| 319 } | 317 } |
| 320 *z = 0; | 318 *z = 0; |
| 321 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); | 319 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); |
| 322 | 320 |
| 323 sqlite3VdbeMemRelease(pMem); | 321 sqlite3VdbeMemRelease(pMem); |
| 324 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem); | 322 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem); |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 405 /* This test function is not currently used by the automated test-suite. | 403 /* This test function is not currently used by the automated test-suite. |
| 406 ** Hence it is only available in debug builds. | 404 ** Hence it is only available in debug builds. |
| 407 */ | 405 */ |
| 408 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) | 406 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) |
| 409 /* | 407 /* |
| 410 ** Translate UTF-8 to UTF-8. | 408 ** Translate UTF-8 to UTF-8. |
| 411 ** | 409 ** |
| 412 ** This has the effect of making sure that the string is well-formed | 410 ** This has the effect of making sure that the string is well-formed |
| 413 ** UTF-8. Miscoded characters are removed. | 411 ** UTF-8. Miscoded characters are removed. |
| 414 ** | 412 ** |
| 415 ** The translation is done in-place (since it is impossible for the | 413 ** The translation is done in-place and aborted if the output |
| 416 ** correct UTF-8 encoding to be longer than a malformed encoding). | 414 ** overruns the input. |
| 417 */ | 415 */ |
| 418 int sqlite3Utf8To8(unsigned char *zIn){ | 416 int sqlite3Utf8To8(unsigned char *zIn){ |
| 419 unsigned char *zOut = zIn; | 417 unsigned char *zOut = zIn; |
| 420 unsigned char *zStart = zIn; | 418 unsigned char *zStart = zIn; |
| 421 u32 c; | 419 u32 c; |
| 422 | 420 |
| 423 while( zIn[0] ){ | 421 while( zIn[0] && zOut<=zIn ){ |
| 424 c = sqlite3Utf8Read(zIn, (const u8**)&zIn); | 422 c = sqlite3Utf8Read(zIn, (const u8**)&zIn); |
| 425 if( c!=0xfffd ){ | 423 if( c!=0xfffd ){ |
| 426 WRITE_UTF8(zOut, c); | 424 WRITE_UTF8(zOut, c); |
| 427 } | 425 } |
| 428 } | 426 } |
| 429 *zOut = 0; | 427 *zOut = 0; |
| 430 return (int)(zOut - zStart); | 428 return (int)(zOut - zStart); |
| 431 } | 429 } |
| 432 #endif | 430 #endif |
| 433 | 431 |
| 434 #ifndef SQLITE_OMIT_UTF16 | 432 #ifndef SQLITE_OMIT_UTF16 |
| 435 /* | 433 /* |
| 436 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. | 434 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. |
| 437 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must | 435 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must |
| 438 ** be freed by the calling function. | 436 ** be freed by the calling function. |
| 439 ** | 437 ** |
| 440 ** NULL is returned if there is an allocation error. | 438 ** NULL is returned if there is an allocation error. |
| 441 */ | 439 */ |
| 442 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){ | 440 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){ |
| 443 Mem m; | 441 Mem m; |
| 444 memset(&m, 0, sizeof(m)); | 442 memset(&m, 0, sizeof(m)); |
| 445 m.db = db; | 443 m.db = db; |
| 446 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC); | 444 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC); |
| 447 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); | 445 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); |
| 448 if( db->mallocFailed ){ | 446 if( db->mallocFailed ){ |
| 449 sqlite3VdbeMemRelease(&m); | 447 sqlite3VdbeMemRelease(&m); |
| 450 m.z = 0; | 448 m.z = 0; |
| 451 } | 449 } |
| 452 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); | 450 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); |
| 453 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); | 451 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); |
| 454 return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z); | 452 assert( (m.flags & MEM_Dyn)!=0 || db->mallocFailed ); |
| 453 assert( m.z || db->mallocFailed ); |
| 454 return m.z; |
| 455 } | 455 } |
| 456 | 456 |
| 457 /* | 457 /* |
| 458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter | 458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter |
| 459 ** enc. A pointer to the new string is returned, and the value of *pnOut | 459 ** enc. A pointer to the new string is returned, and the value of *pnOut |
| 460 ** is set to the length of the returned string in bytes. The call should | 460 ** is set to the length of the returned string in bytes. The call should |
| 461 ** arrange to call sqlite3DbFree() on the returned pointer when it is | 461 ** arrange to call sqlite3DbFree() on the returned pointer when it is |
| 462 ** no longer required. | 462 ** no longer required. |
| 463 ** | 463 ** |
| 464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed | 464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed |
| 465 ** flag set. | 465 ** flag set. |
| 466 */ | 466 */ |
| 467 #ifdef SQLITE_ENABLE_STAT2 | 467 #ifdef SQLITE_ENABLE_STAT2 |
| 468 char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){ | 468 char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){ |
| 469 Mem m; | 469 Mem m; |
| 470 memset(&m, 0, sizeof(m)); | 470 memset(&m, 0, sizeof(m)); |
| 471 m.db = db; | 471 m.db = db; |
| 472 sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC); | 472 sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC); |
| 473 if( sqlite3VdbeMemTranslate(&m, enc) ){ | 473 if( sqlite3VdbeMemTranslate(&m, enc) ){ |
| 474 assert( db->mallocFailed ); | 474 assert( db->mallocFailed ); |
| 475 return 0; | 475 return 0; |
| 476 } | 476 } |
| 477 assert( m.z==m.zMalloc ); | 477 assert( m.z==m.zMalloc ); |
| 478 *pnOut = m.n; | 478 *pnOut = m.n; |
| 479 return m.z; | 479 return m.z; |
| 480 } | 480 } |
| 481 #endif | 481 #endif |
| 482 | 482 |
| 483 /* | 483 /* |
| 484 ** pZ is a UTF-16 encoded unicode string at least nChar characters long. | 484 ** zIn is a UTF-16 encoded unicode string at least nChar characters long. |
| 485 ** Return the number of bytes in the first nChar unicode characters | 485 ** Return the number of bytes in the first nChar unicode characters |
| 486 ** in pZ. nChar must be non-negative. | 486 ** in pZ. nChar must be non-negative. |
| 487 */ | 487 */ |
| 488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ | 488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ |
| 489 int c; | 489 int c; |
| 490 unsigned char const *z = zIn; | 490 unsigned char const *z = zIn; |
| 491 int n = 0; | 491 int n = 0; |
| 492 |
| 492 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ | 493 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ |
| 493 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here | |
| 494 ** and in other parts of this file means that at one branch will | |
| 495 ** not be covered by coverage testing on any single host. But coverage | |
| 496 ** will be complete if the tests are run on both a little-endian and | |
| 497 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE | |
| 498 ** macros are constant at compile time the compiler can determine | |
| 499 ** which branch will be followed. It is therefore assumed that no runtime | |
| 500 ** penalty is paid for this "if" statement. | |
| 501 */ | |
| 502 while( n<nChar ){ | 494 while( n<nChar ){ |
| 503 READ_UTF16BE(z, c); | 495 READ_UTF16BE(z, 1, c); |
| 504 n++; | 496 n++; |
| 505 } | 497 } |
| 506 }else{ | 498 }else{ |
| 507 while( n<nChar ){ | 499 while( n<nChar ){ |
| 508 READ_UTF16LE(z, c); | 500 READ_UTF16LE(z, 1, c); |
| 509 n++; | 501 n++; |
| 510 } | 502 } |
| 511 } | 503 } |
| 512 return (int)(z-(unsigned char const *)zIn); | 504 return (int)(z-(unsigned char const *)zIn); |
| 513 } | 505 } |
| 514 | 506 |
| 515 #if defined(SQLITE_TEST) | 507 #if defined(SQLITE_TEST) |
| 516 /* | 508 /* |
| 517 ** This routine is called from the TCL test function "translate_selftest". | 509 ** This routine is called from the TCL test function "translate_selftest". |
| 518 ** It checks that the primitives for serializing and deserializing | 510 ** It checks that the primitives for serializing and deserializing |
| (...skipping 21 matching lines...) Expand all Loading... |
| 540 assert( (z-zBuf)==n ); | 532 assert( (z-zBuf)==n ); |
| 541 } | 533 } |
| 542 for(i=0; i<0x00110000; i++){ | 534 for(i=0; i<0x00110000; i++){ |
| 543 if( i>=0xD800 && i<0xE000 ) continue; | 535 if( i>=0xD800 && i<0xE000 ) continue; |
| 544 z = zBuf; | 536 z = zBuf; |
| 545 WRITE_UTF16LE(z, i); | 537 WRITE_UTF16LE(z, i); |
| 546 n = (int)(z-zBuf); | 538 n = (int)(z-zBuf); |
| 547 assert( n>0 && n<=4 ); | 539 assert( n>0 && n<=4 ); |
| 548 z[0] = 0; | 540 z[0] = 0; |
| 549 z = zBuf; | 541 z = zBuf; |
| 550 READ_UTF16LE(z, c); | 542 READ_UTF16LE(z, 1, c); |
| 551 assert( c==i ); | 543 assert( c==i ); |
| 552 assert( (z-zBuf)==n ); | 544 assert( (z-zBuf)==n ); |
| 553 } | 545 } |
| 554 for(i=0; i<0x00110000; i++){ | 546 for(i=0; i<0x00110000; i++){ |
| 555 if( i>=0xD800 && i<0xE000 ) continue; | 547 if( i>=0xD800 && i<0xE000 ) continue; |
| 556 z = zBuf; | 548 z = zBuf; |
| 557 WRITE_UTF16BE(z, i); | 549 WRITE_UTF16BE(z, i); |
| 558 n = (int)(z-zBuf); | 550 n = (int)(z-zBuf); |
| 559 assert( n>0 && n<=4 ); | 551 assert( n>0 && n<=4 ); |
| 560 z[0] = 0; | 552 z[0] = 0; |
| 561 z = zBuf; | 553 z = zBuf; |
| 562 READ_UTF16BE(z, c); | 554 READ_UTF16BE(z, 1, c); |
| 563 assert( c==i ); | 555 assert( c==i ); |
| 564 assert( (z-zBuf)==n ); | 556 assert( (z-zBuf)==n ); |
| 565 } | 557 } |
| 566 } | 558 } |
| 567 #endif /* SQLITE_TEST */ | 559 #endif /* SQLITE_TEST */ |
| 568 #endif /* SQLITE_OMIT_UTF16 */ | 560 #endif /* SQLITE_OMIT_UTF16 */ |
| OLD | NEW |