Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(17)

Side by Side Diff: third_party/sqlite/src/src/utf.c

Issue 6990047: Import SQLite 3.7.6.3. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 9 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « third_party/sqlite/src/src/update.c ('k') | third_party/sqlite/src/src/util.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 ** 2004 April 13 2 ** 2004 April 13
3 ** 3 **
4 ** The author disclaims copyright to this source code. In place of 4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing: 5 ** a legal notice, here is a blessing:
6 ** 6 **
7 ** May you do good and not evil. 7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others. 8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give. 9 ** May you share freely, never taking more than you give.
10 ** 10 **
11 ************************************************************************* 11 *************************************************************************
12 ** This file contains routines used to translate between UTF-8, 12 ** This file contains routines used to translate between UTF-8,
13 ** UTF-16, UTF-16BE, and UTF-16LE. 13 ** UTF-16, UTF-16BE, and UTF-16LE.
14 ** 14 **
15 ** $Id: utf.c,v 1.73 2009/04/01 18:40:32 drh Exp $
16 **
17 ** Notes on UTF-8: 15 ** Notes on UTF-8:
18 ** 16 **
19 ** Byte-0 Byte-1 Byte-2 Byte-3 Value 17 ** Byte-0 Byte-1 Byte-2 Byte-3 Value
20 ** 0xxxxxxx 00000000 00000000 0xxxxxxx 18 ** 0xxxxxxx 00000000 00000000 0xxxxxxx
21 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx 19 ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx 20 ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx 21 ** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
24 ** 22 **
25 ** 23 **
26 ** Notes on UTF-16: (with wwww+1==uuuuu) 24 ** Notes on UTF-16: (with wwww+1==uuuuu)
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after
100 *zOut++ = (u8)((c>>8)&0x00FF); \ 98 *zOut++ = (u8)((c>>8)&0x00FF); \
101 *zOut++ = (u8)(c&0x00FF); \ 99 *zOut++ = (u8)(c&0x00FF); \
102 }else{ \ 100 }else{ \
103 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \ 101 *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \
104 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ 102 *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
105 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \ 103 *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \
106 *zOut++ = (u8)(c&0x00FF); \ 104 *zOut++ = (u8)(c&0x00FF); \
107 } \ 105 } \
108 } 106 }
109 107
110 #define READ_UTF16LE(zIn, c){ \ 108 #define READ_UTF16LE(zIn, TERM, c){ \
111 c = (*zIn++); \ 109 c = (*zIn++); \
112 c += ((*zIn++)<<8); \ 110 c += ((*zIn++)<<8); \
113 if( c>=0xD800 && c<0xE000 ){ \ 111 if( c>=0xD800 && c<0xE000 && TERM ){ \
114 int c2 = (*zIn++); \ 112 int c2 = (*zIn++); \
115 c2 += ((*zIn++)<<8); \ 113 c2 += ((*zIn++)<<8); \
116 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 114 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
117 } \ 115 } \
118 } 116 }
119 117
120 #define READ_UTF16BE(zIn, c){ \ 118 #define READ_UTF16BE(zIn, TERM, c){ \
121 c = ((*zIn++)<<8); \ 119 c = ((*zIn++)<<8); \
122 c += (*zIn++); \ 120 c += (*zIn++); \
123 if( c>=0xD800 && c<0xE000 ){ \ 121 if( c>=0xD800 && c<0xE000 && TERM ){ \
124 int c2 = ((*zIn++)<<8); \ 122 int c2 = ((*zIn++)<<8); \
125 c2 += (*zIn++); \ 123 c2 += (*zIn++); \
126 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ 124 c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
127 } \ 125 } \
128 } 126 }
129 127
130 /* 128 /*
131 ** Translate a single UTF-8 character. Return the unicode value. 129 ** Translate a single UTF-8 character. Return the unicode value.
132 ** 130 **
133 ** During translation, assume that the byte that zTerm points 131 ** During translation, assume that the byte that zTerm points
(...skipping 28 matching lines...) Expand all
162 c = (c<<6) + (0x3f & *(zIn++)); \ 160 c = (c<<6) + (0x3f & *(zIn++)); \
163 } \ 161 } \
164 if( c<0x80 \ 162 if( c<0x80 \
165 || (c&0xFFFFF800)==0xD800 \ 163 || (c&0xFFFFF800)==0xD800 \
166 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ 164 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
167 } 165 }
168 int sqlite3Utf8Read( 166 int sqlite3Utf8Read(
169 const unsigned char *zIn, /* First byte of UTF-8 character */ 167 const unsigned char *zIn, /* First byte of UTF-8 character */
170 const unsigned char **pzNext /* Write first byte past UTF-8 char here */ 168 const unsigned char **pzNext /* Write first byte past UTF-8 char here */
171 ){ 169 ){
172 int c; 170 unsigned int c;
173 171
174 /* Same as READ_UTF8() above but without the zTerm parameter. 172 /* Same as READ_UTF8() above but without the zTerm parameter.
175 ** For this routine, we assume the UTF8 string is always zero-terminated. 173 ** For this routine, we assume the UTF8 string is always zero-terminated.
176 */ 174 */
177 c = *(zIn++); 175 c = *(zIn++);
178 if( c>=0xc0 ){ 176 if( c>=0xc0 ){
179 c = sqlite3Utf8Trans1[c-0xc0]; 177 c = sqlite3Utf8Trans1[c-0xc0];
180 while( (*zIn & 0xc0)==0x80 ){ 178 while( (*zIn & 0xc0)==0x80 ){
181 c = (c<<6) + (0x3f & *(zIn++)); 179 c = (c<<6) + (0x3f & *(zIn++));
182 } 180 }
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
298 WRITE_UTF16BE(z, c); 296 WRITE_UTF16BE(z, c);
299 } 297 }
300 } 298 }
301 pMem->n = (int)(z - zOut); 299 pMem->n = (int)(z - zOut);
302 *z++ = 0; 300 *z++ = 0;
303 }else{ 301 }else{
304 assert( desiredEnc==SQLITE_UTF8 ); 302 assert( desiredEnc==SQLITE_UTF8 );
305 if( pMem->enc==SQLITE_UTF16LE ){ 303 if( pMem->enc==SQLITE_UTF16LE ){
306 /* UTF-16 Little-endian -> UTF-8 */ 304 /* UTF-16 Little-endian -> UTF-8 */
307 while( zIn<zTerm ){ 305 while( zIn<zTerm ){
308 READ_UTF16LE(zIn, c); 306 READ_UTF16LE(zIn, zIn<zTerm, c);
309 WRITE_UTF8(z, c); 307 WRITE_UTF8(z, c);
310 } 308 }
311 }else{ 309 }else{
312 /* UTF-16 Big-endian -> UTF-8 */ 310 /* UTF-16 Big-endian -> UTF-8 */
313 while( zIn<zTerm ){ 311 while( zIn<zTerm ){
314 READ_UTF16BE(zIn, c); 312 READ_UTF16BE(zIn, zIn<zTerm, c);
315 WRITE_UTF8(z, c); 313 WRITE_UTF8(z, c);
316 } 314 }
317 } 315 }
318 pMem->n = (int)(z - zOut); 316 pMem->n = (int)(z - zOut);
319 } 317 }
320 *z = 0; 318 *z = 0;
321 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); 319 assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
322 320
323 sqlite3VdbeMemRelease(pMem); 321 sqlite3VdbeMemRelease(pMem);
324 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem); 322 pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem);
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
405 /* This test function is not currently used by the automated test-suite. 403 /* This test function is not currently used by the automated test-suite.
406 ** Hence it is only available in debug builds. 404 ** Hence it is only available in debug builds.
407 */ 405 */
408 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) 406 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
409 /* 407 /*
410 ** Translate UTF-8 to UTF-8. 408 ** Translate UTF-8 to UTF-8.
411 ** 409 **
412 ** This has the effect of making sure that the string is well-formed 410 ** This has the effect of making sure that the string is well-formed
413 ** UTF-8. Miscoded characters are removed. 411 ** UTF-8. Miscoded characters are removed.
414 ** 412 **
415 ** The translation is done in-place (since it is impossible for the 413 ** The translation is done in-place and aborted if the output
416 ** correct UTF-8 encoding to be longer than a malformed encoding). 414 ** overruns the input.
417 */ 415 */
418 int sqlite3Utf8To8(unsigned char *zIn){ 416 int sqlite3Utf8To8(unsigned char *zIn){
419 unsigned char *zOut = zIn; 417 unsigned char *zOut = zIn;
420 unsigned char *zStart = zIn; 418 unsigned char *zStart = zIn;
421 u32 c; 419 u32 c;
422 420
423 while( zIn[0] ){ 421 while( zIn[0] && zOut<=zIn ){
424 c = sqlite3Utf8Read(zIn, (const u8**)&zIn); 422 c = sqlite3Utf8Read(zIn, (const u8**)&zIn);
425 if( c!=0xfffd ){ 423 if( c!=0xfffd ){
426 WRITE_UTF8(zOut, c); 424 WRITE_UTF8(zOut, c);
427 } 425 }
428 } 426 }
429 *zOut = 0; 427 *zOut = 0;
430 return (int)(zOut - zStart); 428 return (int)(zOut - zStart);
431 } 429 }
432 #endif 430 #endif
433 431
434 #ifndef SQLITE_OMIT_UTF16 432 #ifndef SQLITE_OMIT_UTF16
435 /* 433 /*
436 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. 434 ** Convert a UTF-16 string in the native encoding into a UTF-8 string.
437 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must 435 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must
438 ** be freed by the calling function. 436 ** be freed by the calling function.
439 ** 437 **
440 ** NULL is returned if there is an allocation error. 438 ** NULL is returned if there is an allocation error.
441 */ 439 */
442 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){ 440 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){
443 Mem m; 441 Mem m;
444 memset(&m, 0, sizeof(m)); 442 memset(&m, 0, sizeof(m));
445 m.db = db; 443 m.db = db;
446 sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC); 444 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC);
447 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); 445 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8);
448 if( db->mallocFailed ){ 446 if( db->mallocFailed ){
449 sqlite3VdbeMemRelease(&m); 447 sqlite3VdbeMemRelease(&m);
450 m.z = 0; 448 m.z = 0;
451 } 449 }
452 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); 450 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed );
453 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); 451 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed );
454 return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z); 452 assert( (m.flags & MEM_Dyn)!=0 || db->mallocFailed );
453 assert( m.z || db->mallocFailed );
454 return m.z;
455 } 455 }
456 456
457 /* 457 /*
458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter 458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter
459 ** enc. A pointer to the new string is returned, and the value of *pnOut 459 ** enc. A pointer to the new string is returned, and the value of *pnOut
460 ** is set to the length of the returned string in bytes. The call should 460 ** is set to the length of the returned string in bytes. The call should
461 ** arrange to call sqlite3DbFree() on the returned pointer when it is 461 ** arrange to call sqlite3DbFree() on the returned pointer when it is
462 ** no longer required. 462 ** no longer required.
463 ** 463 **
464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed 464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed
465 ** flag set. 465 ** flag set.
466 */ 466 */
467 #ifdef SQLITE_ENABLE_STAT2 467 #ifdef SQLITE_ENABLE_STAT2
468 char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){ 468 char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){
469 Mem m; 469 Mem m;
470 memset(&m, 0, sizeof(m)); 470 memset(&m, 0, sizeof(m));
471 m.db = db; 471 m.db = db;
472 sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC); 472 sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC);
473 if( sqlite3VdbeMemTranslate(&m, enc) ){ 473 if( sqlite3VdbeMemTranslate(&m, enc) ){
474 assert( db->mallocFailed ); 474 assert( db->mallocFailed );
475 return 0; 475 return 0;
476 } 476 }
477 assert( m.z==m.zMalloc ); 477 assert( m.z==m.zMalloc );
478 *pnOut = m.n; 478 *pnOut = m.n;
479 return m.z; 479 return m.z;
480 } 480 }
481 #endif 481 #endif
482 482
483 /* 483 /*
484 ** pZ is a UTF-16 encoded unicode string at least nChar characters long. 484 ** zIn is a UTF-16 encoded unicode string at least nChar characters long.
485 ** Return the number of bytes in the first nChar unicode characters 485 ** Return the number of bytes in the first nChar unicode characters
486 ** in pZ. nChar must be non-negative. 486 ** in pZ. nChar must be non-negative.
487 */ 487 */
488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ 488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){
489 int c; 489 int c;
490 unsigned char const *z = zIn; 490 unsigned char const *z = zIn;
491 int n = 0; 491 int n = 0;
492
492 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ 493 if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
493 /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here
494 ** and in other parts of this file means that at one branch will
495 ** not be covered by coverage testing on any single host. But coverage
496 ** will be complete if the tests are run on both a little-endian and
497 ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE
498 ** macros are constant at compile time the compiler can determine
499 ** which branch will be followed. It is therefore assumed that no runtime
500 ** penalty is paid for this "if" statement.
501 */
502 while( n<nChar ){ 494 while( n<nChar ){
503 READ_UTF16BE(z, c); 495 READ_UTF16BE(z, 1, c);
504 n++; 496 n++;
505 } 497 }
506 }else{ 498 }else{
507 while( n<nChar ){ 499 while( n<nChar ){
508 READ_UTF16LE(z, c); 500 READ_UTF16LE(z, 1, c);
509 n++; 501 n++;
510 } 502 }
511 } 503 }
512 return (int)(z-(unsigned char const *)zIn); 504 return (int)(z-(unsigned char const *)zIn);
513 } 505 }
514 506
515 #if defined(SQLITE_TEST) 507 #if defined(SQLITE_TEST)
516 /* 508 /*
517 ** This routine is called from the TCL test function "translate_selftest". 509 ** This routine is called from the TCL test function "translate_selftest".
518 ** It checks that the primitives for serializing and deserializing 510 ** It checks that the primitives for serializing and deserializing
(...skipping 21 matching lines...) Expand all
540 assert( (z-zBuf)==n ); 532 assert( (z-zBuf)==n );
541 } 533 }
542 for(i=0; i<0x00110000; i++){ 534 for(i=0; i<0x00110000; i++){
543 if( i>=0xD800 && i<0xE000 ) continue; 535 if( i>=0xD800 && i<0xE000 ) continue;
544 z = zBuf; 536 z = zBuf;
545 WRITE_UTF16LE(z, i); 537 WRITE_UTF16LE(z, i);
546 n = (int)(z-zBuf); 538 n = (int)(z-zBuf);
547 assert( n>0 && n<=4 ); 539 assert( n>0 && n<=4 );
548 z[0] = 0; 540 z[0] = 0;
549 z = zBuf; 541 z = zBuf;
550 READ_UTF16LE(z, c); 542 READ_UTF16LE(z, 1, c);
551 assert( c==i ); 543 assert( c==i );
552 assert( (z-zBuf)==n ); 544 assert( (z-zBuf)==n );
553 } 545 }
554 for(i=0; i<0x00110000; i++){ 546 for(i=0; i<0x00110000; i++){
555 if( i>=0xD800 && i<0xE000 ) continue; 547 if( i>=0xD800 && i<0xE000 ) continue;
556 z = zBuf; 548 z = zBuf;
557 WRITE_UTF16BE(z, i); 549 WRITE_UTF16BE(z, i);
558 n = (int)(z-zBuf); 550 n = (int)(z-zBuf);
559 assert( n>0 && n<=4 ); 551 assert( n>0 && n<=4 );
560 z[0] = 0; 552 z[0] = 0;
561 z = zBuf; 553 z = zBuf;
562 READ_UTF16BE(z, c); 554 READ_UTF16BE(z, 1, c);
563 assert( c==i ); 555 assert( c==i );
564 assert( (z-zBuf)==n ); 556 assert( (z-zBuf)==n );
565 } 557 }
566 } 558 }
567 #endif /* SQLITE_TEST */ 559 #endif /* SQLITE_TEST */
568 #endif /* SQLITE_OMIT_UTF16 */ 560 #endif /* SQLITE_OMIT_UTF16 */
OLDNEW
« no previous file with comments | « third_party/sqlite/src/src/update.c ('k') | third_party/sqlite/src/src/util.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698