| OLD | NEW | 
 | (Empty) | 
|    1 /* |  | 
|    2 ** 2004 April 13 |  | 
|    3 ** |  | 
|    4 ** The author disclaims copyright to this source code.  In place of |  | 
|    5 ** a legal notice, here is a blessing: |  | 
|    6 ** |  | 
|    7 **    May you do good and not evil. |  | 
|    8 **    May you find forgiveness for yourself and forgive others. |  | 
|    9 **    May you share freely, never taking more than you give. |  | 
|   10 ** |  | 
|   11 ************************************************************************* |  | 
|   12 ** This file contains routines used to translate between UTF-8,  |  | 
|   13 ** UTF-16, UTF-16BE, and UTF-16LE. |  | 
|   14 ** |  | 
|   15 ** $Id: utf.c,v 1.73 2009/04/01 18:40:32 drh Exp $ |  | 
|   16 ** |  | 
|   17 ** Notes on UTF-8: |  | 
|   18 ** |  | 
|   19 **   Byte-0    Byte-1    Byte-2    Byte-3    Value |  | 
|   20 **  0xxxxxxx                                 00000000 00000000 0xxxxxxx |  | 
|   21 **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx |  | 
|   22 **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx |  | 
|   23 **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx |  | 
|   24 ** |  | 
|   25 ** |  | 
|   26 ** Notes on UTF-16:  (with wwww+1==uuuuu) |  | 
|   27 ** |  | 
|   28 **      Word-0               Word-1          Value |  | 
|   29 **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx |  | 
|   30 **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx |  | 
|   31 ** |  | 
|   32 ** |  | 
|   33 ** BOM or Byte Order Mark: |  | 
|   34 **     0xff 0xfe   little-endian utf-16 follows |  | 
|   35 **     0xfe 0xff   big-endian utf-16 follows |  | 
|   36 ** |  | 
|   37 */ |  | 
|   38 #include "sqliteInt.h" |  | 
|   39 #include <assert.h> |  | 
|   40 #include "vdbeInt.h" |  | 
|   41  |  | 
|   42 #ifndef SQLITE_AMALGAMATION |  | 
|   43 /* |  | 
|   44 ** The following constant value is used by the SQLITE_BIGENDIAN and |  | 
|   45 ** SQLITE_LITTLEENDIAN macros. |  | 
|   46 */ |  | 
|   47 const int sqlite3one = 1; |  | 
|   48 #endif /* SQLITE_AMALGAMATION */ |  | 
|   49  |  | 
|   50 /* |  | 
|   51 ** This lookup table is used to help decode the first byte of |  | 
|   52 ** a multi-byte UTF8 character. |  | 
|   53 */ |  | 
|   54 static const unsigned char sqlite3Utf8Trans1[] = { |  | 
|   55   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |  | 
|   56   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |  | 
|   57   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, |  | 
|   58   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |  | 
|   59   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |  | 
|   60   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |  | 
|   61   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |  | 
|   62   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, |  | 
|   63 }; |  | 
|   64  |  | 
|   65  |  | 
|   66 #define WRITE_UTF8(zOut, c) {                          \ |  | 
|   67   if( c<0x00080 ){                                     \ |  | 
|   68     *zOut++ = (u8)(c&0xFF);                            \ |  | 
|   69   }                                                    \ |  | 
|   70   else if( c<0x00800 ){                                \ |  | 
|   71     *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \ |  | 
|   72     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \ |  | 
|   73   }                                                    \ |  | 
|   74   else if( c<0x10000 ){                                \ |  | 
|   75     *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \ |  | 
|   76     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \ |  | 
|   77     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \ |  | 
|   78   }else{                                               \ |  | 
|   79     *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \ |  | 
|   80     *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \ |  | 
|   81     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \ |  | 
|   82     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \ |  | 
|   83   }                                                    \ |  | 
|   84 } |  | 
|   85  |  | 
|   86 #define WRITE_UTF16LE(zOut, c) {                                    \ |  | 
|   87   if( c<=0xFFFF ){                                                  \ |  | 
|   88     *zOut++ = (u8)(c&0x00FF);                                       \ |  | 
|   89     *zOut++ = (u8)((c>>8)&0x00FF);                                  \ |  | 
|   90   }else{                                                            \ |  | 
|   91     *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \ |  | 
|   92     *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \ |  | 
|   93     *zOut++ = (u8)(c&0x00FF);                                       \ |  | 
|   94     *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \ |  | 
|   95   }                                                                 \ |  | 
|   96 } |  | 
|   97  |  | 
|   98 #define WRITE_UTF16BE(zOut, c) {                                    \ |  | 
|   99   if( c<=0xFFFF ){                                                  \ |  | 
|  100     *zOut++ = (u8)((c>>8)&0x00FF);                                  \ |  | 
|  101     *zOut++ = (u8)(c&0x00FF);                                       \ |  | 
|  102   }else{                                                            \ |  | 
|  103     *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \ |  | 
|  104     *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \ |  | 
|  105     *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \ |  | 
|  106     *zOut++ = (u8)(c&0x00FF);                                       \ |  | 
|  107   }                                                                 \ |  | 
|  108 } |  | 
|  109  |  | 
|  110 #define READ_UTF16LE(zIn, c){                                         \ |  | 
|  111   c = (*zIn++);                                                       \ |  | 
|  112   c += ((*zIn++)<<8);                                                 \ |  | 
|  113   if( c>=0xD800 && c<0xE000 ){                                        \ |  | 
|  114     int c2 = (*zIn++);                                                \ |  | 
|  115     c2 += ((*zIn++)<<8);                                              \ |  | 
|  116     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \ |  | 
|  117   }                                                                   \ |  | 
|  118 } |  | 
|  119  |  | 
|  120 #define READ_UTF16BE(zIn, c){                                         \ |  | 
|  121   c = ((*zIn++)<<8);                                                  \ |  | 
|  122   c += (*zIn++);                                                      \ |  | 
|  123   if( c>=0xD800 && c<0xE000 ){                                        \ |  | 
|  124     int c2 = ((*zIn++)<<8);                                           \ |  | 
|  125     c2 += (*zIn++);                                                   \ |  | 
|  126     c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \ |  | 
|  127   }                                                                   \ |  | 
|  128 } |  | 
|  129  |  | 
|  130 /* |  | 
|  131 ** Translate a single UTF-8 character.  Return the unicode value. |  | 
|  132 ** |  | 
|  133 ** During translation, assume that the byte that zTerm points |  | 
|  134 ** is a 0x00. |  | 
|  135 ** |  | 
|  136 ** Write a pointer to the next unread byte back into *pzNext. |  | 
|  137 ** |  | 
|  138 ** Notes On Invalid UTF-8: |  | 
|  139 ** |  | 
|  140 **  *  This routine never allows a 7-bit character (0x00 through 0x7f) to |  | 
|  141 **     be encoded as a multi-byte character.  Any multi-byte character that |  | 
|  142 **     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd. |  | 
|  143 ** |  | 
|  144 **  *  This routine never allows a UTF16 surrogate value to be encoded. |  | 
|  145 **     If a multi-byte character attempts to encode a value between |  | 
|  146 **     0xd800 and 0xe000 then it is rendered as 0xfffd. |  | 
|  147 ** |  | 
|  148 **  *  Bytes in the range of 0x80 through 0xbf which occur as the first |  | 
|  149 **     byte of a character are interpreted as single-byte characters |  | 
|  150 **     and rendered as themselves even though they are technically |  | 
|  151 **     invalid characters. |  | 
|  152 ** |  | 
|  153 **  *  This routine accepts an infinite number of different UTF8 encodings |  | 
|  154 **     for unicode values 0x80 and greater.  It do not change over-length |  | 
|  155 **     encodings to 0xfffd as some systems recommend. |  | 
|  156 */ |  | 
|  157 #define READ_UTF8(zIn, zTerm, c)                           \ |  | 
|  158   c = *(zIn++);                                            \ |  | 
|  159   if( c>=0xc0 ){                                           \ |  | 
|  160     c = sqlite3Utf8Trans1[c-0xc0];                         \ |  | 
|  161     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \ |  | 
|  162       c = (c<<6) + (0x3f & *(zIn++));                      \ |  | 
|  163     }                                                      \ |  | 
|  164     if( c<0x80                                             \ |  | 
|  165         || (c&0xFFFFF800)==0xD800                          \ |  | 
|  166         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \ |  | 
|  167   } |  | 
|  168 int sqlite3Utf8Read( |  | 
|  169   const unsigned char *zIn,       /* First byte of UTF-8 character */ |  | 
|  170   const unsigned char **pzNext    /* Write first byte past UTF-8 char here */ |  | 
|  171 ){ |  | 
|  172   int c; |  | 
|  173  |  | 
|  174   /* Same as READ_UTF8() above but without the zTerm parameter. |  | 
|  175   ** For this routine, we assume the UTF8 string is always zero-terminated. |  | 
|  176   */ |  | 
|  177   c = *(zIn++); |  | 
|  178   if( c>=0xc0 ){ |  | 
|  179     c = sqlite3Utf8Trans1[c-0xc0]; |  | 
|  180     while( (*zIn & 0xc0)==0x80 ){ |  | 
|  181       c = (c<<6) + (0x3f & *(zIn++)); |  | 
|  182     } |  | 
|  183     if( c<0x80 |  | 
|  184         || (c&0xFFFFF800)==0xD800 |  | 
|  185         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; } |  | 
|  186   } |  | 
|  187   *pzNext = zIn; |  | 
|  188   return c; |  | 
|  189 } |  | 
|  190  |  | 
|  191  |  | 
|  192  |  | 
|  193  |  | 
|  194 /* |  | 
|  195 ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is |  | 
|  196 ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). |  | 
|  197 */  |  | 
|  198 /* #define TRANSLATE_TRACE 1 */ |  | 
|  199  |  | 
|  200 #ifndef SQLITE_OMIT_UTF16 |  | 
|  201 /* |  | 
|  202 ** This routine transforms the internal text encoding used by pMem to |  | 
|  203 ** desiredEnc. It is an error if the string is already of the desired |  | 
|  204 ** encoding, or if *pMem does not contain a string value. |  | 
|  205 */ |  | 
|  206 int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){ |  | 
|  207   int len;                    /* Maximum length of output string in bytes */ |  | 
|  208   unsigned char *zOut;                  /* Output buffer */ |  | 
|  209   unsigned char *zIn;                   /* Input iterator */ |  | 
|  210   unsigned char *zTerm;                 /* End of input */ |  | 
|  211   unsigned char *z;                     /* Output iterator */ |  | 
|  212   unsigned int c; |  | 
|  213  |  | 
|  214   assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) ); |  | 
|  215   assert( pMem->flags&MEM_Str ); |  | 
|  216   assert( pMem->enc!=desiredEnc ); |  | 
|  217   assert( pMem->enc!=0 ); |  | 
|  218   assert( pMem->n>=0 ); |  | 
|  219  |  | 
|  220 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) |  | 
|  221   { |  | 
|  222     char zBuf[100]; |  | 
|  223     sqlite3VdbeMemPrettyPrint(pMem, zBuf); |  | 
|  224     fprintf(stderr, "INPUT:  %s\n", zBuf); |  | 
|  225   } |  | 
|  226 #endif |  | 
|  227  |  | 
|  228   /* If the translation is between UTF-16 little and big endian, then  |  | 
|  229   ** all that is required is to swap the byte order. This case is handled |  | 
|  230   ** differently from the others. |  | 
|  231   */ |  | 
|  232   if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){ |  | 
|  233     u8 temp; |  | 
|  234     int rc; |  | 
|  235     rc = sqlite3VdbeMemMakeWriteable(pMem); |  | 
|  236     if( rc!=SQLITE_OK ){ |  | 
|  237       assert( rc==SQLITE_NOMEM ); |  | 
|  238       return SQLITE_NOMEM; |  | 
|  239     } |  | 
|  240     zIn = (u8*)pMem->z; |  | 
|  241     zTerm = &zIn[pMem->n&~1]; |  | 
|  242     while( zIn<zTerm ){ |  | 
|  243       temp = *zIn; |  | 
|  244       *zIn = *(zIn+1); |  | 
|  245       zIn++; |  | 
|  246       *zIn++ = temp; |  | 
|  247     } |  | 
|  248     pMem->enc = desiredEnc; |  | 
|  249     goto translate_out; |  | 
|  250   } |  | 
|  251  |  | 
|  252   /* Set len to the maximum number of bytes required in the output buffer. */ |  | 
|  253   if( desiredEnc==SQLITE_UTF8 ){ |  | 
|  254     /* When converting from UTF-16, the maximum growth results from |  | 
|  255     ** translating a 2-byte character to a 4-byte UTF-8 character. |  | 
|  256     ** A single byte is required for the output string |  | 
|  257     ** nul-terminator. |  | 
|  258     */ |  | 
|  259     pMem->n &= ~1; |  | 
|  260     len = pMem->n * 2 + 1; |  | 
|  261   }else{ |  | 
|  262     /* When converting from UTF-8 to UTF-16 the maximum growth is caused |  | 
|  263     ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16 |  | 
|  264     ** character. Two bytes are required in the output buffer for the |  | 
|  265     ** nul-terminator. |  | 
|  266     */ |  | 
|  267     len = pMem->n * 2 + 2; |  | 
|  268   } |  | 
|  269  |  | 
|  270   /* Set zIn to point at the start of the input buffer and zTerm to point 1 |  | 
|  271   ** byte past the end. |  | 
|  272   ** |  | 
|  273   ** Variable zOut is set to point at the output buffer, space obtained |  | 
|  274   ** from sqlite3_malloc(). |  | 
|  275   */ |  | 
|  276   zIn = (u8*)pMem->z; |  | 
|  277   zTerm = &zIn[pMem->n]; |  | 
|  278   zOut = sqlite3DbMallocRaw(pMem->db, len); |  | 
|  279   if( !zOut ){ |  | 
|  280     return SQLITE_NOMEM; |  | 
|  281   } |  | 
|  282   z = zOut; |  | 
|  283  |  | 
|  284   if( pMem->enc==SQLITE_UTF8 ){ |  | 
|  285     if( desiredEnc==SQLITE_UTF16LE ){ |  | 
|  286       /* UTF-8 -> UTF-16 Little-endian */ |  | 
|  287       while( zIn<zTerm ){ |  | 
|  288         /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */ |  | 
|  289         READ_UTF8(zIn, zTerm, c); |  | 
|  290         WRITE_UTF16LE(z, c); |  | 
|  291       } |  | 
|  292     }else{ |  | 
|  293       assert( desiredEnc==SQLITE_UTF16BE ); |  | 
|  294       /* UTF-8 -> UTF-16 Big-endian */ |  | 
|  295       while( zIn<zTerm ){ |  | 
|  296         /* c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); */ |  | 
|  297         READ_UTF8(zIn, zTerm, c); |  | 
|  298         WRITE_UTF16BE(z, c); |  | 
|  299       } |  | 
|  300     } |  | 
|  301     pMem->n = (int)(z - zOut); |  | 
|  302     *z++ = 0; |  | 
|  303   }else{ |  | 
|  304     assert( desiredEnc==SQLITE_UTF8 ); |  | 
|  305     if( pMem->enc==SQLITE_UTF16LE ){ |  | 
|  306       /* UTF-16 Little-endian -> UTF-8 */ |  | 
|  307       while( zIn<zTerm ){ |  | 
|  308         READ_UTF16LE(zIn, c);  |  | 
|  309         WRITE_UTF8(z, c); |  | 
|  310       } |  | 
|  311     }else{ |  | 
|  312       /* UTF-16 Big-endian -> UTF-8 */ |  | 
|  313       while( zIn<zTerm ){ |  | 
|  314         READ_UTF16BE(zIn, c);  |  | 
|  315         WRITE_UTF8(z, c); |  | 
|  316       } |  | 
|  317     } |  | 
|  318     pMem->n = (int)(z - zOut); |  | 
|  319   } |  | 
|  320   *z = 0; |  | 
|  321   assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); |  | 
|  322  |  | 
|  323   sqlite3VdbeMemRelease(pMem); |  | 
|  324   pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem); |  | 
|  325   pMem->enc = desiredEnc; |  | 
|  326   pMem->flags |= (MEM_Term|MEM_Dyn); |  | 
|  327   pMem->z = (char*)zOut; |  | 
|  328   pMem->zMalloc = pMem->z; |  | 
|  329  |  | 
|  330 translate_out: |  | 
|  331 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) |  | 
|  332   { |  | 
|  333     char zBuf[100]; |  | 
|  334     sqlite3VdbeMemPrettyPrint(pMem, zBuf); |  | 
|  335     fprintf(stderr, "OUTPUT: %s\n", zBuf); |  | 
|  336   } |  | 
|  337 #endif |  | 
|  338   return SQLITE_OK; |  | 
|  339 } |  | 
|  340  |  | 
|  341 /* |  | 
|  342 ** This routine checks for a byte-order mark at the beginning of the  |  | 
|  343 ** UTF-16 string stored in *pMem. If one is present, it is removed and |  | 
|  344 ** the encoding of the Mem adjusted. This routine does not do any |  | 
|  345 ** byte-swapping, it just sets Mem.enc appropriately. |  | 
|  346 ** |  | 
|  347 ** The allocation (static, dynamic etc.) and encoding of the Mem may be |  | 
|  348 ** changed by this function. |  | 
|  349 */ |  | 
|  350 int sqlite3VdbeMemHandleBom(Mem *pMem){ |  | 
|  351   int rc = SQLITE_OK; |  | 
|  352   u8 bom = 0; |  | 
|  353  |  | 
|  354   assert( pMem->n>=0 ); |  | 
|  355   if( pMem->n>1 ){ |  | 
|  356     u8 b1 = *(u8 *)pMem->z; |  | 
|  357     u8 b2 = *(((u8 *)pMem->z) + 1); |  | 
|  358     if( b1==0xFE && b2==0xFF ){ |  | 
|  359       bom = SQLITE_UTF16BE; |  | 
|  360     } |  | 
|  361     if( b1==0xFF && b2==0xFE ){ |  | 
|  362       bom = SQLITE_UTF16LE; |  | 
|  363     } |  | 
|  364   } |  | 
|  365    |  | 
|  366   if( bom ){ |  | 
|  367     rc = sqlite3VdbeMemMakeWriteable(pMem); |  | 
|  368     if( rc==SQLITE_OK ){ |  | 
|  369       pMem->n -= 2; |  | 
|  370       memmove(pMem->z, &pMem->z[2], pMem->n); |  | 
|  371       pMem->z[pMem->n] = '\0'; |  | 
|  372       pMem->z[pMem->n+1] = '\0'; |  | 
|  373       pMem->flags |= MEM_Term; |  | 
|  374       pMem->enc = bom; |  | 
|  375     } |  | 
|  376   } |  | 
|  377   return rc; |  | 
|  378 } |  | 
|  379 #endif /* SQLITE_OMIT_UTF16 */ |  | 
|  380  |  | 
|  381 /* |  | 
|  382 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, |  | 
|  383 ** return the number of unicode characters in pZ up to (but not including) |  | 
|  384 ** the first 0x00 byte. If nByte is not less than zero, return the |  | 
|  385 ** number of unicode characters in the first nByte of pZ (or up to  |  | 
|  386 ** the first 0x00, whichever comes first). |  | 
|  387 */ |  | 
|  388 int sqlite3Utf8CharLen(const char *zIn, int nByte){ |  | 
|  389   int r = 0; |  | 
|  390   const u8 *z = (const u8*)zIn; |  | 
|  391   const u8 *zTerm; |  | 
|  392   if( nByte>=0 ){ |  | 
|  393     zTerm = &z[nByte]; |  | 
|  394   }else{ |  | 
|  395     zTerm = (const u8*)(-1); |  | 
|  396   } |  | 
|  397   assert( z<=zTerm ); |  | 
|  398   while( *z!=0 && z<zTerm ){ |  | 
|  399     SQLITE_SKIP_UTF8(z); |  | 
|  400     r++; |  | 
|  401   } |  | 
|  402   return r; |  | 
|  403 } |  | 
|  404  |  | 
|  405 /* This test function is not currently used by the automated test-suite.  |  | 
|  406 ** Hence it is only available in debug builds. |  | 
|  407 */ |  | 
|  408 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) |  | 
|  409 /* |  | 
|  410 ** Translate UTF-8 to UTF-8. |  | 
|  411 ** |  | 
|  412 ** This has the effect of making sure that the string is well-formed |  | 
|  413 ** UTF-8.  Miscoded characters are removed. |  | 
|  414 ** |  | 
|  415 ** The translation is done in-place (since it is impossible for the |  | 
|  416 ** correct UTF-8 encoding to be longer than a malformed encoding). |  | 
|  417 */ |  | 
|  418 int sqlite3Utf8To8(unsigned char *zIn){ |  | 
|  419   unsigned char *zOut = zIn; |  | 
|  420   unsigned char *zStart = zIn; |  | 
|  421   u32 c; |  | 
|  422  |  | 
|  423   while( zIn[0] ){ |  | 
|  424     c = sqlite3Utf8Read(zIn, (const u8**)&zIn); |  | 
|  425     if( c!=0xfffd ){ |  | 
|  426       WRITE_UTF8(zOut, c); |  | 
|  427     } |  | 
|  428   } |  | 
|  429   *zOut = 0; |  | 
|  430   return (int)(zOut - zStart); |  | 
|  431 } |  | 
|  432 #endif |  | 
|  433  |  | 
|  434 #ifndef SQLITE_OMIT_UTF16 |  | 
|  435 /* |  | 
|  436 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. |  | 
|  437 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must |  | 
|  438 ** be freed by the calling function. |  | 
|  439 ** |  | 
|  440 ** NULL is returned if there is an allocation error. |  | 
|  441 */ |  | 
|  442 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte){ |  | 
|  443   Mem m; |  | 
|  444   memset(&m, 0, sizeof(m)); |  | 
|  445   m.db = db; |  | 
|  446   sqlite3VdbeMemSetStr(&m, z, nByte, SQLITE_UTF16NATIVE, SQLITE_STATIC); |  | 
|  447   sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); |  | 
|  448   if( db->mallocFailed ){ |  | 
|  449     sqlite3VdbeMemRelease(&m); |  | 
|  450     m.z = 0; |  | 
|  451   } |  | 
|  452   assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); |  | 
|  453   assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); |  | 
|  454   return (m.flags & MEM_Dyn)!=0 ? m.z : sqlite3DbStrDup(db, m.z); |  | 
|  455 } |  | 
|  456  |  | 
|  457 /* |  | 
|  458 ** Convert a UTF-8 string to the UTF-16 encoding specified by parameter |  | 
|  459 ** enc. A pointer to the new string is returned, and the value of *pnOut |  | 
|  460 ** is set to the length of the returned string in bytes. The call should |  | 
|  461 ** arrange to call sqlite3DbFree() on the returned pointer when it is |  | 
|  462 ** no longer required. |  | 
|  463 **  |  | 
|  464 ** If a malloc failure occurs, NULL is returned and the db.mallocFailed |  | 
|  465 ** flag set. |  | 
|  466 */ |  | 
|  467 #ifdef SQLITE_ENABLE_STAT2 |  | 
|  468 char *sqlite3Utf8to16(sqlite3 *db, u8 enc, char *z, int n, int *pnOut){ |  | 
|  469   Mem m; |  | 
|  470   memset(&m, 0, sizeof(m)); |  | 
|  471   m.db = db; |  | 
|  472   sqlite3VdbeMemSetStr(&m, z, n, SQLITE_UTF8, SQLITE_STATIC); |  | 
|  473   if( sqlite3VdbeMemTranslate(&m, enc) ){ |  | 
|  474     assert( db->mallocFailed ); |  | 
|  475     return 0; |  | 
|  476   } |  | 
|  477   assert( m.z==m.zMalloc ); |  | 
|  478   *pnOut = m.n; |  | 
|  479   return m.z; |  | 
|  480 } |  | 
|  481 #endif |  | 
|  482  |  | 
|  483 /* |  | 
|  484 ** pZ is a UTF-16 encoded unicode string at least nChar characters long. |  | 
|  485 ** Return the number of bytes in the first nChar unicode characters |  | 
|  486 ** in pZ.  nChar must be non-negative. |  | 
|  487 */ |  | 
|  488 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ |  | 
|  489   int c; |  | 
|  490   unsigned char const *z = zIn; |  | 
|  491   int n = 0; |  | 
|  492   if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){ |  | 
|  493     /* Using an "if (SQLITE_UTF16NATIVE==SQLITE_UTF16BE)" construct here |  | 
|  494     ** and in other parts of this file means that at one branch will |  | 
|  495     ** not be covered by coverage testing on any single host. But coverage |  | 
|  496     ** will be complete if the tests are run on both a little-endian and  |  | 
|  497     ** big-endian host. Because both the UTF16NATIVE and SQLITE_UTF16BE |  | 
|  498     ** macros are constant at compile time the compiler can determine |  | 
|  499     ** which branch will be followed. It is therefore assumed that no runtime |  | 
|  500     ** penalty is paid for this "if" statement. |  | 
|  501     */ |  | 
|  502     while( n<nChar ){ |  | 
|  503       READ_UTF16BE(z, c); |  | 
|  504       n++; |  | 
|  505     } |  | 
|  506   }else{ |  | 
|  507     while( n<nChar ){ |  | 
|  508       READ_UTF16LE(z, c); |  | 
|  509       n++; |  | 
|  510     } |  | 
|  511   } |  | 
|  512   return (int)(z-(unsigned char const *)zIn); |  | 
|  513 } |  | 
|  514  |  | 
|  515 #if defined(SQLITE_TEST) |  | 
|  516 /* |  | 
|  517 ** This routine is called from the TCL test function "translate_selftest". |  | 
|  518 ** It checks that the primitives for serializing and deserializing |  | 
|  519 ** characters in each encoding are inverses of each other. |  | 
|  520 */ |  | 
|  521 void sqlite3UtfSelfTest(void){ |  | 
|  522   unsigned int i, t; |  | 
|  523   unsigned char zBuf[20]; |  | 
|  524   unsigned char *z; |  | 
|  525   int n; |  | 
|  526   unsigned int c; |  | 
|  527  |  | 
|  528   for(i=0; i<0x00110000; i++){ |  | 
|  529     z = zBuf; |  | 
|  530     WRITE_UTF8(z, i); |  | 
|  531     n = (int)(z-zBuf); |  | 
|  532     assert( n>0 && n<=4 ); |  | 
|  533     z[0] = 0; |  | 
|  534     z = zBuf; |  | 
|  535     c = sqlite3Utf8Read(z, (const u8**)&z); |  | 
|  536     t = i; |  | 
|  537     if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; |  | 
|  538     if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; |  | 
|  539     assert( c==t ); |  | 
|  540     assert( (z-zBuf)==n ); |  | 
|  541   } |  | 
|  542   for(i=0; i<0x00110000; i++){ |  | 
|  543     if( i>=0xD800 && i<0xE000 ) continue; |  | 
|  544     z = zBuf; |  | 
|  545     WRITE_UTF16LE(z, i); |  | 
|  546     n = (int)(z-zBuf); |  | 
|  547     assert( n>0 && n<=4 ); |  | 
|  548     z[0] = 0; |  | 
|  549     z = zBuf; |  | 
|  550     READ_UTF16LE(z, c); |  | 
|  551     assert( c==i ); |  | 
|  552     assert( (z-zBuf)==n ); |  | 
|  553   } |  | 
|  554   for(i=0; i<0x00110000; i++){ |  | 
|  555     if( i>=0xD800 && i<0xE000 ) continue; |  | 
|  556     z = zBuf; |  | 
|  557     WRITE_UTF16BE(z, i); |  | 
|  558     n = (int)(z-zBuf); |  | 
|  559     assert( n>0 && n<=4 ); |  | 
|  560     z[0] = 0; |  | 
|  561     z = zBuf; |  | 
|  562     READ_UTF16BE(z, c); |  | 
|  563     assert( c==i ); |  | 
|  564     assert( (z-zBuf)==n ); |  | 
|  565   } |  | 
|  566 } |  | 
|  567 #endif /* SQLITE_TEST */ |  | 
|  568 #endif /* SQLITE_OMIT_UTF16 */ |  | 
| OLD | NEW |