OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 1999-2006, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * file name: utf_impl.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 1999sep13 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * This file provides implementation functions for macros in the utfXX.h |
| 17 * that would otherwise be too long as macros. |
| 18 */ |
| 19 |
| 20 /* set import/export definitions */ |
| 21 #ifndef U_UTF8_IMPL |
| 22 # define U_UTF8_IMPL |
| 23 #endif |
| 24 |
| 25 #include "unicode/utypes.h" |
| 26 |
| 27 /* |
| 28 * This table could be replaced on many machines by |
| 29 * a few lines of assembler code using an |
| 30 * "index of first 0-bit from msb" instruction and |
| 31 * one or two more integer instructions. |
| 32 * |
| 33 * For example, on an i386, do something like |
| 34 * - MOV AL, leadByte |
| 35 * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0) |
| 36 * - MOV AH, 0 |
| 37 * - BSR BX, AX (16-bit) |
| 38 * - MOV AX, 6 (result) |
| 39 * - JZ finish (ZF==1 if leadByte==0xff) |
| 40 * - SUB AX, BX (result) |
| 41 * -finish: |
| 42 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) |
| 43 * |
| 44 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal; |
| 45 * lead bytes above 0xf4 are illegal. |
| 46 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences. |
| 47 */ |
| 48 U_EXPORT const uint8_t |
| 49 utf8_countTrailBytes[256]={ |
| 50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 54 |
| 55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 59 |
| 60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 64 |
| 65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 67 |
| 68 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 69 3, 3, 3, 3, 3, |
| 70 3, 3, 3, /* illegal in Unicode */ |
| 71 4, 4, 4, 4, /* illegal in Unicode */ |
| 72 5, 5, /* illegal in Unicode */ |
| 73 0, 0 /* illegal bytes 0xfe and 0xff */ |
| 74 }; |
| 75 |
| 76 static const UChar32 |
| 77 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; |
| 78 |
| 79 static const UChar32 |
| 80 utf8_errorValue[6]={ |
| 81 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff, |
| 82 0x3ffffff, 0x7fffffff |
| 83 }; |
| 84 |
| 85 /* |
| 86 * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling |
| 87 * UTF8_NEXT_CHAR_SAFE(). |
| 88 * |
| 89 * The "strict" parameter controls the error behavior: |
| 90 * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative |
| 91 * code point result. |
| 92 * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): |
| 93 * All illegal byte sequences yield a positive code point such that this |
| 94 * result code point would be encoded with the same number of bytes as |
| 95 * the illegal sequence. |
| 96 * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE): |
| 97 * Same as the obsolete "safe" behavior, but non-characters are also treated |
| 98 * like illegal sequences. |
| 99 * |
| 100 * The special negative (<0) value -2 is used for lenient treatment of surrogate |
| 101 * code points as legal. Some implementations use this for roundtripping of |
| 102 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they |
| 103 * contain unpaired surrogates. |
| 104 * |
| 105 * Note that a UBool is the same as an int8_t. |
| 106 */ |
| 107 U_CAPI UChar32 U_EXPORT2 |
| 108 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
UBool strict) { |
| 109 int32_t i=*pi; |
| 110 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); |
| 111 if((i)+count<=(length)) { |
| 112 uint8_t trail, illegal=0; |
| 113 |
| 114 UTF8_MASK_LEAD_BYTE((c), count); |
| 115 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe
and 0xff */ |
| 116 switch(count) { |
| 117 /* each branch falls through to the next one */ |
| 118 case 5: |
| 119 case 4: |
| 120 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode
's UTF-8 */ |
| 121 illegal=1; |
| 122 break; |
| 123 case 3: |
| 124 trail=s[(i)++]; |
| 125 (c)=((c)<<6)|(trail&0x3f); |
| 126 if(c<0x110) { |
| 127 illegal|=(trail&0xc0)^0x80; |
| 128 } else { |
| 129 /* code point>0x10ffff, outside Unicode */ |
| 130 illegal=1; |
| 131 break; |
| 132 } |
| 133 case 2: |
| 134 trail=s[(i)++]; |
| 135 (c)=((c)<<6)|(trail&0x3f); |
| 136 illegal|=(trail&0xc0)^0x80; |
| 137 case 1: |
| 138 trail=s[(i)++]; |
| 139 (c)=((c)<<6)|(trail&0x3f); |
| 140 illegal|=(trail&0xc0)^0x80; |
| 141 break; |
| 142 case 0: |
| 143 if(strict>=0) { |
| 144 return UTF8_ERROR_VALUE_1; |
| 145 } else { |
| 146 return U_SENTINEL; |
| 147 } |
| 148 /* no default branch to optimize switch() - all values are covered */ |
| 149 } |
| 150 |
| 151 /* |
| 152 * All the error handling should return a value |
| 153 * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right. |
| 154 * |
| 155 * Starting with Unicode 3.0.1, non-shortest forms are illegal. |
| 156 * Starting with Unicode 3.2, surrogate code points must not be |
| 157 * encoded in UTF-8, and there are no irregular sequences any more. |
| 158 * |
| 159 * U8_ macros (new in ICU 2.4) return negative values for error conditio
ns. |
| 160 */ |
| 161 |
| 162 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
| 163 /* illegal is also set if count>=4 */ |
| 164 if(illegal || (c)<utf8_minLegal[count] || (UTF_IS_SURROGATE(c) && strict
!=-2)) { |
| 165 /* error handling */ |
| 166 uint8_t errorCount=count; |
| 167 /* don't go beyond this sequence */ |
| 168 i=*pi; |
| 169 while(count>0 && UTF8_IS_TRAIL(s[i])) { |
| 170 ++(i); |
| 171 --count; |
| 172 } |
| 173 if(strict>=0) { |
| 174 c=utf8_errorValue[errorCount-count]; |
| 175 } else { |
| 176 c=U_SENTINEL; |
| 177 } |
| 178 } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) { |
| 179 /* strict: forbid non-characters like U+fffe */ |
| 180 c=utf8_errorValue[count]; |
| 181 } |
| 182 } else /* too few bytes left */ { |
| 183 /* error handling */ |
| 184 int32_t i0=i; |
| 185 /* don't just set (i)=(length) in case there is an illegal sequence */ |
| 186 while((i)<(length) && UTF8_IS_TRAIL(s[i])) { |
| 187 ++(i); |
| 188 } |
| 189 if(strict>=0) { |
| 190 c=utf8_errorValue[i-i0]; |
| 191 } else { |
| 192 c=U_SENTINEL; |
| 193 } |
| 194 } |
| 195 *pi=i; |
| 196 return c; |
| 197 } |
| 198 |
| 199 U_CAPI int32_t U_EXPORT2 |
| 200 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool
*pIsError) { |
| 201 if((uint32_t)(c)<=0x7ff) { |
| 202 if((i)+1<(length)) { |
| 203 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); |
| 204 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); |
| 205 return i; |
| 206 } |
| 207 } else if((uint32_t)(c)<=0xffff) { |
| 208 /* Starting with Unicode 3.2, surrogate code points must not be encoded
in UTF-8. */ |
| 209 if((i)+2<(length) && !U_IS_SURROGATE(c)) { |
| 210 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); |
| 211 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); |
| 212 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); |
| 213 return i; |
| 214 } |
| 215 } else if((uint32_t)(c)<=0x10ffff) { |
| 216 if((i)+3<(length)) { |
| 217 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); |
| 218 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); |
| 219 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); |
| 220 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); |
| 221 return i; |
| 222 } |
| 223 } |
| 224 /* c>0x10ffff or not enough space, write an error value */ |
| 225 if(pIsError!=NULL) { |
| 226 *pIsError=TRUE; |
| 227 } else { |
| 228 length-=i; |
| 229 if(length>0) { |
| 230 int32_t offset; |
| 231 if(length>3) { |
| 232 length=3; |
| 233 } |
| 234 s+=i; |
| 235 offset=0; |
| 236 c=utf8_errorValue[length-1]; |
| 237 UTF8_APPEND_CHAR_UNSAFE(s, offset, c); |
| 238 i=i+offset; |
| 239 } |
| 240 } |
| 241 return i; |
| 242 } |
| 243 |
| 244 U_CAPI UChar32 U_EXPORT2 |
| 245 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
Bool strict) { |
| 246 int32_t i=*pi; |
| 247 uint8_t b, count=1, shift=6; |
| 248 |
| 249 /* extract value bits from the last trail byte */ |
| 250 c&=0x3f; |
| 251 |
| 252 for(;;) { |
| 253 if(i<=start) { |
| 254 /* no lead byte at all */ |
| 255 if(strict>=0) { |
| 256 return UTF8_ERROR_VALUE_1; |
| 257 } else { |
| 258 return U_SENTINEL; |
| 259 } |
| 260 /*break;*/ |
| 261 } |
| 262 |
| 263 /* read another previous byte */ |
| 264 b=s[--i]; |
| 265 if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ |
| 266 if(b&0x40) { |
| 267 /* lead byte, this will always end the loop */ |
| 268 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b); |
| 269 |
| 270 if(count==shouldCount) { |
| 271 /* set the new position */ |
| 272 *pi=i; |
| 273 UTF8_MASK_LEAD_BYTE(b, count); |
| 274 c|=(UChar32)b<<shift; |
| 275 if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (UTF_
IS_SURROGATE(c) && strict!=-2) || (strict>0 && UTF_IS_UNICODE_NONCHAR(c))) { |
| 276 /* illegal sequence or (strict and non-character) */ |
| 277 if(count>=4) { |
| 278 count=3; |
| 279 } |
| 280 if(strict>=0) { |
| 281 c=utf8_errorValue[count]; |
| 282 } else { |
| 283 c=U_SENTINEL; |
| 284 } |
| 285 } else { |
| 286 /* exit with correct c */ |
| 287 } |
| 288 } else { |
| 289 /* the lead byte does not match the number of trail bytes */ |
| 290 /* only set the position to the lead byte if it would |
| 291 include the trail byte that we started with */ |
| 292 if(count<shouldCount) { |
| 293 *pi=i; |
| 294 if(strict>=0) { |
| 295 c=utf8_errorValue[count]; |
| 296 } else { |
| 297 c=U_SENTINEL; |
| 298 } |
| 299 } else { |
| 300 if(strict>=0) { |
| 301 c=UTF8_ERROR_VALUE_1; |
| 302 } else { |
| 303 c=U_SENTINEL; |
| 304 } |
| 305 } |
| 306 } |
| 307 break; |
| 308 } else if(count<5) { |
| 309 /* trail byte */ |
| 310 c|=(UChar32)(b&0x3f)<<shift; |
| 311 ++count; |
| 312 shift+=6; |
| 313 } else { |
| 314 /* more than 5 trail bytes is illegal */ |
| 315 if(strict>=0) { |
| 316 c=UTF8_ERROR_VALUE_1; |
| 317 } else { |
| 318 c=U_SENTINEL; |
| 319 } |
| 320 break; |
| 321 } |
| 322 } else { |
| 323 /* single-byte character precedes trailing bytes */ |
| 324 if(strict>=0) { |
| 325 c=UTF8_ERROR_VALUE_1; |
| 326 } else { |
| 327 c=U_SENTINEL; |
| 328 } |
| 329 break; |
| 330 } |
| 331 } |
| 332 return c; |
| 333 } |
| 334 |
| 335 U_CAPI int32_t U_EXPORT2 |
| 336 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { |
| 337 /* i had been decremented once before the function call */ |
| 338 int32_t I=i, Z; |
| 339 uint8_t b; |
| 340 |
| 341 /* read at most the 6 bytes s[Z] to s[i], inclusively */ |
| 342 if(I-5>start) { |
| 343 Z=I-5; |
| 344 } else { |
| 345 Z=start; |
| 346 } |
| 347 |
| 348 /* return I if the sequence starting there is long enough to include i */ |
| 349 do { |
| 350 b=s[I]; |
| 351 if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */ |
| 352 break; |
| 353 } else if(b>=0xc0) { |
| 354 if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) { |
| 355 return I; |
| 356 } else { |
| 357 break; |
| 358 } |
| 359 } |
| 360 } while(Z<=--I); |
| 361 |
| 362 /* return i itself to be consistent with the FWD_1 macro */ |
| 363 return i; |
| 364 } |
OLD | NEW |