OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 2000-2009, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * file name: ucnvscsu.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2000nov18 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * This is an implementation of the Standard Compression Scheme for Unicode |
| 17 * as defined in http://www.unicode.org/unicode/reports/tr6/ . |
| 18 * Reserved commands and window settings are treated as illegal sequences and |
| 19 * will result in callback calls. |
| 20 */ |
| 21 |
| 22 #include "unicode/utypes.h" |
| 23 |
| 24 #if !UCONFIG_NO_CONVERSION |
| 25 |
| 26 #include "unicode/ucnv.h" |
| 27 #include "unicode/ucnv_cb.h" |
| 28 #include "ucnv_bld.h" |
| 29 #include "ucnv_cnv.h" |
| 30 #include "cmemory.h" |
| 31 |
| 32 /* SCSU definitions --------------------------------------------------------- */ |
| 33 |
| 34 /* SCSU command byte values */ |
| 35 enum { |
| 36 SQ0=0x01, /* Quote from window pair 0 */ |
| 37 SQ7=0x08, /* Quote from window pair 7 */ |
| 38 SDX=0x0B, /* Define a window as extended */ |
| 39 Srs=0x0C, /* reserved */ |
| 40 SQU=0x0E, /* Quote a single Unicode character */ |
| 41 SCU=0x0F, /* Change to Unicode mode */ |
| 42 SC0=0x10, /* Select window 0 */ |
| 43 SC7=0x17, /* Select window 7 */ |
| 44 SD0=0x18, /* Define and select window 0 */ |
| 45 SD7=0x1F, /* Define and select window 7 */ |
| 46 |
| 47 UC0=0xE0, /* Select window 0 */ |
| 48 UC7=0xE7, /* Select window 7 */ |
| 49 UD0=0xE8, /* Define and select window 0 */ |
| 50 UD7=0xEF, /* Define and select window 7 */ |
| 51 UQU=0xF0, /* Quote a single Unicode character */ |
| 52 UDX=0xF1, /* Define a Window as extended */ |
| 53 Urs=0xF2 /* reserved */ |
| 54 }; |
| 55 |
| 56 enum { |
| 57 /* |
| 58 * Unicode code points from 3400 to E000 are not adressible by |
| 59 * dynamic window, since in these areas no short run alphabets are |
| 60 * found. Therefore add gapOffset to all values from gapThreshold. |
| 61 */ |
| 62 gapThreshold=0x68, |
| 63 gapOffset=0xAC00, |
| 64 |
| 65 /* values between reservedStart and fixedThreshold are reserved */ |
| 66 reservedStart=0xA8, |
| 67 |
| 68 /* use table of predefined fixed offsets for values from fixedThreshold */ |
| 69 fixedThreshold=0xF9 |
| 70 }; |
| 71 |
| 72 /* constant offsets for the 8 static windows */ |
| 73 static const uint32_t staticOffsets[8]={ |
| 74 0x0000, /* ASCII for quoted tags */ |
| 75 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ |
| 76 0x0100, /* Latin Extended-A */ |
| 77 0x0300, /* Combining Diacritical Marks */ |
| 78 0x2000, /* General Punctuation */ |
| 79 0x2080, /* Currency Symbols */ |
| 80 0x2100, /* Letterlike Symbols and Number Forms */ |
| 81 0x3000 /* CJK Symbols and punctuation */ |
| 82 }; |
| 83 |
| 84 /* initial offsets for the 8 dynamic (sliding) windows */ |
| 85 static const uint32_t initialDynamicOffsets[8]={ |
| 86 0x0080, /* Latin-1 */ |
| 87 0x00C0, /* Latin Extended A */ |
| 88 0x0400, /* Cyrillic */ |
| 89 0x0600, /* Arabic */ |
| 90 0x0900, /* Devanagari */ |
| 91 0x3040, /* Hiragana */ |
| 92 0x30A0, /* Katakana */ |
| 93 0xFF00 /* Fullwidth ASCII */ |
| 94 }; |
| 95 |
| 96 /* Table of fixed predefined Offsets */ |
| 97 static const uint32_t fixedOffsets[]={ |
| 98 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ |
| 99 /* 0xFA */ 0x0250, /* IPA extensions */ |
| 100 /* 0xFB */ 0x0370, /* Greek */ |
| 101 /* 0xFC */ 0x0530, /* Armenian */ |
| 102 /* 0xFD */ 0x3040, /* Hiragana */ |
| 103 /* 0xFE */ 0x30A0, /* Katakana */ |
| 104 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ |
| 105 }; |
| 106 |
| 107 /* state values */ |
| 108 enum { |
| 109 readCommand, |
| 110 quotePairOne, |
| 111 quotePairTwo, |
| 112 quoteOne, |
| 113 definePairOne, |
| 114 definePairTwo, |
| 115 defineOne |
| 116 }; |
| 117 |
| 118 typedef struct SCSUData { |
| 119 /* dynamic window offsets, intitialize to default values from initialDynamic
Offsets */ |
| 120 uint32_t toUDynamicOffsets[8]; |
| 121 uint32_t fromUDynamicOffsets[8]; |
| 122 |
| 123 /* state machine state - toUnicode */ |
| 124 UBool toUIsSingleByteMode; |
| 125 uint8_t toUState; |
| 126 int8_t toUQuoteWindow, toUDynamicWindow; |
| 127 uint8_t toUByteOne; |
| 128 uint8_t toUPadding[3]; |
| 129 |
| 130 /* state machine state - fromUnicode */ |
| 131 UBool fromUIsSingleByteMode; |
| 132 int8_t fromUDynamicWindow; |
| 133 |
| 134 /* |
| 135 * windowUse[] keeps track of the use of the dynamic windows: |
| 136 * At nextWindowUseIndex there is the least recently used window, |
| 137 * and the following windows (in a wrapping manner) are more and more |
| 138 * recently used. |
| 139 * At nextWindowUseIndex-1 there is the most recently used window. |
| 140 */ |
| 141 uint8_t locale; |
| 142 int8_t nextWindowUseIndex; |
| 143 int8_t windowUse[8]; |
| 144 } SCSUData; |
| 145 |
| 146 static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; |
| 147 static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; |
| 148 |
| 149 enum { |
| 150 lGeneric, l_ja |
| 151 }; |
| 152 |
| 153 /* SCSU setup functions ----------------------------------------------------- */ |
| 154 |
| 155 static void |
| 156 _SCSUReset(UConverter *cnv, UConverterResetChoice choice) { |
| 157 SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
| 158 |
| 159 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 160 /* reset toUnicode */ |
| 161 uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); |
| 162 |
| 163 scsu->toUIsSingleByteMode=TRUE; |
| 164 scsu->toUState=readCommand; |
| 165 scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; |
| 166 scsu->toUByteOne=0; |
| 167 |
| 168 cnv->toULength=0; |
| 169 } |
| 170 if(choice!=UCNV_RESET_TO_UNICODE) { |
| 171 /* reset fromUnicode */ |
| 172 uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); |
| 173 |
| 174 scsu->fromUIsSingleByteMode=TRUE; |
| 175 scsu->fromUDynamicWindow=0; |
| 176 |
| 177 scsu->nextWindowUseIndex=0; |
| 178 switch(scsu->locale) { |
| 179 case l_ja: |
| 180 uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); |
| 181 break; |
| 182 default: |
| 183 uprv_memcpy(scsu->windowUse, initialWindowUse, 8); |
| 184 break; |
| 185 } |
| 186 |
| 187 cnv->fromUChar32=0; |
| 188 } |
| 189 } |
| 190 |
| 191 static void |
| 192 _SCSUOpen(UConverter *cnv, |
| 193 UConverterLoadArgs *pArgs, |
| 194 UErrorCode *pErrorCode) { |
| 195 const char *locale=pArgs->locale; |
| 196 if(pArgs->onlyTestIsLoadable) { |
| 197 return; |
| 198 } |
| 199 cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); |
| 200 if(cnv->extraInfo!=NULL) { |
| 201 if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 ||
locale[2]=='_')) { |
| 202 ((SCSUData *)cnv->extraInfo)->locale=l_ja; |
| 203 } else { |
| 204 ((SCSUData *)cnv->extraInfo)->locale=lGeneric; |
| 205 } |
| 206 _SCSUReset(cnv, UCNV_RESET_BOTH); |
| 207 } else { |
| 208 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; |
| 209 } |
| 210 |
| 211 /* Set the substitution character U+fffd as a Unicode string. */ |
| 212 cnv->subUChars[0]=0xfffd; |
| 213 cnv->subCharLen=-1; |
| 214 } |
| 215 |
| 216 static void |
| 217 _SCSUClose(UConverter *cnv) { |
| 218 if(cnv->extraInfo!=NULL) { |
| 219 if(!cnv->isExtraLocal) { |
| 220 uprv_free(cnv->extraInfo); |
| 221 } |
| 222 cnv->extraInfo=NULL; |
| 223 } |
| 224 } |
| 225 |
| 226 /* SCSU-to-Unicode conversion functions ------------------------------------- */ |
| 227 |
| 228 static void |
| 229 _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
| 230 UErrorCode *pErrorCode) { |
| 231 UConverter *cnv; |
| 232 SCSUData *scsu; |
| 233 const uint8_t *source, *sourceLimit; |
| 234 UChar *target; |
| 235 const UChar *targetLimit; |
| 236 int32_t *offsets; |
| 237 UBool isSingleByteMode; |
| 238 uint8_t state, byteOne; |
| 239 int8_t quoteWindow, dynamicWindow; |
| 240 |
| 241 int32_t sourceIndex, nextSourceIndex; |
| 242 |
| 243 uint8_t b; |
| 244 |
| 245 /* set up the local pointers */ |
| 246 cnv=pArgs->converter; |
| 247 scsu=(SCSUData *)cnv->extraInfo; |
| 248 |
| 249 source=(const uint8_t *)pArgs->source; |
| 250 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
| 251 target=pArgs->target; |
| 252 targetLimit=pArgs->targetLimit; |
| 253 offsets=pArgs->offsets; |
| 254 |
| 255 /* get the state machine state */ |
| 256 isSingleByteMode=scsu->toUIsSingleByteMode; |
| 257 state=scsu->toUState; |
| 258 quoteWindow=scsu->toUQuoteWindow; |
| 259 dynamicWindow=scsu->toUDynamicWindow; |
| 260 byteOne=scsu->toUByteOne; |
| 261 |
| 262 /* sourceIndex=-1 if the current character began in the previous buffer */ |
| 263 sourceIndex=state==readCommand ? 0 : -1; |
| 264 nextSourceIndex=0; |
| 265 |
| 266 /* |
| 267 * conversion "loop" |
| 268 * |
| 269 * For performance, this is not a normal C loop. |
| 270 * Instead, there are two code blocks for the two SCSU modes. |
| 271 * The function branches to either one, and a change of the mode is done wit
h a goto to |
| 272 * the other branch. |
| 273 * |
| 274 * Each branch has two conventional loops: |
| 275 * - a fast-path loop for the most common codes in the mode |
| 276 * - a loop for all other codes in the mode |
| 277 * When the fast-path runs into a code that it cannot handle, its loop ends
and it |
| 278 * runs into the following loop to handle the other codes. |
| 279 * The end of the input or output buffer is also handled by the slower loop. |
| 280 * The slow loop jumps (goto) to the fast-path loop again as soon as possibl
e. |
| 281 * |
| 282 * The callback handling is done by returning with an error code. |
| 283 * The conversion framework actually calls the callback function. |
| 284 */ |
| 285 if(isSingleByteMode) { |
| 286 /* fast path for single-byte mode */ |
| 287 if(state==readCommand) { |
| 288 fastSingle: |
| 289 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20)
{ |
| 290 ++source; |
| 291 ++nextSourceIndex; |
| 292 if(b<=0x7f) { |
| 293 /* write US-ASCII graphic character or DEL */ |
| 294 *target++=(UChar)b; |
| 295 if(offsets!=NULL) { |
| 296 *offsets++=sourceIndex; |
| 297 } |
| 298 } else { |
| 299 /* write from dynamic window */ |
| 300 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
| 301 if(c<=0xffff) { |
| 302 *target++=(UChar)c; |
| 303 if(offsets!=NULL) { |
| 304 *offsets++=sourceIndex; |
| 305 } |
| 306 } else { |
| 307 /* output surrogate pair */ |
| 308 *target++=(UChar)(0xd7c0+(c>>10)); |
| 309 if(target<targetLimit) { |
| 310 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| 311 if(offsets!=NULL) { |
| 312 *offsets++=sourceIndex; |
| 313 *offsets++=sourceIndex; |
| 314 } |
| 315 } else { |
| 316 /* target overflow */ |
| 317 if(offsets!=NULL) { |
| 318 *offsets++=sourceIndex; |
| 319 } |
| 320 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| 321 cnv->UCharErrorBufferLength=1; |
| 322 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 323 goto endloop; |
| 324 } |
| 325 } |
| 326 } |
| 327 sourceIndex=nextSourceIndex; |
| 328 } |
| 329 } |
| 330 |
| 331 /* normal state machine for single-byte mode, minus handling for what fa
stSingle covers */ |
| 332 singleByteMode: |
| 333 while(source<sourceLimit) { |
| 334 if(target>=targetLimit) { |
| 335 /* target is full */ |
| 336 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 337 break; |
| 338 } |
| 339 b=*source++; |
| 340 ++nextSourceIndex; |
| 341 switch(state) { |
| 342 case readCommand: |
| 343 /* redundant conditions are commented out */ |
| 344 /* here: b<0x20 because otherwise we would be in fastSingle */ |
| 345 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0
xd || b==0xa || b==9 || b==0 */) { |
| 346 /* CR/LF/TAB/NUL */ |
| 347 *target++=(UChar)b; |
| 348 if(offsets!=NULL) { |
| 349 *offsets++=sourceIndex; |
| 350 } |
| 351 sourceIndex=nextSourceIndex; |
| 352 goto fastSingle; |
| 353 } else if(SC0<=b) { |
| 354 if(b<=SC7) { |
| 355 dynamicWindow=(int8_t)(b-SC0); |
| 356 sourceIndex=nextSourceIndex; |
| 357 goto fastSingle; |
| 358 } else /* if(SD0<=b && b<=SD7) */ { |
| 359 dynamicWindow=(int8_t)(b-SD0); |
| 360 state=defineOne; |
| 361 } |
| 362 } else if(/* SQ0<=b && */ b<=SQ7) { |
| 363 quoteWindow=(int8_t)(b-SQ0); |
| 364 state=quoteOne; |
| 365 } else if(b==SDX) { |
| 366 state=definePairOne; |
| 367 } else if(b==SQU) { |
| 368 state=quotePairOne; |
| 369 } else if(b==SCU) { |
| 370 sourceIndex=nextSourceIndex; |
| 371 isSingleByteMode=FALSE; |
| 372 goto fastUnicode; |
| 373 } else /* Srs */ { |
| 374 /* callback(illegal) */ |
| 375 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 376 cnv->toUBytes[0]=b; |
| 377 cnv->toULength=1; |
| 378 goto endloop; |
| 379 } |
| 380 |
| 381 /* store the first byte of a multibyte sequence in toUBytes[] */ |
| 382 cnv->toUBytes[0]=b; |
| 383 cnv->toULength=1; |
| 384 break; |
| 385 case quotePairOne: |
| 386 byteOne=b; |
| 387 cnv->toUBytes[1]=b; |
| 388 cnv->toULength=2; |
| 389 state=quotePairTwo; |
| 390 break; |
| 391 case quotePairTwo: |
| 392 *target++=(UChar)((byteOne<<8)|b); |
| 393 if(offsets!=NULL) { |
| 394 *offsets++=sourceIndex; |
| 395 } |
| 396 sourceIndex=nextSourceIndex; |
| 397 state=readCommand; |
| 398 goto fastSingle; |
| 399 case quoteOne: |
| 400 if(b<0x80) { |
| 401 /* all static offsets are in the BMP */ |
| 402 *target++=(UChar)(staticOffsets[quoteWindow]+b); |
| 403 if(offsets!=NULL) { |
| 404 *offsets++=sourceIndex; |
| 405 } |
| 406 } else { |
| 407 /* write from dynamic window */ |
| 408 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
| 409 if(c<=0xffff) { |
| 410 *target++=(UChar)c; |
| 411 if(offsets!=NULL) { |
| 412 *offsets++=sourceIndex; |
| 413 } |
| 414 } else { |
| 415 /* output surrogate pair */ |
| 416 *target++=(UChar)(0xd7c0+(c>>10)); |
| 417 if(target<targetLimit) { |
| 418 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| 419 if(offsets!=NULL) { |
| 420 *offsets++=sourceIndex; |
| 421 *offsets++=sourceIndex; |
| 422 } |
| 423 } else { |
| 424 /* target overflow */ |
| 425 if(offsets!=NULL) { |
| 426 *offsets++=sourceIndex; |
| 427 } |
| 428 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| 429 cnv->UCharErrorBufferLength=1; |
| 430 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 431 goto endloop; |
| 432 } |
| 433 } |
| 434 } |
| 435 sourceIndex=nextSourceIndex; |
| 436 state=readCommand; |
| 437 goto fastSingle; |
| 438 case definePairOne: |
| 439 dynamicWindow=(int8_t)((b>>5)&7); |
| 440 byteOne=(uint8_t)(b&0x1f); |
| 441 cnv->toUBytes[1]=b; |
| 442 cnv->toULength=2; |
| 443 state=definePairTwo; |
| 444 break; |
| 445 case definePairTwo: |
| 446 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL |
b<<7UL); |
| 447 sourceIndex=nextSourceIndex; |
| 448 state=readCommand; |
| 449 goto fastSingle; |
| 450 case defineOne: |
| 451 if(b==0) { |
| 452 /* callback(illegal): Reserved window offset value 0 */ |
| 453 cnv->toUBytes[1]=b; |
| 454 cnv->toULength=2; |
| 455 goto endloop; |
| 456 } else if(b<gapThreshold) { |
| 457 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
| 458 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)
) { |
| 459 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
| 460 } else if(b>=fixedThreshold) { |
| 461 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedT
hreshold]; |
| 462 } else { |
| 463 /* callback(illegal): Reserved window offset value 0xa8..0xf
8 */ |
| 464 cnv->toUBytes[1]=b; |
| 465 cnv->toULength=2; |
| 466 goto endloop; |
| 467 } |
| 468 sourceIndex=nextSourceIndex; |
| 469 state=readCommand; |
| 470 goto fastSingle; |
| 471 } |
| 472 } |
| 473 } else { |
| 474 /* fast path for Unicode mode */ |
| 475 if(state==readCommand) { |
| 476 fastUnicode: |
| 477 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*so
urce)-UC0)>(Urs-UC0)) { |
| 478 *target++=(UChar)((b<<8)|source[1]); |
| 479 if(offsets!=NULL) { |
| 480 *offsets++=sourceIndex; |
| 481 } |
| 482 sourceIndex=nextSourceIndex; |
| 483 nextSourceIndex+=2; |
| 484 source+=2; |
| 485 } |
| 486 } |
| 487 |
| 488 /* normal state machine for Unicode mode */ |
| 489 /* unicodeByteMode: */ |
| 490 while(source<sourceLimit) { |
| 491 if(target>=targetLimit) { |
| 492 /* target is full */ |
| 493 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 494 break; |
| 495 } |
| 496 b=*source++; |
| 497 ++nextSourceIndex; |
| 498 switch(state) { |
| 499 case readCommand: |
| 500 if((uint8_t)(b-UC0)>(Urs-UC0)) { |
| 501 byteOne=b; |
| 502 cnv->toUBytes[0]=b; |
| 503 cnv->toULength=1; |
| 504 state=quotePairTwo; |
| 505 } else if(/* UC0<=b && */ b<=UC7) { |
| 506 dynamicWindow=(int8_t)(b-UC0); |
| 507 sourceIndex=nextSourceIndex; |
| 508 isSingleByteMode=TRUE; |
| 509 goto fastSingle; |
| 510 } else if(/* UD0<=b && */ b<=UD7) { |
| 511 dynamicWindow=(int8_t)(b-UD0); |
| 512 isSingleByteMode=TRUE; |
| 513 cnv->toUBytes[0]=b; |
| 514 cnv->toULength=1; |
| 515 state=defineOne; |
| 516 goto singleByteMode; |
| 517 } else if(b==UDX) { |
| 518 isSingleByteMode=TRUE; |
| 519 cnv->toUBytes[0]=b; |
| 520 cnv->toULength=1; |
| 521 state=definePairOne; |
| 522 goto singleByteMode; |
| 523 } else if(b==UQU) { |
| 524 cnv->toUBytes[0]=b; |
| 525 cnv->toULength=1; |
| 526 state=quotePairOne; |
| 527 } else /* Urs */ { |
| 528 /* callback(illegal) */ |
| 529 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 530 cnv->toUBytes[0]=b; |
| 531 cnv->toULength=1; |
| 532 goto endloop; |
| 533 } |
| 534 break; |
| 535 case quotePairOne: |
| 536 byteOne=b; |
| 537 cnv->toUBytes[1]=b; |
| 538 cnv->toULength=2; |
| 539 state=quotePairTwo; |
| 540 break; |
| 541 case quotePairTwo: |
| 542 *target++=(UChar)((byteOne<<8)|b); |
| 543 if(offsets!=NULL) { |
| 544 *offsets++=sourceIndex; |
| 545 } |
| 546 sourceIndex=nextSourceIndex; |
| 547 state=readCommand; |
| 548 goto fastUnicode; |
| 549 } |
| 550 } |
| 551 } |
| 552 endloop: |
| 553 |
| 554 /* set the converter state back into UConverter */ |
| 555 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
| 556 /* reset to deal with the next character */ |
| 557 state=readCommand; |
| 558 } else if(state==readCommand) { |
| 559 /* not in a multi-byte sequence, reset toULength */ |
| 560 cnv->toULength=0; |
| 561 } |
| 562 scsu->toUIsSingleByteMode=isSingleByteMode; |
| 563 scsu->toUState=state; |
| 564 scsu->toUQuoteWindow=quoteWindow; |
| 565 scsu->toUDynamicWindow=dynamicWindow; |
| 566 scsu->toUByteOne=byteOne; |
| 567 |
| 568 /* write back the updated pointers */ |
| 569 pArgs->source=(const char *)source; |
| 570 pArgs->target=target; |
| 571 pArgs->offsets=offsets; |
| 572 return; |
| 573 } |
| 574 |
| 575 /* |
| 576 * Identical to _SCSUToUnicodeWithOffsets but without offset handling. |
| 577 * If a change is made in the original function, then either |
| 578 * change this function the same way or |
| 579 * re-copy the original function and remove the variables |
| 580 * offsets, sourceIndex, and nextSourceIndex. |
| 581 */ |
| 582 static void |
| 583 _SCSUToUnicode(UConverterToUnicodeArgs *pArgs, |
| 584 UErrorCode *pErrorCode) { |
| 585 UConverter *cnv; |
| 586 SCSUData *scsu; |
| 587 const uint8_t *source, *sourceLimit; |
| 588 UChar *target; |
| 589 const UChar *targetLimit; |
| 590 UBool isSingleByteMode; |
| 591 uint8_t state, byteOne; |
| 592 int8_t quoteWindow, dynamicWindow; |
| 593 |
| 594 uint8_t b; |
| 595 |
| 596 /* set up the local pointers */ |
| 597 cnv=pArgs->converter; |
| 598 scsu=(SCSUData *)cnv->extraInfo; |
| 599 |
| 600 source=(const uint8_t *)pArgs->source; |
| 601 sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
| 602 target=pArgs->target; |
| 603 targetLimit=pArgs->targetLimit; |
| 604 |
| 605 /* get the state machine state */ |
| 606 isSingleByteMode=scsu->toUIsSingleByteMode; |
| 607 state=scsu->toUState; |
| 608 quoteWindow=scsu->toUQuoteWindow; |
| 609 dynamicWindow=scsu->toUDynamicWindow; |
| 610 byteOne=scsu->toUByteOne; |
| 611 |
| 612 /* |
| 613 * conversion "loop" |
| 614 * |
| 615 * For performance, this is not a normal C loop. |
| 616 * Instead, there are two code blocks for the two SCSU modes. |
| 617 * The function branches to either one, and a change of the mode is done wit
h a goto to |
| 618 * the other branch. |
| 619 * |
| 620 * Each branch has two conventional loops: |
| 621 * - a fast-path loop for the most common codes in the mode |
| 622 * - a loop for all other codes in the mode |
| 623 * When the fast-path runs into a code that it cannot handle, its loop ends
and it |
| 624 * runs into the following loop to handle the other codes. |
| 625 * The end of the input or output buffer is also handled by the slower loop. |
| 626 * The slow loop jumps (goto) to the fast-path loop again as soon as possibl
e. |
| 627 * |
| 628 * The callback handling is done by returning with an error code. |
| 629 * The conversion framework actually calls the callback function. |
| 630 */ |
| 631 if(isSingleByteMode) { |
| 632 /* fast path for single-byte mode */ |
| 633 if(state==readCommand) { |
| 634 fastSingle: |
| 635 while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20)
{ |
| 636 ++source; |
| 637 if(b<=0x7f) { |
| 638 /* write US-ASCII graphic character or DEL */ |
| 639 *target++=(UChar)b; |
| 640 } else { |
| 641 /* write from dynamic window */ |
| 642 uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); |
| 643 if(c<=0xffff) { |
| 644 *target++=(UChar)c; |
| 645 } else { |
| 646 /* output surrogate pair */ |
| 647 *target++=(UChar)(0xd7c0+(c>>10)); |
| 648 if(target<targetLimit) { |
| 649 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| 650 } else { |
| 651 /* target overflow */ |
| 652 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| 653 cnv->UCharErrorBufferLength=1; |
| 654 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 655 goto endloop; |
| 656 } |
| 657 } |
| 658 } |
| 659 } |
| 660 } |
| 661 |
| 662 /* normal state machine for single-byte mode, minus handling for what fa
stSingle covers */ |
| 663 singleByteMode: |
| 664 while(source<sourceLimit) { |
| 665 if(target>=targetLimit) { |
| 666 /* target is full */ |
| 667 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 668 break; |
| 669 } |
| 670 b=*source++; |
| 671 switch(state) { |
| 672 case readCommand: |
| 673 /* redundant conditions are commented out */ |
| 674 /* here: b<0x20 because otherwise we would be in fastSingle */ |
| 675 if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0
xd || b==0xa || b==9 || b==0 */) { |
| 676 /* CR/LF/TAB/NUL */ |
| 677 *target++=(UChar)b; |
| 678 goto fastSingle; |
| 679 } else if(SC0<=b) { |
| 680 if(b<=SC7) { |
| 681 dynamicWindow=(int8_t)(b-SC0); |
| 682 goto fastSingle; |
| 683 } else /* if(SD0<=b && b<=SD7) */ { |
| 684 dynamicWindow=(int8_t)(b-SD0); |
| 685 state=defineOne; |
| 686 } |
| 687 } else if(/* SQ0<=b && */ b<=SQ7) { |
| 688 quoteWindow=(int8_t)(b-SQ0); |
| 689 state=quoteOne; |
| 690 } else if(b==SDX) { |
| 691 state=definePairOne; |
| 692 } else if(b==SQU) { |
| 693 state=quotePairOne; |
| 694 } else if(b==SCU) { |
| 695 isSingleByteMode=FALSE; |
| 696 goto fastUnicode; |
| 697 } else /* Srs */ { |
| 698 /* callback(illegal) */ |
| 699 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 700 cnv->toUBytes[0]=b; |
| 701 cnv->toULength=1; |
| 702 goto endloop; |
| 703 } |
| 704 |
| 705 /* store the first byte of a multibyte sequence in toUBytes[] */ |
| 706 cnv->toUBytes[0]=b; |
| 707 cnv->toULength=1; |
| 708 break; |
| 709 case quotePairOne: |
| 710 byteOne=b; |
| 711 cnv->toUBytes[1]=b; |
| 712 cnv->toULength=2; |
| 713 state=quotePairTwo; |
| 714 break; |
| 715 case quotePairTwo: |
| 716 *target++=(UChar)((byteOne<<8)|b); |
| 717 state=readCommand; |
| 718 goto fastSingle; |
| 719 case quoteOne: |
| 720 if(b<0x80) { |
| 721 /* all static offsets are in the BMP */ |
| 722 *target++=(UChar)(staticOffsets[quoteWindow]+b); |
| 723 } else { |
| 724 /* write from dynamic window */ |
| 725 uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); |
| 726 if(c<=0xffff) { |
| 727 *target++=(UChar)c; |
| 728 } else { |
| 729 /* output surrogate pair */ |
| 730 *target++=(UChar)(0xd7c0+(c>>10)); |
| 731 if(target<targetLimit) { |
| 732 *target++=(UChar)(0xdc00|(c&0x3ff)); |
| 733 } else { |
| 734 /* target overflow */ |
| 735 cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); |
| 736 cnv->UCharErrorBufferLength=1; |
| 737 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 738 goto endloop; |
| 739 } |
| 740 } |
| 741 } |
| 742 state=readCommand; |
| 743 goto fastSingle; |
| 744 case definePairOne: |
| 745 dynamicWindow=(int8_t)((b>>5)&7); |
| 746 byteOne=(uint8_t)(b&0x1f); |
| 747 cnv->toUBytes[1]=b; |
| 748 cnv->toULength=2; |
| 749 state=definePairTwo; |
| 750 break; |
| 751 case definePairTwo: |
| 752 scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL |
b<<7UL); |
| 753 state=readCommand; |
| 754 goto fastSingle; |
| 755 case defineOne: |
| 756 if(b==0) { |
| 757 /* callback(illegal): Reserved window offset value 0 */ |
| 758 cnv->toUBytes[1]=b; |
| 759 cnv->toULength=2; |
| 760 goto endloop; |
| 761 } else if(b<gapThreshold) { |
| 762 scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; |
| 763 } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)
) { |
| 764 scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; |
| 765 } else if(b>=fixedThreshold) { |
| 766 scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedT
hreshold]; |
| 767 } else { |
| 768 /* callback(illegal): Reserved window offset value 0xa8..0xf
8 */ |
| 769 cnv->toUBytes[1]=b; |
| 770 cnv->toULength=2; |
| 771 goto endloop; |
| 772 } |
| 773 state=readCommand; |
| 774 goto fastSingle; |
| 775 } |
| 776 } |
| 777 } else { |
| 778 /* fast path for Unicode mode */ |
| 779 if(state==readCommand) { |
| 780 fastUnicode: |
| 781 while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*so
urce)-UC0)>(Urs-UC0)) { |
| 782 *target++=(UChar)((b<<8)|source[1]); |
| 783 source+=2; |
| 784 } |
| 785 } |
| 786 |
| 787 /* normal state machine for Unicode mode */ |
| 788 /* unicodeByteMode: */ |
| 789 while(source<sourceLimit) { |
| 790 if(target>=targetLimit) { |
| 791 /* target is full */ |
| 792 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 793 break; |
| 794 } |
| 795 b=*source++; |
| 796 switch(state) { |
| 797 case readCommand: |
| 798 if((uint8_t)(b-UC0)>(Urs-UC0)) { |
| 799 byteOne=b; |
| 800 cnv->toUBytes[0]=b; |
| 801 cnv->toULength=1; |
| 802 state=quotePairTwo; |
| 803 } else if(/* UC0<=b && */ b<=UC7) { |
| 804 dynamicWindow=(int8_t)(b-UC0); |
| 805 isSingleByteMode=TRUE; |
| 806 goto fastSingle; |
| 807 } else if(/* UD0<=b && */ b<=UD7) { |
| 808 dynamicWindow=(int8_t)(b-UD0); |
| 809 isSingleByteMode=TRUE; |
| 810 cnv->toUBytes[0]=b; |
| 811 cnv->toULength=1; |
| 812 state=defineOne; |
| 813 goto singleByteMode; |
| 814 } else if(b==UDX) { |
| 815 isSingleByteMode=TRUE; |
| 816 cnv->toUBytes[0]=b; |
| 817 cnv->toULength=1; |
| 818 state=definePairOne; |
| 819 goto singleByteMode; |
| 820 } else if(b==UQU) { |
| 821 cnv->toUBytes[0]=b; |
| 822 cnv->toULength=1; |
| 823 state=quotePairOne; |
| 824 } else /* Urs */ { |
| 825 /* callback(illegal) */ |
| 826 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 827 cnv->toUBytes[0]=b; |
| 828 cnv->toULength=1; |
| 829 goto endloop; |
| 830 } |
| 831 break; |
| 832 case quotePairOne: |
| 833 byteOne=b; |
| 834 cnv->toUBytes[1]=b; |
| 835 cnv->toULength=2; |
| 836 state=quotePairTwo; |
| 837 break; |
| 838 case quotePairTwo: |
| 839 *target++=(UChar)((byteOne<<8)|b); |
| 840 state=readCommand; |
| 841 goto fastUnicode; |
| 842 } |
| 843 } |
| 844 } |
| 845 endloop: |
| 846 |
| 847 /* set the converter state back into UConverter */ |
| 848 if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { |
| 849 /* reset to deal with the next character */ |
| 850 state=readCommand; |
| 851 } else if(state==readCommand) { |
| 852 /* not in a multi-byte sequence, reset toULength */ |
| 853 cnv->toULength=0; |
| 854 } |
| 855 scsu->toUIsSingleByteMode=isSingleByteMode; |
| 856 scsu->toUState=state; |
| 857 scsu->toUQuoteWindow=quoteWindow; |
| 858 scsu->toUDynamicWindow=dynamicWindow; |
| 859 scsu->toUByteOne=byteOne; |
| 860 |
| 861 /* write back the updated pointers */ |
| 862 pArgs->source=(const char *)source; |
| 863 pArgs->target=target; |
| 864 return; |
| 865 } |
| 866 |
| 867 /* SCSU-from-Unicode conversion functions ----------------------------------- */ |
| 868 |
| 869 /* |
| 870 * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve |
| 871 * reasonable results. The lookahead is minimal. |
| 872 * Many cases are simple: |
| 873 * A character fits directly into the current mode, a dynamic or static window, |
| 874 * or is not compressible. These cases are tested first. |
| 875 * Real compression heuristics are applied to the rest, in code branches for |
| 876 * single/Unicode mode and BMP/supplementary code points. |
| 877 * The heuristics used here are extremely simple. |
| 878 */ |
| 879 |
| 880 /* get the number of the window that this character is in, or -1 */ |
| 881 static int8_t |
| 882 getWindow(const uint32_t offsets[8], uint32_t c) { |
| 883 int i; |
| 884 for(i=0; i<8; ++i) { |
| 885 if((uint32_t)(c-offsets[i])<=0x7f) { |
| 886 return (int8_t)(i); |
| 887 } |
| 888 } |
| 889 return -1; |
| 890 } |
| 891 |
| 892 /* is the character in the dynamic window starting at the offset, or in the dire
ct-encoded range? */ |
| 893 static UBool |
| 894 isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { |
| 895 return (UBool)(c<=offset+0x7f && |
| 896 (c>=offset || (c<=0x7f && |
| 897 (c>=0x20 || (1UL<<c)&0x2601)))); |
| 898 /* binary 0010 0110 0000 0001, |
| 899 check for b==0xd || b==0xa || b==9 || b==0 */ |
| 900 } |
| 901 |
| 902 /* |
| 903 * getNextDynamicWindow returns the next dynamic window to be redefined |
| 904 */ |
| 905 static int8_t |
| 906 getNextDynamicWindow(SCSUData *scsu) { |
| 907 int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; |
| 908 if(++scsu->nextWindowUseIndex==8) { |
| 909 scsu->nextWindowUseIndex=0; |
| 910 } |
| 911 return window; |
| 912 } |
| 913 |
| 914 /* |
| 915 * useDynamicWindow() adjusts |
| 916 * windowUse[] and nextWindowUseIndex for the algorithm to choose |
| 917 * the next dynamic window to be defined; |
| 918 * a subclass may override it and provide its own algorithm. |
| 919 */ |
| 920 static void |
| 921 useDynamicWindow(SCSUData *scsu, int8_t window) { |
| 922 /* |
| 923 * move the existing window, which just became the most recently used one, |
| 924 * up in windowUse[] to nextWindowUseIndex-1 |
| 925 */ |
| 926 |
| 927 /* first, find the index of the window - backwards to favor the more recentl
y used windows */ |
| 928 int i, j; |
| 929 |
| 930 i=scsu->nextWindowUseIndex; |
| 931 do { |
| 932 if(--i<0) { |
| 933 i=7; |
| 934 } |
| 935 } while(scsu->windowUse[i]!=window); |
| 936 |
| 937 /* now copy each windowUse[i+1] to [i] */ |
| 938 j=i+1; |
| 939 if(j==8) { |
| 940 j=0; |
| 941 } |
| 942 while(j!=scsu->nextWindowUseIndex) { |
| 943 scsu->windowUse[i]=scsu->windowUse[j]; |
| 944 i=j; |
| 945 if(++j==8) { j=0; } |
| 946 } |
| 947 |
| 948 /* finally, set the window into the most recently used index */ |
| 949 scsu->windowUse[i]=window; |
| 950 } |
| 951 |
| 952 /* |
| 953 * calculate the offset and the code for a dynamic window that contains the char
acter |
| 954 * takes fixed offsets into account |
| 955 * the offset of the window is stored in the offset variable, |
| 956 * the code is returned |
| 957 * |
| 958 * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX,
subtract 0x200 to get the true code |
| 959 */ |
| 960 static int |
| 961 getDynamicOffset(uint32_t c, uint32_t *pOffset) { |
| 962 int i; |
| 963 |
| 964 for(i=0; i<7; ++i) { |
| 965 if((uint32_t)(c-fixedOffsets[i])<=0x7f) { |
| 966 *pOffset=fixedOffsets[i]; |
| 967 return 0xf9+i; |
| 968 } |
| 969 } |
| 970 |
| 971 if(c<0x80) { |
| 972 /* No dynamic window for US-ASCII. */ |
| 973 return -1; |
| 974 } else if(c<0x3400 || |
| 975 (uint32_t)(c-0x10000)<(0x14000-0x10000) || |
| 976 (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) |
| 977 ) { |
| 978 /* This character is in a code range for a "small", i.e., reasonably win
dowable, script. */ |
| 979 *pOffset=c&0x7fffff80; |
| 980 return (int)(c>>7); |
| 981 } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { |
| 982 /* For these characters we need to take the gapOffset into account. */ |
| 983 *pOffset=c&0x7fffff80; |
| 984 return (int)((c-gapOffset)>>7); |
| 985 } else { |
| 986 return -1; |
| 987 } |
| 988 } |
| 989 |
| 990 /* |
| 991 * Idea for compression: |
| 992 * - save SCSUData and other state before really starting work |
| 993 * - at endloop, see if compression could be better with just unicode mode |
| 994 * - don't do this if a callback has been called |
| 995 * - if unicode mode would be smaller, then override the results with it - may
need SCU at the beginning |
| 996 * - different buffer handling! |
| 997 * |
| 998 * Drawback or need for corrective handling: |
| 999 * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and |
| 1000 * it is desirable to start a document in US-ASCII/Latin-1 for as long as possib
le |
| 1001 * not only for compression but also for HTML/XML documents with following chars
et/encoding announcers. |
| 1002 * |
| 1003 * How to achieve both? |
| 1004 * - Only replace the result after an SDX or SCU? |
| 1005 */ |
| 1006 |
| 1007 static void |
| 1008 _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
| 1009 UErrorCode *pErrorCode) { |
| 1010 UConverter *cnv; |
| 1011 SCSUData *scsu; |
| 1012 const UChar *source, *sourceLimit; |
| 1013 uint8_t *target; |
| 1014 int32_t targetCapacity; |
| 1015 int32_t *offsets; |
| 1016 |
| 1017 UBool isSingleByteMode; |
| 1018 uint8_t dynamicWindow; |
| 1019 uint32_t currentOffset; |
| 1020 |
| 1021 uint32_t c, delta; |
| 1022 |
| 1023 int32_t sourceIndex, nextSourceIndex; |
| 1024 |
| 1025 int32_t length; |
| 1026 |
| 1027 /* variables for compression heuristics */ |
| 1028 uint32_t offset; |
| 1029 UChar lead, trail; |
| 1030 int code; |
| 1031 int8_t window; |
| 1032 |
| 1033 /* set up the local pointers */ |
| 1034 cnv=pArgs->converter; |
| 1035 scsu=(SCSUData *)cnv->extraInfo; |
| 1036 |
| 1037 /* set up the local pointers */ |
| 1038 source=pArgs->source; |
| 1039 sourceLimit=pArgs->sourceLimit; |
| 1040 target=(uint8_t *)pArgs->target; |
| 1041 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
| 1042 offsets=pArgs->offsets; |
| 1043 |
| 1044 /* get the state machine state */ |
| 1045 isSingleByteMode=scsu->fromUIsSingleByteMode; |
| 1046 dynamicWindow=scsu->fromUDynamicWindow; |
| 1047 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1048 |
| 1049 c=cnv->fromUChar32; |
| 1050 |
| 1051 /* sourceIndex=-1 if the current character began in the previous buffer */ |
| 1052 sourceIndex= c==0 ? 0 : -1; |
| 1053 nextSourceIndex=0; |
| 1054 |
| 1055 /* similar conversion "loop" as in toUnicode */ |
| 1056 loop: |
| 1057 if(isSingleByteMode) { |
| 1058 if(c!=0 && targetCapacity>0) { |
| 1059 goto getTrailSingle; |
| 1060 } |
| 1061 |
| 1062 /* state machine for single-byte mode */ |
| 1063 /* singleByteMode: */ |
| 1064 while(source<sourceLimit) { |
| 1065 if(targetCapacity<=0) { |
| 1066 /* target is full */ |
| 1067 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1068 break; |
| 1069 } |
| 1070 c=*source++; |
| 1071 ++nextSourceIndex; |
| 1072 |
| 1073 if((c-0x20)<=0x5f) { |
| 1074 /* pass US-ASCII graphic character through */ |
| 1075 *target++=(uint8_t)c; |
| 1076 if(offsets!=NULL) { |
| 1077 *offsets++=sourceIndex; |
| 1078 } |
| 1079 --targetCapacity; |
| 1080 } else if(c<0x20) { |
| 1081 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0
xd || b==0xa || b==9 || b==0 */) { |
| 1082 /* CR/LF/TAB/NUL */ |
| 1083 *target++=(uint8_t)c; |
| 1084 if(offsets!=NULL) { |
| 1085 *offsets++=sourceIndex; |
| 1086 } |
| 1087 --targetCapacity; |
| 1088 } else { |
| 1089 /* quote C0 control character */ |
| 1090 c|=SQ0<<8; |
| 1091 length=2; |
| 1092 goto outputBytes; |
| 1093 } |
| 1094 } else if((delta=c-currentOffset)<=0x7f) { |
| 1095 /* use the current dynamic window */ |
| 1096 *target++=(uint8_t)(delta|0x80); |
| 1097 if(offsets!=NULL) { |
| 1098 *offsets++=sourceIndex; |
| 1099 } |
| 1100 --targetCapacity; |
| 1101 } else if(UTF_IS_SURROGATE(c)) { |
| 1102 if(UTF_IS_SURROGATE_FIRST(c)) { |
| 1103 getTrailSingle: |
| 1104 lead=(UChar)c; |
| 1105 if(source<sourceLimit) { |
| 1106 /* test the following code unit */ |
| 1107 trail=*source; |
| 1108 if(UTF_IS_SECOND_SURROGATE(trail)) { |
| 1109 ++source; |
| 1110 ++nextSourceIndex; |
| 1111 c=UTF16_GET_PAIR_VALUE(c, trail); |
| 1112 /* convert this surrogate code point */ |
| 1113 /* exit this condition tree */ |
| 1114 } else { |
| 1115 /* this is an unmatched lead code unit (1st surrogat
e) */ |
| 1116 /* callback(illegal) */ |
| 1117 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1118 goto endloop; |
| 1119 } |
| 1120 } else { |
| 1121 /* no more input */ |
| 1122 break; |
| 1123 } |
| 1124 } else { |
| 1125 /* this is an unmatched trail code unit (2nd surrogate) */ |
| 1126 /* callback(illegal) */ |
| 1127 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1128 goto endloop; |
| 1129 } |
| 1130 |
| 1131 /* compress supplementary character U+10000..U+10ffff */ |
| 1132 if((delta=c-currentOffset)<=0x7f) { |
| 1133 /* use the current dynamic window */ |
| 1134 *target++=(uint8_t)(delta|0x80); |
| 1135 if(offsets!=NULL) { |
| 1136 *offsets++=sourceIndex; |
| 1137 } |
| 1138 --targetCapacity; |
| 1139 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| 1140 /* there is a dynamic window that contains this character, c
hange to it */ |
| 1141 dynamicWindow=window; |
| 1142 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1143 useDynamicWindow(scsu, dynamicWindow); |
| 1144 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| 1145 length=2; |
| 1146 goto outputBytes; |
| 1147 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| 1148 /* might check if there are more characters in this window t
o come */ |
| 1149 /* define an extended window with this character */ |
| 1150 code-=0x200; |
| 1151 dynamicWindow=getNextDynamicWindow(scsu); |
| 1152 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offse
t; |
| 1153 useDynamicWindow(scsu, dynamicWindow); |
| 1154 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32
_t)code<<8)|(c-currentOffset)|0x80; |
| 1155 length=4; |
| 1156 goto outputBytes; |
| 1157 } else { |
| 1158 /* change to Unicode mode and output this (lead, trail) pair
*/ |
| 1159 isSingleByteMode=FALSE; |
| 1160 *target++=(uint8_t)SCU; |
| 1161 if(offsets!=NULL) { |
| 1162 *offsets++=sourceIndex; |
| 1163 } |
| 1164 --targetCapacity; |
| 1165 c=((uint32_t)lead<<16)|trail; |
| 1166 length=4; |
| 1167 goto outputBytes; |
| 1168 } |
| 1169 } else if(c<0xa0) { |
| 1170 /* quote C1 control character */ |
| 1171 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
| 1172 length=2; |
| 1173 goto outputBytes; |
| 1174 } else if(c==0xfeff || c>=0xfff0) { |
| 1175 /* quote signature character=byte order mark and specials */ |
| 1176 c|=SQU<<16; |
| 1177 length=3; |
| 1178 goto outputBytes; |
| 1179 } else { |
| 1180 /* compress all other BMP characters */ |
| 1181 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| 1182 /* there is a window defined that contains this character -
switch to it or quote from it? */ |
| 1183 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fro
mUDynamicOffsets[window], *source)) { |
| 1184 /* change to dynamic window */ |
| 1185 dynamicWindow=window; |
| 1186 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1187 useDynamicWindow(scsu, dynamicWindow); |
| 1188 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0
x80; |
| 1189 length=2; |
| 1190 goto outputBytes; |
| 1191 } else { |
| 1192 /* quote from dynamic window */ |
| 1193 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffse
ts[window])|0x80; |
| 1194 length=2; |
| 1195 goto outputBytes; |
| 1196 } |
| 1197 } else if((window=getWindow(staticOffsets, c))>=0) { |
| 1198 /* quote from static window */ |
| 1199 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
| 1200 length=2; |
| 1201 goto outputBytes; |
| 1202 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| 1203 /* define a dynamic window with this character */ |
| 1204 dynamicWindow=getNextDynamicWindow(scsu); |
| 1205 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offse
t; |
| 1206 useDynamicWindow(scsu, dynamicWindow); |
| 1207 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c
-currentOffset)|0x80; |
| 1208 length=3; |
| 1209 goto outputBytes; |
| 1210 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
| 1211 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0x
d800-0x3400)) |
| 1212 ) { |
| 1213 /* |
| 1214 * this character is not compressible (a BMP ideograph or si
milar); |
| 1215 * switch to Unicode mode if this is the last character in t
he block |
| 1216 * or there is at least one more ideograph following immedia
tely |
| 1217 */ |
| 1218 isSingleByteMode=FALSE; |
| 1219 c|=SCU<<16; |
| 1220 length=3; |
| 1221 goto outputBytes; |
| 1222 } else { |
| 1223 /* quote Unicode */ |
| 1224 c|=SQU<<16; |
| 1225 length=3; |
| 1226 goto outputBytes; |
| 1227 } |
| 1228 } |
| 1229 |
| 1230 /* normal end of conversion: prepare for a new character */ |
| 1231 c=0; |
| 1232 sourceIndex=nextSourceIndex; |
| 1233 } |
| 1234 } else { |
| 1235 if(c!=0 && targetCapacity>0) { |
| 1236 goto getTrailUnicode; |
| 1237 } |
| 1238 |
| 1239 /* state machine for Unicode mode */ |
| 1240 /* unicodeByteMode: */ |
| 1241 while(source<sourceLimit) { |
| 1242 if(targetCapacity<=0) { |
| 1243 /* target is full */ |
| 1244 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1245 break; |
| 1246 } |
| 1247 c=*source++; |
| 1248 ++nextSourceIndex; |
| 1249 |
| 1250 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
| 1251 /* not compressible, write character directly */ |
| 1252 if(targetCapacity>=2) { |
| 1253 *target++=(uint8_t)(c>>8); |
| 1254 *target++=(uint8_t)c; |
| 1255 if(offsets!=NULL) { |
| 1256 *offsets++=sourceIndex; |
| 1257 *offsets++=sourceIndex; |
| 1258 } |
| 1259 targetCapacity-=2; |
| 1260 } else { |
| 1261 length=2; |
| 1262 goto outputBytes; |
| 1263 } |
| 1264 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0x
f300 */) { |
| 1265 /* compress BMP character if the following one is not an uncompr
essible ideograph */ |
| 1266 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x
3400))) { |
| 1267 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint3
2_t)(c-0x41)<26)) { |
| 1268 /* ASCII digit or letter */ |
| 1269 isSingleByteMode=TRUE; |
| 1270 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
| 1271 length=2; |
| 1272 goto outputBytes; |
| 1273 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=
0) { |
| 1274 /* there is a dynamic window that contains this characte
r, change to it */ |
| 1275 isSingleByteMode=TRUE; |
| 1276 dynamicWindow=window; |
| 1277 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1278 useDynamicWindow(scsu, dynamicWindow); |
| 1279 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0
x80; |
| 1280 length=2; |
| 1281 goto outputBytes; |
| 1282 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| 1283 /* define a dynamic window with this character */ |
| 1284 isSingleByteMode=TRUE; |
| 1285 dynamicWindow=getNextDynamicWindow(scsu); |
| 1286 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=o
ffset; |
| 1287 useDynamicWindow(scsu, dynamicWindow); |
| 1288 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8
)|(c-currentOffset)|0x80; |
| 1289 length=3; |
| 1290 goto outputBytes; |
| 1291 } |
| 1292 } |
| 1293 |
| 1294 /* don't know how to compress this character, just write it dire
ctly */ |
| 1295 length=2; |
| 1296 goto outputBytes; |
| 1297 } else if(c<0xe000) { |
| 1298 /* c is a surrogate */ |
| 1299 if(UTF_IS_SURROGATE_FIRST(c)) { |
| 1300 getTrailUnicode: |
| 1301 lead=(UChar)c; |
| 1302 if(source<sourceLimit) { |
| 1303 /* test the following code unit */ |
| 1304 trail=*source; |
| 1305 if(UTF_IS_SECOND_SURROGATE(trail)) { |
| 1306 ++source; |
| 1307 ++nextSourceIndex; |
| 1308 c=UTF16_GET_PAIR_VALUE(c, trail); |
| 1309 /* convert this surrogate code point */ |
| 1310 /* exit this condition tree */ |
| 1311 } else { |
| 1312 /* this is an unmatched lead code unit (1st surrogat
e) */ |
| 1313 /* callback(illegal) */ |
| 1314 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1315 goto endloop; |
| 1316 } |
| 1317 } else { |
| 1318 /* no more input */ |
| 1319 break; |
| 1320 } |
| 1321 } else { |
| 1322 /* this is an unmatched trail code unit (2nd surrogate) */ |
| 1323 /* callback(illegal) */ |
| 1324 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1325 goto endloop; |
| 1326 } |
| 1327 |
| 1328 /* compress supplementary character */ |
| 1329 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
| 1330 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0
x3400)) |
| 1331 ) { |
| 1332 /* |
| 1333 * there is a dynamic window that contains this character an
d |
| 1334 * the following character is not uncompressible, |
| 1335 * change to the window |
| 1336 */ |
| 1337 isSingleByteMode=TRUE; |
| 1338 dynamicWindow=window; |
| 1339 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1340 useDynamicWindow(scsu, dynamicWindow); |
| 1341 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| 1342 length=2; |
| 1343 goto outputBytes; |
| 1344 } else if(source<sourceLimit && lead==*source && /* too lazy to
check trail in same window as source[1] */ |
| 1345 (code=getDynamicOffset(c, &offset))>=0 |
| 1346 ) { |
| 1347 /* two supplementary characters in (probably) the same windo
w - define an extended one */ |
| 1348 isSingleByteMode=TRUE; |
| 1349 code-=0x200; |
| 1350 dynamicWindow=getNextDynamicWindow(scsu); |
| 1351 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offse
t; |
| 1352 useDynamicWindow(scsu, dynamicWindow); |
| 1353 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32
_t)code<<8)|(c-currentOffset)|0x80; |
| 1354 length=4; |
| 1355 goto outputBytes; |
| 1356 } else { |
| 1357 /* don't know how to compress this character, just write it
directly */ |
| 1358 c=((uint32_t)lead<<16)|trail; |
| 1359 length=4; |
| 1360 goto outputBytes; |
| 1361 } |
| 1362 } else /* 0xe000<=c<0xf300 */ { |
| 1363 /* quote to avoid SCSU tags */ |
| 1364 c|=UQU<<16; |
| 1365 length=3; |
| 1366 goto outputBytes; |
| 1367 } |
| 1368 |
| 1369 /* normal end of conversion: prepare for a new character */ |
| 1370 c=0; |
| 1371 sourceIndex=nextSourceIndex; |
| 1372 } |
| 1373 } |
| 1374 endloop: |
| 1375 |
| 1376 /* set the converter state back into UConverter */ |
| 1377 scsu->fromUIsSingleByteMode=isSingleByteMode; |
| 1378 scsu->fromUDynamicWindow=dynamicWindow; |
| 1379 |
| 1380 cnv->fromUChar32=c; |
| 1381 |
| 1382 /* write back the updated pointers */ |
| 1383 pArgs->source=source; |
| 1384 pArgs->target=(char *)target; |
| 1385 pArgs->offsets=offsets; |
| 1386 return; |
| 1387 |
| 1388 outputBytes: |
| 1389 /* write the output character bytes from c and length [code copied from ucnv
mbcs.c] */ |
| 1390 /* from the first if in the loop we know that targetCapacity>0 */ |
| 1391 if(length<=targetCapacity) { |
| 1392 if(offsets==NULL) { |
| 1393 switch(length) { |
| 1394 /* each branch falls through to the next one */ |
| 1395 case 4: |
| 1396 *target++=(uint8_t)(c>>24); |
| 1397 case 3: |
| 1398 *target++=(uint8_t)(c>>16); |
| 1399 case 2: |
| 1400 *target++=(uint8_t)(c>>8); |
| 1401 case 1: |
| 1402 *target++=(uint8_t)c; |
| 1403 default: |
| 1404 /* will never occur */ |
| 1405 break; |
| 1406 } |
| 1407 } else { |
| 1408 switch(length) { |
| 1409 /* each branch falls through to the next one */ |
| 1410 case 4: |
| 1411 *target++=(uint8_t)(c>>24); |
| 1412 *offsets++=sourceIndex; |
| 1413 case 3: |
| 1414 *target++=(uint8_t)(c>>16); |
| 1415 *offsets++=sourceIndex; |
| 1416 case 2: |
| 1417 *target++=(uint8_t)(c>>8); |
| 1418 *offsets++=sourceIndex; |
| 1419 case 1: |
| 1420 *target++=(uint8_t)c; |
| 1421 *offsets++=sourceIndex; |
| 1422 default: |
| 1423 /* will never occur */ |
| 1424 break; |
| 1425 } |
| 1426 } |
| 1427 targetCapacity-=length; |
| 1428 |
| 1429 /* normal end of conversion: prepare for a new character */ |
| 1430 c=0; |
| 1431 sourceIndex=nextSourceIndex; |
| 1432 goto loop; |
| 1433 } else { |
| 1434 uint8_t *p; |
| 1435 |
| 1436 /* |
| 1437 * We actually do this backwards here: |
| 1438 * In order to save an intermediate variable, we output |
| 1439 * first to the overflow buffer what does not fit into the |
| 1440 * regular target. |
| 1441 */ |
| 1442 /* we know that 0<=targetCapacity<length<=4 */ |
| 1443 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapa
city==1 */ |
| 1444 length-=targetCapacity; |
| 1445 p=(uint8_t *)cnv->charErrorBuffer; |
| 1446 switch(length) { |
| 1447 /* each branch falls through to the next one */ |
| 1448 case 4: |
| 1449 *p++=(uint8_t)(c>>24); |
| 1450 case 3: |
| 1451 *p++=(uint8_t)(c>>16); |
| 1452 case 2: |
| 1453 *p++=(uint8_t)(c>>8); |
| 1454 case 1: |
| 1455 *p=(uint8_t)c; |
| 1456 default: |
| 1457 /* will never occur */ |
| 1458 break; |
| 1459 } |
| 1460 cnv->charErrorBufferLength=(int8_t)length; |
| 1461 |
| 1462 /* now output what fits into the regular target */ |
| 1463 c>>=8*length; /* length was reduced by targetCapacity */ |
| 1464 switch(targetCapacity) { |
| 1465 /* each branch falls through to the next one */ |
| 1466 case 3: |
| 1467 *target++=(uint8_t)(c>>16); |
| 1468 if(offsets!=NULL) { |
| 1469 *offsets++=sourceIndex; |
| 1470 } |
| 1471 case 2: |
| 1472 *target++=(uint8_t)(c>>8); |
| 1473 if(offsets!=NULL) { |
| 1474 *offsets++=sourceIndex; |
| 1475 } |
| 1476 case 1: |
| 1477 *target++=(uint8_t)c; |
| 1478 if(offsets!=NULL) { |
| 1479 *offsets++=sourceIndex; |
| 1480 } |
| 1481 default: |
| 1482 break; |
| 1483 } |
| 1484 |
| 1485 /* target overflow */ |
| 1486 targetCapacity=0; |
| 1487 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1488 c=0; |
| 1489 goto endloop; |
| 1490 } |
| 1491 } |
| 1492 |
| 1493 /* |
| 1494 * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. |
| 1495 * If a change is made in the original function, then either |
| 1496 * change this function the same way or |
| 1497 * re-copy the original function and remove the variables |
| 1498 * offsets, sourceIndex, and nextSourceIndex. |
| 1499 */ |
| 1500 static void |
| 1501 _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, |
| 1502 UErrorCode *pErrorCode) { |
| 1503 UConverter *cnv; |
| 1504 SCSUData *scsu; |
| 1505 const UChar *source, *sourceLimit; |
| 1506 uint8_t *target; |
| 1507 int32_t targetCapacity; |
| 1508 |
| 1509 UBool isSingleByteMode; |
| 1510 uint8_t dynamicWindow; |
| 1511 uint32_t currentOffset; |
| 1512 |
| 1513 uint32_t c, delta; |
| 1514 |
| 1515 int32_t length; |
| 1516 |
| 1517 /* variables for compression heuristics */ |
| 1518 uint32_t offset; |
| 1519 UChar lead, trail; |
| 1520 int code; |
| 1521 int8_t window; |
| 1522 |
| 1523 /* set up the local pointers */ |
| 1524 cnv=pArgs->converter; |
| 1525 scsu=(SCSUData *)cnv->extraInfo; |
| 1526 |
| 1527 /* set up the local pointers */ |
| 1528 source=pArgs->source; |
| 1529 sourceLimit=pArgs->sourceLimit; |
| 1530 target=(uint8_t *)pArgs->target; |
| 1531 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); |
| 1532 |
| 1533 /* get the state machine state */ |
| 1534 isSingleByteMode=scsu->fromUIsSingleByteMode; |
| 1535 dynamicWindow=scsu->fromUDynamicWindow; |
| 1536 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1537 |
| 1538 c=cnv->fromUChar32; |
| 1539 |
| 1540 /* similar conversion "loop" as in toUnicode */ |
| 1541 loop: |
| 1542 if(isSingleByteMode) { |
| 1543 if(c!=0 && targetCapacity>0) { |
| 1544 goto getTrailSingle; |
| 1545 } |
| 1546 |
| 1547 /* state machine for single-byte mode */ |
| 1548 /* singleByteMode: */ |
| 1549 while(source<sourceLimit) { |
| 1550 if(targetCapacity<=0) { |
| 1551 /* target is full */ |
| 1552 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1553 break; |
| 1554 } |
| 1555 c=*source++; |
| 1556 |
| 1557 if((c-0x20)<=0x5f) { |
| 1558 /* pass US-ASCII graphic character through */ |
| 1559 *target++=(uint8_t)c; |
| 1560 --targetCapacity; |
| 1561 } else if(c<0x20) { |
| 1562 if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0
xd || b==0xa || b==9 || b==0 */) { |
| 1563 /* CR/LF/TAB/NUL */ |
| 1564 *target++=(uint8_t)c; |
| 1565 --targetCapacity; |
| 1566 } else { |
| 1567 /* quote C0 control character */ |
| 1568 c|=SQ0<<8; |
| 1569 length=2; |
| 1570 goto outputBytes; |
| 1571 } |
| 1572 } else if((delta=c-currentOffset)<=0x7f) { |
| 1573 /* use the current dynamic window */ |
| 1574 *target++=(uint8_t)(delta|0x80); |
| 1575 --targetCapacity; |
| 1576 } else if(UTF_IS_SURROGATE(c)) { |
| 1577 if(UTF_IS_SURROGATE_FIRST(c)) { |
| 1578 getTrailSingle: |
| 1579 lead=(UChar)c; |
| 1580 if(source<sourceLimit) { |
| 1581 /* test the following code unit */ |
| 1582 trail=*source; |
| 1583 if(UTF_IS_SECOND_SURROGATE(trail)) { |
| 1584 ++source; |
| 1585 c=UTF16_GET_PAIR_VALUE(c, trail); |
| 1586 /* convert this surrogate code point */ |
| 1587 /* exit this condition tree */ |
| 1588 } else { |
| 1589 /* this is an unmatched lead code unit (1st surrogat
e) */ |
| 1590 /* callback(illegal) */ |
| 1591 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1592 goto endloop; |
| 1593 } |
| 1594 } else { |
| 1595 /* no more input */ |
| 1596 break; |
| 1597 } |
| 1598 } else { |
| 1599 /* this is an unmatched trail code unit (2nd surrogate) */ |
| 1600 /* callback(illegal) */ |
| 1601 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1602 goto endloop; |
| 1603 } |
| 1604 |
| 1605 /* compress supplementary character U+10000..U+10ffff */ |
| 1606 if((delta=c-currentOffset)<=0x7f) { |
| 1607 /* use the current dynamic window */ |
| 1608 *target++=(uint8_t)(delta|0x80); |
| 1609 --targetCapacity; |
| 1610 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| 1611 /* there is a dynamic window that contains this character, c
hange to it */ |
| 1612 dynamicWindow=window; |
| 1613 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1614 useDynamicWindow(scsu, dynamicWindow); |
| 1615 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| 1616 length=2; |
| 1617 goto outputBytes; |
| 1618 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| 1619 /* might check if there are more characters in this window t
o come */ |
| 1620 /* define an extended window with this character */ |
| 1621 code-=0x200; |
| 1622 dynamicWindow=getNextDynamicWindow(scsu); |
| 1623 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offse
t; |
| 1624 useDynamicWindow(scsu, dynamicWindow); |
| 1625 c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32
_t)code<<8)|(c-currentOffset)|0x80; |
| 1626 length=4; |
| 1627 goto outputBytes; |
| 1628 } else { |
| 1629 /* change to Unicode mode and output this (lead, trail) pair
*/ |
| 1630 isSingleByteMode=FALSE; |
| 1631 *target++=(uint8_t)SCU; |
| 1632 --targetCapacity; |
| 1633 c=((uint32_t)lead<<16)|trail; |
| 1634 length=4; |
| 1635 goto outputBytes; |
| 1636 } |
| 1637 } else if(c<0xa0) { |
| 1638 /* quote C1 control character */ |
| 1639 c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ |
| 1640 length=2; |
| 1641 goto outputBytes; |
| 1642 } else if(c==0xfeff || c>=0xfff0) { |
| 1643 /* quote signature character=byte order mark and specials */ |
| 1644 c|=SQU<<16; |
| 1645 length=3; |
| 1646 goto outputBytes; |
| 1647 } else { |
| 1648 /* compress all other BMP characters */ |
| 1649 if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { |
| 1650 /* there is a window defined that contains this character -
switch to it or quote from it? */ |
| 1651 if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fro
mUDynamicOffsets[window], *source)) { |
| 1652 /* change to dynamic window */ |
| 1653 dynamicWindow=window; |
| 1654 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1655 useDynamicWindow(scsu, dynamicWindow); |
| 1656 c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0
x80; |
| 1657 length=2; |
| 1658 goto outputBytes; |
| 1659 } else { |
| 1660 /* quote from dynamic window */ |
| 1661 c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffse
ts[window])|0x80; |
| 1662 length=2; |
| 1663 goto outputBytes; |
| 1664 } |
| 1665 } else if((window=getWindow(staticOffsets, c))>=0) { |
| 1666 /* quote from static window */ |
| 1667 c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); |
| 1668 length=2; |
| 1669 goto outputBytes; |
| 1670 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| 1671 /* define a dynamic window with this character */ |
| 1672 dynamicWindow=getNextDynamicWindow(scsu); |
| 1673 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offse
t; |
| 1674 useDynamicWindow(scsu, dynamicWindow); |
| 1675 c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c
-currentOffset)|0x80; |
| 1676 length=3; |
| 1677 goto outputBytes; |
| 1678 } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && |
| 1679 (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0x
d800-0x3400)) |
| 1680 ) { |
| 1681 /* |
| 1682 * this character is not compressible (a BMP ideograph or si
milar); |
| 1683 * switch to Unicode mode if this is the last character in t
he block |
| 1684 * or there is at least one more ideograph following immedia
tely |
| 1685 */ |
| 1686 isSingleByteMode=FALSE; |
| 1687 c|=SCU<<16; |
| 1688 length=3; |
| 1689 goto outputBytes; |
| 1690 } else { |
| 1691 /* quote Unicode */ |
| 1692 c|=SQU<<16; |
| 1693 length=3; |
| 1694 goto outputBytes; |
| 1695 } |
| 1696 } |
| 1697 |
| 1698 /* normal end of conversion: prepare for a new character */ |
| 1699 c=0; |
| 1700 } |
| 1701 } else { |
| 1702 if(c!=0 && targetCapacity>0) { |
| 1703 goto getTrailUnicode; |
| 1704 } |
| 1705 |
| 1706 /* state machine for Unicode mode */ |
| 1707 /* unicodeByteMode: */ |
| 1708 while(source<sourceLimit) { |
| 1709 if(targetCapacity<=0) { |
| 1710 /* target is full */ |
| 1711 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1712 break; |
| 1713 } |
| 1714 c=*source++; |
| 1715 |
| 1716 if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { |
| 1717 /* not compressible, write character directly */ |
| 1718 if(targetCapacity>=2) { |
| 1719 *target++=(uint8_t)(c>>8); |
| 1720 *target++=(uint8_t)c; |
| 1721 targetCapacity-=2; |
| 1722 } else { |
| 1723 length=2; |
| 1724 goto outputBytes; |
| 1725 } |
| 1726 } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0x
f300 */) { |
| 1727 /* compress BMP character if the following one is not an uncompr
essible ideograph */ |
| 1728 if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x
3400))) { |
| 1729 if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint3
2_t)(c-0x41)<26)) { |
| 1730 /* ASCII digit or letter */ |
| 1731 isSingleByteMode=TRUE; |
| 1732 c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; |
| 1733 length=2; |
| 1734 goto outputBytes; |
| 1735 } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=
0) { |
| 1736 /* there is a dynamic window that contains this characte
r, change to it */ |
| 1737 isSingleByteMode=TRUE; |
| 1738 dynamicWindow=window; |
| 1739 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1740 useDynamicWindow(scsu, dynamicWindow); |
| 1741 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0
x80; |
| 1742 length=2; |
| 1743 goto outputBytes; |
| 1744 } else if((code=getDynamicOffset(c, &offset))>=0) { |
| 1745 /* define a dynamic window with this character */ |
| 1746 isSingleByteMode=TRUE; |
| 1747 dynamicWindow=getNextDynamicWindow(scsu); |
| 1748 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=o
ffset; |
| 1749 useDynamicWindow(scsu, dynamicWindow); |
| 1750 c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8
)|(c-currentOffset)|0x80; |
| 1751 length=3; |
| 1752 goto outputBytes; |
| 1753 } |
| 1754 } |
| 1755 |
| 1756 /* don't know how to compress this character, just write it dire
ctly */ |
| 1757 length=2; |
| 1758 goto outputBytes; |
| 1759 } else if(c<0xe000) { |
| 1760 /* c is a surrogate */ |
| 1761 if(UTF_IS_SURROGATE_FIRST(c)) { |
| 1762 getTrailUnicode: |
| 1763 lead=(UChar)c; |
| 1764 if(source<sourceLimit) { |
| 1765 /* test the following code unit */ |
| 1766 trail=*source; |
| 1767 if(UTF_IS_SECOND_SURROGATE(trail)) { |
| 1768 ++source; |
| 1769 c=UTF16_GET_PAIR_VALUE(c, trail); |
| 1770 /* convert this surrogate code point */ |
| 1771 /* exit this condition tree */ |
| 1772 } else { |
| 1773 /* this is an unmatched lead code unit (1st surrogat
e) */ |
| 1774 /* callback(illegal) */ |
| 1775 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1776 goto endloop; |
| 1777 } |
| 1778 } else { |
| 1779 /* no more input */ |
| 1780 break; |
| 1781 } |
| 1782 } else { |
| 1783 /* this is an unmatched trail code unit (2nd surrogate) */ |
| 1784 /* callback(illegal) */ |
| 1785 *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
| 1786 goto endloop; |
| 1787 } |
| 1788 |
| 1789 /* compress supplementary character */ |
| 1790 if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && |
| 1791 !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0
x3400)) |
| 1792 ) { |
| 1793 /* |
| 1794 * there is a dynamic window that contains this character an
d |
| 1795 * the following character is not uncompressible, |
| 1796 * change to the window |
| 1797 */ |
| 1798 isSingleByteMode=TRUE; |
| 1799 dynamicWindow=window; |
| 1800 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; |
| 1801 useDynamicWindow(scsu, dynamicWindow); |
| 1802 c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; |
| 1803 length=2; |
| 1804 goto outputBytes; |
| 1805 } else if(source<sourceLimit && lead==*source && /* too lazy to
check trail in same window as source[1] */ |
| 1806 (code=getDynamicOffset(c, &offset))>=0 |
| 1807 ) { |
| 1808 /* two supplementary characters in (probably) the same windo
w - define an extended one */ |
| 1809 isSingleByteMode=TRUE; |
| 1810 code-=0x200; |
| 1811 dynamicWindow=getNextDynamicWindow(scsu); |
| 1812 currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offse
t; |
| 1813 useDynamicWindow(scsu, dynamicWindow); |
| 1814 c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32
_t)code<<8)|(c-currentOffset)|0x80; |
| 1815 length=4; |
| 1816 goto outputBytes; |
| 1817 } else { |
| 1818 /* don't know how to compress this character, just write it
directly */ |
| 1819 c=((uint32_t)lead<<16)|trail; |
| 1820 length=4; |
| 1821 goto outputBytes; |
| 1822 } |
| 1823 } else /* 0xe000<=c<0xf300 */ { |
| 1824 /* quote to avoid SCSU tags */ |
| 1825 c|=UQU<<16; |
| 1826 length=3; |
| 1827 goto outputBytes; |
| 1828 } |
| 1829 |
| 1830 /* normal end of conversion: prepare for a new character */ |
| 1831 c=0; |
| 1832 } |
| 1833 } |
| 1834 endloop: |
| 1835 |
| 1836 /* set the converter state back into UConverter */ |
| 1837 scsu->fromUIsSingleByteMode=isSingleByteMode; |
| 1838 scsu->fromUDynamicWindow=dynamicWindow; |
| 1839 |
| 1840 cnv->fromUChar32=c; |
| 1841 |
| 1842 /* write back the updated pointers */ |
| 1843 pArgs->source=source; |
| 1844 pArgs->target=(char *)target; |
| 1845 return; |
| 1846 |
| 1847 outputBytes: |
| 1848 /* write the output character bytes from c and length [code copied from ucnv
mbcs.c] */ |
| 1849 /* from the first if in the loop we know that targetCapacity>0 */ |
| 1850 if(length<=targetCapacity) { |
| 1851 switch(length) { |
| 1852 /* each branch falls through to the next one */ |
| 1853 case 4: |
| 1854 *target++=(uint8_t)(c>>24); |
| 1855 case 3: |
| 1856 *target++=(uint8_t)(c>>16); |
| 1857 case 2: |
| 1858 *target++=(uint8_t)(c>>8); |
| 1859 case 1: |
| 1860 *target++=(uint8_t)c; |
| 1861 default: |
| 1862 /* will never occur */ |
| 1863 break; |
| 1864 } |
| 1865 targetCapacity-=length; |
| 1866 |
| 1867 /* normal end of conversion: prepare for a new character */ |
| 1868 c=0; |
| 1869 goto loop; |
| 1870 } else { |
| 1871 uint8_t *p; |
| 1872 |
| 1873 /* |
| 1874 * We actually do this backwards here: |
| 1875 * In order to save an intermediate variable, we output |
| 1876 * first to the overflow buffer what does not fit into the |
| 1877 * regular target. |
| 1878 */ |
| 1879 /* we know that 0<=targetCapacity<length<=4 */ |
| 1880 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapa
city==1 */ |
| 1881 length-=targetCapacity; |
| 1882 p=(uint8_t *)cnv->charErrorBuffer; |
| 1883 switch(length) { |
| 1884 /* each branch falls through to the next one */ |
| 1885 case 4: |
| 1886 *p++=(uint8_t)(c>>24); |
| 1887 case 3: |
| 1888 *p++=(uint8_t)(c>>16); |
| 1889 case 2: |
| 1890 *p++=(uint8_t)(c>>8); |
| 1891 case 1: |
| 1892 *p=(uint8_t)c; |
| 1893 default: |
| 1894 /* will never occur */ |
| 1895 break; |
| 1896 } |
| 1897 cnv->charErrorBufferLength=(int8_t)length; |
| 1898 |
| 1899 /* now output what fits into the regular target */ |
| 1900 c>>=8*length; /* length was reduced by targetCapacity */ |
| 1901 switch(targetCapacity) { |
| 1902 /* each branch falls through to the next one */ |
| 1903 case 3: |
| 1904 *target++=(uint8_t)(c>>16); |
| 1905 case 2: |
| 1906 *target++=(uint8_t)(c>>8); |
| 1907 case 1: |
| 1908 *target++=(uint8_t)c; |
| 1909 default: |
| 1910 break; |
| 1911 } |
| 1912 |
| 1913 /* target overflow */ |
| 1914 targetCapacity=0; |
| 1915 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1916 c=0; |
| 1917 goto endloop; |
| 1918 } |
| 1919 } |
| 1920 |
| 1921 /* miscellaneous ------------------------------------------------------------ */ |
| 1922 |
| 1923 static const char * |
| 1924 _SCSUGetName(const UConverter *cnv) { |
| 1925 SCSUData *scsu=(SCSUData *)cnv->extraInfo; |
| 1926 |
| 1927 switch(scsu->locale) { |
| 1928 case l_ja: |
| 1929 return "SCSU,locale=ja"; |
| 1930 default: |
| 1931 return "SCSU"; |
| 1932 } |
| 1933 } |
| 1934 |
| 1935 /* structure for SafeClone calculations */ |
| 1936 struct cloneSCSUStruct |
| 1937 { |
| 1938 UConverter cnv; |
| 1939 SCSUData mydata; |
| 1940 }; |
| 1941 |
| 1942 static UConverter * |
| 1943 _SCSUSafeClone(const UConverter *cnv, |
| 1944 void *stackBuffer, |
| 1945 int32_t *pBufferSize, |
| 1946 UErrorCode *status) |
| 1947 { |
| 1948 struct cloneSCSUStruct * localClone; |
| 1949 int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); |
| 1950 |
| 1951 if (U_FAILURE(*status)){ |
| 1952 return 0; |
| 1953 } |
| 1954 |
| 1955 if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pB
ufferSize */ |
| 1956 *pBufferSize = bufferSizeNeeded; |
| 1957 return 0; |
| 1958 } |
| 1959 |
| 1960 localClone = (struct cloneSCSUStruct *)stackBuffer; |
| 1961 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
| 1962 |
| 1963 uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); |
| 1964 localClone->cnv.extraInfo = &localClone->mydata; |
| 1965 localClone->cnv.isExtraLocal = TRUE; |
| 1966 |
| 1967 return &localClone->cnv; |
| 1968 } |
| 1969 |
| 1970 |
| 1971 static const UConverterImpl _SCSUImpl={ |
| 1972 UCNV_SCSU, |
| 1973 |
| 1974 NULL, |
| 1975 NULL, |
| 1976 |
| 1977 _SCSUOpen, |
| 1978 _SCSUClose, |
| 1979 _SCSUReset, |
| 1980 |
| 1981 _SCSUToUnicode, |
| 1982 _SCSUToUnicodeWithOffsets, |
| 1983 _SCSUFromUnicode, |
| 1984 _SCSUFromUnicodeWithOffsets, |
| 1985 NULL, |
| 1986 |
| 1987 NULL, |
| 1988 _SCSUGetName, |
| 1989 NULL, |
| 1990 _SCSUSafeClone, |
| 1991 ucnv_getCompleteUnicodeSet |
| 1992 }; |
| 1993 |
| 1994 static const UConverterStaticData _SCSUStaticData={ |
| 1995 sizeof(UConverterStaticData), |
| 1996 "SCSU", |
| 1997 1212, /* CCSID for SCSU */ |
| 1998 UCNV_IBM, UCNV_SCSU, |
| 1999 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ |
| 2000 /* |
| 2001 * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode |
| 2002 * substitution string. |
| 2003 */ |
| 2004 { 0x0e, 0xff, 0xfd, 0 }, 3, |
| 2005 FALSE, FALSE, |
| 2006 0, |
| 2007 0, |
| 2008 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 2009 }; |
| 2010 |
| 2011 const UConverterSharedData _SCSUData={ |
| 2012 sizeof(UConverterSharedData), ~((uint32_t)0), |
| 2013 NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl, |
| 2014 0 |
| 2015 }; |
| 2016 |
| 2017 #endif |
OLD | NEW |