| Index: third_party/icu38/source/common/ucnv2022.c
|
| ===================================================================
|
| --- third_party/icu38/source/common/ucnv2022.c (revision 10949)
|
| +++ third_party/icu38/source/common/ucnv2022.c (working copy)
|
| @@ -201,6 +201,7 @@
|
| #ifdef U_ENABLE_GENERIC_ISO_2022
|
| UBool isFirstBuffer;
|
| #endif
|
| + UBool isEmptySegment;
|
| char name[30];
|
| char locale[3];
|
| }UConverterDataISO2022;
|
| @@ -609,6 +610,7 @@
|
| if(choice<=UCNV_RESET_TO_UNICODE) {
|
| uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
|
| myConverterData->key = 0;
|
| + myConverterData->isEmptySegment = FALSE;
|
| }
|
| if(choice!=UCNV_RESET_TO_UNICODE) {
|
| uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
|
| @@ -814,6 +816,7 @@
|
| if(chosenConverterName == NULL) {
|
| /* SS2 or SS3 */
|
| *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| + _this->toUCallbackReason = UCNV_UNASSIGNED;
|
| return;
|
| }
|
|
|
| @@ -964,6 +967,8 @@
|
| }
|
| _this->toULength=1;
|
| }
|
| + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
|
| + _this->toUCallbackReason = UCNV_UNASSIGNED;
|
| }
|
| }
|
|
|
| @@ -1126,11 +1131,11 @@
|
| }
|
|
|
| /*
|
| - * * Check that the result is a 2-byte value with each byte in the range A1..FE
|
| - * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
|
| - * * to move it to the ISO 2022 range 21..7E.
|
| - * * Return 0 if out of range.
|
| - * */
|
| + * Check that the result is a 2-byte value with each byte in the range A1..FE
|
| + * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
|
| + * to move it to the ISO 2022 range 21..7E.
|
| + * Return 0 if out of range.
|
| + */
|
| static U_INLINE uint32_t
|
| _2022FromGR94DBCS(uint32_t value) {
|
| if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
| @@ -1144,19 +1149,18 @@
|
|
|
| #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
|
| /*
|
| - * Check that the result is a 2-byte value with each byte in the range A1..FE
|
| - * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
|
| - * to move it to the ISO 2022 range 21..7E.
|
| - * Return 0 if out of range.
|
| + * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
|
| + * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
|
| + * unchanged.
|
| */
|
| static U_INLINE uint32_t
|
| -_2022FromGR94DBCS(uint32_t value) {
|
| - if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
| - (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
|
| - ) {
|
| - return value - 0x8080; /* shift down to 21..7e byte range */
|
| +_2022ToGR94DBCS(uint32_t value) {
|
| + uint32_t returnValue = value + 0x8080;
|
| + if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
| + (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
|
| + return returnValue;
|
| } else {
|
| - return 0; /* not valid for ISO 2022 */
|
| + return value;
|
| }
|
| }
|
| #endif
|
| @@ -2036,6 +2040,7 @@
|
| continue;
|
| } else {
|
| /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
| + myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
| break;
|
| }
|
|
|
| @@ -2047,21 +2052,39 @@
|
| continue;
|
| } else {
|
| /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
| + myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
| break;
|
| }
|
|
|
| case ESC_2022:
|
| mySource--;
|
| escape:
|
| - changeState_2022(args->converter,&(mySource),
|
| - mySourceLimit, ISO_2022_JP,err);
|
| + {
|
| + const char * mySourceBefore = mySource;
|
| + int8_t toULengthBefore = args->converter->toULength;
|
|
|
| + changeState_2022(args->converter,&(mySource),
|
| + mySourceLimit, ISO_2022_JP,err);
|
| +
|
| + /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
|
| + if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
|
| + }
|
| + }
|
| +
|
| /* invalid or illegal escape sequence */
|
| if(U_FAILURE(*err)){
|
| args->target = myTarget;
|
| args->source = mySource;
|
| + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
| return;
|
| }
|
| + /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
|
| + if(myData->key==0) {
|
| + myData->isEmptySegment = TRUE;
|
| + }
|
| continue;
|
|
|
| /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
|
| @@ -2078,6 +2101,7 @@
|
| /* falls through */
|
| default:
|
| /* convert one or two bytes */
|
| + myData->isEmptySegment = FALSE;
|
| cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
| if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
|
| !IS_JP_DBCS(cs)
|
| @@ -2606,15 +2630,27 @@
|
|
|
| if(mySourceChar==UCNV_SI){
|
| myData->toU2022State.g = 0;
|
| + if (myData->isEmptySegment) {
|
| + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| + args->converter->toULength = 1;
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| + return;
|
| + }
|
| /*consume the source */
|
| continue;
|
| }else if(mySourceChar==UCNV_SO){
|
| myData->toU2022State.g = 1;
|
| + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
| /*consume the source */
|
| continue;
|
| }else if(mySourceChar==ESC_2022){
|
| mySource--;
|
| escape:
|
| + myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
|
| changeState_2022(args->converter,&(mySource),
|
| mySourceLimit, ISO_2022_KR, err);
|
| if(U_FAILURE(*err)){
|
| @@ -2625,6 +2661,7 @@
|
| continue;
|
| }
|
|
|
| + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
|
| if(myData->toU2022State.g == 1) {
|
| if(mySource < mySourceLimit) {
|
| int leadIsOk, trailIsOk;
|
| @@ -3177,27 +3214,52 @@
|
| switch(mySourceChar){
|
| case UCNV_SI:
|
| pToU2022State->g=0;
|
| + if (myData->isEmptySegment) {
|
| + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toUBytes[0] = mySourceChar;
|
| + args->converter->toULength = 1;
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| + return;
|
| + }
|
| continue;
|
|
|
| case UCNV_SO:
|
| if(pToU2022State->cs[1] != 0) {
|
| pToU2022State->g=1;
|
| + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
| continue;
|
| } else {
|
| /* illegal to have SO before a matching designator */
|
| + myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
|
| break;
|
| }
|
|
|
| case ESC_2022:
|
| mySource--;
|
| escape:
|
| - changeState_2022(args->converter,&(mySource),
|
| - mySourceLimit, ISO_2022_CN,err);
|
| + {
|
| + const char * mySourceBefore = mySource;
|
| + int8_t toULengthBefore = args->converter->toULength;
|
|
|
| + changeState_2022(args->converter,&(mySource),
|
| + mySourceLimit, ISO_2022_CN,err);
|
| +
|
| + /* After SO there must be at least one character before a designator (designator error handled separately) */
|
| + if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
| + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| + args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| + args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
|
| + }
|
| + }
|
| +
|
| /* invalid or illegal escape sequence */
|
| if(U_FAILURE(*err)){
|
| args->target = myTarget;
|
| args->source = mySource;
|
| + myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
| return;
|
| }
|
| continue;
|
| @@ -3211,6 +3273,7 @@
|
| /* falls through */
|
| default:
|
| /* convert one or two bytes */
|
| + myData->isEmptySegment = FALSE;
|
| if(pToU2022State->g != 0) {
|
| if(mySource < mySourceLimit) {
|
| UConverterSharedData *cnv;
|
| @@ -3522,11 +3585,19 @@
|
| /* include ASCII for JP */
|
| sa->addRange(sa->set, 0, 0x7f);
|
| }
|
| - if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
|
| + if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
|
| /*
|
| - * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
|
| - * we need to include half-width Katakana for all JP variants because
|
| - * JIS X 0208 has hardcoded fallbacks for them.
|
| + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
|
| + * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
|
| + * use half-width Katakana.
|
| + * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
|
| + * half-width Katakana via the ESC ( I sequence.
|
| + * However, we only emit (fromUnicode) half-width Katakana according to the
|
| + * definition of each variant.
|
| + *
|
| + * When including fallbacks,
|
| + * we need to include half-width Katakana Unicode code points for all JP variants because
|
| + * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
|
| */
|
| /* include half-width Katakana for JP */
|
| sa->addRange(sa->set, HWKANA_START, HWKANA_END);
|
| @@ -3580,6 +3651,12 @@
|
| * corresponding to JIS X 0208.
|
| */
|
| filter=UCNV_SET_FILTER_SJIS;
|
| + } else if(i==KSC5601) {
|
| + /*
|
| + * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
|
| + * are broader than GR94.
|
| + */
|
| + filter=UCNV_SET_FILTER_GR94DBCS;
|
| } else {
|
| filter=UCNV_SET_FILTER_NONE;
|
| }
|
| @@ -3595,6 +3672,9 @@
|
| sa->remove(sa->set, 0x0e);
|
| sa->remove(sa->set, 0x0f);
|
| sa->remove(sa->set, 0x1b);
|
| +
|
| + /* ISO 2022 converters do not convert C1 controls either */
|
| + sa->removeRange(sa->set, 0x80, 0x9f);
|
| }
|
|
|
| static const UConverterImpl _ISO2022Impl={
|
|
|