Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(327)

Unified Diff: third_party/icu38/source/common/ucnv2022.c

Issue 52030: Apply ICU patches for ICU tickets 6175 (ISO-2022 and ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/icu38/source/common/ucnv.c ('k') | third_party/icu38/source/common/ucnv_bld.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/icu38/source/common/ucnv2022.c
===================================================================
--- third_party/icu38/source/common/ucnv2022.c (revision 10949)
+++ third_party/icu38/source/common/ucnv2022.c (working copy)
@@ -201,6 +201,7 @@
#ifdef U_ENABLE_GENERIC_ISO_2022
UBool isFirstBuffer;
#endif
+ UBool isEmptySegment;
char name[30];
char locale[3];
}UConverterDataISO2022;
@@ -609,6 +610,7 @@
if(choice<=UCNV_RESET_TO_UNICODE) {
uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
myConverterData->key = 0;
+ myConverterData->isEmptySegment = FALSE;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
@@ -814,6 +816,7 @@
if(chosenConverterName == NULL) {
/* SS2 or SS3 */
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+ _this->toUCallbackReason = UCNV_UNASSIGNED;
return;
}
@@ -964,6 +967,8 @@
}
_this->toULength=1;
}
+ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
+ _this->toUCallbackReason = UCNV_UNASSIGNED;
}
}
@@ -1126,11 +1131,11 @@
}
/*
- * * Check that the result is a 2-byte value with each byte in the range A1..FE
- * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
- * * to move it to the ISO 2022 range 21..7E.
- * * Return 0 if out of range.
- * */
+ * Check that the result is a 2-byte value with each byte in the range A1..FE
+ * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
+ * to move it to the ISO 2022 range 21..7E.
+ * Return 0 if out of range.
+ */
static U_INLINE uint32_t
_2022FromGR94DBCS(uint32_t value) {
if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
@@ -1144,19 +1149,18 @@
#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
/*
- * Check that the result is a 2-byte value with each byte in the range A1..FE
- * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
- * to move it to the ISO 2022 range 21..7E.
- * Return 0 if out of range.
+ * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
+ * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
+ * unchanged.
*/
static U_INLINE uint32_t
-_2022FromGR94DBCS(uint32_t value) {
- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
- ) {
- return value - 0x8080; /* shift down to 21..7e byte range */
+_2022ToGR94DBCS(uint32_t value) {
+ uint32_t returnValue = value + 0x8080;
+ if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
+ (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
+ return returnValue;
} else {
- return 0; /* not valid for ISO 2022 */
+ return value;
}
}
#endif
@@ -2036,6 +2040,7 @@
continue;
} else {
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
break;
}
@@ -2047,21 +2052,39 @@
continue;
} else {
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
break;
}
case ESC_2022:
mySource--;
escape:
- changeState_2022(args->converter,&(mySource),
- mySourceLimit, ISO_2022_JP,err);
+ {
+ const char * mySourceBefore = mySource;
+ int8_t toULengthBefore = args->converter->toULength;
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_JP,err);
+
+ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
+ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ args->converter->toUCallbackReason = UCNV_IRREGULAR;
+ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
+ }
+ }
+
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
+ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
return;
}
+ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
+ if(myData->key==0) {
+ myData->isEmptySegment = TRUE;
+ }
continue;
/* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
@@ -2078,6 +2101,7 @@
/* falls through */
default:
/* convert one or two bytes */
+ myData->isEmptySegment = FALSE;
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
!IS_JP_DBCS(cs)
@@ -2606,15 +2630,27 @@
if(mySourceChar==UCNV_SI){
myData->toU2022State.g = 0;
+ if (myData->isEmptySegment) {
+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ args->converter->toUCallbackReason = UCNV_IRREGULAR;
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+ args->converter->toULength = 1;
+ args->target = myTarget;
+ args->source = mySource;
+ return;
+ }
/*consume the source */
continue;
}else if(mySourceChar==UCNV_SO){
myData->toU2022State.g = 1;
+ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
/*consume the source */
continue;
}else if(mySourceChar==ESC_2022){
mySource--;
escape:
+ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_KR, err);
if(U_FAILURE(*err)){
@@ -2625,6 +2661,7 @@
continue;
}
+ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
if(myData->toU2022State.g == 1) {
if(mySource < mySourceLimit) {
int leadIsOk, trailIsOk;
@@ -3177,27 +3214,52 @@
switch(mySourceChar){
case UCNV_SI:
pToU2022State->g=0;
+ if (myData->isEmptySegment) {
+ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ args->converter->toUCallbackReason = UCNV_IRREGULAR;
+ args->converter->toUBytes[0] = mySourceChar;
+ args->converter->toULength = 1;
+ args->target = myTarget;
+ args->source = mySource;
+ return;
+ }
continue;
case UCNV_SO:
if(pToU2022State->cs[1] != 0) {
pToU2022State->g=1;
+ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
continue;
} else {
/* illegal to have SO before a matching designator */
+ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
break;
}
case ESC_2022:
mySource--;
escape:
- changeState_2022(args->converter,&(mySource),
- mySourceLimit, ISO_2022_CN,err);
+ {
+ const char * mySourceBefore = mySource;
+ int8_t toULengthBefore = args->converter->toULength;
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_CN,err);
+
+ /* After SO there must be at least one character before a designator (designator error handled separately) */
+ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
+ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+ args->converter->toUCallbackReason = UCNV_IRREGULAR;
+ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
+ }
+ }
+
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
+ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
return;
}
continue;
@@ -3211,6 +3273,7 @@
/* falls through */
default:
/* convert one or two bytes */
+ myData->isEmptySegment = FALSE;
if(pToU2022State->g != 0) {
if(mySource < mySourceLimit) {
UConverterSharedData *cnv;
@@ -3522,11 +3585,19 @@
/* include ASCII for JP */
sa->addRange(sa->set, 0, 0x7f);
}
- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
+ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
/*
- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
- * we need to include half-width Katakana for all JP variants because
- * JIS X 0208 has hardcoded fallbacks for them.
+ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
+ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
+ * use half-width Katakana.
+ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
+ * half-width Katakana via the ESC ( I sequence.
+ * However, we only emit (fromUnicode) half-width Katakana according to the
+ * definition of each variant.
+ *
+ * When including fallbacks,
+ * we need to include half-width Katakana Unicode code points for all JP variants because
+ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
*/
/* include half-width Katakana for JP */
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
@@ -3580,6 +3651,12 @@
* corresponding to JIS X 0208.
*/
filter=UCNV_SET_FILTER_SJIS;
+ } else if(i==KSC5601) {
+ /*
+ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
+ * are broader than GR94.
+ */
+ filter=UCNV_SET_FILTER_GR94DBCS;
} else {
filter=UCNV_SET_FILTER_NONE;
}
@@ -3595,6 +3672,9 @@
sa->remove(sa->set, 0x0e);
sa->remove(sa->set, 0x0f);
sa->remove(sa->set, 0x1b);
+
+ /* ISO 2022 converters do not convert C1 controls either */
+ sa->removeRange(sa->set, 0x80, 0x9f);
}
static const UConverterImpl _ISO2022Impl={
« no previous file with comments | « third_party/icu38/source/common/ucnv.c ('k') | third_party/icu38/source/common/ucnv_bld.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698