| OLD | NEW |
| 1 diff -ru trie.clean/source/common/ucnv2022.c chrome.canonical/source/common/ucnv
2022.c | 1 --- r22777/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0
700 |
| 2 --- trie.clean/source/common/ucnv2022.c 2007-11-07 17:39:05.057870000 -0800 | 2 +++ chrome.canonical/source/test/cintltst/nucnvtst.c 2009-03-23 12:42:01.1062
92000 -0700 |
| 3 +++ chrome.canonical/source/common/ucnv2022.c 2008-10-29 12:52:22.517453000 -0
700 | 3 @@ -17,6 +17,7 @@ |
| 4 @@ -752,6 +752,7 @@ | 4 #include "unicode/uloc.h" |
| 5 #include "unicode/ucnv.h" |
| 6 #include "unicode/ucnv_err.h" |
| 7 +#include "unicode/ucnv_cb.h" |
| 8 #include "cintltst.h" |
| 9 #include "unicode/utypes.h" |
| 10 #include "unicode/ustring.h" |
| 11 @@ -81,6 +82,7 @@ |
| 12 static void TestJitterbug2411(void); |
| 13 static void TestJB5275(void); |
| 14 static void TestJB5275_1(void); |
| 15 +static void TestJitterbug6175(void); |
| 16 #endif |
| 17 |
| 18 static void TestRoundTrippingAllUTF(void); |
| 19 @@ -297,6 +299,7 @@ |
| 20 #if !UCONFIG_NO_LEGACY_CONVERSION |
| 21 addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346"); |
| 22 addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411"); |
| 23 + addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175"); |
| 24 #endif |
| 25 |
| 26 } |
| 27 @@ -2606,7 +2609,7 @@ |
| 28 TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceL
imit <= source"); |
| 29 /*Test for the condition where there is an invalid character*/ |
| 30 { |
| 31 - static const uint8_t source2[]={0xa1, 0x01}; |
| 32 + static const uint8_t source2[]={0xa1, 0x80}; |
| 33 TestNextUCharError(cnv, (const char*)source2, (const char*)source2+size
of(source2), U_ZERO_ERROR, "an invalid character"); |
| 34 } |
| 35 /*Test for the condition where we have a truncated char*/ |
| 36 @@ -3899,11 +3902,11 @@ |
| 37 TestISO_2022_KR() { |
| 38 /* test input */ |
| 39 static const uint16_t in[]={ |
| 40 - 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x0
00A,0x000D |
| 41 - ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA
C02,0xAC04 |
| 42 + 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x0
00D |
| 43 + ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA
C04 |
| 44 ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0
028,0x0029 |
| 45 ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x5
3CA,0x53CB |
| 46 - ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x5
3E1,0x53E2 |
| 47 + ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x5
3E2 |
| 48 ,0x53E3,0x53E4,0x000A,0x000D}; |
| 49 const UChar* uSource; |
| 50 const UChar* uSourceLimit; |
| 51 @@ -4456,6 +4459,70 @@ |
| 52 free(offsets); |
| 53 } |
| 54 |
| 55 +/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCall
backReason is UCNV_IRREGULAR */ |
| 56 +typedef struct { |
| 57 + const char * converterName; |
| 58 + const char * inputText; |
| 59 + int inputTextLength; |
| 60 +} EmptySegmentTest; |
| 61 + |
| 62 +/* Callback for TestJitterbug6175, should only get called for empty segment err
ors */ |
| 63 +static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToU
nicodeArgs *toArgs, const char* codeUnits, |
| 64 + int32_t length, UConverterCallback
Reason reason, UErrorCode * err ) { |
| 65 + if (reason > UCNV_IRREGULAR) { |
| 66 + return; |
| 67 + } |
| 68 + if (reason != UCNV_IRREGULAR) { |
| 69 + log_err("toUnicode callback invoked for empty segment but reason is not
UCNV_IRREGULAR\n"); |
| 70 + } |
| 71 + /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */ |
| 72 + *err = U_ZERO_ERROR; |
| 73 + ucnv_cbToUWriteSub(toArgs,0,err); |
| 74 +} |
| 75 + |
| 76 +enum { kEmptySegmentToUCharsMax = 64 }; |
| 77 +static void TestJitterbug6175(void) { |
| 78 + static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,
0x42, 0x63, 0x64, 0x0D, 0x0A }; |
| 79 + static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F,
0x62, 0x0D, 0x0A }; |
| 80 + static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E,
0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A }; |
| 81 + static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E,
0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A }; |
| 82 + static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63
, 0x64 }; |
| 83 + static const EmptySegmentTest emptySegmentTests[] = { |
| 84 + /* converterName inputText inputTextLength */ |
| 85 + { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) }, |
| 86 + { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) }, |
| 87 + { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) }, |
| 88 + { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) }, |
| 89 + { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) }, |
| 90 + /* terminator: */ |
| 91 + { NULL, NULL, 0, } |
| 92 + }; |
| 93 + const EmptySegmentTest * testPtr; |
| 94 + for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr
) { |
| 95 + UErrorCode err = U_ZERO_ERROR; |
| 96 + UConverter * cnv = ucnv_open(testPtr->converterName, &err); |
| 97 + if (U_FAILURE(err)) { |
| 98 + log_data_err("Unable to open %s converter: %s\n", testPtr->converte
rName, u_errorName(err)); |
| 99 + return; |
| 100 + } |
| 101 + ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, N
ULL, &err); |
| 102 + if (U_FAILURE(err)) { |
| 103 + log_data_err("Unable to setToUCallBack for %s converter: %s\n", tes
tPtr->converterName, u_errorName(err)); |
| 104 + ucnv_close(cnv); |
| 105 + return; |
| 106 + } |
| 107 + { |
| 108 + UChar toUChars[kEmptySegmentToUCharsMax]; |
| 109 + UChar * toUCharsPtr = toUChars; |
| 110 + const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMa
x; |
| 111 + const char * inCharsPtr = testPtr->inputText; |
| 112 + const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength; |
| 113 + ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCha
rsLimit, NULL, TRUE, &err); |
| 114 + } |
| 115 + ucnv_close(cnv); |
| 116 + } |
| 117 +} |
| 118 + |
| 119 static void |
| 120 TestEBCDIC_STATEFUL() { |
| 121 /* test input */ |
| 122 --- r22777/source/test/cintltst/ncnvtst.c 2007-01-24 15:27:45.575224000 -0
800 |
| 123 +++ chrome.canonical/source/test/cintltst/ncnvtst.c 2009-03-23 12:30:17.2910
31000 -0700 |
| 124 @@ -1928,7 +1928,7 @@ |
| 125 #if !UCONFIG_NO_LEGACY_CONVERSION |
| 126 { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }, |
| 127 { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff }, |
| 128 - { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff }, |
| 129 + /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6
002 */ |
| 130 { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff } |
| 131 #else |
| 132 { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff } |
| 133 --- r22777/source/test/intltest/convtest.h 2007-07-26 20:12:12.288784000 -0
700 |
| 134 +++ chrome.canonical/source/test/intltest/convtest.h 2009-03-23 12:30:09.4451
94000 -0700 |
| 135 @@ -72,6 +72,7 @@ |
| 136 void TestToUnicode(); |
| 137 void TestFromUnicode(); |
| 138 void TestGetUnicodeSet(); |
| 139 + void TestGetUnicodeSet2(); |
| 140 |
| 141 private: |
| 142 UBool |
| 143 --- r22777/source/test/intltest/convtest.cpp 2007-03-08 16:28:01.852223000 -0
800 |
| 144 +++ chrome.canonical/source/test/intltest/convtest.cpp 2009-03-23 12:30:40.1618
68000 -0700 |
| 145 @@ -70,6 +70,7 @@ |
| 146 case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; |
| 147 case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; |
| 148 case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; |
| 149 + case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); brea
k; |
| 150 default: name=""; break; //needed to end loop |
| 151 } |
| 152 } |
| 153 @@ -465,6 +466,183 @@ |
| 154 } |
| 155 } |
| 156 |
| 157 +U_CDECL_BEGIN |
| 158 +static void U_CALLCONV |
| 159 +getUnicodeSetCallback(const void *context, |
| 160 + UConverterFromUnicodeArgs *fromUArgs, |
| 161 + const UChar* codeUnits, |
| 162 + int32_t length, |
| 163 + UChar32 codePoint, |
| 164 + UConverterCallbackReason reason, |
| 165 + UErrorCode *pErrorCode) { |
| 166 + if(reason<=UCNV_IRREGULAR) { |
| 167 + ((UnicodeSet *)context)->remove(codePoint); // the converter cannot co
nvert this code point |
| 168 + *pErrorCode=U_ZERO_ERROR; // skip |
| 169 + } // else ignore the reset, close and clone calls. |
| 170 +} |
| 171 +U_CDECL_END |
| 172 + |
| 173 +// Compare ucnv_getUnicodeSet() with the set of characters that can be converte
d. |
| 174 +void |
| 175 +ConversionTest::TestGetUnicodeSet2() { |
| 176 + // Build a string with all code points. |
| 177 + UChar32 cpLimit; |
| 178 + int32_t s0Length; |
| 179 + if(quick) { |
| 180 + cpLimit=s0Length=0x10000; // BMP only |
| 181 + } else { |
| 182 + cpLimit=0x110000; |
| 183 + s0Length=0x10000+0x200000; // BMP + surrogate pairs |
| 184 + } |
| 185 + UChar *s0=new UChar[s0Length]; |
| 186 + if(s0==NULL) { |
| 187 + return; |
| 188 + } |
| 189 + UChar *s=s0; |
| 190 + UChar32 c; |
| 191 + UChar c2; |
| 192 + // low BMP |
| 193 + for(c=0; c<=0xd7ff; ++c) { |
| 194 + *s++=(UChar)c; |
| 195 + } |
| 196 + // trail surrogates |
| 197 + for(c=0xdc00; c<=0xdfff; ++c) { |
| 198 + *s++=(UChar)c; |
| 199 + } |
| 200 + // lead surrogates |
| 201 + // (after trails so that there is not even one surrogate pair in between) |
| 202 + for(c=0xd800; c<=0xdbff; ++c) { |
| 203 + *s++=(UChar)c; |
| 204 + } |
| 205 + // high BMP |
| 206 + for(c=0xe000; c<=0xffff; ++c) { |
| 207 + *s++=(UChar)c; |
| 208 + } |
| 209 + // supplementary code points = surrogate pairs |
| 210 + if(cpLimit==0x110000) { |
| 211 + for(c=0xd800; c<=0xdbff; ++c) { |
| 212 + for(c2=0xdc00; c2<=0xdfff; ++c2) { |
| 213 + *s++=(UChar)c; |
| 214 + *s++=c2; |
| 215 + } |
| 216 + } |
| 217 + } |
| 218 + |
| 219 + static const char *const cnvNames[]={ |
| 220 + "UTF-8", |
| 221 + "UTF-7", |
| 222 + "UTF-16", |
| 223 + "US-ASCII", |
| 224 + "ISO-8859-1", |
| 225 + "windows-1252", |
| 226 + "Shift-JIS", |
| 227 + "ibm-1390", // EBCDIC_STATEFUL table |
| 228 + "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL tab
le |
| 229 + "HZ", |
| 230 + "ISO-2022-JP", |
| 231 + "JIS7", |
| 232 + "ISO-2022-CN", |
| 233 + "ISO-2022-CN-EXT", |
| 234 + "LMBCS" |
| 235 + }; |
| 236 + char buffer[1024]; |
| 237 + int32_t i; |
| 238 + for(i=0; i<LENGTHOF(cnvNames); ++i) { |
| 239 + UErrorCode errorCode=U_ZERO_ERROR; |
| 240 + UConverter *cnv=cnv_open(cnvNames[i], errorCode); |
| 241 + if(U_FAILURE(errorCode)) { |
| 242 + errln("failed to open converter %s - %s", cnvNames[i], u_errorName(
errorCode)); |
| 243 + continue; |
| 244 + } |
| 245 + UnicodeSet expected; |
| 246 + ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL
, &errorCode); |
| 247 + if(U_FAILURE(errorCode)) { |
| 248 + errln("failed to set the callback on converter %s - %s", cnvNames[i
], u_errorName(errorCode)); |
| 249 + ucnv_close(cnv); |
| 250 + continue; |
| 251 + } |
| 252 + UConverterUnicodeSet which; |
| 253 + for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUn
icodeSet)((int)which+1)) { |
| 254 + if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { |
| 255 + ucnv_setFallback(cnv, TRUE); |
| 256 + } |
| 257 + expected.add(0, cpLimit-1); |
| 258 + s=s0; |
| 259 + UBool flush; |
| 260 + do { |
| 261 + char *t=buffer; |
| 262 + flush=(UBool)(s==s0+s0Length); |
| 263 + ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar *
*)&s, s0+s0Length, NULL, flush, &errorCode); |
| 264 + if(U_FAILURE(errorCode)) { |
| 265 + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| 266 + errorCode=U_ZERO_ERROR; |
| 267 + continue; |
| 268 + } else { |
| 269 + break; // unexpected error, should not occur |
| 270 + } |
| 271 + } |
| 272 + } while(!flush); |
| 273 + UnicodeSet set; |
| 274 + ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode); |
| 275 + if(cpLimit<0x110000) { |
| 276 + set.remove(cpLimit, 0x10ffff); |
| 277 + } |
| 278 + if(which==UCNV_ROUNDTRIP_SET) { |
| 279 + // ignore PUA code points because they will be converted even i
f they |
| 280 + // are fallbacks and when other fallbacks are turned off, |
| 281 + // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true rou
ndtrips |
| 282 + expected.remove(0xe000, 0xf8ff); |
| 283 + expected.remove(0xf0000, 0xffffd); |
| 284 + expected.remove(0x100000, 0x10fffd); |
| 285 + set.remove(0xe000, 0xf8ff); |
| 286 + set.remove(0xf0000, 0xffffd); |
| 287 + set.remove(0x100000, 0x10fffd); |
| 288 + } |
| 289 + if(set!=expected) { |
| 290 + // First try to see if we have different sets because ucnv_getU
nicodeSet() |
| 291 + // added strings: The above conversion method does not tell us
what strings might be convertible. |
| 292 + // Remove strings from the set and compare again. |
| 293 + // Unfortunately, there are no good, direct set methods for fin
ding out whether there are strings |
| 294 + // in the set, nor for enumerating or removing just them. |
| 295 + // Intersect all code points with the set. The intersection wil
l not contain strings. |
| 296 + UnicodeSet temp(0, 0x10ffff); |
| 297 + temp.retainAll(set); |
| 298 + set=temp; |
| 299 + } |
| 300 + if(set!=expected) { |
| 301 + UnicodeSet diffSet; |
| 302 + UnicodeString out; |
| 303 + |
| 304 + // are there items that must be in the set but are not? |
| 305 + (diffSet=expected).removeAll(set); |
| 306 + if(!diffSet.isEmpty()) { |
| 307 + diffSet.toPattern(out, TRUE); |
| 308 + if(out.length()>100) { |
| 309 + out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsi
s)); |
| 310 + } |
| 311 + errln("error: ucnv_getUnicodeSet(\"%s\") is missing items -
which set: %d", |
| 312 + cnvNames[i], which); |
| 313 + errln(out); |
| 314 + } |
| 315 + |
| 316 + // are there items that must not be in the set but are? |
| 317 + (diffSet=set).removeAll(expected); |
| 318 + if(!diffSet.isEmpty()) { |
| 319 + diffSet.toPattern(out, TRUE); |
| 320 + if(out.length()>100) { |
| 321 + out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsi
s)); |
| 322 + } |
| 323 + errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpecte
d items - which set: %d", |
| 324 + cnvNames[i], which); |
| 325 + errln(out); |
| 326 + } |
| 327 + } |
| 328 + } |
| 329 + } |
| 330 + |
| 331 + delete [] s0; |
| 332 +} |
| 333 + |
| 334 // open testdata or ICU data converter ------------------------------------- **
* |
| 335 |
| 336 UConverter * |
| 337 --- r22777/source/test/testdata/testdata.mak 2007-07-26 20:12:12.288784000 -0
700 |
| 338 +++ chrome.canonical/source/test/testdata/testdata.mak 2009-03-23 12:31:04.4246
45000 -0700 |
| 339 @@ -28,7 +28,7 @@ |
| 340 |
| 341 TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res) |
| 342 |
| 343 -"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "
$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res
" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh
.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.
res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TEST
DATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res
" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "
$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.
cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4
x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\
nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATAB
LD)\nfsmxp.spp" |
| 344 +"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "
$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res
" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh
.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.
res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TEST
DATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res
" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "
$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.
cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\te
st4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD
)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDAT
ABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp" |
| 345 @echo Building test data |
| 346 @copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ" |
| 347 @copy "$(TESTDATA)\icu26_testtypes.res" "$(TESTDATABLD)" |
| 348 @@ -54,6 +54,7 @@ |
| 349 iscii.res |
| 350 test.icu |
| 351 test1.cnv |
| 352 +test1bmp.cnv |
| 353 test3.cnv |
| 354 test4.cnv |
| 355 test4x.cnv |
| 356 @@ -126,6 +127,10 @@ |
| 357 @echo Building $@ |
| 358 @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $** |
| 359 |
| 360 +"$(TESTDATABLD)\test1bmp.cnv": "$(TESTDATA)\test1bmp.ucm" |
| 361 + @echo Building $@ |
| 362 + @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $** |
| 363 + |
| 364 "$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm" |
| 365 @echo Building $@ |
| 366 @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $** |
| 367 --- r22777/source/test/testdata/Makefile.in 2007-08-21 13:15:55.267002000 -0
700 |
| 368 +++ chrome.canonical/source/test/testdata/Makefile.in 2009-03-23 12:31:04.4356
35000 -0700 |
| 369 @@ -117,7 +117,7 @@ |
| 370 TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu |
| 371 TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUIL
DDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp |
| 372 |
| 373 -TEST_UCM_SOURCE= test1.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm |
| 374 +TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm ibm9027.
ucm |
| 375 TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%) |
| 376 TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv) |
| 377 |
| 378 --- r22777/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0
700 |
| 379 +++ chrome.canonical/source/test/testdata/conversion.txt 2009-03-23 12:42
:01.119267000 -0700 |
| 380 @@ -1,6 +1,6 @@ |
| 381 //*****************************************************************************
** |
| 382 // |
| 383 -// Copyright (C) 2003-2007, International Business Machines |
| 384 +// Copyright (C) 2003-2008, International Business Machines |
| 385 // Corporation and others. All Rights Reserved. |
| 386 // |
| 387 // file name: conversion.txt |
| 388 @@ -48,13 +48,161 @@ |
| 389 toUnicode { |
| 390 Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks",
"errorCode", "callback", "invalidChars" } |
| 391 Cases { |
| 392 + // Test ticket 5691: consistent illegal sequences |
| 393 + // The following test cases are for illegal character byte sequences. |
| 394 + // |
| 395 + // Unfortunately, we cannot use the Shift-JIS examples from the ticket |
| 396 + // comments because our Shift-JIS table is Windows-compatible and |
| 397 + // therefore has no illegal single bytes. Same for GBK. |
| 398 + // Instead, we use the stricter GB 18030 also for 2-byte examples. |
| 399 + // The byte sequences are generally slightly different from the ticket |
| 400 + // comment, simply using assigned characters rather than just |
| 401 + // theoretically valid sequences. |
| 402 + { |
| 403 + "gb18030", |
| 404 + :bin{ 618140813c81ff7a }, |
| 405 + "a\u4e02\\x81<\\x81\\xFFz", |
| 406 + :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, |
| 407 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 408 + } |
| 409 + { |
| 410 + "EUC-JP", |
| 411 + :bin{ 618fb0a98fb03c8f3cb0a97a }, |
| 412 + "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", |
| 413 + :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, |
| 414 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 415 + } |
| 416 + { |
| 417 + "gb18030", |
| 418 + :bin{ 618130fc318130fc8181303c3e813cfc817a }, |
| 419 + "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z", |
| 420 + :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 }, |
| 421 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 422 + } |
| 423 + { |
| 424 + "UTF-8", |
| 425 + :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, |
| 426 + "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1
\\xFF<>z", |
| 427 + :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,1
2,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,
21 }, |
| 428 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 429 + } |
| 430 + { |
| 431 + "ISO-2022-JP", |
| 432 + :bin{ 1b24424141af4142affe41431b2842 }, |
| 433 + "\u758f\\xAF\u758e\\xAF\\xFE\u790e", |
| 434 + :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, |
| 435 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 436 + } |
| 437 + { |
| 438 + "ibm-25546", |
| 439 + :bin{ 411b242943420e4141af4142affe41430f5a }, |
| 440 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", |
| 441 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, |
| 442 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 443 + } |
| 444 + { |
| 445 + "ISO-2022-KR", |
| 446 + :bin{ 411b242943420e4141af4142affe41430f5a }, |
| 447 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", |
| 448 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, |
| 449 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 450 + } |
| 451 + { |
| 452 + "ISO-2022-CN", |
| 453 + :bin{ 411b242941420e4141af4142affe41430f5a }, |
| 454 + "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", |
| 455 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, |
| 456 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 457 + } |
| 458 + { |
| 459 + "HZ", |
| 460 + :bin{ 417e7b4141af4142affe41437e7d5a }, |
| 461 + "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", |
| 462 + :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, |
| 463 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 464 + } |
| 465 + // Test ticket 5691: consistent illegal sequences |
| 466 + // The following test cases are for illegal escape/designator/shift seq
uences. |
| 467 + // |
| 468 + // ISO-2022-JP and -CN with illegal escape sequences. |
| 469 + { |
| 470 + "ISO-2022-JP", |
| 471 + :bin{ 611b24201b244241411b283f1b28427a }, |
| 472 + "a\\x1B$ \u758f\\x1B\u2538z", |
| 473 + :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 }, |
| 474 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 475 + } |
| 476 + { |
| 477 + "ISO-2022-CN", |
| 478 + :bin{ 611b2429201b2429410e41410f7a }, |
| 479 + "a\\x1B$) \u4eaez", |
| 480 + :intvector{ 0,1,1,1,1,2,3,4,10,13 }, |
| 481 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 482 + } |
| 483 + // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS
3 sequences. |
| 484 + // The first ESC N comes before its designator sequence, the last seque
nce is ESC+space. |
| 485 + { |
| 486 + "ISO-2022-JP-2", |
| 487 + :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e }, |
| 488 + "N\\x1BNNN\xceN\\x1B N", |
| 489 + :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 }, |
| 490 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 491 + } |
| 492 + { |
| 493 + "ISO-2022-CN-EXT", |
| 494 + :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e }, |
| 495 + "N\\x1BNNN\u8f0eN\\x1B N", |
| 496 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, |
| 497 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 498 + } |
| 499 + { |
| 500 + "ISO-2022-CN-EXT", |
| 501 + :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f }, |
| 502 + "O\\x1BOOO\u492bO\\x1B O", |
| 503 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, |
| 504 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 505 + } |
| 506 + // Test ticket 5691: HZ with illegal tilde sequences. |
| 507 + { |
| 508 + "HZ", |
| 509 + :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a }, |
| 510 + "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a
9Z", |
| 511 + :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9,
// SBCS |
| 512 + 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,
19,19,19,21, // DBCS |
| 513 + 25 },
// SBCS |
| 514 + :int{1}, :int{0}, "", "&C", :bin{""} |
| 515 + } |
| 516 + // Test ticket 5691: Example from Peter Edberg. |
| 517 + { |
| 518 + "ISO-2022-JP", |
| 519 + :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 }, |
| 520 + "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda", |
| 521 + :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 }, |
| 522 + :int{1}, :int{0}, "", "?", :bin{""} |
| 523 + } |
| 524 + // Test bug 6071 (2:1 Unicode:charset SBCS mapping). |
| 525 + { |
| 526 + "*test1bmp", |
| 527 + :bin{ 050008 }, |
| 528 + "e@uv", |
| 529 + :intvector{ 0,1,2,2 }, |
| 530 + :int{1}, :int{1}, "", "?", :bin{""} |
| 531 + } |
| 532 + // test that HZ limits its byte values to lead bytes 21..7d and trail b
ytes 21..7e |
| 533 + { |
| 534 + "HZ", |
| 535 + :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b }, |
| 536 + "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+", |
| 537 + :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 }, |
| 538 + :int{1}, :int{1}, "", "?", :bin{""} |
| 539 + } |
| 540 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201
and |
| 541 // using the Shift-JIS table for JIS X 0208 (ticket #5797) |
| 542 { |
| 543 "ISO-2022-JP", |
| 544 :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b284
2 }, |
| 545 - "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6
f3e", |
| 546 - :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, |
| 547 + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\uf
ffd\u6f3e", |
| 548 + :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 }, |
| 549 :int{1}, :int{1}, "", "?", :bin{""} |
| 550 } |
| 551 // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBM
PWithOffsets() |
| 552 @@ -191,6 +339,21 @@ |
| 553 :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, |
| 554 :int{1}, :int{1}, "", "&", :bin{""} |
| 555 } |
| 556 + // empty segment (using substitution and stop) |
| 557 + { |
| 558 + "ISO-2022-KR", |
| 559 + :bin{ 1b242943610e0f620d0a }, |
| 560 + "a\uFFFDb\u000D\u000A", |
| 561 + :intvector{ 4, 6, 7, 8, 9 }, |
| 562 + :int{1}, :int{1}, "", "?", :bin{""} |
| 563 + } |
| 564 + { |
| 565 + "ISO-2022-KR", |
| 566 + :bin{ 1b242943610e0f620d0a }, |
| 567 + "a", |
| 568 + :intvector{ 4 }, |
| 569 + :int{1}, :int{1}, "illesc", ".", :bin{"0f"} |
| 570 + } |
| 571 |
| 572 // ISO-2022-JP |
| 573 |
| 574 @@ -241,6 +404,21 @@ |
| 575 :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1,
2, 6, 7, 8 }, |
| 576 :int{1}, :int{1}, "", ".", :bin{""} |
| 577 } |
| 578 + // empty segment (using substitution and stop) |
| 579 + { |
| 580 + "ISO-2022-JP", |
| 581 + :bin{ 61621b24421b284263640d0a }, |
| 582 + "ab\uFFFDcd\u000D\u000A", |
| 583 + :intvector{ 0, 1, 5, 8, 9, 10, 11 }, |
| 584 + :int{1}, :int{1}, "", "?", :bin{""} |
| 585 + } |
| 586 + { |
| 587 + "ISO-2022-JP", |
| 588 + :bin{ 61621b24421b284263640d0a }, |
| 589 + "ab", |
| 590 + :intvector{ 0, 1 }, |
| 591 + :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"} |
| 592 + } |
| 593 |
| 594 // ISO-2022-CN |
| 595 |
| 596 @@ -303,7 +481,7 @@ |
| 597 { |
| 598 "ISO-2022-CN-EXT", |
| 599 :bin{ 411b4e2121 }, "\x41", :intvector{ 0 }, |
| 600 - :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e } |
| 601 + :int{1}, :int{1}, "illesc", ".", :bin{ 1b } |
| 602 } |
| 603 // G3 designator: recognized, but not supported for -CN (only for -CN-E
XT) |
| 604 { |
| 605 @@ -311,6 +489,36 @@ |
| 606 :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 }, |
| 607 :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 } |
| 608 } |
| 609 + // empty segment 1 (using substitution and stop) |
| 610 + { |
| 611 + "ISO-2022-CN", |
| 612 + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, |
| 613 + "ab\uFFFD\u994Cc\u000D\u000A", |
| 614 + :intvector{ 0, 5, 7, 14, 16, 17, 18 }, |
| 615 + :int{1}, :int{1}, "", "?", :bin{""} |
| 616 + } |
| 617 + { |
| 618 + "ISO-2022-CN", |
| 619 + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, |
| 620 + "ab", |
| 621 + :intvector{ 0, 5 }, |
| 622 + :int{1}, :int{1}, "illesc", ".", :bin{"0f"} |
| 623 + } |
| 624 + // empty segment 2 (using substitution and stop) |
| 625 + { |
| 626 + "ISO-2022-CN", |
| 627 + :bin{ 611b242941620e1b24294768640f630d0a }, |
| 628 + "ab\uFFFD\u5F70c\u000D\u000A", |
| 629 + :intvector{ 0, 5, 7, 11, 14, 15, 16 }, |
| 630 + :int{1}, :int{1}, "", "?", :bin{""} |
| 631 + } |
| 632 + { |
| 633 + "ISO-2022-CN", |
| 634 + :bin{ 611b242941620e1b24294768640f630d0a }, |
| 635 + "ab", |
| 636 + :intvector{ 0, 5 }, |
| 637 + :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"} |
| 638 + } |
| 639 |
| 640 // ISO-2022 SBCS |
| 641 // [U_ENABLE_GENERIC_ISO_2022] |
| 642 @@ -325,6 +533,39 @@ |
| 643 // :int{1}, :int{1}, "", ".", :bin{""} |
| 644 //} |
| 645 |
| 646 + // HZ-GB-2312 |
| 647 + |
| 648 + // empty segment 1 (using substitution and stop) |
| 649 + { |
| 650 + "HZ-GB-2312", |
| 651 + :bin{ 61627e7b7e7d6364 }, |
| 652 + "ab\uFFFDcd", |
| 653 + :intvector{ 0, 1, 4, 6, 7 }, |
| 654 + :int{1}, :int{1}, "", "?", :bin{""} |
| 655 + } |
| 656 + { |
| 657 + "HZ-GB-2312", |
| 658 + :bin{ 61627e7b7e7d63640d0a }, |
| 659 + "ab", |
| 660 + :intvector{ 0, 1 }, |
| 661 + :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"} |
| 662 + } |
| 663 + // empty segment 2 & legal redundant switches (using substitution and s
top) |
| 664 + { |
| 665 + "HZ-GB-2312", |
| 666 + :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, |
| 667 + "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD", |
| 668 + :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 }, |
| 669 + :int{1}, :int{1}, "", "?", :bin{""} |
| 670 + } |
| 671 + { |
| 672 + "HZ-GB-2312", |
| 673 + :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, |
| 674 + "ab\u4E0D\u7A7A", |
| 675 + :intvector{ 0, 1, 4, 6 }, |
| 676 + :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"} |
| 677 + } |
| 678 + |
| 679 // DBCS-only extensions |
| 680 { |
| 681 "ibm-970", |
| 682 @@ -496,6 +737,14 @@ |
| 683 :intvector{ 0, 4, 8, 12 }, |
| 684 :int{1}, :int{0}, "", "?", :bin{""} |
| 685 } |
| 686 + // Test iso-2022-jp-2 miscellaneous symbols |
| 687 + { |
| 688 + "iso-2022-jp-2", |
| 689 + :bin{ 1b242843224f224e1b2842 }, |
| 690 + "\u260E\u260F", |
| 691 + :intvector{ 4, 6 }, |
| 692 + :int{1}, :int{0}, "", ".", :bin{""} |
| 693 + } |
| 694 } |
| 695 } |
| 696 |
| 697 @@ -504,6 +753,14 @@ |
| 698 fromUnicode { |
| 699 Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks",
"errorCode", "callback", "invalidUChars" } |
| 700 Cases { |
| 701 + // Test bug 6071 (1:2 Unicode:charset SBCS mapping). |
| 702 + { |
| 703 + "*test1bmp", |
| 704 + "e@t", |
| 705 + :bin{ 05000709 }, |
| 706 + :intvector{ 0,1,2,2 }, |
| 707 + :int{1}, :int{0}, "", "?", "" |
| 708 + } |
| 709 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201
and |
| 710 // using the Shift-JIS table for JIS X 0208 (ticket #5797) |
| 711 { |
| 712 @@ -1311,16 +1568,29 @@ |
| 713 // versions of ISO-2022-JP |
| 714 { |
| 715 "ISO-2022-JP", |
| 716 - "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e
00\u4e01\uffe5]", |
| 717 - "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\uf
a2d\uffe6-\U0010ffff]", |
| 718 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e0
1\uffe5]", |
| 719 + "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e
29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]", |
| 720 :int{0} |
| 721 - } |
| 722 + } |
| 723 { |
| 724 "ISO-2022-JP-2", |
| 725 - "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a
1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", |
| 726 - "[\x0e\x0f\x1b\uffe7-\U0010ffff]", |
| 727 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0
390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]", |
| 728 + "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]", |
| 729 + :int{0} |
| 730 + } |
| 731 + { |
| 732 + "JIS7", |
| 733 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0
390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]", |
| 734 + "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]", |
| 735 :int{0} |
| 736 } |
| 737 + // with fallbacks |
| 738 + { |
| 739 + "ISO-2022-JP", |
| 740 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301
c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]", |
| 741 + "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b
\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]", |
| 742 + :int{1} |
| 743 + } |
| 744 |
| 745 // versions of ISO-2022-CN |
| 746 { |
| 747 @@ -1336,6 +1606,22 @@ |
| 748 :int{0} |
| 749 } |
| 750 |
| 751 + // HZ |
| 752 + { |
| 753 + "HZ", |
| 754 + "[\u0410-\u044f\u4e00\u4e01\u4e03]", |
| 755 + "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]", |
| 756 + :int{0} |
| 757 + } |
| 758 + |
| 759 + // LMBCS |
| 760 + { |
| 761 + "LMBCS", |
| 762 + "[\x00-\U0010ffff]", |
| 763 + "[]", |
| 764 + :int{0} |
| 765 + } |
| 766 + |
| 767 // DBCS-only |
| 768 { |
| 769 "ibm-971", |
| 770 --- r22777/source/common/ucnv_ext.h 2007-08-22 22:46:49.525855000 -0700 |
| 771 +++ chrome.canonical/source/common/ucnv_ext.h 2009-03-23 12:30:09.644121000 -0
700 |
| 772 @@ -382,10 +382,20 @@ |
| 773 UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, |
| 774 UErrorCode *pErrorCode); |
| 775 |
| 776 +/* |
| 777 + * Add code points and strings to the set according to the extension mappings. |
| 778 + * Limitation on the UConverterSetFilter: |
| 779 + * The filters currently assume that they are used with 1:1 mappings. |
| 780 + * They only apply to single input code points, and then they pass through |
| 781 + * only mappings with single-charset-code results. |
| 782 + * For example, the Shift-JIS filter only works for 2-byte results and tests |
| 783 + * that those 2 bytes are in the JIS X 0208 range of Shift-JIS. |
| 784 + */ |
| 785 U_CFUNC void |
| 786 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, |
| 787 const USetAdder *sa, |
| 788 UConverterUnicodeSet which, |
| 789 + UConverterSetFilter filter, |
| 790 UErrorCode *pErrorCode); |
| 791 |
| 792 /* toUnicode helpers -------------------------------------------------------- *
/ |
| 793 --- r22777/source/common/ucnvmbcs.c 2007-10-11 14:31:32.196532000 -0700 |
| 794 +++ chrome.canonical/source/common/ucnvmbcs.c 2009-03-23 12:42:01.150242000 -0
700 |
| 795 @@ -1,7 +1,7 @@ |
| 796 /* |
| 797 ****************************************************************************** |
| 798 * |
| 799 -* Copyright (C) 2000-2007, International Business Machines |
| 800 +* Copyright (C) 2000-2008, International Business Machines |
| 801 * Corporation and others. All Rights Reserved. |
| 802 * |
| 803 ****************************************************************************** |
| 804 @@ -485,9 +485,23 @@ |
| 805 |
| 806 if(mbcsTable->outputType==MBCS_OUTPUT_1) { |
| 807 const uint16_t *stage2, *stage3, *results; |
| 808 + uint16_t minValue; |
| 809 |
| 810 results=(const uint16_t *)mbcsTable->fromUnicodeBytes; |
| 811 |
| 812 + /* |
| 813 + * Set a threshold variable for selecting which mappings to use. |
| 814 + * See ucnv_MBCSSingleFromBMPWithOffsets() and |
| 815 + * MBCS_SINGLE_RESULT_FROM_U() for details. |
| 816 + */ |
| 817 + if(which==UCNV_ROUNDTRIP_SET) { |
| 818 + /* use only roundtrips */ |
| 819 + minValue=0xf00; |
| 820 + } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ { |
| 821 + /* use all roundtrip and fallback results */ |
| 822 + minValue=0x800; |
| 823 + } |
| 824 + |
| 825 for(st1=0; st1<maxStage1; ++st1) { |
| 826 st2=table[st1]; |
| 827 if(st2>maxStage1) { |
| 828 @@ -497,15 +511,8 @@ |
| 829 /* read the stage 3 block */ |
| 830 stage3=results+st3; |
| 831 |
| 832 - /* |
| 833 - * Add code points for which the roundtrip flag is set. |
| 834 - * Once we get a set for fallback mappings, we have to
use |
| 835 - * a threshold variable with a value of 0x800. |
| 836 - * See ucnv_MBCSSingleFromBMPWithOffsets() and |
| 837 - * MBCS_SINGLE_RESULT_FROM_U() for details. |
| 838 - */ |
| 839 do { |
| 840 - if(*stage3++>=0xf00) { |
| 841 + if(*stage3++>=minValue) { |
| 842 sa->add(sa->set, c); |
| 843 } |
| 844 } while((++c&0xf)!=0); |
| 845 @@ -522,9 +529,12 @@ |
| 846 const uint8_t *stage3, *bytes; |
| 847 uint32_t st3Multiplier; |
| 848 uint32_t value; |
| 849 + UBool useFallback; |
| 850 |
| 851 bytes=mbcsTable->fromUnicodeBytes; |
| 852 |
| 853 + useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); |
| 854 + |
| 855 switch(mbcsTable->outputType) { |
| 856 case MBCS_OUTPUT_3: |
| 857 case MBCS_OUTPUT_4_EUC: |
| 858 @@ -551,9 +561,8 @@ |
| 859 st3>>=16; |
| 860 |
| 861 /* |
| 862 - * Add code points for which the roundtrip flag is set. |
| 863 - * Once we get a set for fallback mappings, we have to
check |
| 864 - * non-roundtrip stage 3 results for whether they are 0
. |
| 865 + * Add code points for which the roundtrip flag is set, |
| 866 + * or which map to non-zero bytes if we use fallbacks. |
| 867 * See ucnv_MBCSFromUnicodeWithOffsets() for details. |
| 868 */ |
| 869 switch(filter) { |
| 870 @@ -561,6 +570,23 @@ |
| 871 do { |
| 872 if(st3&1) { |
| 873 sa->add(sa->set, c); |
| 874 + stage3+=st3Multiplier; |
| 875 + } else if(useFallback) { |
| 876 + uint8_t b=0; |
| 877 + switch(st3Multiplier) { |
| 878 + case 4: |
| 879 + b|=*stage3++; |
| 880 + case 3: |
| 881 + b|=*stage3++; |
| 882 + case 2: |
| 883 + b|=stage3[0]|stage3[1]; |
| 884 + stage3+=2; |
| 885 + default: |
| 886 + break; |
| 887 + } |
| 888 + if(b!=0) { |
| 889 + sa->add(sa->set, c); |
| 890 + } |
| 891 } |
| 892 st3>>=1; |
| 893 } while((++c&0xf)!=0); |
| 894 @@ -568,7 +594,7 @@ |
| 895 case UCNV_SET_FILTER_DBCS_ONLY: |
| 896 /* Ignore single-byte results (<0x100). */ |
| 897 do { |
| 898 - if((st3&1)!=0 && *((const uint16_t *)stage3)>=0
x100) { |
| 899 + if(((st3&1)!=0 || useFallback) && *((const uint
16_t *)stage3)>=0x100) { |
| 900 sa->add(sa->set, c); |
| 901 } |
| 902 st3>>=1; |
| 903 @@ -578,7 +604,7 @@ |
| 904 case UCNV_SET_FILTER_2022_CN: |
| 905 /* Only add code points that map to CNS 11643 plan
es 1 & 2 for non-EXT ISO-2022-CN. */ |
| 906 do { |
| 907 - if((st3&1)!=0 && ((value=*stage3)==0x81 || valu
e==0x82)) { |
| 908 + if(((st3&1)!=0 || useFallback) && ((value=*stag
e3)==0x81 || value==0x82)) { |
| 909 sa->add(sa->set, c); |
| 910 } |
| 911 st3>>=1; |
| 912 @@ -588,7 +614,33 @@ |
| 913 case UCNV_SET_FILTER_SJIS: |
| 914 /* Only add code points that map to Shift-JIS code
s corresponding to JIS X 0208. */ |
| 915 do { |
| 916 - if((st3&1)!=0 && (value=*((const uint16_t *)sta
ge3))>=0x8140 && value<=0xeffc) { |
| 917 + if(((st3&1)!=0 || useFallback) && (value=*((con
st uint16_t *)stage3))>=0x8140 && value<=0xeffc) { |
| 918 + sa->add(sa->set, c); |
| 919 + } |
| 920 + st3>>=1; |
| 921 + stage3+=2; /* +=st3Multiplier */ |
| 922 + } while((++c&0xf)!=0); |
| 923 + break; |
| 924 + case UCNV_SET_FILTER_GR94DBCS: |
| 925 + /* Only add code points that map to ISO 2022 GR 94
DBCS codes (each byte A1..FE). */ |
| 926 + do { |
| 927 + if( ((st3&1)!=0 || useFallback) && |
| 928 + (uint16_t)((value=*((const uint16_t *)stage
3)) - 0xa1a1)<=(0xfefe - 0xa1a1) && |
| 929 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1) |
| 930 + ) { |
| 931 + sa->add(sa->set, c); |
| 932 + } |
| 933 + st3>>=1; |
| 934 + stage3+=2; /* +=st3Multiplier */ |
| 935 + } while((++c&0xf)!=0); |
| 936 + break; |
| 937 + case UCNV_SET_FILTER_HZ: |
| 938 + /* Only add code points that are suitable for HZ DB
CS (lead byte A1..FD). */ |
| 939 + do { |
| 940 + if( ((st3&1)!=0 || useFallback) && |
| 941 + (uint16_t)((value=*((const uint16_t *)stage
3))-0xa1a1)<=(0xfdfe - 0xa1a1) && |
| 942 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1) |
| 943 + ) { |
| 944 sa->add(sa->set, c); |
| 945 } |
| 946 st3>>=1; |
| 947 @@ -609,7 +661,7 @@ |
| 948 } |
| 949 } |
| 950 |
| 951 - ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); |
| 952 + ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode); |
| 953 } |
| 954 |
| 955 U_CFUNC void |
| 956 @@ -1694,7 +1746,7 @@ |
| 957 cnv->toUBytes[0]=*(source-1); |
| 958 cnv->toULength=_extToU(cnv, cnv->sharedData, |
| 959 1, &source, sourceLimit, |
| 960 - &target, target+targetCapacity, |
| 961 + &target, pArgs->targetLimit, |
| 962 &offsets, sourceIndex, |
| 963 pArgs->flush, |
| 964 pErrorCode); |
| 965 @@ -1739,6 +1791,65 @@ |
| 966 pArgs->offsets=offsets; |
| 967 } |
| 968 |
| 969 +static UBool |
| 970 +hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { |
| 971 + const int32_t *row=stateTable[state]; |
| 972 + int32_t b, entry; |
| 973 + /* First test for final entries in this state for some commonly valid byte
values. */ |
| 974 + entry=row[0xa1]; |
| 975 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && |
| 976 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL |
| 977 + ) { |
| 978 + return TRUE; |
| 979 + } |
| 980 + entry=row[0x41]; |
| 981 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && |
| 982 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL |
| 983 + ) { |
| 984 + return TRUE; |
| 985 + } |
| 986 + /* Then test for final entries in this state. */ |
| 987 + for(b=0; b<=0xff; ++b) { |
| 988 + entry=row[b]; |
| 989 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && |
| 990 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL |
| 991 + ) { |
| 992 + return TRUE; |
| 993 + } |
| 994 + } |
| 995 + /* Then recurse for transition entries. */ |
| 996 + for(b=0; b<=0xff; ++b) { |
| 997 + entry=row[b]; |
| 998 + if( MBCS_ENTRY_IS_TRANSITION(entry) && |
| 999 + hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE
(entry)) |
| 1000 + ) { |
| 1001 + return TRUE; |
| 1002 + } |
| 1003 + } |
| 1004 + return FALSE; |
| 1005 +} |
| 1006 + |
| 1007 +/* |
| 1008 + * Is byte b a single/lead byte in this state? |
| 1009 + * Recurse for transition states, because here we don't want to say that |
| 1010 + * b is a lead byte if all byte sequences that start with b are illegal. |
| 1011 + */ |
| 1012 +static UBool |
| 1013 +isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnl
y, uint8_t b) { |
| 1014 + const int32_t *row=stateTable[state]; |
| 1015 + int32_t entry=row[b]; |
| 1016 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ |
| 1017 + return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_ST
ATE(entry)); |
| 1018 + } else { |
| 1019 + uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); |
| 1020 + if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { |
| 1021 + return FALSE; /* SI/SO are illegal for DBCS-only conversion */ |
| 1022 + } else { |
| 1023 + return action!=MBCS_STATE_ILLEGAL; |
| 1024 + } |
| 1025 + } |
| 1026 +} |
| 1027 + |
| 1028 U_CFUNC void |
| 1029 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
| 1030 UErrorCode *pErrorCode) { |
| 1031 @@ -2094,6 +2205,34 @@ |
| 1032 sourceIndex=nextSourceIndex; |
| 1033 } else if(U_FAILURE(*pErrorCode)) { |
| 1034 /* callback(illegal) */ |
| 1035 + if(byteIndex>1) { |
| 1036 + /* |
| 1037 + * Ticket 5691: consistent illegal sequences: |
| 1038 + * - We include at least the first byte in the illegal sequence
. |
| 1039 + * - If any of the non-initial bytes could be the start of a ch
aracter, |
| 1040 + * we stop the illegal sequence before the first one of those
. |
| 1041 + */ |
| 1042 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0
); |
| 1043 + int8_t i; |
| 1044 + for(i=1; |
| 1045 + i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnl
y, bytes[i]); |
| 1046 + ++i) {} |
| 1047 + if(i<byteIndex) { |
| 1048 + /* Back out some bytes. */ |
| 1049 + int8_t backOutDistance=byteIndex-i; |
| 1050 + int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t
*)pArgs->source); |
| 1051 + byteIndex=i; /* length of reported illegal byte sequence *
/ |
| 1052 + if(backOutDistance<=bytesFromThisBuffer) { |
| 1053 + source-=backOutDistance; |
| 1054 + } else { |
| 1055 + /* Back out bytes from the previous buffer: Need to rep
lay them. */ |
| 1056 + cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutD
istance); |
| 1057 + /* preToULength is negative! */ |
| 1058 + uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); |
| 1059 + source=(const uint8_t *)pArgs->source; |
| 1060 + } |
| 1061 + } |
| 1062 + } |
| 1063 break; |
| 1064 } else /* unassigned sequences indicated with byteIndex>0 */ { |
| 1065 /* try an extension mapping */ |
| 1066 @@ -2104,7 +2243,7 @@ |
| 1067 &offsets, sourceIndex, |
| 1068 pArgs->flush, |
| 1069 pErrorCode); |
| 1070 - sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs
->source); |
| 1071 + sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArg
s->source); |
| 1072 |
| 1073 if(U_FAILURE(*pErrorCode)) { |
| 1074 /* not mappable or buffer overflow */ |
| 1075 @@ -2395,15 +2534,37 @@ |
| 1076 |
| 1077 if(c<0) { |
| 1078 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source)
{ |
| 1079 - *pErrorCode=U_TRUNCATED_CHAR_FOUND; |
| 1080 - } |
| 1081 - if(U_FAILURE(*pErrorCode)) { |
| 1082 /* incomplete character byte sequence */ |
| 1083 uint8_t *bytes=cnv->toUBytes; |
| 1084 cnv->toULength=(int8_t)(source-lastSource); |
| 1085 do { |
| 1086 *bytes++=*lastSource++; |
| 1087 } while(lastSource<source); |
| 1088 + *pErrorCode=U_TRUNCATED_CHAR_FOUND; |
| 1089 + } else if(U_FAILURE(*pErrorCode)) { |
| 1090 + /* callback(illegal) */ |
| 1091 + /* |
| 1092 + * Ticket 5691: consistent illegal sequences: |
| 1093 + * - We include at least the first byte in the illegal sequence. |
| 1094 + * - If any of the non-initial bytes could be the start of a charac
ter, |
| 1095 + * we stop the illegal sequence before the first one of those. |
| 1096 + */ |
| 1097 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); |
| 1098 + uint8_t *bytes=cnv->toUBytes; |
| 1099 + *bytes++=*lastSource++; /* first byte */ |
| 1100 + if(lastSource==source) { |
| 1101 + cnv->toULength=1; |
| 1102 + } else /* lastSource<source: multi-byte character */ { |
| 1103 + int8_t i; |
| 1104 + for(i=1; |
| 1105 + lastSource<source && !isSingleOrLead(stateTable, state, isD
BCSOnly, *lastSource); |
| 1106 + ++i |
| 1107 + ) { |
| 1108 + *bytes++=*lastSource++; |
| 1109 + } |
| 1110 + cnv->toULength=i; |
| 1111 + source=lastSource; |
| 1112 + } |
| 1113 } else { |
| 1114 /* no output because of empty input or only state changes */ |
| 1115 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
| 1116 @@ -3237,7 +3398,7 @@ |
| 1117 lastSource=source; |
| 1118 c=_extFromU(cnv, cnv->sharedData, |
| 1119 c, &source, sourceLimit, |
| 1120 - &target, target+targetCapacity, |
| 1121 + &target, (const uint8_t *)(pArgs->targetLimit), |
| 1122 &offsets, sourceIndex, |
| 1123 pArgs->flush, |
| 1124 pErrorCode); |
| 1125 --- r22777/source/common/ucnvmbcs.h 2007-10-11 14:31:32.196532000 -0700 |
| 1126 +++ chrome.canonical/source/common/ucnvmbcs.h 2009-03-23 12:30:17.315007000 -0
700 |
| 1127 @@ -492,6 +492,8 @@ |
| 1128 UCNV_SET_FILTER_DBCS_ONLY, |
| 1129 UCNV_SET_FILTER_2022_CN, |
| 1130 UCNV_SET_FILTER_SJIS, |
| 1131 + UCNV_SET_FILTER_GR94DBCS, |
| 1132 + UCNV_SET_FILTER_HZ, |
| 1133 UCNV_SET_FILTER_COUNT |
| 1134 } UConverterSetFilter; |
| 1135 |
| 1136 --- r22777/source/common/ucnv.c 2007-08-31 12:39:14.294200000 -0700 |
| 1137 +++ chrome.canonical/source/common/ucnv.c 2009-03-23 12:40:10.566608000 -0
700 |
| 1138 @@ -1528,11 +1528,14 @@ |
| 1139 cnv->toULength=0; |
| 1140 |
| 1141 /* call the callback function */ |
| 1142 + if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOU
ND) { |
| 1143 + cnv->toUCallbackReason = UCNV_UNASSIGNED; |
| 1144 + } |
| 1145 cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, |
| 1146 cnv->invalidCharBuffer, errorInputLength, |
| 1147 - (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUE
NCE) ? |
| 1148 - UCNV_UNASSIGNED : UCNV_ILLEGAL, |
| 1149 + cnv->toUCallbackReason, |
| 1150 err); |
| 1151 + cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */ |
| 1152 |
| 1153 /* |
| 1154 * loop back to the offset handling |
| 1155 --- r22777/source/common/uset_imp.h 2007-07-24 19:51:25.692061000 -0700 |
| 1156 +++ chrome.canonical/source/common/uset_imp.h 2009-03-23 12:30:09.893067000 -0
700 |
| 1157 @@ -36,6 +36,9 @@ |
| 1158 typedef void U_CALLCONV |
| 1159 USetRemove(USet *set, UChar32 c); |
| 1160 |
| 1161 +typedef void U_CALLCONV |
| 1162 +USetRemoveRange(USet *set, UChar32 start, UChar32 end); |
| 1163 + |
| 1164 /** |
| 1165 * Interface for adding items to a USet, to keep low-level code from |
| 1166 * statically depending on the USet implementation. |
| 1167 @@ -47,6 +50,7 @@ |
| 1168 USetAddRange *addRange; |
| 1169 USetAddString *addString; |
| 1170 USetRemove *remove; |
| 1171 + USetRemoveRange *removeRange; |
| 1172 }; |
| 1173 typedef struct USetAdder USetAdder; |
| 1174 |
| 1175 --- r22777/source/common/ucnv2022.c 2007-10-11 14:31:32.196532000 -0700 |
| 1176 +++ chrome.canonical/source/common/ucnv2022.c 2009-03-23 12:57:38.398368000 -0
700 |
| 1177 @@ -201,6 +201,7 @@ |
| 1178 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 1179 UBool isFirstBuffer; |
| 1180 #endif |
| 1181 + UBool isEmptySegment; |
| 1182 char name[30]; |
| 1183 char locale[3]; |
| 1184 }UConverterDataISO2022; |
| 1185 @@ -609,6 +610,7 @@ |
| 1186 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 1187 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); |
| 1188 myConverterData->key = 0; |
| 1189 + myConverterData->isEmptySegment = FALSE; |
| 1190 } |
| 1191 if(choice!=UCNV_RESET_TO_UNICODE) { |
| 1192 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); |
| 1193 @@ -752,6 +754,7 @@ |
| 5 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraIn
fo); | 1194 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraIn
fo); |
| 6 uint32_t key = myData2022->key; | 1195 uint32_t key = myData2022->key; |
| 7 int32_t offset = 0; | 1196 int32_t offset = 0; |
| 8 + int8_t initialToULength = _this->toULength; | 1197 + int8_t initialToULength = _this->toULength; |
| 9 char c; | 1198 char c; |
| 10 | 1199 |
| 11 value = VALID_NON_TERMINAL_2022; | 1200 value = VALID_NON_TERMINAL_2022; |
| 12 @@ -804,7 +805,6 @@ | 1201 @@ -804,7 +807,6 @@ |
| 13 return; | 1202 return; |
| 14 } else if (value == INVALID_2022 ) { | 1203 } else if (value == INVALID_2022 ) { |
| 15 *err = U_ILLEGAL_ESCAPE_SEQUENCE; | 1204 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 16 - return; | 1205 - return; |
| 17 } else /* value == VALID_TERMINAL_2022 */ { | 1206 } else /* value == VALID_TERMINAL_2022 */ { |
| 18 switch(var){ | 1207 switch(var){ |
| 19 #ifdef U_ENABLE_GENERIC_ISO_2022 | 1208 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 20 @@ -935,6 +935,35 @@ | 1209 @@ -814,6 +816,7 @@ |
| 1210 if(chosenConverterName == NULL) { |
| 1211 /* SS2 or SS3 */ |
| 1212 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 1213 + _this->toUCallbackReason = UCNV_UNASSIGNED; |
| 1214 return; |
| 1215 } |
| 1216 |
| 1217 @@ -935,6 +938,37 @@ |
| 21 } | 1218 } |
| 22 if(U_SUCCESS(*err)) { | 1219 if(U_SUCCESS(*err)) { |
| 23 _this->toULength = 0; | 1220 _this->toULength = 0; |
| 24 + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { | 1221 + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
| 25 + if(_this->toULength>1) { | 1222 + if(_this->toULength>1) { |
| 26 + /* | 1223 + /* |
| 27 + * Ticket 5691: consistent illegal sequences: | 1224 + * Ticket 5691: consistent illegal sequences: |
| 28 + * - We include at least the first byte (ESC) in the illegal sequen
ce. | 1225 + * - We include at least the first byte (ESC) in the illegal sequen
ce. |
| 29 + * - If any of the non-initial bytes could be the start of a charac
ter, | 1226 + * - If any of the non-initial bytes could be the start of a charac
ter, |
| 30 + * we stop the illegal sequence before the first one of those. | 1227 + * we stop the illegal sequence before the first one of those. |
| (...skipping 12 matching lines...) Expand all Loading... |
| 43 + } else { | 1240 + } else { |
| 44 + /* Back out bytes from the previous buffer: Need to replay them
. */ | 1241 + /* Back out bytes from the previous buffer: Need to replay them
. */ |
| 45 + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistanc
e); | 1242 + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistanc
e); |
| 46 + /* same as -(initialToULength-1) */ | 1243 + /* same as -(initialToULength-1) */ |
| 47 + /* preToULength is negative! */ | 1244 + /* preToULength is negative! */ |
| 48 + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULen
gth); | 1245 + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULen
gth); |
| 49 + *source-=bytesFromThisBuffer; | 1246 + *source-=bytesFromThisBuffer; |
| 50 + } | 1247 + } |
| 51 + _this->toULength=1; | 1248 + _this->toULength=1; |
| 52 + } | 1249 + } |
| 1250 + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
| 1251 + _this->toUCallbackReason = UCNV_UNASSIGNED; |
| 53 } | 1252 } |
| 54 } | 1253 } |
| 55 | 1254 |
| 56 @@ -1097,6 +1126,24 @@ | 1255 @@ -1113,6 +1147,24 @@ |
| 1256 } |
| 57 } | 1257 } |
| 58 | 1258 |
| 59 /* | 1259 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after
that. */ |
| 60 + * * Check that the result is a 2-byte value with each byte in the range A1..F
E | 1260 +/* |
| 61 + * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byt
e | 1261 + * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code poi
nt, it returns the |
| 62 + * * to move it to the ISO 2022 range 21..7E. | 1262 + * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns
the 2022 code point |
| 63 + * * Return 0 if out of range. | 1263 + * unchanged. |
| 64 + * */ | 1264 + */ |
| 65 +static U_INLINE uint32_t | 1265 +static U_INLINE uint32_t |
| 66 +_2022FromGR94DBCS(uint32_t value) { | 1266 +_2022ToGR94DBCS(uint32_t value) { |
| 67 + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && | 1267 + uint32_t returnValue = value + 0x8080; |
| 68 + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) | 1268 + if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && |
| 69 + ) { | 1269 + (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { |
| 70 + return value - 0x8080; /* shift down to 21..7e byte range */ | 1270 + return returnValue; |
| 71 + } else { | 1271 + } else { |
| 72 + return 0; /* not valid for ISO 2022 */ | 1272 + return value; |
| 73 + } | 1273 + } |
| 74 +} | 1274 +} |
| 1275 +#endif |
| 75 + | 1276 + |
| 76 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after
that. */ | |
| 77 +/* | |
| 78 * Check that the result is a 2-byte value with each byte in the range A1..FE | |
| 79 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte | |
| 80 * to move it to the ISO 2022 range 21..7E. | |
| 81 @@ -1112,6 +1159,7 @@ | |
| 82 return 0; /* not valid for ISO 2022 */ | |
| 83 } | |
| 84 } | |
| 85 +#endif | |
| 86 | |
| 87 #ifdef U_ENABLE_GENERIC_ISO_2022 | 1277 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 88 | 1278 |
| 89 @@ -1953,6 +2001,7 @@ | 1279 /******************************************************************************
**** |
| 1280 @@ -1436,7 +1488,7 @@ |
| 1281 c2 = 0; /* invalid */ |
| 1282 } |
| 1283 } else { |
| 1284 - if((uint8_t)(c2-0x21) <= (0x7e-0x21)) { |
| 1285 + if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { |
| 1286 c2 += 0x7e; |
| 1287 } else { |
| 1288 c2 = 0; /* invalid */ |
| 1289 @@ -1953,6 +2005,7 @@ |
| 90 const char *mySourceLimit = args->sourceLimit; | 1290 const char *mySourceLimit = args->sourceLimit; |
| 91 uint32_t targetUniChar = 0x0000; | 1291 uint32_t targetUniChar = 0x0000; |
| 92 uint32_t mySourceChar = 0x0000; | 1292 uint32_t mySourceChar = 0x0000; |
| 93 + uint32_t tmpSourceChar = 0x0000; | 1293 + uint32_t tmpSourceChar = 0x0000; |
| 94 UConverterDataISO2022* myData; | 1294 UConverterDataISO2022* myData; |
| 95 ISO2022State *pToU2022State; | 1295 ISO2022State *pToU2022State; |
| 96 StateEnum cs; | 1296 StateEnum cs; |
| 97 @@ -1968,6 +2017,7 @@ | 1297 @@ -1968,6 +2021,7 @@ |
| 98 mySourceChar = args->converter->toUBytes[0]; | 1298 mySourceChar = args->converter->toUBytes[0]; |
| 99 args->converter->toULength = 0; | 1299 args->converter->toULength = 0; |
| 100 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; | 1300 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| 101 + targetUniChar = missingCharMarker; | 1301 + targetUniChar = missingCharMarker; |
| 102 goto getTrailByte; | 1302 goto getTrailByte; |
| 103 } | 1303 } |
| 104 | 1304 |
| 105 @@ -2077,17 +2127,44 @@ | 1305 @@ -1986,6 +2040,7 @@ |
| 1306 continue; |
| 1307 } else { |
| 1308 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
| 1309 + myData->isEmptySegment = FALSE;» /* reset this, we have a
different error */ |
| 1310 break; |
| 1311 } |
| 1312 |
| 1313 @@ -1997,21 +2052,39 @@ |
| 1314 continue; |
| 1315 } else { |
| 1316 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
| 1317 + myData->isEmptySegment = FALSE;» /* reset this, we have a
different error */ |
| 1318 break; |
| 1319 } |
| 1320 |
| 1321 case ESC_2022: |
| 1322 mySource--; |
| 1323 escape: |
| 1324 - changeState_2022(args->converter,&(mySource), |
| 1325 - mySourceLimit, ISO_2022_JP,err); |
| 1326 + { |
| 1327 + const char * mySourceBefore = mySource; |
| 1328 + int8_t toULengthBefore = args->converter->toULength; |
| 1329 + |
| 1330 + changeState_2022(args->converter,&(mySource), |
| 1331 + mySourceLimit, ISO_2022_JP,err); |
| 1332 + |
| 1333 + /* If in ISO-2022-JP only and we successully completed an e
scape sequence, but previous segment was empty, create an error */ |
| 1334 + if(myData->version==0 && myData->key==0 && U_SUCCESS(*err)
&& myData->isEmptySegment) { |
| 1335 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1336 + args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 1337 + args->converter->toULength = toULengthBefore + (mySourc
e - mySourceBefore); |
| 1338 + } |
| 1339 + } |
| 1340 |
| 1341 /* invalid or illegal escape sequence */ |
| 1342 if(U_FAILURE(*err)){ |
| 1343 args->target = myTarget; |
| 1344 args->source = mySource; |
| 1345 + myData->isEmptySegment = FALSE;» /* Reset to avoid future
spurious errors */ |
| 1346 return; |
| 1347 } |
| 1348 + /* If we successfully completed an escape sequence, we begin a
new segment, empty so far */ |
| 1349 + if(myData->key==0) { |
| 1350 + myData->isEmptySegment = TRUE; |
| 1351 + } |
| 1352 continue; |
| 1353 |
| 1354 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
| 1355 @@ -2028,6 +2101,7 @@ |
| 1356 /* falls through */ |
| 1357 default: |
| 1358 /* convert one or two bytes */ |
| 1359 + myData->isEmptySegment = FALSE; |
| 1360 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| 1361 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->
version==4 && |
| 1362 !IS_JP_DBCS(cs) |
| 1363 @@ -2077,17 +2151,44 @@ |
| 106 default: | 1364 default: |
| 107 /* G0 DBCS */ | 1365 /* G0 DBCS */ |
| 108 if(mySource < mySourceLimit) { | 1366 if(mySource < mySourceLimit) { |
| 109 - char trailByte; | 1367 - char trailByte; |
| 110 + int leadIsOk, trailIsOk; | 1368 + int leadIsOk, trailIsOk; |
| 111 + uint8_t trailByte; | 1369 + uint8_t trailByte; |
| 112 getTrailByte: | 1370 getTrailByte: |
| 113 - trailByte = *mySource++; | 1371 - trailByte = *mySource++; |
| 114 - if(cs == JISX208) { | 1372 - if(cs == JISX208) { |
| 115 - _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailBy
te, tempBuf); | 1373 - _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailBy
te, tempBuf); |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 149 + /* report a pair of illegal bytes if the second byt
e is not a DBCS starter */ | 1407 + /* report a pair of illegal bytes if the second byt
e is not a DBCS starter */ |
| 150 + ++mySource; | 1408 + ++mySource; |
| 151 + /* add another bit so that the code below writes 2
bytes in case of error */ | 1409 + /* add another bit so that the code below writes 2
bytes in case of error */ |
| 152 + mySourceChar = 0x10000 | (mySourceChar << 8) | trai
lByte; | 1410 + mySourceChar = 0x10000 | (mySourceChar << 8) | trai
lByte; |
| 153 } | 1411 } |
| 154 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt
e); | 1412 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt
e); |
| 155 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myC
onverterArray[cs], tempBuf, 2, FALSE); | 1413 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myC
onverterArray[cs], tempBuf, 2, FALSE); |
| 156 } else { | 1414 } else { |
| 157 args->converter->toUBytes[0] = (uint8_t)mySourceChar; | 1415 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 158 args->converter->toULength = 1; | 1416 args->converter->toULength = 1; |
| 159 @@ -2229,7 +2306,12 @@ | 1417 @@ -2229,7 +2330,12 @@ |
| 160 } | 1418 } |
| 161 /* only DBCS or SBCS characters are expected*/ | 1419 /* only DBCS or SBCS characters are expected*/ |
| 162 /* DB characters with high bit set to 1 are expected */ | 1420 /* DB characters with high bit set to 1 are expected */ |
| 163 - if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)
&& length==2)){ | 1421 - if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)
&& length==2)){ |
| 164 + if( length > 2 || length==0 || | 1422 + if( length > 2 || length==0 || |
| 165 + (length == 1 && targetByteUnit > 0x7f) || | 1423 + (length == 1 && targetByteUnit > 0x7f) || |
| 166 + (length == 2 && | 1424 + (length == 2 && |
| 167 + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || | 1425 + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || |
| 168 + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) | 1426 + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) |
| 169 + ) { | 1427 + ) { |
| 170 targetByteUnit=missingCharMarker; | 1428 targetByteUnit=missingCharMarker; |
| 171 } | 1429 } |
| 172 if (targetByteUnit != missingCharMarker){ | 1430 if (targetByteUnit != missingCharMarker){ |
| 173 @@ -2545,17 +2627,34 @@ | 1431 @@ -2524,15 +2630,27 @@ |
| 174 | 1432 |
| 1433 if(mySourceChar==UCNV_SI){ |
| 1434 myData->toU2022State.g = 0; |
| 1435 + if (myData->isEmptySegment) { |
| 1436 + myData->isEmptySegment = FALSE; /* we are handling it, r
eset to avoid future spurious errors */ |
| 1437 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1438 + args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 1439 + args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 1440 + args->converter->toULength = 1; |
| 1441 + args->target = myTarget; |
| 1442 + args->source = mySource; |
| 1443 + return; |
| 1444 + } |
| 1445 /*consume the source */ |
| 1446 continue; |
| 1447 }else if(mySourceChar==UCNV_SO){ |
| 1448 myData->toU2022State.g = 1; |
| 1449 + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so
far */ |
| 1450 /*consume the source */ |
| 1451 continue; |
| 1452 }else if(mySourceChar==ESC_2022){ |
| 1453 mySource--; |
| 1454 escape: |
| 1455 + myData->isEmptySegment = FALSE; /* Any invalid ESC seque
nces will be detected separately, so just reset this */ |
| 1456 changeState_2022(args->converter,&(mySource), |
| 1457 mySourceLimit, ISO_2022_KR, err); |
| 1458 if(U_FAILURE(*err)){ |
| 1459 @@ -2543,19 +2661,37 @@ |
| 1460 continue; |
| 1461 } |
| 1462 |
| 1463 + myData->isEmptySegment = FALSE; /* Any invalid char errors will
be detected separately, so just reset this */ |
| 175 if(myData->toU2022State.g == 1) { | 1464 if(myData->toU2022State.g == 1) { |
| 176 if(mySource < mySourceLimit) { | 1465 if(mySource < mySourceLimit) { |
| 177 - char trailByte; | 1466 - char trailByte; |
| 178 + int leadIsOk, trailIsOk; | 1467 + int leadIsOk, trailIsOk; |
| 179 + uint8_t trailByte; | 1468 + uint8_t trailByte; |
| 180 getTrailByte: | 1469 getTrailByte: |
| 181 - trailByte = *mySource++; | 1470 - trailByte = *mySource++; |
| 182 - tempBuf[0] = (char)(mySourceChar + 0x80); | 1471 - tempBuf[0] = (char)(mySourceChar + 0x80); |
| 183 - tempBuf[1] = (char)(trailByte + 0x80); | 1472 - tempBuf[1] = (char)(trailByte + 0x80); |
| 184 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); | 1473 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); |
| (...skipping 22 matching lines...) Expand all Loading... |
| 207 - targetUniChar = missingCharMarker; | 1496 - targetUniChar = missingCharMarker; |
| 208 + mySourceChar = (mySourceChar << 8) | trailByte; | 1497 + mySourceChar = (mySourceChar << 8) | trailByte; |
| 209 + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { | 1498 + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| 210 + /* report a pair of illegal bytes if the second byte is
not a DBCS starter */ | 1499 + /* report a pair of illegal bytes if the second byte is
not a DBCS starter */ |
| 211 + ++mySource; | 1500 + ++mySource; |
| 212 + /* add another bit so that the code below writes 2 byte
s in case of error */ | 1501 + /* add another bit so that the code below writes 2 byte
s in case of error */ |
| 213 + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByt
e; | 1502 + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByt
e; |
| 214 } | 1503 } |
| 215 } else { | 1504 } else { |
| 216 args->converter->toUBytes[0] = (uint8_t)mySourceChar; | 1505 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 217 @@ -2563,8 +2662,10 @@ | 1506 @@ -2563,8 +2699,10 @@ |
| 218 break; | 1507 break; |
| 219 } | 1508 } |
| 220 } | 1509 } |
| 221 - else{ | 1510 - else{ |
| 222 + else if(mySourceChar <= 0x7f) { | 1511 + else if(mySourceChar <= 0x7f) { |
| 223 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySourc
e - 1, 1, useFallback); | 1512 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySourc
e - 1, 1, useFallback); |
| 224 + } else { | 1513 + } else { |
| 225 + targetUniChar = 0xffff; | 1514 + targetUniChar = 0xffff; |
| 226 } | 1515 } |
| 227 if(targetUniChar < 0xfffe){ | 1516 if(targetUniChar < 0xfffe){ |
| 228 if(args->offsets) { | 1517 if(args->offsets) { |
| 229 @@ -3061,6 +3162,7 @@ | 1518 @@ -3061,6 +3199,7 @@ |
| 230 /* continue with a partial double-byte character */ | 1519 /* continue with a partial double-byte character */ |
| 231 mySourceChar = args->converter->toUBytes[0]; | 1520 mySourceChar = args->converter->toUBytes[0]; |
| 232 args->converter->toULength = 0; | 1521 args->converter->toULength = 0; |
| 233 + targetUniChar = missingCharMarker; | 1522 + targetUniChar = missingCharMarker; |
| 234 goto getTrailByte; | 1523 goto getTrailByte; |
| 235 } | 1524 } |
| 236 | 1525 |
| 237 @@ -3114,29 +3216,50 @@ | 1526 @@ -3075,27 +3214,52 @@ |
| 1527 switch(mySourceChar){ |
| 1528 case UCNV_SI: |
| 1529 pToU2022State->g=0; |
| 1530 + if (myData->isEmptySegment) { |
| 1531 + myData->isEmptySegment = FALSE;» /* we are handling it, r
eset to avoid future spurious errors */ |
| 1532 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1533 + args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 1534 + args->converter->toUBytes[0] = mySourceChar; |
| 1535 + args->converter->toULength = 1; |
| 1536 + args->target = myTarget; |
| 1537 + args->source = mySource; |
| 1538 + return; |
| 1539 + } |
| 1540 continue; |
| 1541 |
| 1542 case UCNV_SO: |
| 1543 if(pToU2022State->cs[1] != 0) { |
| 1544 pToU2022State->g=1; |
| 1545 + myData->isEmptySegment = TRUE;» /* Begin a new segment,
empty so far */ |
| 1546 continue; |
| 1547 } else { |
| 1548 /* illegal to have SO before a matching designator */ |
| 1549 + myData->isEmptySegment = FALSE;» /* Handling a different
error, reset this to avoid future spurious errs */ |
| 1550 break; |
| 1551 } |
| 1552 |
| 1553 case ESC_2022: |
| 1554 mySource--; |
| 1555 escape: |
| 1556 - changeState_2022(args->converter,&(mySource), |
| 1557 - mySourceLimit, ISO_2022_CN,err); |
| 1558 + { |
| 1559 + const char * mySourceBefore = mySource; |
| 1560 + int8_t toULengthBefore = args->converter->toULength; |
| 1561 + |
| 1562 + changeState_2022(args->converter,&(mySource), |
| 1563 + mySourceLimit, ISO_2022_CN,err); |
| 1564 + |
| 1565 + /* After SO there must be at least one character before a d
esignator (designator error handled separately) */ |
| 1566 + if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegm
ent) { |
| 1567 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1568 + args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 1569 + args->converter->toULength = toULengthBefore + (mySourc
e - mySourceBefore); |
| 1570 + } |
| 1571 + } |
| 1572 |
| 1573 /* invalid or illegal escape sequence */ |
| 1574 if(U_FAILURE(*err)){ |
| 1575 args->target = myTarget; |
| 1576 args->source = mySource; |
| 1577 + myData->isEmptySegment = FALSE;» /* Reset to avoid future
spurious errors */ |
| 1578 return; |
| 1579 } |
| 1580 continue; |
| 1581 @@ -3109,34 +3273,56 @@ |
| 1582 /* falls through */ |
| 1583 default: |
| 1584 /* convert one or two bytes */ |
| 1585 + myData->isEmptySegment = FALSE; |
| 1586 if(pToU2022State->g != 0) { |
| 1587 if(mySource < mySourceLimit) { |
| 238 UConverterSharedData *cnv; | 1588 UConverterSharedData *cnv; |
| 239 StateEnum tempState; | 1589 StateEnum tempState; |
| 240 int32_t tempBufLen; | 1590 int32_t tempBufLen; |
| 241 - char trailByte; | 1591 - char trailByte; |
| 242 + int leadIsOk, trailIsOk; | 1592 + int leadIsOk, trailIsOk; |
| 243 + uint8_t trailByte; | 1593 + uint8_t trailByte; |
| 244 getTrailByte: | 1594 getTrailByte: |
| 245 - trailByte = *mySource++; | 1595 - trailByte = *mySource++; |
| 246 - tempState = (StateEnum)pToU2022State->cs[pToU2022State-
>g]; | 1596 - tempState = (StateEnum)pToU2022State->cs[pToU2022State-
>g]; |
| 247 - if(tempState > CNS_11643_0) { | 1597 - if(tempState > CNS_11643_0) { |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 295 } | 1645 } |
| 296 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt
e); | 1646 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt
e); |
| 297 if(pToU2022State->g>=2) { | 1647 if(pToU2022State->g>=2) { |
| 298 /* return from a single-shift state to the previous
one */ | 1648 /* return from a single-shift state to the previous
one */ |
| 299 pToU2022State->g=pToU2022State->prevG; | 1649 pToU2022State->g=pToU2022State->prevG; |
| 300 } | 1650 } |
| 301 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBu
f, tempBufLen, FALSE); | 1651 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBu
f, tempBufLen, FALSE); |
| 302 } else { | 1652 } else { |
| 303 args->converter->toUBytes[0] = (uint8_t)mySourceChar; | 1653 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 304 args->converter->toULength = 1; | 1654 args->converter->toULength = 1; |
| 305 diff -ru trie.clean/source/common/ucnvmbcs.c chrome.canonical/source/common/ucnv
mbcs.c | 1655 @@ -3399,11 +3585,19 @@ |
| 306 --- trie.clean/source/common/ucnvmbcs.c 2007-11-07 17:39:05.057870000 -0800 | 1656 /* include ASCII for JP */ |
| 307 +++ chrome.canonical/source/common/ucnvmbcs.c 2008-10-29 11:34:34.648518000 -0
700 | 1657 sa->addRange(sa->set, 0, 0x7f); |
| 1658 } |
| 1659 - if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { |
| 1660 + if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_
AND_FALLBACK_SET) { |
| 1661 /* |
| 1662 - * TODO(markus): If and when ucnv_getUnicodeSet() supports fallback
s, |
| 1663 - * we need to include half-width Katakana for all JP variants becau
se |
| 1664 - * JIS X 0208 has hardcoded fallbacks for them. |
| 1665 + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!
=0 |
| 1666 + * because the bit is on for all JP versions although only versions
3 & 4 (JIS7 & JIS8) |
| 1667 + * use half-width Katakana. |
| 1668 + * This is because all ISO-2022-JP variants are lenient in that the
y accept (in toUnicode) |
| 1669 + * half-width Katakana via the ESC ( I sequence. |
| 1670 + * However, we only emit (fromUnicode) half-width Katakana accordin
g to the |
| 1671 + * definition of each variant. |
| 1672 + * |
| 1673 + * When including fallbacks, |
| 1674 + * we need to include half-width Katakana Unicode code points for a
ll JP variants because |
| 1675 + * JIS X 0208 has hardcoded fallbacks for them (which map to full-w
idth Katakana). |
| 1676 */ |
| 1677 /* include half-width Katakana for JP */ |
| 1678 sa->addRange(sa->set, HWKANA_START, HWKANA_END); |
| 1679 @@ -3457,6 +3651,12 @@ |
| 1680 * corresponding to JIS X 0208. |
| 1681 */ |
| 1682 filter=UCNV_SET_FILTER_SJIS; |
| 1683 + } else if(i==KSC5601) { |
| 1684 + /* |
| 1685 + * Some of the KSC 5601 tables (convrtrs.txt has this aliases o
n multiple tables) |
| 1686 + * are broader than GR94. |
| 1687 + */ |
| 1688 + filter=UCNV_SET_FILTER_GR94DBCS; |
| 1689 } else { |
| 1690 filter=UCNV_SET_FILTER_NONE; |
| 1691 } |
| 1692 @@ -3472,6 +3672,9 @@ |
| 1693 sa->remove(sa->set, 0x0e); |
| 1694 sa->remove(sa->set, 0x0f); |
| 1695 sa->remove(sa->set, 0x1b); |
| 1696 + |
| 1697 + /* ISO 2022 converters do not convert C1 controls either */ |
| 1698 + sa->removeRange(sa->set, 0x80, 0x9f); |
| 1699 } |
| 1700 |
| 1701 static const UConverterImpl _ISO2022Impl={ |
| 1702 --- r22777/source/common/ucnv_lmb.c 2006-08-19 14:27:08.000000000 -0700 |
| 1703 +++ chrome.canonical/source/common/ucnv_lmb.c 2009-03-23 12:30:26.043293000 -0
700 |
| 1704 @@ -1,6 +1,6 @@ |
| 1705 /* |
| 1706 ********************************************************************** |
| 1707 -* Copyright (C) 2000-2006, International Business Machines |
| 1708 +* Copyright (C) 2000-2007, International Business Machines |
| 1709 * Corporation and others. All Rights Reserved. |
| 1710 ********************************************************************** |
| 1711 * file name: ucnv_lmb.cpp |
| 1712 @@ -536,7 +536,7 @@ |
| 1713 NULL,\ |
| 1714 NULL,\ |
| 1715 _LMBCSSafeClone,\ |
| 1716 - _LMBCSGetUnicodeSet\ |
| 1717 + ucnv_getCompleteUnicodeSet\ |
| 1718 };\ |
| 1719 static const UConverterStaticData _LMBCSStaticData##n={\ |
| 1720 sizeof(UConverterStaticData),\ |
| 1721 @@ -662,15 +662,14 @@ |
| 1722 return &newLMBCS->cnv; |
| 1723 } |
| 1724 |
| 1725 -static void |
| 1726 -_LMBCSGetUnicodeSet(const UConverter *cnv, |
| 1727 - const USetAdder *sa, |
| 1728 - UConverterUnicodeSet which, |
| 1729 - UErrorCode *pErrorCode) { |
| 1730 - /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */ |
| 1731 - sa->addRange(sa->set, 0, 0xf5ff); |
| 1732 - sa->addRange(sa->set, 0xf700, 0x10ffff); |
| 1733 -} |
| 1734 +/* |
| 1735 + * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 2
0117) |
| 1736 + * which added all code points except for U+F6xx |
| 1737 + * because those cannot be represented in the Unicode group. |
| 1738 + * However, it turns out that windows-950 has roundtrips for all of U+F6xx |
| 1739 + * which means that LMBCS can convert all Unicode code points after all. |
| 1740 + * We now simply use ucnv_getCompleteUnicodeSet(). |
| 1741 + */ |
| 1742 |
| 1743 /* |
| 1744 Here's the basic helper function that we use when converting from |
| 1745 --- r22777/source/common/ucnvhz.c 2006-07-05 16:08:50.000000000 -0700 |
| 1746 +++ chrome.canonical/source/common/ucnvhz.c 2009-03-23 12:42:01.208181000 -0
700 |
| 1747 @@ -1,6 +1,6 @@ |
| 1748 /* |
| 1749 ********************************************************************** |
| 1750 -* Copyright (C) 2000-2006, International Business Machines |
| 1751 +* Copyright (C) 2000-2007, International Business Machines |
| 1752 * Corporation and others. All Rights Reserved. |
| 1753 ********************************************************************** |
| 1754 * file name: ucnvhz.c |
| 1755 @@ -59,6 +59,7 @@ |
| 1756 UBool isEscapeAppended; |
| 1757 UBool isStateDBCS; |
| 1758 UBool isTargetUCharDBCS; |
| 1759 + UBool isEmptySegment; |
| 1760 }UConverterDataHZ; |
| 1761 |
| 1762 |
| 1763 @@ -72,7 +73,7 @@ |
| 1764 cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ)); |
| 1765 if(cnv->extraInfo != NULL){ |
| 1766 uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ)); |
| 1767 - ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386"
,errorCode); |
| 1768 + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",erro
rCode); |
| 1769 } |
| 1770 else { |
| 1771 *errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 1772 @@ -98,6 +99,7 @@ |
| 1773 cnv->mode=0; |
| 1774 if(cnv->extraInfo != NULL){ |
| 1775 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE; |
| 1776 + ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE; |
| 1777 } |
| 1778 } |
| 1779 if(choice!=UCNV_RESET_TO_UNICODE) { |
| 1780 @@ -130,6 +132,10 @@ |
| 1781 * from-GB code '~}' ($7E7D) is outside the defined GB range.) |
| 1782 * |
| 1783 * Source: RFC 1842 |
| 1784 +* |
| 1785 +* Note that the formal syntax in RFC 1842 is invalid. I assume that the |
| 1786 +* intended definition of single-byte-segment is as follows (pedberg): |
| 1787 +* single-byte-segment = single-byte-seq 1*single-byte-char |
| 1788 */ |
| 1789 |
| 1790 |
| 1791 @@ -141,7 +147,7 @@ |
| 1792 UChar *myTarget = args->target; |
| 1793 const char *mySourceLimit = args->sourceLimit; |
| 1794 UChar32 targetUniChar = 0x0000; |
| 1795 - UChar mySourceChar = 0x0000; |
| 1796 + int32_t mySourceChar = 0x0000; |
| 1797 UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo); |
| 1798 tempBuf[0]=0; |
| 1799 tempBuf[1]=0; |
| 1800 @@ -156,90 +162,123 @@ |
| 1801 |
| 1802 mySourceChar= (unsigned char) *mySource++; |
| 1803 |
| 1804 - switch(mySourceChar){ |
| 1805 + if(args->converter->mode == UCNV_TILDE) { |
| 1806 + /* second byte after ~ */ |
| 1807 + args->converter->mode=0; |
| 1808 + switch(mySourceChar) { |
| 1809 case 0x0A: |
| 1810 - if(args->converter->mode ==UCNV_TILDE){ |
| 1811 - args->converter->mode=0; |
| 1812 - |
| 1813 - } |
| 1814 - *(myTarget++)=(UChar)mySourceChar; |
| 1815 + /* no output for ~\n (line-continuation marker) */ |
| 1816 continue; |
| 1817 - |
| 1818 case UCNV_TILDE: |
| 1819 - if(args->converter->mode ==UCNV_TILDE){ |
| 1820 - *(myTarget++)=(UChar)mySourceChar; |
| 1821 - args->converter->mode=0; |
| 1822 - continue; |
| 1823 - |
| 1824 + if(args->offsets) { |
| 1825 + args->offsets[myTarget - args->target]=(int32_t)(mySour
ce - args->source - 2); |
| 1826 } |
| 1827 - else if(args->converter->toUnicodeStatus !=0){ |
| 1828 - args->converter->mode=0; |
| 1829 - break; |
| 1830 - } |
| 1831 - else{ |
| 1832 - args->converter->mode = UCNV_TILDE; |
| 1833 - continue; |
| 1834 - } |
| 1835 - |
| 1836 - |
| 1837 + *(myTarget++)=(UChar)mySourceChar; |
| 1838 + myData->isEmptySegment = FALSE; |
| 1839 + continue; |
| 1840 case UCNV_OPEN_BRACE: |
| 1841 - if(args->converter->mode == UCNV_TILDE){ |
| 1842 - args->converter->mode=0; |
| 1843 - myData->isStateDBCS = TRUE; |
| 1844 - continue; |
| 1845 - } |
| 1846 - else{ |
| 1847 - break; |
| 1848 - } |
| 1849 - |
| 1850 - |
| 1851 case UCNV_CLOSE_BRACE: |
| 1852 - if(args->converter->mode == UCNV_TILDE){ |
| 1853 - args->converter->mode=0; |
| 1854 - myData->isStateDBCS = FALSE; |
| 1855 - continue; |
| 1856 - } |
| 1857 - else{ |
| 1858 - break; |
| 1859 + myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); |
| 1860 + if (myData->isEmptySegment) { |
| 1861 + myData->isEmptySegment = FALSE; /* we are handling it,
reset to avoid future spurious errors */ |
| 1862 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1863 + args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 1864 + args->converter->toUBytes[0] = UCNV_TILDE; |
| 1865 + args->converter->toUBytes[1] = mySourceChar; |
| 1866 + args->converter->toULength = 2; |
| 1867 + args->target = myTarget; |
| 1868 + args->source = mySource; |
| 1869 + return; |
| 1870 } |
| 1871 - |
| 1872 + myData->isEmptySegment = TRUE; |
| 1873 + continue; |
| 1874 default: |
| 1875 /* if the first byte is equal to TILDE and the trail byte |
| 1876 * is not a valid byte then it is an error condition |
| 1877 */ |
| 1878 - if(args->converter->mode == UCNV_TILDE){ |
| 1879 - args->converter->mode=0; |
| 1880 - mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySo
urceChar & 0x00ff)+0x80)); |
| 1881 - goto SAVE_STATE; |
| 1882 - } |
| 1883 - |
| 1884 - break; |
| 1885 - |
| 1886 - } |
| 1887 - |
| 1888 - if(myData->isStateDBCS){ |
| 1889 + /* |
| 1890 + * Ticket 5691: consistent illegal sequences: |
| 1891 + * - We include at least the first byte in the illegal sequ
ence. |
| 1892 + * - If any of the non-initial bytes could be the start of
a character, |
| 1893 + * we stop the illegal sequence before the first one of t
hose. |
| 1894 + */ |
| 1895 + myData->isEmptySegment = FALSE; /* different error here, re
set this to avoid spurious future error */ |
| 1896 + *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 1897 + args->converter->toUBytes[0] = UCNV_TILDE; |
| 1898 + if( myData->isStateDBCS ? |
| 1899 + (0x21 <= mySourceChar && mySourceChar <= 0x7e) : |
| 1900 + mySourceChar <= 0x7f |
| 1901 + ) { |
| 1902 + /* The current byte could be the start of a character:
Back it out. */ |
| 1903 + args->converter->toULength = 1; |
| 1904 + --mySource; |
| 1905 + } else { |
| 1906 + /* Include the current byte in the illegal sequence. */ |
| 1907 + args->converter->toUBytes[1] = mySourceChar; |
| 1908 + args->converter->toULength = 2; |
| 1909 + } |
| 1910 + args->target = myTarget; |
| 1911 + args->source = mySource; |
| 1912 + return; |
| 1913 + } |
| 1914 + } else if(myData->isStateDBCS) { |
| 1915 if(args->converter->toUnicodeStatus == 0x00){ |
| 1916 - args->converter->toUnicodeStatus = (UChar) mySourceChar; |
| 1917 + /* lead byte */ |
| 1918 + if(mySourceChar == UCNV_TILDE) { |
| 1919 + args->converter->mode = UCNV_TILDE; |
| 1920 + } else { |
| 1921 + /* add another bit to distinguish a 0 byte from not hav
ing seen a lead byte */ |
| 1922 + args->converter->toUnicodeStatus = (uint32_t) (mySource
Char | 0x100); |
| 1923 + myData->isEmptySegment = FALSE; /* the segment has some
thing, either valid or will produce a different error, so reset this */ |
| 1924 + } |
| 1925 continue; |
| 1926 } |
| 1927 else{ |
| 1928 - tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80)
; |
| 1929 - tempBuf[1] = (char) (mySourceChar+0x80); |
| 1930 - mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x
80) << 8) | ((mySourceChar & 0x00ff)+0x80)); |
| 1931 + /* trail byte */ |
| 1932 + int leadIsOk, trailIsOk; |
| 1933 + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff
; |
| 1934 + targetUniChar = 0xffff; |
| 1935 + /* |
| 1936 + * Ticket 5691: consistent illegal sequences: |
| 1937 + * - We include at least the first byte in the illegal sequ
ence. |
| 1938 + * - If any of the non-initial bytes could be the start of
a character, |
| 1939 + * we stop the illegal sequence before the first one of t
hose. |
| 1940 + * |
| 1941 + * In HZ DBCS, if the second byte is in the 21..7e range, |
| 1942 + * we report only the first byte as the illegal sequence. |
| 1943 + * Otherwise we convert or report the pair of bytes. |
| 1944 + */ |
| 1945 + leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21); |
| 1946 + trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21)
; |
| 1947 + if (leadIsOk && trailIsOk) { |
| 1948 + tempBuf[0] = (char) (leadByte+0x80) ; |
| 1949 + tempBuf[1] = (char) (mySourceChar+0x80); |
| 1950 + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbC
onverter->sharedData, |
| 1951 + tempBuf, 2, args->converter->useFallback); |
| 1952 + mySourceChar= (leadByte << 8) | mySourceChar; |
| 1953 + } else if (trailIsOk) { |
| 1954 + /* report a single illegal byte and continue with the f
ollowing DBCS starter byte */ |
| 1955 + --mySource; |
| 1956 + mySourceChar = (int32_t)leadByte; |
| 1957 + } else { |
| 1958 + /* report a pair of illegal bytes if the second byte is
not a DBCS starter */ |
| 1959 + /* add another bit so that the code below writes 2 byte
s in case of error */ |
| 1960 + mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar; |
| 1961 + } |
| 1962 args->converter->toUnicodeStatus =0x00; |
| 1963 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConve
rter->sharedData, |
| 1964 - tempBuf, 2, args->converter->useFallback); |
| 1965 } |
| 1966 } |
| 1967 else{ |
| 1968 - if(args->converter->fromUnicodeStatus == 0x00){ |
| 1969 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConve
rter->sharedData, |
| 1970 - mySource - 1, 1, args->converter->useFallback); |
| 1971 - } |
| 1972 - else{ |
| 1973 - goto SAVE_STATE; |
| 1974 + if(mySourceChar == UCNV_TILDE) { |
| 1975 + args->converter->mode = UCNV_TILDE; |
| 1976 + continue; |
| 1977 + } else if(mySourceChar <= 0x7f) { |
| 1978 + targetUniChar = (UChar)mySourceChar; /* ASCII */ |
| 1979 + myData->isEmptySegment = FALSE; /* the segment has somethin
g valid */ |
| 1980 + } else { |
| 1981 + targetUniChar = 0xffff; |
| 1982 + myData->isEmptySegment = FALSE; /* different error here, re
set this to avoid spurious future error */ |
| 1983 } |
| 1984 - |
| 1985 } |
| 1986 if(targetUniChar < 0xfffe){ |
| 1987 if(args->offsets) { |
| 1988 @@ -248,26 +287,17 @@ |
| 1989 |
| 1990 *(myTarget++)=(UChar)targetUniChar; |
| 1991 } |
| 1992 - else if(targetUniChar>=0xfffe){ |
| 1993 -SAVE_STATE: |
| 1994 + else /* targetUniChar>=0xfffe */ { |
| 1995 if(targetUniChar == 0xfffe){ |
| 1996 *err = U_INVALID_CHAR_FOUND; |
| 1997 } |
| 1998 else{ |
| 1999 *err = U_ILLEGAL_CHAR_FOUND; |
| 2000 } |
| 2001 - if(myData->isStateDBCS){ |
| 2002 - /* this should never occur since isStateDBCS is set to true
|
| 2003 - * only after tempBuf[0] and tempBuf[1] |
| 2004 - * are set to the input .. just to please BEAM |
| 2005 - */ |
| 2006 - if(tempBuf[0]==0 || tempBuf[1]==0){ |
| 2007 - *err = U_INTERNAL_PROGRAM_ERROR; |
| 2008 - }else{ |
| 2009 - args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x8
0); |
| 2010 - args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x8
0); |
| 2011 - args->converter->toULength=2; |
| 2012 - } |
| 2013 + if(mySourceChar > 0xff){ |
| 2014 + args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8)
; |
| 2015 + args->converter->toUBytes[1] = (uint8_t)mySourceChar; |
| 2016 + args->converter->toULength=2; |
| 2017 } |
| 2018 else{ |
| 2019 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 2020 @@ -328,16 +358,21 @@ |
| 2021 escSeq = TILDE_ESCAPE; |
| 2022 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,e
rr,len,mySourceIndex); |
| 2023 continue; |
| 2024 - } |
| 2025 - else{ |
| 2026 + } else if(mySourceChar <= 0x7f) { |
| 2027 + length = 1; |
| 2028 + targetUniChar = mySourceChar; |
| 2029 + } else { |
| 2030 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->shar
edData, |
| 2031 mySourceChar,&targetUniChar,args->converter->useFallback); |
| 2032 - |
| 2033 - } |
| 2034 - /* only DBCS or SBCS characters are expected*/ |
| 2035 - /* DB haracters with high bit set to 1 are expected */ |
| 2036 - if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&
& length==2)){ |
| 2037 - targetUniChar= missingCharMarker; |
| 2038 + /* we can only use lead bytes 21..7D and trail bytes 21..7E */ |
| 2039 + if( length == 2 && |
| 2040 + (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) && |
| 2041 + (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1) |
| 2042 + ) { |
| 2043 + targetUniChar -= 0x8080; |
| 2044 + } else { |
| 2045 + targetUniChar = missingCharMarker; |
| 2046 + } |
| 2047 } |
| 2048 if (targetUniChar != missingCharMarker){ |
| 2049 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)
(targetUniChar>0x00FF); |
| 2050 @@ -360,22 +395,22 @@ |
| 2051 |
| 2052 if(isTargetUCharDBCS){ |
| 2053 if( myTargetIndex <targetLength){ |
| 2054 - myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8)
-0x80); |
| 2055 + myTarget[myTargetIndex++] =(char) (targetUniChar >> 8); |
| 2056 if(offsets){ |
| 2057 *(offsets++) = mySourceIndex-1; |
| 2058 } |
| 2059 if(myTargetIndex < targetLength){ |
| 2060 - myTarget[myTargetIndex++] =(char) ((targetUniChar &
0x00FF) -0x80); |
| 2061 + myTarget[myTargetIndex++] =(char) targetUniChar; |
| 2062 if(offsets){ |
| 2063 *(offsets++) = mySourceIndex-1; |
| 2064 } |
| 2065 }else{ |
| 2066 - args->converter->charErrorBuffer[args->converter->c
harErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); |
| 2067 + args->converter->charErrorBuffer[args->converter->c
harErrorBufferLength++] = (char) targetUniChar; |
| 2068 *err = U_BUFFER_OVERFLOW_ERROR; |
| 2069 } |
| 2070 }else{ |
| 2071 - args->converter->charErrorBuffer[args->converter->charE
rrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80); |
| 2072 - args->converter->charErrorBuffer[args->converter->charE
rrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80); |
| 2073 + args->converter->charErrorBuffer[args->converter->charE
rrorBufferLength++] =(char) (targetUniChar >> 8); |
| 2074 + args->converter->charErrorBuffer[args->converter->charE
rrorBufferLength++] = (char) targetUniChar; |
| 2075 *err = U_BUFFER_OVERFLOW_ERROR; |
| 2076 } |
| 2077 |
| 2078 @@ -524,14 +559,14 @@ |
| 2079 const USetAdder *sa, |
| 2080 UConverterUnicodeSet which, |
| 2081 UErrorCode *pErrorCode) { |
| 2082 - /* the tilde '~' is hardcoded in the converter */ |
| 2083 - sa->add(sa->set, 0x7e); |
| 2084 + /* HZ converts all of ASCII */ |
| 2085 + sa->addRange(sa->set, 0, 0x7f); |
| 2086 |
| 2087 /* add all of the code points that the sub-converter handles */ |
| 2088 - ((UConverterDataHZ*)cnv->extraInfo)-> |
| 2089 - gbConverter->sharedData->impl-> |
| 2090 - getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, |
| 2091 - sa, which, pErrorCode); |
| 2092 + ucnv_MBCSGetFilteredUnicodeSetForUnicode( |
| 2093 + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData, |
| 2094 + sa, which, UCNV_SET_FILTER_HZ, |
| 2095 + pErrorCode); |
| 2096 } |
| 2097 |
| 2098 static const UConverterImpl _HZImpl={ |
| 2099 --- r22777/source/common/ucnv_set.c 2005-06-03 13:17:54.000000000 -0700 |
| 2100 +++ chrome.canonical/source/common/ucnv_set.c 2009-03-23 12:30:09.917043000 -0
700 |
| 308 @@ -1,7 +1,7 @@ | 2101 @@ -1,7 +1,7 @@ |
| 309 /* | 2102 /* |
| 310 ****************************************************************************** | 2103 ******************************************************************************* |
| 311 * | 2104 * |
| 312 -* Copyright (C) 2000-2007, International Business Machines | 2105 -* Copyright (C) 2003-2005, International Business Machines |
| 313 +* Copyright (C) 2000-2008, International Business Machines | 2106 +* Copyright (C) 2003-2007, International Business Machines |
| 314 * Corporation and others. All Rights Reserved. | 2107 * Corporation and others. All Rights Reserved. |
| 315 * | 2108 * |
| 316 ****************************************************************************** | 2109 ******************************************************************************* |
| 317 @@ -1739,6 +1739,65 @@ | 2110 @@ -52,7 +52,8 @@ |
| 318 pArgs->offsets=offsets; | 2111 uset_add, |
| 319 } | 2112 uset_addRange, |
| 320 | 2113 uset_addString, |
| 321 +static UBool | 2114 - uset_remove |
| 322 +hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { | 2115 + uset_remove, |
| 323 + const int32_t *row=stateTable[state]; | 2116 + uset_removeRange |
| 324 + int32_t b, entry; | 2117 }; |
| 325 + /* First test for final entries in this state for some commonly valid byte
values. */ | 2118 sa.set=setFillIn; |
| 326 + entry=row[0xa1]; | 2119 |
| 327 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && | 2120 --- r22777/source/common/ucnv_bld.c» 2007-08-24 02:44:10.880047000 -0700 |
| 328 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL | 2121 +++ chrome.canonical/source/common/ucnv_bld.c» 2009-03-23 12:40:10.653507000 -0
700 |
| 2122 @@ -932,6 +932,7 @@ |
| 2123 myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen; |
| 2124 myUConverter->subChars = (uint8_t *)myUConverter->subUChars; |
| 2125 uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subC
har, myUConverter->subCharLen); |
| 2126 + myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke
(*fromCharErrorBehaviour) */ |
| 2127 |
| 2128 if(mySharedConverterData->impl->open != NULL) { |
| 2129 mySharedConverterData->impl->open(myUConverter, realName, locale, optio
ns, err); |
| 2130 --- r22777/source/common/ucnv_bld.h» 2006-07-05 16:08:50.000000000 -0700 |
| 2131 +++ chrome.canonical/source/common/ucnv_bld.h» 2009-03-23 12:40:10.680507000 -0
700 |
| 2132 @@ -1,6 +1,6 @@ |
| 2133 /* |
| 2134 ********************************************************************** |
| 2135 -* Copyright (C) 1999-2006, International Business Machines |
| 2136 +* Copyright (C) 1999-2006,2008 International Business Machines |
| 2137 * Corporation and others. All Rights Reserved. |
| 2138 ********************************************************************** |
| 2139 * |
| 2140 @@ -226,6 +226,9 @@ |
| 2141 char preToU[UCNV_EXT_MAX_BYTES]; |
| 2142 int8_t preFromULength, preToULength; /* negative: replay */ |
| 2143 int8_t preToUFirstLength; /* length of first character */ |
| 2144 + |
| 2145 + /* new fields for ICU 4.0 */ |
| 2146 + UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) re
ason, set when error is detected */ |
| 2147 }; |
| 2148 |
| 2149 U_CDECL_END /* end of UConverter */ |
| 2150 --- r22777/source/common/ucnv_ext.c» 2007-08-22 22:46:49.525855000 -0700 |
| 2151 +++ chrome.canonical/source/common/ucnv_ext.c» 2009-03-23 12:30:33.135573000 -0
700 |
| 2152 @@ -946,7 +946,7 @@ |
| 2153 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData, |
| 2154 const int32_t *cx, |
| 2155 const USetAdder *sa, |
| 2156 - UConverterUnicodeSet which, |
| 2157 + UBool useFallback, |
| 2158 int32_t minLength, |
| 2159 UChar32 c, |
| 2160 UChar s[UCNV_EXT_MAX_UCHARS], int32_t length, |
| 2161 @@ -966,7 +966,7 @@ |
| 2162 value=*fromUSectionValues++; |
| 2163 |
| 2164 if( value!=0 && |
| 2165 - UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) && |
| 2166 + (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) && |
| 2167 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength |
| 2168 ) { |
| 2169 if(c>=0) { |
| 2170 @@ -987,12 +987,14 @@ |
| 2171 /* no mapping, do nothing */ |
| 2172 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { |
| 2173 ucnv_extGetUnicodeSetString( |
| 2174 - sharedData, cx, sa, which, minLength, |
| 2175 + sharedData, cx, sa, useFallback, minLength, |
| 2176 U_SENTINEL, s, length+1, |
| 2177 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value), |
| 2178 pErrorCode); |
| 2179 - } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESER
VED_MASK))== |
| 2180 - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && |
| 2181 + } else if((useFallback ? |
| 2182 + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 : |
| 2183 + ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_R
ESERVED_MASK))== |
| 2184 + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && |
| 2185 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength |
| 2186 ) { |
| 2187 sa->addString(sa->set, s, length+1); |
| 2188 @@ -1004,6 +1006,7 @@ |
| 2189 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData, |
| 2190 const USetAdder *sa, |
| 2191 UConverterUnicodeSet which, |
| 2192 + UConverterSetFilter filter, |
| 2193 UErrorCode *pErrorCode) { |
| 2194 const int32_t *cx; |
| 2195 const uint16_t *stage12, *stage3, *ps2, *ps3; |
| 2196 @@ -1011,6 +1014,7 @@ |
| 2197 |
| 2198 uint32_t value; |
| 2199 int32_t st1, stage1Length, st2, st3, minLength; |
| 2200 + UBool useFallback; |
| 2201 |
| 2202 UChar s[UCNV_EXT_MAX_UCHARS]; |
| 2203 UChar32 c; |
| 2204 @@ -1027,10 +1031,16 @@ |
| 2205 |
| 2206 stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]; |
| 2207 |
| 2208 + useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET); |
| 2209 + |
| 2210 /* enumerate the from-Unicode trie table */ |
| 2211 c=0; /* keep track of the current code point while enumerating */ |
| 2212 |
| 2213 - if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) { |
| 2214 + if(filter==UCNV_SET_FILTER_2022_CN) { |
| 2215 + minLength=3; |
| 2216 + } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY || |
| 2217 + filter!=UCNV_SET_FILTER_NONE |
| 329 + ) { | 2218 + ) { |
| 330 + return TRUE; | 2219 /* DBCS-only, ignore single-byte results */ |
| 331 + } | 2220 minLength=2; |
| 332 + entry=row[0x41]; | 2221 } else { |
| 333 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && | 2222 @@ -1064,14 +1074,48 @@ |
| 334 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL | 2223 length=0; |
| 335 + ) { | 2224 U16_APPEND_UNSAFE(s, length, c); |
| 336 + return TRUE; | 2225 ucnv_extGetUnicodeSetString( |
| 337 + } | 2226 - sharedData, cx, sa, which, minLength, |
| 338 + /* Then test for final entries in this state. */ | 2227 + sharedData, cx, sa, useFallback, minLength, |
| 339 + for(b=0; b<=0xff; ++b) { | 2228 c, s, length, |
| 340 + entry=row[b]; | 2229 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(valu
e), |
| 341 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && | 2230 pErrorCode); |
| 342 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL | 2231 - } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_
EXT_FROM_U_RESERVED_MASK))== |
| 343 + ) { | 2232 - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) && |
| 344 + return TRUE; | 2233 + } else if((useFallback ? |
| 345 + } | 2234 + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
: |
| 346 + } | 2235 + ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|U
CNV_EXT_FROM_U_RESERVED_MASK))== |
| 347 + /* Then recurse for transition entries. */ | 2236 + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) && |
| 348 + for(b=0; b<=0xff; ++b) { | 2237 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength |
| 349 + entry=row[b]; | 2238 ) { |
| 350 + if( MBCS_ENTRY_IS_TRANSITION(entry) && | 2239 + switch(filter) { |
| 351 + hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE
(entry)) | 2240 + case UCNV_SET_FILTER_2022_CN: |
| 352 + ) { | 2241 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UC
NV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) { |
| 353 + return TRUE; | 2242 + continue; |
| 354 + } | 2243 + } |
| 355 + } | 2244 + break; |
| 356 + return FALSE; | 2245 + case UCNV_SET_FILTER_SJIS: |
| 357 +} | 2246 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (v
alue=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) { |
| 358 + | 2247 + continue; |
| 359 +/* | 2248 + } |
| 360 + * Is byte b a single/lead byte in this state? | 2249 + break; |
| 361 + * Recurse for transition states, because here we don't want to say that | 2250 + case UCNV_SET_FILTER_GR94DBCS: |
| 362 + * b is a lead byte if all byte sequences that start with b are illegal. | 2251 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && |
| 363 + */ | 2252 + (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA
(value))-0xa1a1)<=(0xfefe - 0xa1a1) && |
| 364 +static UBool | 2253 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { |
| 365 +isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnl
y, uint8_t b) { | 2254 + continue; |
| 366 + const int32_t *row=stateTable[state]; | 2255 + } |
| 367 + int32_t entry=row[b]; | 2256 + break; |
| 368 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ | 2257 + case UCNV_SET_FILTER_HZ: |
| 369 + return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_ST
ATE(entry)); | 2258 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && |
| 370 + } else { | 2259 + (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA
(value))-0xa1a1)<=(0xfdfe - 0xa1a1) && |
| 371 + uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); | 2260 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) { |
| 372 + if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { | 2261 + continue; |
| 373 + return FALSE; /* SI/SO are illegal for DBCS-only conversion */ | 2262 + } |
| 374 + } else { | 2263 + break; |
| 375 + return action!=MBCS_STATE_ILLEGAL; | 2264 + default: |
| 376 + } | 2265 + /* |
| 377 + } | 2266 + * UCNV_SET_FILTER_NONE, |
| 378 +} | 2267 + * or UCNV_SET_FILTER_DBCS_ONLY which is handle
d via minLength |
| 379 + | 2268 + */ |
| 380 U_CFUNC void | 2269 + break; |
| 381 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, | 2270 + } |
| 382 UErrorCode *pErrorCode) { | 2271 sa->add(sa->set, c); |
| 383 @@ -2094,6 +2153,34 @@ | 2272 } |
| 384 sourceIndex=nextSourceIndex; | 2273 } while((++c&0xf)!=0); |
| 385 } else if(U_FAILURE(*pErrorCode)) { | |
| 386 /* callback(illegal) */ | |
| 387 + if(byteIndex>1) { | |
| 388 + /* | |
| 389 + * Ticket 5691: consistent illegal sequences: | |
| 390 + * - We include at least the first byte in the illegal sequence
. | |
| 391 + * - If any of the non-initial bytes could be the start of a ch
aracter, | |
| 392 + * we stop the illegal sequence before the first one of those
. | |
| 393 + */ | |
| 394 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0
); | |
| 395 + int8_t i; | |
| 396 + for(i=1; | |
| 397 + i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnl
y, bytes[i]); | |
| 398 + ++i) {} | |
| 399 + if(i<byteIndex) { | |
| 400 + /* Back out some bytes. */ | |
| 401 + int8_t backOutDistance=byteIndex-i; | |
| 402 + int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t
*)pArgs->source); | |
| 403 + byteIndex=i; /* length of reported illegal byte sequence *
/ | |
| 404 + if(backOutDistance<=bytesFromThisBuffer) { | |
| 405 + source-=backOutDistance; | |
| 406 + } else { | |
| 407 + /* Back out bytes from the previous buffer: Need to rep
lay them. */ | |
| 408 + cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutD
istance); | |
| 409 + /* preToULength is negative! */ | |
| 410 + uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength); | |
| 411 + source=(const uint8_t *)pArgs->source; | |
| 412 + } | |
| 413 + } | |
| 414 + } | |
| 415 break; | |
| 416 } else /* unassigned sequences indicated with byteIndex>0 */ { | |
| 417 /* try an extension mapping */ | |
| 418 @@ -2104,7 +2191,7 @@ | |
| 419 &offsets, sourceIndex, | |
| 420 pArgs->flush, | |
| 421 pErrorCode); | |
| 422 - sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs
->source); | |
| 423 + sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArg
s->source); | |
| 424 | |
| 425 if(U_FAILURE(*pErrorCode)) { | |
| 426 /* not mappable or buffer overflow */ | |
| 427 @@ -2395,15 +2482,37 @@ | |
| 428 | |
| 429 if(c<0) { | |
| 430 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source)
{ | |
| 431 - *pErrorCode=U_TRUNCATED_CHAR_FOUND; | |
| 432 - } | |
| 433 - if(U_FAILURE(*pErrorCode)) { | |
| 434 /* incomplete character byte sequence */ | |
| 435 uint8_t *bytes=cnv->toUBytes; | |
| 436 cnv->toULength=(int8_t)(source-lastSource); | |
| 437 do { | |
| 438 *bytes++=*lastSource++; | |
| 439 } while(lastSource<source); | |
| 440 + *pErrorCode=U_TRUNCATED_CHAR_FOUND; | |
| 441 + } else if(U_FAILURE(*pErrorCode)) { | |
| 442 + /* callback(illegal) */ | |
| 443 + /* | |
| 444 + * Ticket 5691: consistent illegal sequences: | |
| 445 + * - We include at least the first byte in the illegal sequence. | |
| 446 + * - If any of the non-initial bytes could be the start of a charac
ter, | |
| 447 + * we stop the illegal sequence before the first one of those. | |
| 448 + */ | |
| 449 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0); | |
| 450 + uint8_t *bytes=cnv->toUBytes; | |
| 451 + *bytes++=*lastSource++; /* first byte */ | |
| 452 + if(lastSource==source) { | |
| 453 + cnv->toULength=1; | |
| 454 + } else /* lastSource<source: multi-byte character */ { | |
| 455 + int8_t i; | |
| 456 + for(i=1; | |
| 457 + lastSource<source && !isSingleOrLead(stateTable, state, isD
BCSOnly, *lastSource); | |
| 458 + ++i | |
| 459 + ) { | |
| 460 + *bytes++=*lastSource++; | |
| 461 + } | |
| 462 + cnv->toULength=i; | |
| 463 + source=lastSource; | |
| 464 + } | |
| 465 } else { | |
| 466 /* no output because of empty input or only state changes */ | |
| 467 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
| 468 diff -ru trie.clean/source/test/cintltst/nccbtst.c chrome.canonical/source/test/
cintltst/nccbtst.c | |
| 469 --- trie.clean/source/test/cintltst/nccbtst.c 2007-09-19 09:45:00.986804000 -0
700 | |
| 470 +++ chrome.canonical/source/test/cintltst/nccbtst.c 2008-10-29 11:08:51.1023
76000 -0700 | |
| 471 @@ -1,6 +1,6 @@ | |
| 472 /******************************************************************** | |
| 473 * COPYRIGHT: | |
| 474 - * Copyright (c) 1997-2007, International Business Machines Corporation and | |
| 475 + * Copyright (c) 1997-2008, International Business Machines Corporation and | |
| 476 * others. All Rights Reserved. | |
| 477 ********************************************************************/ | |
| 478 /* | |
| 479 @@ -2530,13 +2530,13 @@ | |
| 480 | |
| 481 | |
| 482 static const uint8_t text943[] = { | |
| 483 - 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; | |
| 484 - static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061
, 0x6f22, 0x5b57}; | |
| 485 - static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22
, 0x5b57}; | |
| 486 + 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a }; | |
| 487 + static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22
, 0x5b57 }; | |
| 488 + static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5
b57 }; | |
| 489 static const UChar toUnicode943stop[]= { 0x304b}; | |
| 490 | |
| 491 - static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7}; | |
| 492 - static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7}; | |
| 493 + static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 }; | |
| 494 + static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 }; | |
| 495 static const int32_t fromIBM943Offsstop[] = { 0}; | |
| 496 | |
| 497 gInBufferSize = inputsize; | |
| 498 @@ -2570,9 +2570,9 @@ | |
| 499 { | |
| 500 static const uint8_t sampleText[] = { | |
| 501 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82, | |
| 502 - 0xff, /*0x82, 0xa9,*/ 0x32, 0x33}; | |
| 503 - static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0x
fffd,/*0x304b,*/ 0x0032, 0x0033}; | |
| 504 - static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8}; | |
| 505 + 0xff, 0x32, 0x33}; | |
| 506 + static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x
1a, 0x1a, 0x0032, 0x0033 }; | |
| 507 + static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 }; | |
| 508 /*checking illegal value for ibm-943 with substitute*/ | |
| 509 gInBufferSize = inputsize; | |
| 510 gOutBufferSize = outputsize; | |
| 511 diff -ru trie.clean/source/test/cintltst/nucnvtst.c chrome.canonical/source/test
/cintltst/nucnvtst.c | |
| 512 --- trie.clean/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0
700 | |
| 513 +++ chrome.canonical/source/test/cintltst/nucnvtst.c 2008-10-29 11:08:51.1942
86000 -0700 | |
| 514 @@ -2606,7 +2606,7 @@ | |
| 515 TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceL
imit <= source"); | |
| 516 /*Test for the condition where there is an invalid character*/ | |
| 517 { | |
| 518 - static const uint8_t source2[]={0xa1, 0x01}; | |
| 519 + static const uint8_t source2[]={0xa1, 0x80}; | |
| 520 TestNextUCharError(cnv, (const char*)source2, (const char*)source2+size
of(source2), U_ZERO_ERROR, "an invalid character"); | |
| 521 } | |
| 522 /*Test for the condition where we have a truncated char*/ | |
| 523 @@ -3899,11 +3899,11 @@ | |
| 524 TestISO_2022_KR() { | |
| 525 /* test input */ | |
| 526 static const uint16_t in[]={ | |
| 527 - 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x0
00A,0x000D | |
| 528 - ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA
C02,0xAC04 | |
| 529 + 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x0
00D | |
| 530 + ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA
C04 | |
| 531 ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0
028,0x0029 | |
| 532 ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x5
3CA,0x53CB | |
| 533 - ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x5
3E1,0x53E2 | |
| 534 + ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x5
3E2 | |
| 535 ,0x53E3,0x53E4,0x000A,0x000D}; | |
| 536 const UChar* uSource; | |
| 537 const UChar* uSourceLimit; | |
| 538 diff -ru trie.clean/source/test/testdata/conversion.txt chrome.canonical/source/
test/testdata/conversion.txt | |
| 539 --- trie.clean/source/test/testdata/conversion.txt 2007-10-11 14:31:32.1965
32000 -0700 | |
| 540 +++ chrome.canonical/source/test/testdata/conversion.txt 2008-10-29 11:37
:09.419716000 -0700 | |
| 541 @@ -48,13 +48,135 @@ | |
| 542 toUnicode { | |
| 543 Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks",
"errorCode", "callback", "invalidChars" } | |
| 544 Cases { | |
| 545 + // Test ticket 5691: consistent illegal sequences | |
| 546 + // The following test cases are for illegal character byte sequences. | |
| 547 + // | |
| 548 + // Unfortunately, we cannot use the Shift-JIS examples from the ticket | |
| 549 + // comments because our Shift-JIS table is Windows-compatible and | |
| 550 + // therefore has no illegal single bytes. Same for GBK. | |
| 551 + // Instead, we use the stricter GB 18030 also for 2-byte examples. | |
| 552 + // The byte sequences are generally slightly different from the ticket | |
| 553 + // comment, simply using assigned characters rather than just | |
| 554 + // theoretically valid sequences. | |
| 555 + { | |
| 556 + "gb18030", | |
| 557 + :bin{ 618140813c81ff7a }, | |
| 558 + "a\u4e02\\x81<\\x81\\xFFz", | |
| 559 + :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, | |
| 560 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 561 + } | |
| 562 + { | |
| 563 + "EUC-JP", | |
| 564 + :bin{ 618fb0a98fb03c8f3cb0a97a }, | |
| 565 + "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", | |
| 566 + :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, | |
| 567 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 568 + } | |
| 569 + { | |
| 570 + "gb18030", | |
| 571 + :bin{ 618130fc318130fc8181303c3e813cfc817a }, | |
| 572 + "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z", | |
| 573 + :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 }, | |
| 574 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 575 + } | |
| 576 + { | |
| 577 + "UTF-8", | |
| 578 + :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, | |
| 579 + "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1
\\xFF<>z", | |
| 580 + :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,1
2,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,
21 }, | |
| 581 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 582 + } | |
| 583 + { | |
| 584 + "ISO-2022-JP", | |
| 585 + :bin{ 1b24424141af4142affe41431b2842 }, | |
| 586 + "\u758f\\xAF\u758e\\xAF\\xFE\u790e", | |
| 587 + :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, | |
| 588 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 589 + } | |
| 590 + { | |
| 591 + "ibm-25546", | |
| 592 + :bin{ 411b242943420e4141af4142affe41430f5a }, | |
| 593 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", | |
| 594 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, | |
| 595 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 596 + } | |
| 597 + { | |
| 598 + "ISO-2022-KR", | |
| 599 + :bin{ 411b242943420e4141af4142affe41430f5a }, | |
| 600 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", | |
| 601 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, | |
| 602 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 603 + } | |
| 604 + { | |
| 605 + "ISO-2022-CN", | |
| 606 + :bin{ 411b242941420e4141af4142affe41430f5a }, | |
| 607 + "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", | |
| 608 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, | |
| 609 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 610 + } | |
| 611 + { | |
| 612 + "HZ", | |
| 613 + :bin{ 417e7b4141af4142affe41437e7d5a }, | |
| 614 + "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", | |
| 615 + :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, | |
| 616 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 617 + } | |
| 618 + // Test ticket 5691: consistent illegal sequences | |
| 619 + // The following test cases are for illegal escape/designator/shift seq
uences. | |
| 620 + // | |
| 621 + // ISO-2022-JP and -CN with illegal escape sequences. | |
| 622 + { | |
| 623 + "ISO-2022-JP", | |
| 624 + :bin{ 611b24201b244241411b283f1b28427a }, | |
| 625 + "a\\x1B$ \u758f\\x1B\u2538z", | |
| 626 + :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 }, | |
| 627 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 628 + } | |
| 629 + { | |
| 630 + "ISO-2022-CN", | |
| 631 + :bin{ 611b2429201b2429410e41410f7a }, | |
| 632 + "a\\x1B$) \u4eaez", | |
| 633 + :intvector{ 0,1,1,1,1,2,3,4,10,13 }, | |
| 634 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 635 + } | |
| 636 + // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS
3 sequences. | |
| 637 + // The first ESC N comes before its designator sequence, the last seque
nce is ESC+space. | |
| 638 + { | |
| 639 + "ISO-2022-JP-2", | |
| 640 + :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e }, | |
| 641 + "N\\x1BNNN\xceN\\x1B N", | |
| 642 + :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 }, | |
| 643 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 644 + } | |
| 645 + { | |
| 646 + "ISO-2022-CN-EXT", | |
| 647 + :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e }, | |
| 648 + "N\\x1BNNN\u8f0eN\\x1B N", | |
| 649 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, | |
| 650 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 651 + } | |
| 652 + { | |
| 653 + "ISO-2022-CN-EXT", | |
| 654 + :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f }, | |
| 655 + "O\\x1BOOO\u492bO\\x1B O", | |
| 656 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, | |
| 657 + :int{1}, :int{0}, "", "&C", :bin{""} | |
| 658 + } | |
| 659 + // Test ticket 5691: Example from Peter Edberg. | |
| 660 + { | |
| 661 + "ISO-2022-JP", | |
| 662 + :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 }, | |
| 663 + "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda", | |
| 664 + :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 }, | |
| 665 + :int{1}, :int{0}, "", "?", :bin{""} | |
| 666 + } | |
| 667 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201
and | |
| 668 // using the Shift-JIS table for JIS X 0208 (ticket #5797) | |
| 669 { | |
| 670 "ISO-2022-JP", | |
| 671 :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b284
2 }, | |
| 672 - "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6
f3e", | |
| 673 - :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, | |
| 674 + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\uf
ffd\u6f3e", | |
| 675 + :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 }, | |
| 676 :int{1}, :int{1}, "", "?", :bin{""} | |
| 677 } | |
| 678 // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBM
PWithOffsets() | |
| 679 @@ -303,7 +425,7 @@ | |
| 680 { | |
| 681 "ISO-2022-CN-EXT", | |
| 682 :bin{ 411b4e2121 }, "\x41", :intvector{ 0 }, | |
| 683 - :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e } | |
| 684 + :int{1}, :int{1}, "illesc", ".", :bin{ 1b } | |
| 685 } | |
| 686 // G3 designator: recognized, but not supported for -CN (only for -CN-E
XT) | |
| 687 { | |
| OLD | NEW |