third_party/icu38/uconv.security.patch - Issue 52030: Apply ICU patches for ICU tickets 6175 (ISO-2022 and ...

Side by Side Diff: third_party/icu38/uconv.security.patch

Issue 52030: Apply ICU patches for ICU tickets 6175 (ISO-2022 and ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 diff -ru trie.clean/source/common/ucnv2022.c chrome.canonical/source/common/ucnv 2022.c	1 --- r22777/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0 700

2 --- trie.clean/source/common/ucnv2022.c 2007-11-07 17:39:05.057870000 -0800	2 +++ chrome.canonical/source/test/cintltst/nucnvtst.c 2009-03-23 12:42:01.1062 92000 -0700

3 +++ chrome.canonical/source/common/ucnv2022.c 2008-10-29 12:52:22.517453000 -0 700	3 @@ -17,6 +17,7 @@

4 @@ -752,6 +752,7 @@	4 #include "unicode/uloc.h"

	5 #include "unicode/ucnv.h"

	6 #include "unicode/ucnv_err.h"

	7 +#include "unicode/ucnv_cb.h"

	8 #include "cintltst.h"

	9 #include "unicode/utypes.h"

	10 #include "unicode/ustring.h"

	11 @@ -81,6 +82,7 @@

	12 static void TestJitterbug2411(void);

	13 static void TestJB5275(void);

	14 static void TestJB5275_1(void);

	15 +static void TestJitterbug6175(void);

	16 #endif

	17

	18 static void TestRoundTrippingAllUTF(void);

	19 @@ -297,6 +299,7 @@

	20 #if !UCONFIG_NO_LEGACY_CONVERSION

	21 addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");

	22 addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");

	23 + addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");

	24 #endif

	25

	26 }

	27 @@ -2606,7 +2609,7 @@

	28 TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceL imit <= source");

	29 /Test for the condition where there is an invalid character/

	30 {

	31 - static const uint8_t source2[]={0xa1, 0x01};

	32 + static const uint8_t source2[]={0xa1, 0x80};

	33 TestNextUCharError(cnv, (const char)source2, (const char)source2+size of(source2), U_ZERO_ERROR, "an invalid character");

	34 }

	35 /Test for the condition where we have a truncated char/

	36 @@ -3899,11 +3902,11 @@

	37 TestISO_2022_KR() {

	38 /* test input */

	39 static const uint16_t in[]={

	40 - 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x0 00A,0x000D

	41 - ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C02,0xAC04

	42 + 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x0 00D

	43 + ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C04

	44 ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0 028,0x0029

	45 ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x5 3CA,0x53CB

	46 - ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x5 3E1,0x53E2

	47 + ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x5 3E2

	48 ,0x53E3,0x53E4,0x000A,0x000D};

	49 const UChar* uSource;

	50 const UChar* uSourceLimit;

	51 @@ -4456,6 +4459,70 @@

	52 free(offsets);

	53 }

	54

	55 +/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCall backReason is UCNV_IRREGULAR */

	56 +typedef struct {

	57 + const char * converterName;

	58 + const char * inputText;

	59 + int inputTextLength;

	60 +} EmptySegmentTest;

	61 +

	62 +/* Callback for TestJitterbug6175, should only get called for empty segment err ors */

	63 +static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void context, UConverterToU nicodeArgs toArgs, const char* codeUnits,

	64 + int32_t length, UConverterCallback Reason reason, UErrorCode * err ) {

	65 + if (reason > UCNV_IRREGULAR) {

	66 + return;

	67 + }

	68 + if (reason != UCNV_IRREGULAR) {

	69 + log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n");

	70 + }

	71 + /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */

	72 + *err = U_ZERO_ERROR;

	73 + ucnv_cbToUWriteSub(toArgs,0,err);

	74 +}

	75 +

	76 +enum { kEmptySegmentToUCharsMax = 64 };

	77 +static void TestJitterbug6175(void) {

	78 + static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28, 0x42, 0x63, 0x64, 0x0D, 0x0A };

	79 + static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A };

	80 + static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A };

	81 + static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A };

	82 + static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63 , 0x64 };

	83 + static const EmptySegmentTest emptySegmentTests[] = {

	84 + /* converterName inputText inputTextLength */

	85 + { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) },

	86 + { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) },

	87 + { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) },

	88 + { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) },

	89 + { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) },

	90 + /* terminator: */

	91 + { NULL, NULL, 0, }

	92 + };

	93 + const EmptySegmentTest * testPtr;

	94 + for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr ) {

	95 + UErrorCode err = U_ZERO_ERROR;

	96 + UConverter * cnv = ucnv_open(testPtr->converterName, &err);

	97 + if (U_FAILURE(err)) {

	98 + log_data_err("Unable to open %s converter: %s\n", testPtr->converte rName, u_errorName(err));

	99 + return;

	100 + }

	101 + ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, N ULL, &err);

	102 + if (U_FAILURE(err)) {

	103 + log_data_err("Unable to setToUCallBack for %s converter: %s\n", tes tPtr->converterName, u_errorName(err));

	104 + ucnv_close(cnv);

	105 + return;

	106 + }

	107 + {

	108 + UChar toUChars[kEmptySegmentToUCharsMax];

	109 + UChar * toUCharsPtr = toUChars;

	110 + const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMa x;

	111 + const char * inCharsPtr = testPtr->inputText;

	112 + const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength;

	113 + ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCha rsLimit, NULL, TRUE, &err);

	114 + }

	115 + ucnv_close(cnv);

	116 + }

	117 +}

	118 +

	119 static void

	120 TestEBCDIC_STATEFUL() {

	121 /* test input */

	122 --- r22777/source/test/cintltst/ncnvtst.c 2007-01-24 15:27:45.575224000 -0 800

	123 +++ chrome.canonical/source/test/cintltst/ncnvtst.c 2009-03-23 12:30:17.2910 31000 -0700

	124 @@ -1928,7 +1928,7 @@

	125 #if !UCONFIG_NO_LEGACY_CONVERSION

	126 { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },

	127 { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },

	128 - { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },

	129 + /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6 002 */

	130 { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }

	131 #else

	132 { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }

	133 --- r22777/source/test/intltest/convtest.h 2007-07-26 20:12:12.288784000 -0 700

	134 +++ chrome.canonical/source/test/intltest/convtest.h 2009-03-23 12:30:09.4451 94000 -0700

	135 @@ -72,6 +72,7 @@

	136 void TestToUnicode();

	137 void TestFromUnicode();

	138 void TestGetUnicodeSet();

	139 + void TestGetUnicodeSet2();

	140

	141 private:

	142 UBool

	143 --- r22777/source/test/intltest/convtest.cpp 2007-03-08 16:28:01.852223000 -0 800

	144 +++ chrome.canonical/source/test/intltest/convtest.cpp 2009-03-23 12:30:40.1618 68000 -0700

	145 @@ -70,6 +70,7 @@

	146 case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;

	147 case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;

	148 case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;

	149 + case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); brea k;

	150 default: name=""; break; //needed to end loop

	151 }

	152 }

	153 @@ -465,6 +466,183 @@

	154 }

	155 }

	156

	157 +U_CDECL_BEGIN

	158 +static void U_CALLCONV

	159 +getUnicodeSetCallback(const void *context,

	160 + UConverterFromUnicodeArgs *fromUArgs,

	161 + const UChar* codeUnits,

	162 + int32_t length,

	163 + UChar32 codePoint,

	164 + UConverterCallbackReason reason,

	165 + UErrorCode *pErrorCode) {

	166 + if(reason<=UCNV_IRREGULAR) {

	167 + ((UnicodeSet *)context)->remove(codePoint); // the converter cannot co nvert this code point

	168 + *pErrorCode=U_ZERO_ERROR; // skip

	169 + } // else ignore the reset, close and clone calls.

	170 +}

	171 +U_CDECL_END

	172 +

	173 +// Compare ucnv_getUnicodeSet() with the set of characters that can be converte d.

	174 +void

	175 +ConversionTest::TestGetUnicodeSet2() {

	176 + // Build a string with all code points.

	177 + UChar32 cpLimit;

	178 + int32_t s0Length;

	179 + if(quick) {

	180 + cpLimit=s0Length=0x10000; // BMP only

	181 + } else {

	182 + cpLimit=0x110000;

	183 + s0Length=0x10000+0x200000; // BMP + surrogate pairs

	184 + }

	185 + UChar *s0=new UChar[s0Length];

	186 + if(s0==NULL) {

	187 + return;

	188 + }

	189 + UChar *s=s0;

	190 + UChar32 c;

	191 + UChar c2;

	192 + // low BMP

	193 + for(c=0; c<=0xd7ff; ++c) {

	194 + *s++=(UChar)c;

	195 + }

	196 + // trail surrogates

	197 + for(c=0xdc00; c<=0xdfff; ++c) {

	198 + *s++=(UChar)c;

	199 + }

	200 + // lead surrogates

	201 + // (after trails so that there is not even one surrogate pair in between)

	202 + for(c=0xd800; c<=0xdbff; ++c) {

	203 + *s++=(UChar)c;

	204 + }

	205 + // high BMP

	206 + for(c=0xe000; c<=0xffff; ++c) {

	207 + *s++=(UChar)c;

	208 + }

	209 + // supplementary code points = surrogate pairs

	210 + if(cpLimit==0x110000) {

	211 + for(c=0xd800; c<=0xdbff; ++c) {

	212 + for(c2=0xdc00; c2<=0xdfff; ++c2) {

	213 + *s++=(UChar)c;

	214 + *s++=c2;

	215 + }

	216 + }

	217 + }

	218 +

	219 + static const char *const cnvNames[]={

	220 + "UTF-8",

	221 + "UTF-7",

	222 + "UTF-16",

	223 + "US-ASCII",

	224 + "ISO-8859-1",

	225 + "windows-1252",

	226 + "Shift-JIS",

	227 + "ibm-1390", // EBCDIC_STATEFUL table

	228 + "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL tab le

	229 + "HZ",

	230 + "ISO-2022-JP",

	231 + "JIS7",

	232 + "ISO-2022-CN",

	233 + "ISO-2022-CN-EXT",

	234 + "LMBCS"

	235 + };

	236 + char buffer[1024];

	237 + int32_t i;

	238 + for(i=0; i<LENGTHOF(cnvNames); ++i) {

	239 + UErrorCode errorCode=U_ZERO_ERROR;

	240 + UConverter *cnv=cnv_open(cnvNames[i], errorCode);

	241 + if(U_FAILURE(errorCode)) {

	242 + errln("failed to open converter %s - %s", cnvNames[i], u_errorName( errorCode));

	243 + continue;

	244 + }

	245 + UnicodeSet expected;

	246 + ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL , &errorCode);

	247 + if(U_FAILURE(errorCode)) {

	248 + errln("failed to set the callback on converter %s - %s", cnvNames[i ], u_errorName(errorCode));

	249 + ucnv_close(cnv);

	250 + continue;

	251 + }

	252 + UConverterUnicodeSet which;

	253 + for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUn icodeSet)((int)which+1)) {

	254 + if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {

	255 + ucnv_setFallback(cnv, TRUE);

	256 + }

	257 + expected.add(0, cpLimit-1);

	258 + s=s0;

	259 + UBool flush;

	260 + do {

	261 + char *t=buffer;

	262 + flush=(UBool)(s==s0+s0Length);

	263 + ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar * *)&s, s0+s0Length, NULL, flush, &errorCode);

	264 + if(U_FAILURE(errorCode)) {

	265 + if(errorCode==U_BUFFER_OVERFLOW_ERROR) {

	266 + errorCode=U_ZERO_ERROR;

	267 + continue;

	268 + } else {

	269 + break; // unexpected error, should not occur

	270 + }

	271 + }

	272 + } while(!flush);

	273 + UnicodeSet set;

	274 + ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);

	275 + if(cpLimit<0x110000) {

	276 + set.remove(cpLimit, 0x10ffff);

	277 + }

	278 + if(which==UCNV_ROUNDTRIP_SET) {

	279 + // ignore PUA code points because they will be converted even i f they

	280 + // are fallbacks and when other fallbacks are turned off,

	281 + // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true rou ndtrips

	282 + expected.remove(0xe000, 0xf8ff);

	283 + expected.remove(0xf0000, 0xffffd);

	284 + expected.remove(0x100000, 0x10fffd);

	285 + set.remove(0xe000, 0xf8ff);

	286 + set.remove(0xf0000, 0xffffd);

	287 + set.remove(0x100000, 0x10fffd);

	288 + }

	289 + if(set!=expected) {

	290 + // First try to see if we have different sets because ucnv_getU nicodeSet()

	291 + // added strings: The above conversion method does not tell us what strings might be convertible.

	292 + // Remove strings from the set and compare again.

	293 + // Unfortunately, there are no good, direct set methods for fin ding out whether there are strings

	294 + // in the set, nor for enumerating or removing just them.

	295 + // Intersect all code points with the set. The intersection wil l not contain strings.

	296 + UnicodeSet temp(0, 0x10ffff);

	297 + temp.retainAll(set);

	298 + set=temp;

	299 + }

	300 + if(set!=expected) {

	301 + UnicodeSet diffSet;

	302 + UnicodeString out;

	303 +

	304 + // are there items that must be in the set but are not?

	305 + (diffSet=expected).removeAll(set);

	306 + if(!diffSet.isEmpty()) {

	307 + diffSet.toPattern(out, TRUE);

	308 + if(out.length()>100) {

	309 + out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsi s));

	310 + }

	311 + errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",

	312 + cnvNames[i], which);

	313 + errln(out);

	314 + }

	315 +

	316 + // are there items that must not be in the set but are?

	317 + (diffSet=set).removeAll(expected);

	318 + if(!diffSet.isEmpty()) {

	319 + diffSet.toPattern(out, TRUE);

	320 + if(out.length()>100) {

	321 + out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsi s));

	322 + }

	323 + errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpecte d items - which set: %d",

	324 + cnvNames[i], which);

	325 + errln(out);

	326 + }

	327 + }

	328 + }

	329 + }

	330 +

	331 + delete [] s0;

	332 +}

	333 +

	334 // open testdata or ICU data converter ------------------------------------- ** *

	335

	336 UConverter *

	337 --- r22777/source/test/testdata/testdata.mak 2007-07-26 20:12:12.288784000 -0 700

	338 +++ chrome.canonical/source/test/testdata/testdata.mak 2009-03-23 12:31:04.4246 45000 -0700

	339 @@ -28,7 +28,7 @@

	340

	341 TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res)

	342

	343 -"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" " $(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res " "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh .res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN. res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TEST DATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res " "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" " $(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1. cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4 x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\ nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATAB LD)\nfsmxp.spp"

	344 +"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" " $(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res " "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh .res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN. res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TEST DATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res " "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" " $(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1. cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\te st4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD )\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDAT ABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"

	345 @echo Building test data

	346 @copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ"

	347 @copy "$(TESTDATA)\icu26_testtypes.res" "$(TESTDATABLD)"

	348 @@ -54,6 +54,7 @@

	349 iscii.res

	350 test.icu

	351 test1.cnv

	352 +test1bmp.cnv

	353 test3.cnv

	354 test4.cnv

	355 test4x.cnv

	356 @@ -126,6 +127,10 @@

	357 @echo Building $@

	358 @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**

	359

	360 +"$(TESTDATABLD)\test1bmp.cnv": "$(TESTDATA)\test1bmp.ucm"

	361 + @echo Building $@

	362 + @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**

	363 +

	364 "$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"

	365 @echo Building $@

	366 @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**

	367 --- r22777/source/test/testdata/Makefile.in 2007-08-21 13:15:55.267002000 -0 700

	368 +++ chrome.canonical/source/test/testdata/Makefile.in 2009-03-23 12:31:04.4356 35000 -0700

	369 @@ -117,7 +117,7 @@

	370 TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu

	371 TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUIL DDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp

	372

	373 -TEST_UCM_SOURCE= test1.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm

	374 +TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm ibm9027. ucm

	375 TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%)

	376 TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv)

	377

	378 --- r22777/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0 700

	379 +++ chrome.canonical/source/test/testdata/conversion.txt 2009-03-23 12:42 :01.119267000 -0700

	380 @@ -1,6 +1,6 @@

	381 //***************************************************************************

	382 //

	383 -// Copyright (C) 2003-2007, International Business Machines

	384 +// Copyright (C) 2003-2008, International Business Machines

	385 // Corporation and others. All Rights Reserved.

	386 //

	387 // file name: conversion.txt

	388 @@ -48,13 +48,161 @@

	389 toUnicode {

	390 Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }

	391 Cases {

	392 + // Test ticket 5691: consistent illegal sequences

	393 + // The following test cases are for illegal character byte sequences.

	394 + //

	395 + // Unfortunately, we cannot use the Shift-JIS examples from the ticket

	396 + // comments because our Shift-JIS table is Windows-compatible and

	397 + // therefore has no illegal single bytes. Same for GBK.

	398 + // Instead, we use the stricter GB 18030 also for 2-byte examples.

	399 + // The byte sequences are generally slightly different from the ticket

	400 + // comment, simply using assigned characters rather than just

	401 + // theoretically valid sequences.

	402 + {

	403 + "gb18030",

	404 + :bin{ 618140813c81ff7a },

	405 + "a\u4e02\\x81<\\x81\\xFFz",

	406 + :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },

	407 + :int{1}, :int{0}, "", "&C", :bin{""}

	408 + }

	409 + {

	410 + "EUC-JP",

	411 + :bin{ 618fb0a98fb03c8f3cb0a97a },

	412 + "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",

	413 + :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },

	414 + :int{1}, :int{0}, "", "&C", :bin{""}

	415 + }

	416 + {

	417 + "gb18030",

	418 + :bin{ 618130fc318130fc8181303c3e813cfc817a },

	419 + "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",

	420 + :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },

	421 + :int{1}, :int{0}, "", "&C", :bin{""}

	422 + }

	423 + {

	424 + "UTF-8",

	425 + :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },

	426 + "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1 \\xFF<>z",

	427 + :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,1 2,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20, 21 },

	428 + :int{1}, :int{0}, "", "&C", :bin{""}

	429 + }

	430 + {

	431 + "ISO-2022-JP",

	432 + :bin{ 1b24424141af4142affe41431b2842 },

	433 + "\u758f\\xAF\u758e\\xAF\\xFE\u790e",

	434 + :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },

	435 + :int{1}, :int{0}, "", "&C", :bin{""}

	436 + }

	437 + {

	438 + "ibm-25546",

	439 + :bin{ 411b242943420e4141af4142affe41430f5a },

	440 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

	441 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

	442 + :int{1}, :int{0}, "", "&C", :bin{""}

	443 + }

	444 + {

	445 + "ISO-2022-KR",

	446 + :bin{ 411b242943420e4141af4142affe41430f5a },

	447 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

	448 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

	449 + :int{1}, :int{0}, "", "&C", :bin{""}

	450 + }

	451 + {

	452 + "ISO-2022-CN",

	453 + :bin{ 411b242941420e4141af4142affe41430f5a },

	454 + "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

	455 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

	456 + :int{1}, :int{0}, "", "&C", :bin{""}

	457 + }

	458 + {

	459 + "HZ",

	460 + :bin{ 417e7b4141af4142affe41437e7d5a },

	461 + "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

	462 + :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },

	463 + :int{1}, :int{0}, "", "&C", :bin{""}

	464 + }

	465 + // Test ticket 5691: consistent illegal sequences

	466 + // The following test cases are for illegal escape/designator/shift seq uences.

	467 + //

	468 + // ISO-2022-JP and -CN with illegal escape sequences.

	469 + {

	470 + "ISO-2022-JP",

	471 + :bin{ 611b24201b244241411b283f1b28427a },

	472 + "a\\x1B$ \u758f\\x1B\u2538z",

	473 + :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },

	474 + :int{1}, :int{0}, "", "&C", :bin{""}

	475 + }

	476 + {

	477 + "ISO-2022-CN",

	478 + :bin{ 611b2429201b2429410e41410f7a },

	479 + "a\\x1B$) \u4eaez",

	480 + :intvector{ 0,1,1,1,1,2,3,4,10,13 },

	481 + :int{1}, :int{0}, "", "&C", :bin{""}

	482 + }

	483 + // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS 3 sequences.

	484 + // The first ESC N comes before its designator sequence, the last seque nce is ESC+space.

	485 + {

	486 + "ISO-2022-JP-2",

	487 + :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },

	488 + "N\\x1BNNN\xceN\\x1B N",

	489 + :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },

	490 + :int{1}, :int{0}, "", "&C", :bin{""}

	491 + }

	492 + {

	493 + "ISO-2022-CN-EXT",

	494 + :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },

	495 + "N\\x1BNNN\u8f0eN\\x1B N",

	496 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

	497 + :int{1}, :int{0}, "", "&C", :bin{""}

	498 + }

	499 + {

	500 + "ISO-2022-CN-EXT",

	501 + :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },

	502 + "O\\x1BOOO\u492bO\\x1B O",

	503 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

	504 + :int{1}, :int{0}, "", "&C", :bin{""}

	505 + }

	506 + // Test ticket 5691: HZ with illegal tilde sequences.

	507 + {

	508 + "HZ",

	509 + :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },

	510 + "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a 9Z",

	511 + :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS

	512 + 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19, 19,19,19,21, // DBCS

	513 + 25 }, // SBCS

	514 + :int{1}, :int{0}, "", "&C", :bin{""}

	515 + }

	516 + // Test ticket 5691: Example from Peter Edberg.

	517 + {

	518 + "ISO-2022-JP",

	519 + :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },

	520 + "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",

	521 + :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },

	522 + :int{1}, :int{0}, "", "?", :bin{""}

	523 + }

	524 + // Test bug 6071 (2:1 Unicode:charset SBCS mapping).

	525 + {

	526 + "*test1bmp",

	527 + :bin{ 050008 },

	528 + "e@uv",

	529 + :intvector{ 0,1,2,2 },

	530 + :int{1}, :int{1}, "", "?", :bin{""}

	531 + }

	532 + // test that HZ limits its byte values to lead bytes 21..7d and trail b ytes 21..7e

	533 + {

	534 + "HZ",

	535 + :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },

	536 + "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",

	537 + :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },

	538 + :int{1}, :int{1}, "", "?", :bin{""}

	539 + }

	540 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and

	541 // using the Shift-JIS table for JIS X 0208 (ticket #5797)

	542 {

	543 "ISO-2022-JP",

	544 :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b284 2 },

	545 - "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6 f3e",

	546 - :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },

	547 + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\uf ffd\u6f3e",

	548 + :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },

	549 :int{1}, :int{1}, "", "?", :bin{""}

	550 }

	551 // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBM PWithOffsets()

	552 @@ -191,6 +339,21 @@

	553 :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 },

	554 :int{1}, :int{1}, "", "&", :bin{""}

	555 }

	556 + // empty segment (using substitution and stop)

	557 + {

	558 + "ISO-2022-KR",

	559 + :bin{ 1b242943610e0f620d0a },

	560 + "a\uFFFDb\u000D\u000A",

	561 + :intvector{ 4, 6, 7, 8, 9 },

	562 + :int{1}, :int{1}, "", "?", :bin{""}

	563 + }

	564 + {

	565 + "ISO-2022-KR",

	566 + :bin{ 1b242943610e0f620d0a },

	567 + "a",

	568 + :intvector{ 4 },

	569 + :int{1}, :int{1}, "illesc", ".", :bin{"0f"}

	570 + }

	571

	572 // ISO-2022-JP

	573

	574 @@ -241,6 +404,21 @@

	575 :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 },

	576 :int{1}, :int{1}, "", ".", :bin{""}

	577 }

	578 + // empty segment (using substitution and stop)

	579 + {

	580 + "ISO-2022-JP",

	581 + :bin{ 61621b24421b284263640d0a },

	582 + "ab\uFFFDcd\u000D\u000A",

	583 + :intvector{ 0, 1, 5, 8, 9, 10, 11 },

	584 + :int{1}, :int{1}, "", "?", :bin{""}

	585 + }

	586 + {

	587 + "ISO-2022-JP",

	588 + :bin{ 61621b24421b284263640d0a },

	589 + "ab",

	590 + :intvector{ 0, 1 },

	591 + :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"}

	592 + }

	593

	594 // ISO-2022-CN

	595

	596 @@ -303,7 +481,7 @@

	597 {

	598 "ISO-2022-CN-EXT",

	599 :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },

	600 - :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }

	601 + :int{1}, :int{1}, "illesc", ".", :bin{ 1b }

	602 }

	603 // G3 designator: recognized, but not supported for -CN (only for -CN-E XT)

	604 {

	605 @@ -311,6 +489,36 @@

	606 :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 },

	607 :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 }

	608 }

	609 + // empty segment 1 (using substitution and stop)

	610 + {

	611 + "ISO-2022-CN",

	612 + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },

	613 + "ab\uFFFD\u994Cc\u000D\u000A",

	614 + :intvector{ 0, 5, 7, 14, 16, 17, 18 },

	615 + :int{1}, :int{1}, "", "?", :bin{""}

	616 + }

	617 + {

	618 + "ISO-2022-CN",

	619 + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },

	620 + "ab",

	621 + :intvector{ 0, 5 },

	622 + :int{1}, :int{1}, "illesc", ".", :bin{"0f"}

	623 + }

	624 + // empty segment 2 (using substitution and stop)

	625 + {

	626 + "ISO-2022-CN",

	627 + :bin{ 611b242941620e1b24294768640f630d0a },

	628 + "ab\uFFFD\u5F70c\u000D\u000A",

	629 + :intvector{ 0, 5, 7, 11, 14, 15, 16 },

	630 + :int{1}, :int{1}, "", "?", :bin{""}

	631 + }

	632 + {

	633 + "ISO-2022-CN",

	634 + :bin{ 611b242941620e1b24294768640f630d0a },

	635 + "ab",

	636 + :intvector{ 0, 5 },

	637 + :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"}

	638 + }

	639

	640 // ISO-2022 SBCS

	641 // [U_ENABLE_GENERIC_ISO_2022]

	642 @@ -325,6 +533,39 @@

	643 // :int{1}, :int{1}, "", ".", :bin{""}

	644 //}

	645

	646 + // HZ-GB-2312

	647 +

	648 + // empty segment 1 (using substitution and stop)

	649 + {

	650 + "HZ-GB-2312",

	651 + :bin{ 61627e7b7e7d6364 },

	652 + "ab\uFFFDcd",

	653 + :intvector{ 0, 1, 4, 6, 7 },

	654 + :int{1}, :int{1}, "", "?", :bin{""}

	655 + }

	656 + {

	657 + "HZ-GB-2312",

	658 + :bin{ 61627e7b7e7d63640d0a },

	659 + "ab",

	660 + :intvector{ 0, 1 },

	661 + :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"}

	662 + }

	663 + // empty segment 2 & legal redundant switches (using substitution and s top)

	664 + {

	665 + "HZ-GB-2312",

	666 + :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },

	667 + "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD",

	668 + :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 },

	669 + :int{1}, :int{1}, "", "?", :bin{""}

	670 + }

	671 + {

	672 + "HZ-GB-2312",

	673 + :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },

	674 + "ab\u4E0D\u7A7A",

	675 + :intvector{ 0, 1, 4, 6 },

	676 + :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"}

	677 + }

	678 +

	679 // DBCS-only extensions

	680 {

	681 "ibm-970",

	682 @@ -496,6 +737,14 @@

	683 :intvector{ 0, 4, 8, 12 },

	684 :int{1}, :int{0}, "", "?", :bin{""}

	685 }

	686 + // Test iso-2022-jp-2 miscellaneous symbols

	687 + {

	688 + "iso-2022-jp-2",

	689 + :bin{ 1b242843224f224e1b2842 },

	690 + "\u260E\u260F",

	691 + :intvector{ 4, 6 },

	692 + :int{1}, :int{0}, "", ".", :bin{""}

	693 + }

	694 }

	695 }

	696

	697 @@ -504,6 +753,14 @@

	698 fromUnicode {

	699 Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }

	700 Cases {

	701 + // Test bug 6071 (1:2 Unicode:charset SBCS mapping).

	702 + {

	703 + "*test1bmp",

	704 + "e@t",

	705 + :bin{ 05000709 },

	706 + :intvector{ 0,1,2,2 },

	707 + :int{1}, :int{0}, "", "?", ""

	708 + }

	709 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and

	710 // using the Shift-JIS table for JIS X 0208 (ticket #5797)

	711 {

	712 @@ -1311,16 +1568,29 @@

	713 // versions of ISO-2022-JP

	714 {

	715 "ISO-2022-JP",

	716 - "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e 00\u4e01\uffe5]",

	717 - "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\uf a2d\uffe6-\U0010ffff]",

	718 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e0 1\uffe5]",

	719 + "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e 29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",

	720 :int{0}

	721 - }

	722 + }

	723 {

	724 "ISO-2022-JP-2",

	725 - "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a 1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",

	726 - "[\x0e\x0f\x1b\uffe7-\U0010ffff]",

	727 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0 390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",

	728 + "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",

	729 + :int{0}

	730 + }

	731 + {

	732 + "JIS7",

	733 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0 390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",

	734 + "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",

	735 :int{0}

	736 }

	737 + // with fallbacks

	738 + {

	739 + "ISO-2022-JP",

	740 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301 c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",

	741 + "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b \ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",

	742 + :int{1}

	743 + }

	744

	745 // versions of ISO-2022-CN

	746 {

	747 @@ -1336,6 +1606,22 @@

	748 :int{0}

	749 }

	750

	751 + // HZ

	752 + {

	753 + "HZ",

	754 + "[\u0410-\u044f\u4e00\u4e01\u4e03]",

	755 + "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",

	756 + :int{0}

	757 + }

	758 +

	759 + // LMBCS

	760 + {

	761 + "LMBCS",

	762 + "[\x00-\U0010ffff]",

	763 + "[]",

	764 + :int{0}

	765 + }

	766 +

	767 // DBCS-only

	768 {

	769 "ibm-971",

	770 --- r22777/source/common/ucnv_ext.h 2007-08-22 22:46:49.525855000 -0700

	771 +++ chrome.canonical/source/common/ucnv_ext.h 2009-03-23 12:30:09.644121000 -0 700

	772 @@ -382,10 +382,20 @@

	773 UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,

	774 UErrorCode *pErrorCode);

	775

	776 +/*

	777 + * Add code points and strings to the set according to the extension mappings.

	778 + * Limitation on the UConverterSetFilter:

	779 + * The filters currently assume that they are used with 1:1 mappings.

	780 + * They only apply to single input code points, and then they pass through

	781 + * only mappings with single-charset-code results.

	782 + * For example, the Shift-JIS filter only works for 2-byte results and tests

	783 + * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.

	784 + */

	785 U_CFUNC void

	786 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,

	787 const USetAdder *sa,

	788 UConverterUnicodeSet which,

	789 + UConverterSetFilter filter,

	790 UErrorCode *pErrorCode);

	791

	792 /* toUnicode helpers -------------------------------------------------------- * /

	793 --- r22777/source/common/ucnvmbcs.c 2007-10-11 14:31:32.196532000 -0700

	794 +++ chrome.canonical/source/common/ucnvmbcs.c 2009-03-23 12:42:01.150242000 -0 700

	795 @@ -1,7 +1,7 @@

	796 /*

	797 ******************************************************************************

	798 *

	799 -* Copyright (C) 2000-2007, International Business Machines

	800 +* Copyright (C) 2000-2008, International Business Machines

	801 * Corporation and others. All Rights Reserved.

	802 *

	803 ******************************************************************************

	804 @@ -485,9 +485,23 @@

	805

	806 if(mbcsTable->outputType==MBCS_OUTPUT_1) {

	807 const uint16_t stage2, stage3, *results;

	808 + uint16_t minValue;

	809

	810 results=(const uint16_t *)mbcsTable->fromUnicodeBytes;

	811

	812 + /*

	813 + * Set a threshold variable for selecting which mappings to use.

	814 + * See ucnv_MBCSSingleFromBMPWithOffsets() and

	815 + * MBCS_SINGLE_RESULT_FROM_U() for details.

	816 + */

	817 + if(which==UCNV_ROUNDTRIP_SET) {

	818 + /* use only roundtrips */

	819 + minValue=0xf00;

	820 + } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {

	821 + /* use all roundtrip and fallback results */

	822 + minValue=0x800;

	823 + }

	824 +

	825 for(st1=0; st1<maxStage1; ++st1) {

	826 st2=table[st1];

	827 if(st2>maxStage1) {

	828 @@ -497,15 +511,8 @@

	829 /* read the stage 3 block */

	830 stage3=results+st3;

	831

	832 - /*

	833 - * Add code points for which the roundtrip flag is set.

	834 - * Once we get a set for fallback mappings, we have to use

	835 - * a threshold variable with a value of 0x800.

	836 - * See ucnv_MBCSSingleFromBMPWithOffsets() and

	837 - * MBCS_SINGLE_RESULT_FROM_U() for details.

	838 - */

	839 do {

	840 - if(*stage3++>=0xf00) {

	841 + if(*stage3++>=minValue) {

	842 sa->add(sa->set, c);

	843 }

	844 } while((++c&0xf)!=0);

	845 @@ -522,9 +529,12 @@

	846 const uint8_t stage3, bytes;

	847 uint32_t st3Multiplier;

	848 uint32_t value;

	849 + UBool useFallback;

	850

	851 bytes=mbcsTable->fromUnicodeBytes;

	852

	853 + useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);

	854 +

	855 switch(mbcsTable->outputType) {

	856 case MBCS_OUTPUT_3:

	857 case MBCS_OUTPUT_4_EUC:

	858 @@ -551,9 +561,8 @@

	859 st3>>=16;

	860

	861 /*

	862 - * Add code points for which the roundtrip flag is set.

	863 - * Once we get a set for fallback mappings, we have to check

	864 - * non-roundtrip stage 3 results for whether they are 0 .

	865 + * Add code points for which the roundtrip flag is set,

	866 + * or which map to non-zero bytes if we use fallbacks.

	867 * See ucnv_MBCSFromUnicodeWithOffsets() for details.

	868 */

	869 switch(filter) {

	870 @@ -561,6 +570,23 @@

	871 do {

	872 if(st3&1) {

	873 sa->add(sa->set, c);

	874 + stage3+=st3Multiplier;

	875 + } else if(useFallback) {

	876 + uint8_t b=0;

	877 + switch(st3Multiplier) {

	878 + case 4:

	879 + b\|=*stage3++;

	880 + case 3:

	881 + b\|=*stage3++;

	882 + case 2:

	883 + b\|=stage3[0]\|stage3[1];

	884 + stage3+=2;

	885 + default:

	886 + break;

	887 + }

	888 + if(b!=0) {

	889 + sa->add(sa->set, c);

	890 + }

	891 }

	892 st3>>=1;

	893 } while((++c&0xf)!=0);

	894 @@ -568,7 +594,7 @@

	895 case UCNV_SET_FILTER_DBCS_ONLY:

	896 /* Ignore single-byte results (<0x100). */

	897 do {

	898 - if((st3&1)!=0 && ((const uint16_t )stage3)>=0 x100) {

	899 + if(((st3&1)!=0 \|\| useFallback) && ((const uint 16_t )stage3)>=0x100) {

	900 sa->add(sa->set, c);

	901 }

	902 st3>>=1;

	903 @@ -578,7 +604,7 @@

	904 case UCNV_SET_FILTER_2022_CN:

	905 /* Only add code points that map to CNS 11643 plan es 1 & 2 for non-EXT ISO-2022-CN. */

	906 do {

	907 - if((st3&1)!=0 && ((value=*stage3)==0x81 \|\| valu e==0x82)) {

	908 + if(((st3&1)!=0 \|\| useFallback) && ((value=*stag e3)==0x81 \|\| value==0x82)) {

	909 sa->add(sa->set, c);

	910 }

	911 st3>>=1;

	912 @@ -588,7 +614,33 @@

	913 case UCNV_SET_FILTER_SJIS:

	914 /* Only add code points that map to Shift-JIS code s corresponding to JIS X 0208. */

	915 do {

	916 - if((st3&1)!=0 && (value=((const uint16_t )sta ge3))>=0x8140 && value<=0xeffc) {

	917 + if(((st3&1)!=0 \|\| useFallback) && (value=((con st uint16_t )stage3))>=0x8140 && value<=0xeffc) {

	918 + sa->add(sa->set, c);

	919 + }

	920 + st3>>=1;

	921 + stage3+=2; /* +=st3Multiplier */

	922 + } while((++c&0xf)!=0);

	923 + break;

	924 + case UCNV_SET_FILTER_GR94DBCS:

	925 + /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */

	926 + do {

	927 + if( ((st3&1)!=0 \|\| useFallback) &&

	928 + (uint16_t)((value=((const uint16_t )stage 3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&

	929 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

	930 + ) {

	931 + sa->add(sa->set, c);

	932 + }

	933 + st3>>=1;

	934 + stage3+=2; /* +=st3Multiplier */

	935 + } while((++c&0xf)!=0);

	936 + break;

	937 + case UCNV_SET_FILTER_HZ:

	938 + /* Only add code points that are suitable for HZ DB CS (lead byte A1..FD). */

	939 + do {

	940 + if( ((st3&1)!=0 \|\| useFallback) &&

	941 + (uint16_t)((value=((const uint16_t )stage 3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&

	942 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

	943 + ) {

	944 sa->add(sa->set, c);

	945 }

	946 st3>>=1;

	947 @@ -609,7 +661,7 @@

	948 }

	949 }

	950

	951 - ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);

	952 + ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);

	953 }

	954

	955 U_CFUNC void

	956 @@ -1694,7 +1746,7 @@

	957 cnv->toUBytes[0]=*(source-1);

	958 cnv->toULength=_extToU(cnv, cnv->sharedData,

	959 1, &source, sourceLimit,

	960 - &target, target+targetCapacity,

	961 + &target, pArgs->targetLimit,

	962 &offsets, sourceIndex,

	963 pArgs->flush,

	964 pErrorCode);

	965 @@ -1739,6 +1791,65 @@

	966 pArgs->offsets=offsets;

	967 }

	968

	969 +static UBool

	970 +hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {

	971 + const int32_t *row=stateTable[state];

	972 + int32_t b, entry;

	973 + /* First test for final entries in this state for some commonly valid byte values. */

	974 + entry=row[0xa1];

	975 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

	976 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

	977 + ) {

	978 + return TRUE;

	979 + }

	980 + entry=row[0x41];

	981 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

	982 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

	983 + ) {

	984 + return TRUE;

	985 + }

	986 + /* Then test for final entries in this state. */

	987 + for(b=0; b<=0xff; ++b) {

	988 + entry=row[b];

	989 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

	990 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

	991 + ) {

	992 + return TRUE;

	993 + }

	994 + }

	995 + /* Then recurse for transition entries. */

	996 + for(b=0; b<=0xff; ++b) {

	997 + entry=row[b];

	998 + if( MBCS_ENTRY_IS_TRANSITION(entry) &&

	999 + hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE (entry))

	1000 + ) {

	1001 + return TRUE;

	1002 + }

	1003 + }

	1004 + return FALSE;

	1005 +}

	1006 +

	1007 +/*

	1008 + * Is byte b a single/lead byte in this state?

	1009 + * Recurse for transition states, because here we don't want to say that

	1010 + * b is a lead byte if all byte sequences that start with b are illegal.

	1011 + */

	1012 +static UBool

	1013 +isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnl y, uint8_t b) {

	1014 + const int32_t *row=stateTable[state];

	1015 + int32_t entry=row[b];

	1016 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */

	1017 + return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_ST ATE(entry));

	1018 + } else {

	1019 + uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

	1020 + if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {

	1021 + return FALSE; /* SI/SO are illegal for DBCS-only conversion */

	1022 + } else {

	1023 + return action!=MBCS_STATE_ILLEGAL;

	1024 + }

	1025 + }

	1026 +}

	1027 +

	1028 U_CFUNC void

	1029 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

	1030 UErrorCode *pErrorCode) {

	1031 @@ -2094,6 +2205,34 @@

	1032 sourceIndex=nextSourceIndex;

	1033 } else if(U_FAILURE(*pErrorCode)) {

	1034 /* callback(illegal) */

	1035 + if(byteIndex>1) {

	1036 + /*

	1037 + * Ticket 5691: consistent illegal sequences:

	1038 + * - We include at least the first byte in the illegal sequence .

	1039 + * - If any of the non-initial bytes could be the start of a ch aracter,

	1040 + * we stop the illegal sequence before the first one of those .

	1041 + */

	1042 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0 );

	1043 + int8_t i;

	1044 + for(i=1;

	1045 + i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnl y, bytes[i]);

	1046 + ++i) {}

	1047 + if(i<byteIndex) {

	1048 + /* Back out some bytes. */

	1049 + int8_t backOutDistance=byteIndex-i;

	1050 + int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);

	1051 + byteIndex=i; /* length of reported illegal byte sequence * /

	1052 + if(backOutDistance<=bytesFromThisBuffer) {

	1053 + source-=backOutDistance;

	1054 + } else {

	1055 + /* Back out bytes from the previous buffer: Need to rep lay them. */

	1056 + cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutD istance);

	1057 + /* preToULength is negative! */

	1058 + uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);

	1059 + source=(const uint8_t *)pArgs->source;

	1060 + }

	1061 + }

	1062 + }

	1063 break;

	1064 } else /* unassigned sequences indicated with byteIndex>0 */ {

	1065 /* try an extension mapping */

	1066 @@ -2104,7 +2243,7 @@

	1067 &offsets, sourceIndex,

	1068 pArgs->flush,

	1069 pErrorCode);

	1070 - sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs ->source);

	1071 + sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArg s->source);

	1072

	1073 if(U_FAILURE(*pErrorCode)) {

	1074 /* not mappable or buffer overflow */

	1075 @@ -2395,15 +2534,37 @@

	1076

	1077 if(c<0) {

	1078 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {

	1079 - *pErrorCode=U_TRUNCATED_CHAR_FOUND;

	1080 - }

	1081 - if(U_FAILURE(*pErrorCode)) {

	1082 /* incomplete character byte sequence */

	1083 uint8_t *bytes=cnv->toUBytes;

	1084 cnv->toULength=(int8_t)(source-lastSource);

	1085 do {

	1086 bytes++=lastSource++;

	1087 } while(lastSource<source);

	1088 + *pErrorCode=U_TRUNCATED_CHAR_FOUND;

	1089 + } else if(U_FAILURE(*pErrorCode)) {

	1090 + /* callback(illegal) */

	1091 + /*

	1092 + * Ticket 5691: consistent illegal sequences:

	1093 + * - We include at least the first byte in the illegal sequence.

	1094 + * - If any of the non-initial bytes could be the start of a charac ter,

	1095 + * we stop the illegal sequence before the first one of those.

	1096 + */

	1097 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

	1098 + uint8_t *bytes=cnv->toUBytes;

	1099 + bytes++=lastSource++; /* first byte */

	1100 + if(lastSource==source) {

	1101 + cnv->toULength=1;

	1102 + } else /* lastSource<source: multi-byte character */ {

	1103 + int8_t i;

	1104 + for(i=1;

	1105 + lastSource<source && !isSingleOrLead(stateTable, state, isD BCSOnly, *lastSource);

	1106 + ++i

	1107 + ) {

	1108 + bytes++=lastSource++;

	1109 + }

	1110 + cnv->toULength=i;

	1111 + source=lastSource;

	1112 + }

	1113 } else {

	1114 /* no output because of empty input or only state changes */

	1115 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

	1116 @@ -3237,7 +3398,7 @@

	1117 lastSource=source;

	1118 c=_extFromU(cnv, cnv->sharedData,

	1119 c, &source, sourceLimit,

	1120 - &target, target+targetCapacity,

	1121 + &target, (const uint8_t *)(pArgs->targetLimit),

	1122 &offsets, sourceIndex,

	1123 pArgs->flush,

	1124 pErrorCode);

	1125 --- r22777/source/common/ucnvmbcs.h 2007-10-11 14:31:32.196532000 -0700

	1126 +++ chrome.canonical/source/common/ucnvmbcs.h 2009-03-23 12:30:17.315007000 -0 700

	1127 @@ -492,6 +492,8 @@

	1128 UCNV_SET_FILTER_DBCS_ONLY,

	1129 UCNV_SET_FILTER_2022_CN,

	1130 UCNV_SET_FILTER_SJIS,

	1131 + UCNV_SET_FILTER_GR94DBCS,

	1132 + UCNV_SET_FILTER_HZ,

	1133 UCNV_SET_FILTER_COUNT

	1134 } UConverterSetFilter;

	1135

	1136 --- r22777/source/common/ucnv.c 2007-08-31 12:39:14.294200000 -0700

	1137 +++ chrome.canonical/source/common/ucnv.c 2009-03-23 12:40:10.566608000 -0 700

	1138 @@ -1528,11 +1528,14 @@

	1139 cnv->toULength=0;

	1140

	1141 /* call the callback function */

	1142 + if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOU ND) {

	1143 + cnv->toUCallbackReason = UCNV_UNASSIGNED;

	1144 + }

	1145 cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,

	1146 cnv->invalidCharBuffer, errorInputLength,

	1147 - (err==U_INVALID_CHAR_FOUND \|\| err==U_UNSUPPORTED_ESCAPE_SEQUE NCE) ?

	1148 - UCNV_UNASSIGNED : UCNV_ILLEGAL,

	1149 + cnv->toUCallbackReason,

	1150 err);

	1151 + cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */

	1152

	1153 /*

	1154 * loop back to the offset handling

	1155 --- r22777/source/common/uset_imp.h 2007-07-24 19:51:25.692061000 -0700

	1156 +++ chrome.canonical/source/common/uset_imp.h 2009-03-23 12:30:09.893067000 -0 700

	1157 @@ -36,6 +36,9 @@

	1158 typedef void U_CALLCONV

	1159 USetRemove(USet *set, UChar32 c);

	1160

	1161 +typedef void U_CALLCONV

	1162 +USetRemoveRange(USet *set, UChar32 start, UChar32 end);

	1163 +

	1164 /**

	1165 * Interface for adding items to a USet, to keep low-level code from

	1166 * statically depending on the USet implementation.

	1167 @@ -47,6 +50,7 @@

	1168 USetAddRange *addRange;

	1169 USetAddString *addString;

	1170 USetRemove *remove;

	1171 + USetRemoveRange *removeRange;

	1172 };

	1173 typedef struct USetAdder USetAdder;

	1174

	1175 --- r22777/source/common/ucnv2022.c 2007-10-11 14:31:32.196532000 -0700

	1176 +++ chrome.canonical/source/common/ucnv2022.c 2009-03-23 12:57:38.398368000 -0 700

	1177 @@ -201,6 +201,7 @@

	1178 #ifdef U_ENABLE_GENERIC_ISO_2022

	1179 UBool isFirstBuffer;

	1180 #endif

	1181 + UBool isEmptySegment;

	1182 char name[30];

	1183 char locale[3];

	1184 }UConverterDataISO2022;

	1185 @@ -609,6 +610,7 @@

	1186 if(choice<=UCNV_RESET_TO_UNICODE) {

	1187 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));

	1188 myConverterData->key = 0;

	1189 + myConverterData->isEmptySegment = FALSE;

	1190 }

	1191 if(choice!=UCNV_RESET_TO_UNICODE) {

	1192 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));

	1193 @@ -752,6 +754,7 @@

5 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraIn fo);	1194 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraIn fo);

6 uint32_t key = myData2022->key;	1195 uint32_t key = myData2022->key;

7 int32_t offset = 0;	1196 int32_t offset = 0;

8 + int8_t initialToULength = _this->toULength;	1197 + int8_t initialToULength = _this->toULength;

9 char c;	1198 char c;

10	1199

11 value = VALID_NON_TERMINAL_2022;	1200 value = VALID_NON_TERMINAL_2022;

12 @@ -804,7 +805,6 @@	1201 @@ -804,7 +807,6 @@

13 return;	1202 return;

14 } else if (value == INVALID_2022 ) {	1203 } else if (value == INVALID_2022 ) {

15 *err = U_ILLEGAL_ESCAPE_SEQUENCE;	1204 *err = U_ILLEGAL_ESCAPE_SEQUENCE;

16 - return;	1205 - return;

17 } else /* value == VALID_TERMINAL_2022 */ {	1206 } else /* value == VALID_TERMINAL_2022 */ {

18 switch(var){	1207 switch(var){

19 #ifdef U_ENABLE_GENERIC_ISO_2022	1208 #ifdef U_ENABLE_GENERIC_ISO_2022

20 @@ -935,6 +935,35 @@	1209 @@ -814,6 +816,7 @@

	1210 if(chosenConverterName == NULL) {

	1211 /* SS2 or SS3 */

	1212 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;

	1213 + _this->toUCallbackReason = UCNV_UNASSIGNED;

	1214 return;

	1215 }

	1216

	1217 @@ -935,6 +938,37 @@

21 }	1218 }

22 if(U_SUCCESS(*err)) {	1219 if(U_SUCCESS(*err)) {

23 _this->toULength = 0;	1220 _this->toULength = 0;

24 + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {	1221 + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {

25 + if(_this->toULength>1) {	1222 + if(_this->toULength>1) {

26 + /*	1223 + /*

27 + * Ticket 5691: consistent illegal sequences:	1224 + * Ticket 5691: consistent illegal sequences:

28 + * - We include at least the first byte (ESC) in the illegal sequen ce.	1225 + * - We include at least the first byte (ESC) in the illegal sequen ce.

29 + * - If any of the non-initial bytes could be the start of a charac ter,	1226 + * - If any of the non-initial bytes could be the start of a charac ter,

30 + * we stop the illegal sequence before the first one of those.	1227 + * we stop the illegal sequence before the first one of those.

(...skipping 12 matching lines...) Expand all Loading...
43 + } else {	1240 + } else {

44 + /* Back out bytes from the previous buffer: Need to replay them . */	1241 + /* Back out bytes from the previous buffer: Need to replay them . */

45 + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistanc e);	1242 + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistanc e);

46 + /* same as -(initialToULength-1) */	1243 + /* same as -(initialToULength-1) */

47 + /* preToULength is negative! */	1244 + /* preToULength is negative! */

48 + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULen gth);	1245 + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULen gth);

49 + *source-=bytesFromThisBuffer;	1246 + *source-=bytesFromThisBuffer;

50 + }	1247 + }

51 + _this->toULength=1;	1248 + _this->toULength=1;

52 + }	1249 + }

	1250 + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {

	1251 + _this->toUCallbackReason = UCNV_UNASSIGNED;

53 }	1252 }

54 }	1253 }

55	1254

56 @@ -1097,6 +1126,24 @@	1255 @@ -1113,6 +1147,24 @@

	1256 }

57 }	1257 }

58	1258

59 /*	1259 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */

60 + * * Check that the result is a 2-byte value with each byte in the range A1..F E	1260 +/*

61 + * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byt e	1261 + * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code poi nt, it returns the

62 + * * to move it to the ISO 2022 range 21..7E.	1262 + * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point

63 + * * Return 0 if out of range.	1263 + * unchanged.

64 + * */	1264 + */

65 +static U_INLINE uint32_t	1265 +static U_INLINE uint32_t

66 +_2022FromGR94DBCS(uint32_t value) {	1266 +_2022ToGR94DBCS(uint32_t value) {

67 + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&	1267 + uint32_t returnValue = value + 0x8080;

68 + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)	1268 + if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&

69 + ) {	1269 + (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {

70 + return value - 0x8080; /* shift down to 21..7e byte range */	1270 + return returnValue;

71 + } else {	1271 + } else {

72 + return 0; /* not valid for ISO 2022 */	1272 + return value;

73 + }	1273 + }

74 +}	1274 +}

	1275 +#endif

75 +	1276 +

76 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */

77 +/*

78 * Check that the result is a 2-byte value with each byte in the range A1..FE

79 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte

80 * to move it to the ISO 2022 range 21..7E.

81 @@ -1112,6 +1159,7 @@

82 return 0; /* not valid for ISO 2022 */

83 }

84 }

85 +#endif

86

87 #ifdef U_ENABLE_GENERIC_ISO_2022	1277 #ifdef U_ENABLE_GENERIC_ISO_2022

88	1278

89 @@ -1953,6 +2001,7 @@	1279 /**************************************************************************** **

	1280 @@ -1436,7 +1488,7 @@

	1281 c2 = 0; /* invalid */

	1282 }

	1283 } else {

	1284 - if((uint8_t)(c2-0x21) <= (0x7e-0x21)) {

	1285 + if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {

	1286 c2 += 0x7e;

	1287 } else {

	1288 c2 = 0; /* invalid */

	1289 @@ -1953,6 +2005,7 @@

90 const char *mySourceLimit = args->sourceLimit;	1290 const char *mySourceLimit = args->sourceLimit;

91 uint32_t targetUniChar = 0x0000;	1291 uint32_t targetUniChar = 0x0000;

92 uint32_t mySourceChar = 0x0000;	1292 uint32_t mySourceChar = 0x0000;

93 + uint32_t tmpSourceChar = 0x0000;	1293 + uint32_t tmpSourceChar = 0x0000;

94 UConverterDataISO2022* myData;	1294 UConverterDataISO2022* myData;

95 ISO2022State *pToU2022State;	1295 ISO2022State *pToU2022State;

96 StateEnum cs;	1296 StateEnum cs;

97 @@ -1968,6 +2017,7 @@	1297 @@ -1968,6 +2021,7 @@

98 mySourceChar = args->converter->toUBytes[0];	1298 mySourceChar = args->converter->toUBytes[0];

99 args->converter->toULength = 0;	1299 args->converter->toULength = 0;

100 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];	1300 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];

101 + targetUniChar = missingCharMarker;	1301 + targetUniChar = missingCharMarker;

102 goto getTrailByte;	1302 goto getTrailByte;

103 }	1303 }

104	1304

105 @@ -2077,17 +2127,44 @@	1305 @@ -1986,6 +2040,7 @@

	1306 continue;

	1307 } else {

	1308 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */

	1309 + myData->isEmptySegment = FALSE;» /* reset this, we have a different error */

	1310 break;

	1311 }

	1312

	1313 @@ -1997,21 +2052,39 @@

	1314 continue;

	1315 } else {

	1316 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */

	1317 + myData->isEmptySegment = FALSE;» /* reset this, we have a different error */

	1318 break;

	1319 }

	1320

	1321 case ESC_2022:

	1322 mySource--;

	1323 escape:

	1324 - changeState_2022(args->converter,&(mySource),

	1325 - mySourceLimit, ISO_2022_JP,err);

	1326 + {

	1327 + const char * mySourceBefore = mySource;

	1328 + int8_t toULengthBefore = args->converter->toULength;

	1329 +

	1330 + changeState_2022(args->converter,&(mySource),

	1331 + mySourceLimit, ISO_2022_JP,err);

	1332 +

	1333 + /* If in ISO-2022-JP only and we successully completed an e scape sequence, but previous segment was empty, create an error */

	1334 + if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {

	1335 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;

	1336 + args->converter->toUCallbackReason = UCNV_IRREGULAR;

	1337 + args->converter->toULength = toULengthBefore + (mySourc e - mySourceBefore);

	1338 + }

	1339 + }

	1340

	1341 /* invalid or illegal escape sequence */

	1342 if(U_FAILURE(*err)){

	1343 args->target = myTarget;

	1344 args->source = mySource;

	1345 + myData->isEmptySegment = FALSE;» /* Reset to avoid future spurious errors */

	1346 return;

	1347 }

	1348 + /* If we successfully completed an escape sequence, we begin a new segment, empty so far */

	1349 + if(myData->key==0) {

	1350 + myData->isEmptySegment = TRUE;

	1351 + }

	1352 continue;

	1353

	1354 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */

	1355 @@ -2028,6 +2101,7 @@

	1356 /* falls through */

	1357 default:

	1358 /* convert one or two bytes */

	1359 + myData->isEmptySegment = FALSE;

	1360 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];

	1361 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData-> version==4 &&

	1362 !IS_JP_DBCS(cs)

	1363 @@ -2077,17 +2151,44 @@

106 default:	1364 default:

107 /* G0 DBCS */	1365 /* G0 DBCS */

108 if(mySource < mySourceLimit) {	1366 if(mySource < mySourceLimit) {

109 - char trailByte;	1367 - char trailByte;

110 + int leadIsOk, trailIsOk;	1368 + int leadIsOk, trailIsOk;

111 + uint8_t trailByte;	1369 + uint8_t trailByte;

112 getTrailByte:	1370 getTrailByte:

113 - trailByte = *mySource++;	1371 - trailByte = *mySource++;

114 - if(cs == JISX208) {	1372 - if(cs == JISX208) {

115 - _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailBy te, tempBuf);	1373 - _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailBy te, tempBuf);

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
149 + /* report a pair of illegal bytes if the second byt e is not a DBCS starter */	1407 + /* report a pair of illegal bytes if the second byt e is not a DBCS starter */

150 + ++mySource;	1408 + ++mySource;

151 + /* add another bit so that the code below writes 2 bytes in case of error */	1409 + /* add another bit so that the code below writes 2 bytes in case of error */

152 + mySourceChar = 0x10000 \| (mySourceChar << 8) \| trai lByte;	1410 + mySourceChar = 0x10000 \| (mySourceChar << 8) \| trai lByte;

153 }	1411 }

154 - mySourceChar = (mySourceChar << 8) \| (uint8_t)(trailByt e);	1412 - mySourceChar = (mySourceChar << 8) \| (uint8_t)(trailByt e);

155 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myC onverterArray[cs], tempBuf, 2, FALSE);	1413 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myC onverterArray[cs], tempBuf, 2, FALSE);

156 } else {	1414 } else {

157 args->converter->toUBytes[0] = (uint8_t)mySourceChar;	1415 args->converter->toUBytes[0] = (uint8_t)mySourceChar;

158 args->converter->toULength = 1;	1416 args->converter->toULength = 1;

159 @@ -2229,7 +2306,12 @@	1417 @@ -2229,7 +2330,12 @@

160 }	1418 }

161 /* only DBCS or SBCS characters are expected*/	1419 /* only DBCS or SBCS characters are expected*/

162 /* DB characters with high bit set to 1 are expected */	1420 /* DB characters with high bit set to 1 are expected */

163 - if(length > 2 \|\| length==0 \|\|(((targetByteUnit & 0x8080) != 0x8080) && length==2)){	1421 - if(length > 2 \|\| length==0 \|\|(((targetByteUnit & 0x8080) != 0x8080) && length==2)){

164 + if( length > 2 \|\| length==0 \|\|	1422 + if( length > 2 \|\| length==0 \|\|

165 + (length == 1 && targetByteUnit > 0x7f) \|\|	1423 + (length == 1 && targetByteUnit > 0x7f) \|\|

166 + (length == 2 &&	1424 + (length == 2 &&

167 + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) \|\|	1425 + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) \|\|

168 + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))	1426 + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))

169 + ) {	1427 + ) {

170 targetByteUnit=missingCharMarker;	1428 targetByteUnit=missingCharMarker;

171 }	1429 }

172 if (targetByteUnit != missingCharMarker){	1430 if (targetByteUnit != missingCharMarker){

173 @@ -2545,17 +2627,34 @@	1431 @@ -2524,15 +2630,27 @@

174	1432

	1433 if(mySourceChar==UCNV_SI){

	1434 myData->toU2022State.g = 0;

	1435 + if (myData->isEmptySegment) {

	1436 + myData->isEmptySegment = FALSE; /* we are handling it, r eset to avoid future spurious errors */

	1437 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;

	1438 + args->converter->toUCallbackReason = UCNV_IRREGULAR;

	1439 + args->converter->toUBytes[0] = (uint8_t)mySourceChar;

	1440 + args->converter->toULength = 1;

	1441 + args->target = myTarget;

	1442 + args->source = mySource;

	1443 + return;

	1444 + }

	1445 /consume the source /

	1446 continue;

	1447 }else if(mySourceChar==UCNV_SO){

	1448 myData->toU2022State.g = 1;

	1449 + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */

	1450 /consume the source /

	1451 continue;

	1452 }else if(mySourceChar==ESC_2022){

	1453 mySource--;

	1454 escape:

	1455 + myData->isEmptySegment = FALSE; /* Any invalid ESC seque nces will be detected separately, so just reset this */

	1456 changeState_2022(args->converter,&(mySource),

	1457 mySourceLimit, ISO_2022_KR, err);

	1458 if(U_FAILURE(*err)){

	1459 @@ -2543,19 +2661,37 @@

	1460 continue;

	1461 }

	1462

	1463 + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */

175 if(myData->toU2022State.g == 1) {	1464 if(myData->toU2022State.g == 1) {

176 if(mySource < mySourceLimit) {	1465 if(mySource < mySourceLimit) {

177 - char trailByte;	1466 - char trailByte;

178 + int leadIsOk, trailIsOk;	1467 + int leadIsOk, trailIsOk;

179 + uint8_t trailByte;	1468 + uint8_t trailByte;

180 getTrailByte:	1469 getTrailByte:

181 - trailByte = *mySource++;	1470 - trailByte = *mySource++;

182 - tempBuf[0] = (char)(mySourceChar + 0x80);	1471 - tempBuf[0] = (char)(mySourceChar + 0x80);

183 - tempBuf[1] = (char)(trailByte + 0x80);	1472 - tempBuf[1] = (char)(trailByte + 0x80);

184 - mySourceChar = (mySourceChar << 8) \| (uint8_t)(trailByte);	1473 - mySourceChar = (mySourceChar << 8) \| (uint8_t)(trailByte);

(...skipping 22 matching lines...) Expand all Loading...
207 - targetUniChar = missingCharMarker;	1496 - targetUniChar = missingCharMarker;

208 + mySourceChar = (mySourceChar << 8) \| trailByte;	1497 + mySourceChar = (mySourceChar << 8) \| trailByte;

209 + } else if (!(trailIsOk \|\| IS_2022_CONTROL(trailByte))) {	1498 + } else if (!(trailIsOk \|\| IS_2022_CONTROL(trailByte))) {

210 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */	1499 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */

211 + ++mySource;	1500 + ++mySource;

212 + /* add another bit so that the code below writes 2 byte s in case of error */	1501 + /* add another bit so that the code below writes 2 byte s in case of error */

213 + mySourceChar = 0x10000 \| (mySourceChar << 8) \| trailByt e;	1502 + mySourceChar = 0x10000 \| (mySourceChar << 8) \| trailByt e;

214 }	1503 }

215 } else {	1504 } else {

216 args->converter->toUBytes[0] = (uint8_t)mySourceChar;	1505 args->converter->toUBytes[0] = (uint8_t)mySourceChar;

217 @@ -2563,8 +2662,10 @@	1506 @@ -2563,8 +2699,10 @@

218 break;	1507 break;

219 }	1508 }

220 }	1509 }

221 - else{	1510 - else{

222 + else if(mySourceChar <= 0x7f) {	1511 + else if(mySourceChar <= 0x7f) {

223 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySourc e - 1, 1, useFallback);	1512 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySourc e - 1, 1, useFallback);

224 + } else {	1513 + } else {

225 + targetUniChar = 0xffff;	1514 + targetUniChar = 0xffff;

226 }	1515 }

227 if(targetUniChar < 0xfffe){	1516 if(targetUniChar < 0xfffe){

228 if(args->offsets) {	1517 if(args->offsets) {

229 @@ -3061,6 +3162,7 @@	1518 @@ -3061,6 +3199,7 @@

230 /* continue with a partial double-byte character */	1519 /* continue with a partial double-byte character */

231 mySourceChar = args->converter->toUBytes[0];	1520 mySourceChar = args->converter->toUBytes[0];

232 args->converter->toULength = 0;	1521 args->converter->toULength = 0;

233 + targetUniChar = missingCharMarker;	1522 + targetUniChar = missingCharMarker;

234 goto getTrailByte;	1523 goto getTrailByte;

235 }	1524 }

236	1525

237 @@ -3114,29 +3216,50 @@	1526 @@ -3075,27 +3214,52 @@

	1527 switch(mySourceChar){

	1528 case UCNV_SI:

	1529 pToU2022State->g=0;

	1530 + if (myData->isEmptySegment) {

	1531 + myData->isEmptySegment = FALSE;» /* we are handling it, r eset to avoid future spurious errors */

	1532 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;

	1533 + args->converter->toUCallbackReason = UCNV_IRREGULAR;

	1534 + args->converter->toUBytes[0] = mySourceChar;

	1535 + args->converter->toULength = 1;

	1536 + args->target = myTarget;

	1537 + args->source = mySource;

	1538 + return;

	1539 + }

	1540 continue;

	1541

	1542 case UCNV_SO:

	1543 if(pToU2022State->cs[1] != 0) {

	1544 pToU2022State->g=1;

	1545 + myData->isEmptySegment = TRUE;» /* Begin a new segment, empty so far */

	1546 continue;

	1547 } else {

	1548 /* illegal to have SO before a matching designator */

	1549 + myData->isEmptySegment = FALSE;» /* Handling a different error, reset this to avoid future spurious errs */

	1550 break;

	1551 }

	1552

	1553 case ESC_2022:

	1554 mySource--;

	1555 escape:

	1556 - changeState_2022(args->converter,&(mySource),

	1557 - mySourceLimit, ISO_2022_CN,err);

	1558 + {

	1559 + const char * mySourceBefore = mySource;

	1560 + int8_t toULengthBefore = args->converter->toULength;

	1561 +

	1562 + changeState_2022(args->converter,&(mySource),

	1563 + mySourceLimit, ISO_2022_CN,err);

	1564 +

	1565 + /* After SO there must be at least one character before a d esignator (designator error handled separately) */

	1566 + if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegm ent) {

	1567 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;

	1568 + args->converter->toUCallbackReason = UCNV_IRREGULAR;

	1569 + args->converter->toULength = toULengthBefore + (mySourc e - mySourceBefore);

	1570 + }

	1571 + }

	1572

	1573 /* invalid or illegal escape sequence */

	1574 if(U_FAILURE(*err)){

	1575 args->target = myTarget;

	1576 args->source = mySource;

	1577 + myData->isEmptySegment = FALSE;» /* Reset to avoid future spurious errors */

	1578 return;

	1579 }

	1580 continue;

	1581 @@ -3109,34 +3273,56 @@

	1582 /* falls through */

	1583 default:

	1584 /* convert one or two bytes */

	1585 + myData->isEmptySegment = FALSE;

	1586 if(pToU2022State->g != 0) {

	1587 if(mySource < mySourceLimit) {

238 UConverterSharedData *cnv;	1588 UConverterSharedData *cnv;

239 StateEnum tempState;	1589 StateEnum tempState;

240 int32_t tempBufLen;	1590 int32_t tempBufLen;

241 - char trailByte;	1591 - char trailByte;

242 + int leadIsOk, trailIsOk;	1592 + int leadIsOk, trailIsOk;

243 + uint8_t trailByte;	1593 + uint8_t trailByte;

244 getTrailByte:	1594 getTrailByte:

245 - trailByte = *mySource++;	1595 - trailByte = *mySource++;

246 - tempState = (StateEnum)pToU2022State->cs[pToU2022State- >g];	1596 - tempState = (StateEnum)pToU2022State->cs[pToU2022State- >g];

247 - if(tempState > CNS_11643_0) {	1597 - if(tempState > CNS_11643_0) {

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
295 }	1645 }

296 - mySourceChar = (mySourceChar << 8) \| (uint8_t)(trailByt e);	1646 - mySourceChar = (mySourceChar << 8) \| (uint8_t)(trailByt e);

297 if(pToU2022State->g>=2) {	1647 if(pToU2022State->g>=2) {

298 /* return from a single-shift state to the previous one */	1648 /* return from a single-shift state to the previous one */

299 pToU2022State->g=pToU2022State->prevG;	1649 pToU2022State->g=pToU2022State->prevG;

300 }	1650 }

301 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBu f, tempBufLen, FALSE);	1651 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBu f, tempBufLen, FALSE);

302 } else {	1652 } else {

303 args->converter->toUBytes[0] = (uint8_t)mySourceChar;	1653 args->converter->toUBytes[0] = (uint8_t)mySourceChar;

304 args->converter->toULength = 1;	1654 args->converter->toULength = 1;

305 diff -ru trie.clean/source/common/ucnvmbcs.c chrome.canonical/source/common/ucnv mbcs.c	1655 @@ -3399,11 +3585,19 @@

306 --- trie.clean/source/common/ucnvmbcs.c 2007-11-07 17:39:05.057870000 -0800	1656 /* include ASCII for JP */

307 +++ chrome.canonical/source/common/ucnvmbcs.c 2008-10-29 11:34:34.648518000 -0 700	1657 sa->addRange(sa->set, 0, 0x7f);

	1658 }

	1659 - if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {

	1660 + if(cnvData->version==3 \|\| cnvData->version==4 \|\| which==UCNV_ROUNDTRIP_ AND_FALLBACK_SET) {

	1661 /*

	1662 - * TODO(markus): If and when ucnv_getUnicodeSet() supports fallback s,

	1663 - * we need to include half-width Katakana for all JP variants becau se

	1664 - * JIS X 0208 has hardcoded fallbacks for them.

	1665 + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))! =0

	1666 + * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)

	1667 + * use half-width Katakana.

	1668 + * This is because all ISO-2022-JP variants are lenient in that the y accept (in toUnicode)

	1669 + * half-width Katakana via the ESC ( I sequence.

	1670 + * However, we only emit (fromUnicode) half-width Katakana accordin g to the

	1671 + * definition of each variant.

	1672 + *

	1673 + * When including fallbacks,

	1674 + * we need to include half-width Katakana Unicode code points for a ll JP variants because

	1675 + * JIS X 0208 has hardcoded fallbacks for them (which map to full-w idth Katakana).

	1676 */

	1677 /* include half-width Katakana for JP */

	1678 sa->addRange(sa->set, HWKANA_START, HWKANA_END);

	1679 @@ -3457,6 +3651,12 @@

	1680 * corresponding to JIS X 0208.

	1681 */

	1682 filter=UCNV_SET_FILTER_SJIS;

	1683 + } else if(i==KSC5601) {

	1684 + /*

	1685 + * Some of the KSC 5601 tables (convrtrs.txt has this aliases o n multiple tables)

	1686 + * are broader than GR94.

	1687 + */

	1688 + filter=UCNV_SET_FILTER_GR94DBCS;

	1689 } else {

	1690 filter=UCNV_SET_FILTER_NONE;

	1691 }

	1692 @@ -3472,6 +3672,9 @@

	1693 sa->remove(sa->set, 0x0e);

	1694 sa->remove(sa->set, 0x0f);

	1695 sa->remove(sa->set, 0x1b);

	1696 +

	1697 + /* ISO 2022 converters do not convert C1 controls either */

	1698 + sa->removeRange(sa->set, 0x80, 0x9f);

	1699 }

	1700

	1701 static const UConverterImpl _ISO2022Impl={

	1702 --- r22777/source/common/ucnv_lmb.c 2006-08-19 14:27:08.000000000 -0700

	1703 +++ chrome.canonical/source/common/ucnv_lmb.c 2009-03-23 12:30:26.043293000 -0 700

	1704 @@ -1,6 +1,6 @@

	1705 /*

	1706 **********************************************************************

	1707 -* Copyright (C) 2000-2006, International Business Machines

	1708 +* Copyright (C) 2000-2007, International Business Machines

	1709 * Corporation and others. All Rights Reserved.

	1710 **********************************************************************

	1711 * file name: ucnv_lmb.cpp

	1712 @@ -536,7 +536,7 @@

	1713 NULL,\

	1714 NULL,\

	1715 _LMBCSSafeClone,\

	1716 - _LMBCSGetUnicodeSet\

	1717 + ucnv_getCompleteUnicodeSet\

	1718 };\

	1719 static const UConverterStaticData _LMBCSStaticData##n={\

	1720 sizeof(UConverterStaticData),\

	1721 @@ -662,15 +662,14 @@

	1722 return &newLMBCS->cnv;

	1723 }

	1724

	1725 -static void

	1726 -_LMBCSGetUnicodeSet(const UConverter *cnv,

	1727 - const USetAdder *sa,

	1728 - UConverterUnicodeSet which,

	1729 - UErrorCode *pErrorCode) {

	1730 - /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */

	1731 - sa->addRange(sa->set, 0, 0xf5ff);

	1732 - sa->addRange(sa->set, 0xf700, 0x10ffff);

	1733 -}

	1734 +/*

	1735 + * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 2 0117)

	1736 + * which added all code points except for U+F6xx

	1737 + * because those cannot be represented in the Unicode group.

	1738 + * However, it turns out that windows-950 has roundtrips for all of U+F6xx

	1739 + * which means that LMBCS can convert all Unicode code points after all.

	1740 + * We now simply use ucnv_getCompleteUnicodeSet().

	1741 + */

	1742

	1743 /*

	1744 Here's the basic helper function that we use when converting from

	1745 --- r22777/source/common/ucnvhz.c 2006-07-05 16:08:50.000000000 -0700

	1746 +++ chrome.canonical/source/common/ucnvhz.c 2009-03-23 12:42:01.208181000 -0 700

	1747 @@ -1,6 +1,6 @@

	1748 /*

	1749 **********************************************************************

	1750 -* Copyright (C) 2000-2006, International Business Machines

	1751 +* Copyright (C) 2000-2007, International Business Machines

	1752 * Corporation and others. All Rights Reserved.

	1753 **********************************************************************

	1754 * file name: ucnvhz.c

	1755 @@ -59,6 +59,7 @@

	1756 UBool isEscapeAppended;

	1757 UBool isStateDBCS;

	1758 UBool isTargetUCharDBCS;

	1759 + UBool isEmptySegment;

	1760 }UConverterDataHZ;

	1761

	1762

	1763 @@ -72,7 +73,7 @@

	1764 cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));

	1765 if(cnv->extraInfo != NULL){

	1766 uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));

	1767 - ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386" ,errorCode);

	1768 + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",erro rCode);

	1769 }

	1770 else {

	1771 *errorCode = U_MEMORY_ALLOCATION_ERROR;

	1772 @@ -98,6 +99,7 @@

	1773 cnv->mode=0;

	1774 if(cnv->extraInfo != NULL){

	1775 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;

	1776 + ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;

	1777 }

	1778 }

	1779 if(choice!=UCNV_RESET_TO_UNICODE) {

	1780 @@ -130,6 +132,10 @@

	1781 * from-GB code '~}' ($7E7D) is outside the defined GB range.)

	1782 *

	1783 * Source: RFC 1842

	1784 +*

	1785 +* Note that the formal syntax in RFC 1842 is invalid. I assume that the

	1786 +* intended definition of single-byte-segment is as follows (pedberg):

	1787 +* single-byte-segment = single-byte-seq 1*single-byte-char

	1788 */

	1789

	1790

	1791 @@ -141,7 +147,7 @@

	1792 UChar *myTarget = args->target;

	1793 const char *mySourceLimit = args->sourceLimit;

	1794 UChar32 targetUniChar = 0x0000;

	1795 - UChar mySourceChar = 0x0000;

	1796 + int32_t mySourceChar = 0x0000;

	1797 UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);

	1798 tempBuf[0]=0;

	1799 tempBuf[1]=0;

	1800 @@ -156,90 +162,123 @@

	1801

	1802 mySourceChar= (unsigned char) *mySource++;

	1803

	1804 - switch(mySourceChar){

	1805 + if(args->converter->mode == UCNV_TILDE) {

	1806 + /* second byte after ~ */

	1807 + args->converter->mode=0;

	1808 + switch(mySourceChar) {

	1809 case 0x0A:

	1810 - if(args->converter->mode ==UCNV_TILDE){

	1811 - args->converter->mode=0;

	1812 -

	1813 - }

	1814 - *(myTarget++)=(UChar)mySourceChar;

	1815 + /* no output for ~\n (line-continuation marker) */

	1816 continue;

	1817 -

	1818 case UCNV_TILDE:

	1819 - if(args->converter->mode ==UCNV_TILDE){

	1820 - *(myTarget++)=(UChar)mySourceChar;

	1821 - args->converter->mode=0;

	1822 - continue;

	1823 -

	1824 + if(args->offsets) {

	1825 + args->offsets[myTarget - args->target]=(int32_t)(mySour ce - args->source - 2);

	1826 }

	1827 - else if(args->converter->toUnicodeStatus !=0){

	1828 - args->converter->mode=0;

	1829 - break;

	1830 - }

	1831 - else{

	1832 - args->converter->mode = UCNV_TILDE;

	1833 - continue;

	1834 - }

	1835 -

	1836 -

	1837 + *(myTarget++)=(UChar)mySourceChar;

	1838 + myData->isEmptySegment = FALSE;

	1839 + continue;

	1840 case UCNV_OPEN_BRACE:

	1841 - if(args->converter->mode == UCNV_TILDE){

	1842 - args->converter->mode=0;

	1843 - myData->isStateDBCS = TRUE;

	1844 - continue;

	1845 - }

	1846 - else{

	1847 - break;

	1848 - }

	1849 -

	1850 -

	1851 case UCNV_CLOSE_BRACE:

	1852 - if(args->converter->mode == UCNV_TILDE){

	1853 - args->converter->mode=0;

	1854 - myData->isStateDBCS = FALSE;

	1855 - continue;

	1856 - }

	1857 - else{

	1858 - break;

	1859 + myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);

	1860 + if (myData->isEmptySegment) {

	1861 + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */

	1862 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;

	1863 + args->converter->toUCallbackReason = UCNV_IRREGULAR;

	1864 + args->converter->toUBytes[0] = UCNV_TILDE;

	1865 + args->converter->toUBytes[1] = mySourceChar;

	1866 + args->converter->toULength = 2;

	1867 + args->target = myTarget;

	1868 + args->source = mySource;

	1869 + return;

	1870 }

	1871 -

	1872 + myData->isEmptySegment = TRUE;

	1873 + continue;

	1874 default:

	1875 /* if the first byte is equal to TILDE and the trail byte

	1876 * is not a valid byte then it is an error condition

	1877 */

	1878 - if(args->converter->mode == UCNV_TILDE){

	1879 - args->converter->mode=0;

	1880 - mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) \| ((mySo urceChar & 0x00ff)+0x80));

	1881 - goto SAVE_STATE;

	1882 - }

	1883 -

	1884 - break;

	1885 -

	1886 - }

	1887 -

	1888 - if(myData->isStateDBCS){

	1889 + /*

	1890 + * Ticket 5691: consistent illegal sequences:

	1891 + * - We include at least the first byte in the illegal sequ ence.

	1892 + * - If any of the non-initial bytes could be the start of a character,

	1893 + * we stop the illegal sequence before the first one of t hose.

	1894 + */

	1895 + myData->isEmptySegment = FALSE; /* different error here, re set this to avoid spurious future error */

	1896 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;

	1897 + args->converter->toUBytes[0] = UCNV_TILDE;

	1898 + if( myData->isStateDBCS ?

	1899 + (0x21 <= mySourceChar && mySourceChar <= 0x7e) :

	1900 + mySourceChar <= 0x7f

	1901 + ) {

	1902 + /* The current byte could be the start of a character: Back it out. */

	1903 + args->converter->toULength = 1;

	1904 + --mySource;

	1905 + } else {

	1906 + /* Include the current byte in the illegal sequence. */

	1907 + args->converter->toUBytes[1] = mySourceChar;

	1908 + args->converter->toULength = 2;

	1909 + }

	1910 + args->target = myTarget;

	1911 + args->source = mySource;

	1912 + return;

	1913 + }

	1914 + } else if(myData->isStateDBCS) {

	1915 if(args->converter->toUnicodeStatus == 0x00){

	1916 - args->converter->toUnicodeStatus = (UChar) mySourceChar;

	1917 + /* lead byte */

	1918 + if(mySourceChar == UCNV_TILDE) {

	1919 + args->converter->mode = UCNV_TILDE;

	1920 + } else {

	1921 + /* add another bit to distinguish a 0 byte from not hav ing seen a lead byte */

	1922 + args->converter->toUnicodeStatus = (uint32_t) (mySource Char \| 0x100);

	1923 + myData->isEmptySegment = FALSE; /* the segment has some thing, either valid or will produce a different error, so reset this */

	1924 + }

	1925 continue;

	1926 }

	1927 else{

	1928 - tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;

	1929 - tempBuf[1] = (char) (mySourceChar+0x80);

	1930 - mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x 80) << 8) \| ((mySourceChar & 0x00ff)+0x80));

	1931 + /* trail byte */

	1932 + int leadIsOk, trailIsOk;

	1933 + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff ;

	1934 + targetUniChar = 0xffff;

	1935 + /*

	1936 + * Ticket 5691: consistent illegal sequences:

	1937 + * - We include at least the first byte in the illegal sequ ence.

	1938 + * - If any of the non-initial bytes could be the start of a character,

	1939 + * we stop the illegal sequence before the first one of t hose.

	1940 + *

	1941 + * In HZ DBCS, if the second byte is in the 21..7e range,

	1942 + * we report only the first byte as the illegal sequence.

	1943 + * Otherwise we convert or report the pair of bytes.

	1944 + */

	1945 + leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);

	1946 + trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) ;

	1947 + if (leadIsOk && trailIsOk) {

	1948 + tempBuf[0] = (char) (leadByte+0x80) ;

	1949 + tempBuf[1] = (char) (mySourceChar+0x80);

	1950 + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbC onverter->sharedData,

	1951 + tempBuf, 2, args->converter->useFallback);

	1952 + mySourceChar= (leadByte << 8) \| mySourceChar;

	1953 + } else if (trailIsOk) {

	1954 + /* report a single illegal byte and continue with the f ollowing DBCS starter byte */

	1955 + --mySource;

	1956 + mySourceChar = (int32_t)leadByte;

	1957 + } else {

	1958 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */

	1959 + /* add another bit so that the code below writes 2 byte s in case of error */

	1960 + mySourceChar= 0x10000 \| (leadByte << 8) \| mySourceChar;

	1961 + }

	1962 args->converter->toUnicodeStatus =0x00;

	1963 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConve rter->sharedData,

	1964 - tempBuf, 2, args->converter->useFallback);

	1965 }

	1966 }

	1967 else{

	1968 - if(args->converter->fromUnicodeStatus == 0x00){

	1969 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConve rter->sharedData,

	1970 - mySource - 1, 1, args->converter->useFallback);

	1971 - }

	1972 - else{

	1973 - goto SAVE_STATE;

	1974 + if(mySourceChar == UCNV_TILDE) {

	1975 + args->converter->mode = UCNV_TILDE;

	1976 + continue;

	1977 + } else if(mySourceChar <= 0x7f) {

	1978 + targetUniChar = (UChar)mySourceChar; /* ASCII */

	1979 + myData->isEmptySegment = FALSE; /* the segment has somethin g valid */

	1980 + } else {

	1981 + targetUniChar = 0xffff;

	1982 + myData->isEmptySegment = FALSE; /* different error here, re set this to avoid spurious future error */

	1983 }

	1984 -

	1985 }

	1986 if(targetUniChar < 0xfffe){

	1987 if(args->offsets) {

	1988 @@ -248,26 +287,17 @@

	1989

	1990 *(myTarget++)=(UChar)targetUniChar;

	1991 }

	1992 - else if(targetUniChar>=0xfffe){

	1993 -SAVE_STATE:

	1994 + else /* targetUniChar>=0xfffe */ {

	1995 if(targetUniChar == 0xfffe){

	1996 *err = U_INVALID_CHAR_FOUND;

	1997 }

	1998 else{

	1999 *err = U_ILLEGAL_CHAR_FOUND;

	2000 }

	2001 - if(myData->isStateDBCS){

	2002 - /* this should never occur since isStateDBCS is set to true

	2003 - * only after tempBuf[0] and tempBuf[1]

	2004 - * are set to the input .. just to please BEAM

	2005 - */

	2006 - if(tempBuf[0]==0 \|\| tempBuf[1]==0){

	2007 - *err = U_INTERNAL_PROGRAM_ERROR;

	2008 - }else{

	2009 - args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x8 0);

	2010 - args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x8 0);

	2011 - args->converter->toULength=2;

	2012 - }

	2013 + if(mySourceChar > 0xff){

	2014 + args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8) ;

	2015 + args->converter->toUBytes[1] = (uint8_t)mySourceChar;

	2016 + args->converter->toULength=2;

	2017 }

	2018 else{

	2019 args->converter->toUBytes[0] = (uint8_t)mySourceChar;

	2020 @@ -328,16 +358,21 @@

	2021 escSeq = TILDE_ESCAPE;

	2022 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,e rr,len,mySourceIndex);

	2023 continue;

	2024 - }

	2025 - else{

	2026 + } else if(mySourceChar <= 0x7f) {

	2027 + length = 1;

	2028 + targetUniChar = mySourceChar;

	2029 + } else {

	2030 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->shar edData,

	2031 mySourceChar,&targetUniChar,args->converter->useFallback);

	2032 -

	2033 - }

	2034 - /* only DBCS or SBCS characters are expected*/

	2035 - /* DB haracters with high bit set to 1 are expected */

	2036 - if(length > 2 \|\| length==0 \|\|(((targetUniChar & 0x8080) != 0x8080)& & length==2)){

	2037 - targetUniChar= missingCharMarker;

	2038 + /* we can only use lead bytes 21..7D and trail bytes 21..7E */

	2039 + if( length == 2 &&

	2040 + (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&

	2041 + (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)

	2042 + ) {

	2043 + targetUniChar -= 0x8080;

	2044 + } else {

	2045 + targetUniChar = missingCharMarker;

	2046 + }

	2047 }

	2048 if (targetUniChar != missingCharMarker){

	2049 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool) (targetUniChar>0x00FF);

	2050 @@ -360,22 +395,22 @@

	2051

	2052 if(isTargetUCharDBCS){

	2053 if( myTargetIndex <targetLength){

	2054 - myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);

	2055 + myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);

	2056 if(offsets){

	2057 *(offsets++) = mySourceIndex-1;

	2058 }

	2059 if(myTargetIndex < targetLength){

	2060 - myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);

	2061 + myTarget[myTargetIndex++] =(char) targetUniChar;

	2062 if(offsets){

	2063 *(offsets++) = mySourceIndex-1;

	2064 }

	2065 }else{

	2066 - args->converter->charErrorBuffer[args->converter->c harErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);

	2067 + args->converter->charErrorBuffer[args->converter->c harErrorBufferLength++] = (char) targetUniChar;

	2068 *err = U_BUFFER_OVERFLOW_ERROR;

	2069 }

	2070 }else{

	2071 - args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);

	2072 - args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);

	2073 + args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] =(char) (targetUniChar >> 8);

	2074 + args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] = (char) targetUniChar;

	2075 *err = U_BUFFER_OVERFLOW_ERROR;

	2076 }

	2077

	2078 @@ -524,14 +559,14 @@

	2079 const USetAdder *sa,

	2080 UConverterUnicodeSet which,

	2081 UErrorCode *pErrorCode) {

	2082 - /* the tilde '~' is hardcoded in the converter */

	2083 - sa->add(sa->set, 0x7e);

	2084 + /* HZ converts all of ASCII */

	2085 + sa->addRange(sa->set, 0, 0x7f);

	2086

	2087 /* add all of the code points that the sub-converter handles */

	2088 - ((UConverterDataHZ*)cnv->extraInfo)->

	2089 - gbConverter->sharedData->impl->

	2090 - getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,

	2091 - sa, which, pErrorCode);

	2092 + ucnv_MBCSGetFilteredUnicodeSetForUnicode(

	2093 + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,

	2094 + sa, which, UCNV_SET_FILTER_HZ,

	2095 + pErrorCode);

	2096 }

	2097

	2098 static const UConverterImpl _HZImpl={

	2099 --- r22777/source/common/ucnv_set.c 2005-06-03 13:17:54.000000000 -0700

	2100 +++ chrome.canonical/source/common/ucnv_set.c 2009-03-23 12:30:09.917043000 -0 700

308 @@ -1,7 +1,7 @@	2101 @@ -1,7 +1,7 @@

309 /*	2102 /*

310 ******************************************************************************	2103 *******************************************************************************

311 *	2104 *

312 -* Copyright (C) 2000-2007, International Business Machines	2105 -* Copyright (C) 2003-2005, International Business Machines

313 +* Copyright (C) 2000-2008, International Business Machines	2106 +* Copyright (C) 2003-2007, International Business Machines

314 * Corporation and others. All Rights Reserved.	2107 * Corporation and others. All Rights Reserved.

315 *	2108 *

316 ******************************************************************************	2109 *******************************************************************************

317 @@ -1739,6 +1739,65 @@	2110 @@ -52,7 +52,8 @@

318 pArgs->offsets=offsets;	2111 uset_add,

319 }	2112 uset_addRange,

320	2113 uset_addString,

321 +static UBool	2114 - uset_remove

322 +hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {	2115 + uset_remove,

323 + const int32_t *row=stateTable[state];	2116 + uset_removeRange

324 + int32_t b, entry;	2117 };

325 + /* First test for final entries in this state for some commonly valid byte values. */	2118 sa.set=setFillIn;

326 + entry=row[0xa1];	2119

327 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&	2120 --- r22777/source/common/ucnv_bld.c» 2007-08-24 02:44:10.880047000 -0700

328 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL	2121 +++ chrome.canonical/source/common/ucnv_bld.c» 2009-03-23 12:40:10.653507000 -0 700

	2122 @@ -932,6 +932,7 @@

	2123 myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;

	2124 myUConverter->subChars = (uint8_t *)myUConverter->subUChars;

	2125 uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subC har, myUConverter->subCharLen);

	2126 + myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (fromCharErrorBehaviour) /

	2127

	2128 if(mySharedConverterData->impl->open != NULL) {

	2129 mySharedConverterData->impl->open(myUConverter, realName, locale, optio ns, err);

	2130 --- r22777/source/common/ucnv_bld.h» 2006-07-05 16:08:50.000000000 -0700

	2131 +++ chrome.canonical/source/common/ucnv_bld.h» 2009-03-23 12:40:10.680507000 -0 700

	2132 @@ -1,6 +1,6 @@

	2133 /*

	2134 **********************************************************************

	2135 -* Copyright (C) 1999-2006, International Business Machines

	2136 +* Copyright (C) 1999-2006,2008 International Business Machines

	2137 * Corporation and others. All Rights Reserved.

	2138 **********************************************************************

	2139 *

	2140 @@ -226,6 +226,9 @@

	2141 char preToU[UCNV_EXT_MAX_BYTES];

	2142 int8_t preFromULength, preToULength; /* negative: replay */

	2143 int8_t preToUFirstLength; /* length of first character */

	2144 +

	2145 + /* new fields for ICU 4.0 */

	2146 + UConverterCallbackReason toUCallbackReason; /* (fromCharErrorBehaviour) re ason, set when error is detected /

	2147 };

	2148

	2149 U_CDECL_END /* end of UConverter */

	2150 --- r22777/source/common/ucnv_ext.c» 2007-08-22 22:46:49.525855000 -0700

	2151 +++ chrome.canonical/source/common/ucnv_ext.c» 2009-03-23 12:30:33.135573000 -0 700

	2152 @@ -946,7 +946,7 @@

	2153 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,

	2154 const int32_t *cx,

	2155 const USetAdder *sa,

	2156 - UConverterUnicodeSet which,

	2157 + UBool useFallback,

	2158 int32_t minLength,

	2159 UChar32 c,

	2160 UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,

	2161 @@ -966,7 +966,7 @@

	2162 value=*fromUSectionValues++;

	2163

	2164 if( value!=0 &&

	2165 - UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&

	2166 + (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) \|\| useFallback) &&

	2167 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength

	2168 ) {

	2169 if(c>=0) {

	2170 @@ -987,12 +987,14 @@

	2171 /* no mapping, do nothing */

	2172 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {

	2173 ucnv_extGetUnicodeSetString(

	2174 - sharedData, cx, sa, which, minLength,

	2175 + sharedData, cx, sa, useFallback, minLength,

	2176 U_SENTINEL, s, length+1,

	2177 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),

	2178 pErrorCode);

	2179 - } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG\|UCNV_EXT_FROM_U_RESER VED_MASK))==

	2180 - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&

	2181 + } else if((useFallback ?

	2182 + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :

	2183 + ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG\|UCNV_EXT_FROM_U_R ESERVED_MASK))==

	2184 + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&

	2185 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength

	2186 ) {

	2187 sa->addString(sa->set, s, length+1);

	2188 @@ -1004,6 +1006,7 @@

	2189 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,

	2190 const USetAdder *sa,

	2191 UConverterUnicodeSet which,

	2192 + UConverterSetFilter filter,

	2193 UErrorCode *pErrorCode) {

	2194 const int32_t *cx;

	2195 const uint16_t stage12, stage3, ps2, ps3;

	2196 @@ -1011,6 +1014,7 @@

	2197

	2198 uint32_t value;

	2199 int32_t st1, stage1Length, st2, st3, minLength;

	2200 + UBool useFallback;

	2201

	2202 UChar s[UCNV_EXT_MAX_UCHARS];

	2203 UChar32 c;

	2204 @@ -1027,10 +1031,16 @@

	2205

	2206 stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];

	2207

	2208 + useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);

	2209 +

	2210 /* enumerate the from-Unicode trie table */

	2211 c=0; /* keep track of the current code point while enumerating */

	2212

	2213 - if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {

	2214 + if(filter==UCNV_SET_FILTER_2022_CN) {

	2215 + minLength=3;

	2216 + } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY \|\|

	2217 + filter!=UCNV_SET_FILTER_NONE

329 + ) {	2218 + ) {

330 + return TRUE;	2219 /* DBCS-only, ignore single-byte results */

331 + }	2220 minLength=2;

332 + entry=row[0x41];	2221 } else {

333 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&	2222 @@ -1064,14 +1074,48 @@

334 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL	2223 length=0;

335 + ) {	2224 U16_APPEND_UNSAFE(s, length, c);

336 + return TRUE;	2225 ucnv_extGetUnicodeSetString(

337 + }	2226 - sharedData, cx, sa, which, minLength,

338 + /* Then test for final entries in this state. */	2227 + sharedData, cx, sa, useFallback, minLength,

339 + for(b=0; b<=0xff; ++b) {	2228 c, s, length,

340 + entry=row[b];	2229 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(valu e),

341 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&	2230 pErrorCode);

342 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL	2231 - } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG\|UCNV_ EXT_FROM_U_RESERVED_MASK))==

343 + ) {	2232 - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&

344 + return TRUE;	2233 + } else if((useFallback ?

345 + }	2234 + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :

346 + }	2235 + ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG\|U CNV_EXT_FROM_U_RESERVED_MASK))==

347 + /* Then recurse for transition entries. */	2236 + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&

348 + for(b=0; b<=0xff; ++b) {	2237 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength

349 + entry=row[b];	2238 ) {

350 + if( MBCS_ENTRY_IS_TRANSITION(entry) &&	2239 + switch(filter) {

351 + hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE (entry))	2240 + case UCNV_SET_FILTER_2022_CN:

352 + ) {	2241 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UC NV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {

353 + return TRUE;	2242 + continue;

354 + }	2243 + }

355 + }	2244 + break;

356 + return FALSE;	2245 + case UCNV_SET_FILTER_SJIS:

357 +}	2246 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (v alue=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {

358 +	2247 + continue;

359 +/*	2248 + }

360 + * Is byte b a single/lead byte in this state?	2249 + break;

361 + * Recurse for transition states, because here we don't want to say that	2250 + case UCNV_SET_FILTER_GR94DBCS:

362 + * b is a lead byte if all byte sequences that start with b are illegal.	2251 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&

363 + */	2252 + (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA (value))-0xa1a1)<=(0xfefe - 0xa1a1) &&

364 +static UBool	2253 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {

365 +isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnl y, uint8_t b) {	2254 + continue;

366 + const int32_t *row=stateTable[state];	2255 + }

367 + int32_t entry=row[b];	2256 + break;

368 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */	2257 + case UCNV_SET_FILTER_HZ:

369 + return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_ST ATE(entry));	2258 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&

370 + } else {	2259 + (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA (value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&

371 + uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));	2260 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {

372 + if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {	2261 + continue;

373 + return FALSE; /* SI/SO are illegal for DBCS-only conversion */	2262 + }

374 + } else {	2263 + break;

375 + return action!=MBCS_STATE_ILLEGAL;	2264 + default:

376 + }	2265 + /*

377 + }	2266 + * UCNV_SET_FILTER_NONE,

378 +}	2267 + * or UCNV_SET_FILTER_DBCS_ONLY which is handle d via minLength

379 +	2268 + */

380 U_CFUNC void	2269 + break;

381 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,	2270 + }

382 UErrorCode *pErrorCode) {	2271 sa->add(sa->set, c);

383 @@ -2094,6 +2153,34 @@	2272 }

384 sourceIndex=nextSourceIndex;	2273 } while((++c&0xf)!=0);

385 } else if(U_FAILURE(*pErrorCode)) {

386 /* callback(illegal) */

387 + if(byteIndex>1) {

388 + /*

389 + * Ticket 5691: consistent illegal sequences:

390 + * - We include at least the first byte in the illegal sequence .

391 + * - If any of the non-initial bytes could be the start of a ch aracter,

392 + * we stop the illegal sequence before the first one of those .

393 + */

394 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0 );

395 + int8_t i;

396 + for(i=1;

397 + i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnl y, bytes[i]);

398 + ++i) {}

399 + if(i<byteIndex) {

400 + /* Back out some bytes. */

401 + int8_t backOutDistance=byteIndex-i;

402 + int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);

403 + byteIndex=i; /* length of reported illegal byte sequence * /

404 + if(backOutDistance<=bytesFromThisBuffer) {

405 + source-=backOutDistance;

406 + } else {

407 + /* Back out bytes from the previous buffer: Need to rep lay them. */

408 + cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutD istance);

409 + /* preToULength is negative! */

410 + uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);

411 + source=(const uint8_t *)pArgs->source;

412 + }

413 + }

414 + }

415 break;

416 } else /* unassigned sequences indicated with byteIndex>0 */ {

417 /* try an extension mapping */

418 @@ -2104,7 +2191,7 @@

419 &offsets, sourceIndex,

420 pArgs->flush,

421 pErrorCode);

422 - sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs ->source);

423 + sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArg s->source);

424

425 if(U_FAILURE(*pErrorCode)) {

426 /* not mappable or buffer overflow */

427 @@ -2395,15 +2482,37 @@

428

429 if(c<0) {

430 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {

431 - *pErrorCode=U_TRUNCATED_CHAR_FOUND;

432 - }

433 - if(U_FAILURE(*pErrorCode)) {

434 /* incomplete character byte sequence */

435 uint8_t *bytes=cnv->toUBytes;

436 cnv->toULength=(int8_t)(source-lastSource);

437 do {

438 bytes++=lastSource++;

439 } while(lastSource<source);

440 + *pErrorCode=U_TRUNCATED_CHAR_FOUND;

441 + } else if(U_FAILURE(*pErrorCode)) {

442 + /* callback(illegal) */

443 + /*

444 + * Ticket 5691: consistent illegal sequences:

445 + * - We include at least the first byte in the illegal sequence.

446 + * - If any of the non-initial bytes could be the start of a charac ter,

447 + * we stop the illegal sequence before the first one of those.

448 + */

449 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

450 + uint8_t *bytes=cnv->toUBytes;

451 + bytes++=lastSource++; /* first byte */

452 + if(lastSource==source) {

453 + cnv->toULength=1;

454 + } else /* lastSource<source: multi-byte character */ {

455 + int8_t i;

456 + for(i=1;

457 + lastSource<source && !isSingleOrLead(stateTable, state, isD BCSOnly, *lastSource);

458 + ++i

459 + ) {

460 + bytes++=lastSource++;

461 + }

462 + cnv->toULength=i;

463 + source=lastSource;

464 + }

465 } else {

466 /* no output because of empty input or only state changes */

467 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

468 diff -ru trie.clean/source/test/cintltst/nccbtst.c chrome.canonical/source/test/ cintltst/nccbtst.c

469 --- trie.clean/source/test/cintltst/nccbtst.c 2007-09-19 09:45:00.986804000 -0 700

470 +++ chrome.canonical/source/test/cintltst/nccbtst.c 2008-10-29 11:08:51.1023 76000 -0700

471 @@ -1,6 +1,6 @@

472 /********************************************************************

473 * COPYRIGHT:

474 - * Copyright (c) 1997-2007, International Business Machines Corporation and

475 + * Copyright (c) 1997-2008, International Business Machines Corporation and

476 * others. All Rights Reserved.

477 ********************************************************************/

478 /*

479 @@ -2530,13 +2530,13 @@

480

481

482 static const uint8_t text943[] = {

483 - 0x82, 0xa9, 0x82, 0x20, /0xc8,/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };

484 - static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /0xff88,/ 0x0061 , 0x6f22, 0x5b57};

485 - static const UChar toUnicode943skip[]= { 0x304b, /0xff88,/ 0x0061, 0x6f22 , 0x5b57};

486 + 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };

487 + static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22 , 0x5b57 };

488 + static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5 b57 };

489 static const UChar toUnicode943stop[]= { 0x304b};

490

491 - static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};

492 - static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};

493 + static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };

494 + static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };

495 static const int32_t fromIBM943Offsstop[] = { 0};

496

497 gInBufferSize = inputsize;

498 @@ -2570,9 +2570,9 @@

499 {

500 static const uint8_t sampleText[] = {

501 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,

502 - 0xff, /0x82, 0xa9,/ 0x32, 0x33};

503 - static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0x fffd,/0x304b,/ 0x0032, 0x0033};

504 - static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};

505 + 0xff, 0x32, 0x33};

506 + static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x 1a, 0x1a, 0x0032, 0x0033 };

507 + static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };

508 /checking illegal value for ibm-943 with substitute/

509 gInBufferSize = inputsize;

510 gOutBufferSize = outputsize;

511 diff -ru trie.clean/source/test/cintltst/nucnvtst.c chrome.canonical/source/test /cintltst/nucnvtst.c

512 --- trie.clean/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0 700

513 +++ chrome.canonical/source/test/cintltst/nucnvtst.c 2008-10-29 11:08:51.1942 86000 -0700

514 @@ -2606,7 +2606,7 @@

515 TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceL imit <= source");

516 /Test for the condition where there is an invalid character/

517 {

518 - static const uint8_t source2[]={0xa1, 0x01};

519 + static const uint8_t source2[]={0xa1, 0x80};

520 TestNextUCharError(cnv, (const char)source2, (const char)source2+size of(source2), U_ZERO_ERROR, "an invalid character");

521 }

522 /Test for the condition where we have a truncated char/

523 @@ -3899,11 +3899,11 @@

524 TestISO_2022_KR() {

525 /* test input */

526 static const uint16_t in[]={

527 - 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x0 00A,0x000D

528 - ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C02,0xAC04

529 + 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x0 00D

530 + ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C04

531 ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0 028,0x0029

532 ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x5 3CA,0x53CB

533 - ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x5 3E1,0x53E2

534 + ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x5 3E2

535 ,0x53E3,0x53E4,0x000A,0x000D};

536 const UChar* uSource;

537 const UChar* uSourceLimit;

538 diff -ru trie.clean/source/test/testdata/conversion.txt chrome.canonical/source/ test/testdata/conversion.txt

539 --- trie.clean/source/test/testdata/conversion.txt 2007-10-11 14:31:32.1965 32000 -0700

540 +++ chrome.canonical/source/test/testdata/conversion.txt 2008-10-29 11:37 :09.419716000 -0700

541 @@ -48,13 +48,135 @@

542 toUnicode {

543 Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }

544 Cases {

545 + // Test ticket 5691: consistent illegal sequences

546 + // The following test cases are for illegal character byte sequences.

547 + //

548 + // Unfortunately, we cannot use the Shift-JIS examples from the ticket

549 + // comments because our Shift-JIS table is Windows-compatible and

550 + // therefore has no illegal single bytes. Same for GBK.

551 + // Instead, we use the stricter GB 18030 also for 2-byte examples.

552 + // The byte sequences are generally slightly different from the ticket

553 + // comment, simply using assigned characters rather than just

554 + // theoretically valid sequences.

555 + {

556 + "gb18030",

557 + :bin{ 618140813c81ff7a },

558 + "a\u4e02\\x81<\\x81\\xFFz",

559 + :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },

560 + :int{1}, :int{0}, "", "&C", :bin{""}

561 + }

562 + {

563 + "EUC-JP",

564 + :bin{ 618fb0a98fb03c8f3cb0a97a },

565 + "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",

566 + :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },

567 + :int{1}, :int{0}, "", "&C", :bin{""}

568 + }

569 + {

570 + "gb18030",

571 + :bin{ 618130fc318130fc8181303c3e813cfc817a },

572 + "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",

573 + :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },

574 + :int{1}, :int{0}, "", "&C", :bin{""}

575 + }

576 + {

577 + "UTF-8",

578 + :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },

579 + "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1 \\xFF<>z",

580 + :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,1 2,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20, 21 },

581 + :int{1}, :int{0}, "", "&C", :bin{""}

582 + }

583 + {

584 + "ISO-2022-JP",

585 + :bin{ 1b24424141af4142affe41431b2842 },

586 + "\u758f\\xAF\u758e\\xAF\\xFE\u790e",

587 + :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },

588 + :int{1}, :int{0}, "", "&C", :bin{""}

589 + }

590 + {

591 + "ibm-25546",

592 + :bin{ 411b242943420e4141af4142affe41430f5a },

593 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

594 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

595 + :int{1}, :int{0}, "", "&C", :bin{""}

596 + }

597 + {

598 + "ISO-2022-KR",

599 + :bin{ 411b242943420e4141af4142affe41430f5a },

600 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

601 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

602 + :int{1}, :int{0}, "", "&C", :bin{""}

603 + }

604 + {

605 + "ISO-2022-CN",

606 + :bin{ 411b242941420e4141af4142affe41430f5a },

607 + "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

608 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

609 + :int{1}, :int{0}, "", "&C", :bin{""}

610 + }

611 + {

612 + "HZ",

613 + :bin{ 417e7b4141af4142affe41437e7d5a },

614 + "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

615 + :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },

616 + :int{1}, :int{0}, "", "&C", :bin{""}

617 + }

618 + // Test ticket 5691: consistent illegal sequences

619 + // The following test cases are for illegal escape/designator/shift seq uences.

620 + //

621 + // ISO-2022-JP and -CN with illegal escape sequences.

622 + {

623 + "ISO-2022-JP",

624 + :bin{ 611b24201b244241411b283f1b28427a },

625 + "a\\x1B$ \u758f\\x1B\u2538z",

626 + :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },

627 + :int{1}, :int{0}, "", "&C", :bin{""}

628 + }

629 + {

630 + "ISO-2022-CN",

631 + :bin{ 611b2429201b2429410e41410f7a },

632 + "a\\x1B$) \u4eaez",

633 + :intvector{ 0,1,1,1,1,2,3,4,10,13 },

634 + :int{1}, :int{0}, "", "&C", :bin{""}

635 + }

636 + // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS 3 sequences.

637 + // The first ESC N comes before its designator sequence, the last seque nce is ESC+space.

638 + {

639 + "ISO-2022-JP-2",

640 + :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },

641 + "N\\x1BNNN\xceN\\x1B N",

642 + :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },

643 + :int{1}, :int{0}, "", "&C", :bin{""}

644 + }

645 + {

646 + "ISO-2022-CN-EXT",

647 + :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },

648 + "N\\x1BNNN\u8f0eN\\x1B N",

649 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

650 + :int{1}, :int{0}, "", "&C", :bin{""}

651 + }

652 + {

653 + "ISO-2022-CN-EXT",

654 + :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },

655 + "O\\x1BOOO\u492bO\\x1B O",

656 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

657 + :int{1}, :int{0}, "", "&C", :bin{""}

658 + }

659 + // Test ticket 5691: Example from Peter Edberg.

660 + {

661 + "ISO-2022-JP",

662 + :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },

663 + "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",

664 + :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },

665 + :int{1}, :int{0}, "", "?", :bin{""}

666 + }

667 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and

668 // using the Shift-JIS table for JIS X 0208 (ticket #5797)

669 {

670 "ISO-2022-JP",

671 :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b284 2 },

672 - "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6 f3e",

673 - :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },

674 + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\uf ffd\u6f3e",

675 + :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },

676 :int{1}, :int{1}, "", "?", :bin{""}

677 }

678 // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBM PWithOffsets()

679 @@ -303,7 +425,7 @@

680 {

681 "ISO-2022-CN-EXT",

682 :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },

683 - :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }

684 + :int{1}, :int{1}, "illesc", ".", :bin{ 1b }

685 }

686 // G3 designator: recognized, but not supported for -CN (only for -CN-E XT)

687 {

OLD	NEW

« no previous file with comments | « third_party/icu38/source/test/testdata/testdata.mak ('k') | third_party/icu38/uconv.security.header.patch » ('j') | no next file with comments »