third_party/icu38/uconv.security.patch - Issue 52030: Apply ICU patches for ICU tickets 6175 (ISO-2022 and ...

Unified Diff: third_party/icu38/uconv.security.patch

Issue 52030: Apply ICU patches for ICU tickets 6175 (ISO-2022 and ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/icu38/uconv.security.patch

===================================================================

--- third_party/icu38/uconv.security.patch (revision 10949)

+++ third_party/icu38/uconv.security.patch (working copy)

@@ -1,7 +1,1196 @@

-diff -ru trie.clean/source/common/ucnv2022.c chrome.canonical/source/common/ucnv2022.c

---- trie.clean/source/common/ucnv2022.c 2007-11-07 17:39:05.057870000 -0800

-+++ chrome.canonical/source/common/ucnv2022.c 2008-10-29 12:52:22.517453000 -0700

-@@ -752,6 +752,7 @@

+--- r22777/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0700

++++ chrome.canonical/source/test/cintltst/nucnvtst.c 2009-03-23 12:42:01.106292000 -0700

+@@ -17,6 +17,7 @@

+ #include "unicode/uloc.h"

+ #include "unicode/ucnv.h"

+ #include "unicode/ucnv_err.h"

++#include "unicode/ucnv_cb.h"

+ #include "cintltst.h"

+ #include "unicode/utypes.h"

+ #include "unicode/ustring.h"

+@@ -81,6 +82,7 @@

+ static void TestJitterbug2411(void);

+ static void TestJB5275(void);

+ static void TestJB5275_1(void);

++static void TestJitterbug6175(void);

+ #endif

+ static void TestRoundTrippingAllUTF(void);

+@@ -297,6 +299,7 @@

+ #if !UCONFIG_NO_LEGACY_CONVERSION

+ addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");

+ addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");

++ addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");

+ #endif

+ }

+@@ -2606,7 +2609,7 @@

+ TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");

+ /*Test for the condition where there is an invalid character*/

+ {

+- static const uint8_t source2[]={0xa1, 0x01};

++ static const uint8_t source2[]={0xa1, 0x80};

+ TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");

+ }

+ /*Test for the condition where we have a truncated char*/

+@@ -3899,11 +3902,11 @@

+ TestISO_2022_KR() {

+ /* test input */

+ static const uint16_t in[]={

+- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D

+- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04

++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D

++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04

+ ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029

+ ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB

+- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2

++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2

+ ,0x53E3,0x53E4,0x000A,0x000D};

+ const UChar* uSource;

+ const UChar* uSourceLimit;

+@@ -4456,6 +4459,70 @@

+ free(offsets);

+ }

++/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */

++typedef struct {

++ const char * converterName;

++ const char * inputText;

++ int inputTextLength;

++} EmptySegmentTest;

++/* Callback for TestJitterbug6175, should only get called for empty segment errors */

++static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits,

++ int32_t length, UConverterCallbackReason reason, UErrorCode * err ) {

++ if (reason > UCNV_IRREGULAR) {

++ return;

++ }

++ if (reason != UCNV_IRREGULAR) {

++ log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n");

++ }

++ /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */

++ *err = U_ZERO_ERROR;

++ ucnv_cbToUWriteSub(toArgs,0,err);

++}

++enum { kEmptySegmentToUCharsMax = 64 };

++static void TestJitterbug6175(void) {

++ static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A };

++ static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A };

++ static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A };

++ static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A };

++ static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 };

++ static const EmptySegmentTest emptySegmentTests[] = {

++ /* converterName inputText inputTextLength */

++ { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) },

++ { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) },

++ { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) },

++ { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) },

++ { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) },

++ /* terminator: */

++ { NULL, NULL, 0, }

++ };

++ const EmptySegmentTest * testPtr;

++ for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) {

++ UErrorCode err = U_ZERO_ERROR;

++ UConverter * cnv = ucnv_open(testPtr->converterName, &err);

++ if (U_FAILURE(err)) {

++ log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err));

++ return;

++ }

++ ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err);

++ if (U_FAILURE(err)) {

++ log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err));

++ ucnv_close(cnv);

++ return;

++ }

++ {

++ UChar toUChars[kEmptySegmentToUCharsMax];

++ UChar * toUCharsPtr = toUChars;

++ const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax;

++ const char * inCharsPtr = testPtr->inputText;

++ const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength;

++ ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err);

++ }

++ ucnv_close(cnv);

++ }

++}

+ static void

+ TestEBCDIC_STATEFUL() {

+ /* test input */

+--- r22777/source/test/cintltst/ncnvtst.c 2007-01-24 15:27:45.575224000 -0800

++++ chrome.canonical/source/test/cintltst/ncnvtst.c 2009-03-23 12:30:17.291031000 -0700

+@@ -1928,7 +1928,7 @@

+ #if !UCONFIG_NO_LEGACY_CONVERSION

+ { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },

+ { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },

+- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },

++ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */

+ { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }

+ #else

+ { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }

+--- r22777/source/test/intltest/convtest.h 2007-07-26 20:12:12.288784000 -0700

++++ chrome.canonical/source/test/intltest/convtest.h 2009-03-23 12:30:09.445194000 -0700

+@@ -72,6 +72,7 @@

+ void TestToUnicode();

+ void TestFromUnicode();

+ void TestGetUnicodeSet();

++ void TestGetUnicodeSet2();

+ private:

+ UBool

+--- r22777/source/test/intltest/convtest.cpp 2007-03-08 16:28:01.852223000 -0800

++++ chrome.canonical/source/test/intltest/convtest.cpp 2009-03-23 12:30:40.161868000 -0700

+@@ -70,6 +70,7 @@

+ case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;

+ case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;

+ case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;

++ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;

+ default: name=""; break; //needed to end loop

+ }

+@@ -465,6 +466,183 @@

+ }

++U_CDECL_BEGIN

++static void U_CALLCONV

++getUnicodeSetCallback(const void *context,

++ UConverterFromUnicodeArgs *fromUArgs,

++ const UChar* codeUnits,

++ int32_t length,

++ UChar32 codePoint,

++ UConverterCallbackReason reason,

++ UErrorCode *pErrorCode) {

++ if(reason<=UCNV_IRREGULAR) {

++ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point

++ *pErrorCode=U_ZERO_ERROR; // skip

++ } // else ignore the reset, close and clone calls.

++}

++U_CDECL_END

++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.

++void

++ConversionTest::TestGetUnicodeSet2() {

++ // Build a string with all code points.

++ UChar32 cpLimit;

++ int32_t s0Length;

++ if(quick) {

++ cpLimit=s0Length=0x10000; // BMP only

++ } else {

++ cpLimit=0x110000;

++ s0Length=0x10000+0x200000; // BMP + surrogate pairs

++ }

++ UChar *s0=new UChar[s0Length];

++ if(s0==NULL) {

++ return;

++ }

++ UChar *s=s0;

++ UChar32 c;

++ UChar c2;

++ // low BMP

++ for(c=0; c<=0xd7ff; ++c) {

++ *s++=(UChar)c;

++ }

++ // trail surrogates

++ for(c=0xdc00; c<=0xdfff; ++c) {

++ *s++=(UChar)c;

++ }

++ // lead surrogates

++ // (after trails so that there is not even one surrogate pair in between)

++ for(c=0xd800; c<=0xdbff; ++c) {

++ *s++=(UChar)c;

++ }

++ // high BMP

++ for(c=0xe000; c<=0xffff; ++c) {

++ *s++=(UChar)c;

++ }

++ // supplementary code points = surrogate pairs

++ if(cpLimit==0x110000) {

++ for(c=0xd800; c<=0xdbff; ++c) {

++ for(c2=0xdc00; c2<=0xdfff; ++c2) {

++ *s++=(UChar)c;

++ *s++=c2;

++ }

++ static const char *const cnvNames[]={

++ "UTF-8",

++ "UTF-7",

++ "UTF-16",

++ "US-ASCII",

++ "ISO-8859-1",

++ "windows-1252",

++ "Shift-JIS",

++ "ibm-1390", // EBCDIC_STATEFUL table

++ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table

++ "HZ",

++ "ISO-2022-JP",

++ "JIS7",

++ "ISO-2022-CN",

++ "ISO-2022-CN-EXT",

++ "LMBCS"

++ };

++ char buffer[1024];

++ int32_t i;

++ for(i=0; i<LENGTHOF(cnvNames); ++i) {

++ UErrorCode errorCode=U_ZERO_ERROR;

++ UConverter *cnv=cnv_open(cnvNames[i], errorCode);

++ if(U_FAILURE(errorCode)) {

++ errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));

++ continue;

++ }

++ UnicodeSet expected;

++ ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);

++ if(U_FAILURE(errorCode)) {

++ errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));

++ ucnv_close(cnv);

++ continue;

++ }

++ UConverterUnicodeSet which;

++ for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {

++ if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {

++ ucnv_setFallback(cnv, TRUE);

++ }

++ expected.add(0, cpLimit-1);

++ s=s0;

++ UBool flush;

++ do {

++ char *t=buffer;

++ flush=(UBool)(s==s0+s0Length);

++ ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);

++ if(U_FAILURE(errorCode)) {

++ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {

++ errorCode=U_ZERO_ERROR;

++ continue;

++ } else {

++ break; // unexpected error, should not occur

++ }

++ } while(!flush);

++ UnicodeSet set;

++ ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);

++ if(cpLimit<0x110000) {

++ set.remove(cpLimit, 0x10ffff);

++ }

++ if(which==UCNV_ROUNDTRIP_SET) {

++ // ignore PUA code points because they will be converted even if they

++ // are fallbacks and when other fallbacks are turned off,

++ // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips

++ expected.remove(0xe000, 0xf8ff);

++ expected.remove(0xf0000, 0xffffd);

++ expected.remove(0x100000, 0x10fffd);

++ set.remove(0xe000, 0xf8ff);

++ set.remove(0xf0000, 0xffffd);

++ set.remove(0x100000, 0x10fffd);

++ }

++ if(set!=expected) {

++ // First try to see if we have different sets because ucnv_getUnicodeSet()

++ // added strings: The above conversion method does not tell us what strings might be convertible.

++ // Remove strings from the set and compare again.

++ // Unfortunately, there are no good, direct set methods for finding out whether there are strings

++ // in the set, nor for enumerating or removing just them.

++ // Intersect all code points with the set. The intersection will not contain strings.

++ UnicodeSet temp(0, 0x10ffff);

++ temp.retainAll(set);

++ set=temp;

++ }

++ if(set!=expected) {

++ UnicodeSet diffSet;

++ UnicodeString out;

++ // are there items that must be in the set but are not?

++ (diffSet=expected).removeAll(set);

++ if(!diffSet.isEmpty()) {

++ diffSet.toPattern(out, TRUE);

++ if(out.length()>100) {

++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));

++ }

++ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",

++ cnvNames[i], which);

++ errln(out);

++ }

++ // are there items that must not be in the set but are?

++ (diffSet=set).removeAll(expected);

++ if(!diffSet.isEmpty()) {

++ diffSet.toPattern(out, TRUE);

++ if(out.length()>100) {

++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));

++ }

++ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",

++ cnvNames[i], which);

++ errln(out);

++ }

++ delete [] s0;

++}

+ // open testdata or ICU data converter ------------------------------------- ***

+ UConverter *

+--- r22777/source/test/testdata/testdata.mak 2007-07-26 20:12:12.288784000 -0700

++++ chrome.canonical/source/test/testdata/testdata.mak 2009-03-23 12:31:04.424645000 -0700

+@@ -28,7 +28,7 @@

+ TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res)

+-"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"

++"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"

+ @echo Building test data

+ @copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ"

+ @copy "$(TESTDATA)\icu26_testtypes.res" "$(TESTDATABLD)"

+@@ -54,6 +54,7 @@

+ iscii.res

+ test.icu

+ test1.cnv

++test1bmp.cnv

+ test3.cnv

+ test4.cnv

+ test4x.cnv

+@@ -126,6 +127,10 @@

+ @echo Building $@

+ @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**

++"$(TESTDATABLD)\test1bmp.cnv": "$(TESTDATA)\test1bmp.ucm"

++ @echo Building $@

++ @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**

+ "$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"

+ @echo Building $@

+ @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**

+--- r22777/source/test/testdata/Makefile.in 2007-08-21 13:15:55.267002000 -0700

++++ chrome.canonical/source/test/testdata/Makefile.in 2009-03-23 12:31:04.435635000 -0700

+@@ -117,7 +117,7 @@

+ TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu

+ TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUILDDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp

+-TEST_UCM_SOURCE= test1.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm

++TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm

+ TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%)

+ TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv)

+--- r22777/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0700

++++ chrome.canonical/source/test/testdata/conversion.txt 2009-03-23 12:42:01.119267000 -0700

+@@ -1,6 +1,6 @@

+ //*******************************************************************************

+ //

+ // file name: conversion.txt

+@@ -48,13 +48,161 @@

+ toUnicode {

+ Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }

+ Cases {

++ // Test ticket 5691: consistent illegal sequences

++ // The following test cases are for illegal character byte sequences.

++ //

++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket

++ // comments because our Shift-JIS table is Windows-compatible and

++ // therefore has no illegal single bytes. Same for GBK.

++ // Instead, we use the stricter GB 18030 also for 2-byte examples.

++ // The byte sequences are generally slightly different from the ticket

++ // comment, simply using assigned characters rather than just

++ // theoretically valid sequences.

++ {

++ "gb18030",

++ :bin{ 618140813c81ff7a },

++ "a\u4e02\\x81<\\x81\\xFFz",

++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "EUC-JP",

++ :bin{ 618fb0a98fb03c8f3cb0a97a },

++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",

++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "gb18030",

++ :bin{ 618130fc318130fc8181303c3e813cfc817a },

++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",

++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "UTF-8",

++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },

++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",

++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "ISO-2022-JP",

++ :bin{ 1b24424141af4142affe41431b2842 },

++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",

++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "ibm-25546",

++ :bin{ 411b242943420e4141af4142affe41430f5a },

++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "ISO-2022-KR",

++ :bin{ 411b242943420e4141af4142affe41430f5a },

++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "ISO-2022-CN",

++ :bin{ 411b242941420e4141af4142affe41430f5a },

++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "HZ",

++ :bin{ 417e7b4141af4142affe41437e7d5a },

++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ // Test ticket 5691: consistent illegal sequences

++ // The following test cases are for illegal escape/designator/shift sequences.

++ //

++ // ISO-2022-JP and -CN with illegal escape sequences.

++ {

++ "ISO-2022-JP",

++ :bin{ 611b24201b244241411b283f1b28427a },

++ "a\\x1B$ \u758f\\x1B\u2538z",

++ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "ISO-2022-CN",

++ :bin{ 611b2429201b2429410e41410f7a },

++ "a\\x1B$) \u4eaez",

++ :intvector{ 0,1,1,1,1,2,3,4,10,13 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.

++ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.

++ {

++ "ISO-2022-JP-2",

++ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },

++ "N\\x1BNNN\xceN\\x1B N",

++ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "ISO-2022-CN-EXT",

++ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },

++ "N\\x1BNNN\u8f0eN\\x1B N",

++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ {

++ "ISO-2022-CN-EXT",

++ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },

++ "O\\x1BOOO\u492bO\\x1B O",

++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ // Test ticket 5691: HZ with illegal tilde sequences.

++ {

++ "HZ",

++ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },

++ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z",

++ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS

++ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS

++ 25 }, // SBCS

++ :int{1}, :int{0}, "", "&C", :bin{""}

++ }

++ // Test ticket 5691: Example from Peter Edberg.

++ {

++ "ISO-2022-JP",

++ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },

++ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",

++ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },

++ :int{1}, :int{0}, "", "?", :bin{""}

++ }

++ // Test bug 6071 (2:1 Unicode:charset SBCS mapping).

++ {

++ "*test1bmp",

++ :bin{ 050008 },

++ "e@uv",

++ :intvector{ 0,1,2,2 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

++ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e

++ {

++ "HZ",

++ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },

++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",

++ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and

+ // using the Shift-JIS table for JIS X 0208 (ticket #5797)

+ {

+ "ISO-2022-JP",

+ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },

+- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",

+- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },

++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",

++ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },

+ :int{1}, :int{1}, "", "?", :bin{""}

+ }

+ // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()

+@@ -191,6 +339,21 @@

+ :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 },

+ :int{1}, :int{1}, "", "&", :bin{""}

+ }

++ // empty segment (using substitution and stop)

++ {

++ "ISO-2022-KR",

++ :bin{ 1b242943610e0f620d0a },

++ "a\uFFFDb\u000D\u000A",

++ :intvector{ 4, 6, 7, 8, 9 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

++ {

++ "ISO-2022-KR",

++ :bin{ 1b242943610e0f620d0a },

++ "a",

++ :intvector{ 4 },

++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}

++ }

+ // ISO-2022-JP

+@@ -241,6 +404,21 @@

+ :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 },

+ :int{1}, :int{1}, "", ".", :bin{""}

+ }

++ // empty segment (using substitution and stop)

++ {

++ "ISO-2022-JP",

++ :bin{ 61621b24421b284263640d0a },

++ "ab\uFFFDcd\u000D\u000A",

++ :intvector{ 0, 1, 5, 8, 9, 10, 11 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

++ {

++ "ISO-2022-JP",

++ :bin{ 61621b24421b284263640d0a },

++ "ab",

++ :intvector{ 0, 1 },

++ :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"}

++ }

+ // ISO-2022-CN

+@@ -303,7 +481,7 @@

+ {

+ "ISO-2022-CN-EXT",

+ :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },

+- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }

++ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }

+ }

+ // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)

+ {

+@@ -311,6 +489,36 @@

+ :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 },

+ :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 }

+ }

++ // empty segment 1 (using substitution and stop)

++ {

++ "ISO-2022-CN",

++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },

++ "ab\uFFFD\u994Cc\u000D\u000A",

++ :intvector{ 0, 5, 7, 14, 16, 17, 18 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

++ {

++ "ISO-2022-CN",

++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },

++ "ab",

++ :intvector{ 0, 5 },

++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}

++ }

++ // empty segment 2 (using substitution and stop)

++ {

++ "ISO-2022-CN",

++ :bin{ 611b242941620e1b24294768640f630d0a },

++ "ab\uFFFD\u5F70c\u000D\u000A",

++ :intvector{ 0, 5, 7, 11, 14, 15, 16 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

++ {

++ "ISO-2022-CN",

++ :bin{ 611b242941620e1b24294768640f630d0a },

++ "ab",

++ :intvector{ 0, 5 },

++ :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"}

++ }

+ // ISO-2022 SBCS

+ // [U_ENABLE_GENERIC_ISO_2022]

+@@ -325,6 +533,39 @@

+ // :int{1}, :int{1}, "", ".", :bin{""}

+ //}

++ // HZ-GB-2312

++ // empty segment 1 (using substitution and stop)

++ {

++ "HZ-GB-2312",

++ :bin{ 61627e7b7e7d6364 },

++ "ab\uFFFDcd",

++ :intvector{ 0, 1, 4, 6, 7 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

++ {

++ "HZ-GB-2312",

++ :bin{ 61627e7b7e7d63640d0a },

++ "ab",

++ :intvector{ 0, 1 },

++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"}

++ }

++ // empty segment 2 & legal redundant switches (using substitution and stop)

++ {

++ "HZ-GB-2312",

++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },

++ "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD",

++ :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 },

++ :int{1}, :int{1}, "", "?", :bin{""}

++ }

++ {

++ "HZ-GB-2312",

++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },

++ "ab\u4E0D\u7A7A",

++ :intvector{ 0, 1, 4, 6 },

++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"}

++ }

+ // DBCS-only extensions

+ {

+ "ibm-970",

+@@ -496,6 +737,14 @@

+ :intvector{ 0, 4, 8, 12 },

+ :int{1}, :int{0}, "", "?", :bin{""}

+ }

++ // Test iso-2022-jp-2 miscellaneous symbols

++ {

++ "iso-2022-jp-2",

++ :bin{ 1b242843224f224e1b2842 },

++ "\u260E\u260F",

++ :intvector{ 4, 6 },

++ :int{1}, :int{0}, "", ".", :bin{""}

++ }

+ }

+@@ -504,6 +753,14 @@

+ fromUnicode {

+ Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }

+ Cases {

++ // Test bug 6071 (1:2 Unicode:charset SBCS mapping).

++ {

++ "*test1bmp",

++ "e@t",

++ :bin{ 05000709 },

++ :intvector{ 0,1,2,2 },

++ :int{1}, :int{0}, "", "?", ""

++ }

+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and

+ // using the Shift-JIS table for JIS X 0208 (ticket #5797)

+ {

+@@ -1311,16 +1568,29 @@

+ // versions of ISO-2022-JP

+ {

+ "ISO-2022-JP",

+- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",

+- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",

++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",

++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",

+ :int{0}

+- }

++ }

+ {

+ "ISO-2022-JP-2",

+- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",

+- "[\x0e\x0f\x1b\uffe7-\U0010ffff]",

++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",

++ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",

++ :int{0}

++ }

++ {

++ "JIS7",

++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",

++ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",

+ :int{0}

+ }

++ // with fallbacks

++ {

++ "ISO-2022-JP",

++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",

++ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",

++ :int{1}

++ }

+ // versions of ISO-2022-CN

+ {

+@@ -1336,6 +1606,22 @@

+ :int{0}

+ }

++ // HZ

++ {

++ "HZ",

++ "[\u0410-\u044f\u4e00\u4e01\u4e03]",

++ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",

++ :int{0}

++ }

++ // LMBCS

++ {

++ "LMBCS",

++ "[\x00-\U0010ffff]",

++ "[]",

++ :int{0}

++ }

+ // DBCS-only

+ {

+ "ibm-971",

+--- r22777/source/common/ucnv_ext.h 2007-08-22 22:46:49.525855000 -0700

++++ chrome.canonical/source/common/ucnv_ext.h 2009-03-23 12:30:09.644121000 -0700

+@@ -382,10 +382,20 @@

+ UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,

+ UErrorCode *pErrorCode);

++/*

++ * Add code points and strings to the set according to the extension mappings.

++ * Limitation on the UConverterSetFilter:

++ * The filters currently assume that they are used with 1:1 mappings.

++ * They only apply to single input code points, and then they pass through

++ * only mappings with single-charset-code results.

++ * For example, the Shift-JIS filter only works for 2-byte results and tests

++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.

++ */

+ U_CFUNC void

+ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,

+ const USetAdder *sa,

+ UConverterUnicodeSet which,

++ UConverterSetFilter filter,

+ UErrorCode *pErrorCode);

+ /* toUnicode helpers -------------------------------------------------------- */

+--- r22777/source/common/ucnvmbcs.c 2007-10-11 14:31:32.196532000 -0700

++++ chrome.canonical/source/common/ucnvmbcs.c 2009-03-23 12:42:01.150242000 -0700

+@@ -1,7 +1,7 @@

+ /*

+ ******************************************************************************

+ *

+ ******************************************************************************

+@@ -485,9 +485,23 @@

+ if(mbcsTable->outputType==MBCS_OUTPUT_1) {

+ const uint16_t *stage2, *stage3, *results;

++ uint16_t minValue;

+ results=(const uint16_t *)mbcsTable->fromUnicodeBytes;

++ /*

++ * Set a threshold variable for selecting which mappings to use.

++ * See ucnv_MBCSSingleFromBMPWithOffsets() and

++ * MBCS_SINGLE_RESULT_FROM_U() for details.

++ */

++ if(which==UCNV_ROUNDTRIP_SET) {

++ /* use only roundtrips */

++ minValue=0xf00;

++ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {

++ /* use all roundtrip and fallback results */

++ minValue=0x800;

++ }

+ for(st1=0; st1<maxStage1; ++st1) {

+ st2=table[st1];

+ if(st2>maxStage1) {

+@@ -497,15 +511,8 @@

+ /* read the stage 3 block */

+ stage3=results+st3;

+- /*

+- * Add code points for which the roundtrip flag is set.

+- * Once we get a set for fallback mappings, we have to use

+- * a threshold variable with a value of 0x800.

+- * See ucnv_MBCSSingleFromBMPWithOffsets() and

+- * MBCS_SINGLE_RESULT_FROM_U() for details.

+- */

+ do {

+- if(*stage3++>=0xf00) {

++ if(*stage3++>=minValue) {

+ sa->add(sa->set, c);

+ }

+ } while((++c&0xf)!=0);

+@@ -522,9 +529,12 @@

+ const uint8_t *stage3, *bytes;

+ uint32_t st3Multiplier;

+ uint32_t value;

++ UBool useFallback;

+ bytes=mbcsTable->fromUnicodeBytes;

++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);

+ switch(mbcsTable->outputType) {

+ case MBCS_OUTPUT_3:

+ case MBCS_OUTPUT_4_EUC:

+@@ -551,9 +561,8 @@

+ st3>>=16;

+ /*

+- * Add code points for which the roundtrip flag is set.

+- * Once we get a set for fallback mappings, we have to check

+- * non-roundtrip stage 3 results for whether they are 0.

++ * Add code points for which the roundtrip flag is set,

++ * or which map to non-zero bytes if we use fallbacks.

+ * See ucnv_MBCSFromUnicodeWithOffsets() for details.

+ */

+ switch(filter) {

+@@ -561,6 +570,23 @@

+ do {

+ if(st3&1) {

+ sa->add(sa->set, c);

++ stage3+=st3Multiplier;

++ } else if(useFallback) {

++ uint8_t b=0;

++ switch(st3Multiplier) {

++ case 4:

++ b|=*stage3++;

++ case 3:

++ b|=*stage3++;

++ case 2:

++ b|=stage3[0]|stage3[1];

++ stage3+=2;

++ default:

++ break;

++ }

++ if(b!=0) {

++ sa->add(sa->set, c);

++ }

+ }

+ st3>>=1;

+ } while((++c&0xf)!=0);

+@@ -568,7 +594,7 @@

+ case UCNV_SET_FILTER_DBCS_ONLY:

+ /* Ignore single-byte results (<0x100). */

+ do {

+- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {

++ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {

+ sa->add(sa->set, c);

+ }

+ st3>>=1;

+@@ -578,7 +604,7 @@

+ case UCNV_SET_FILTER_2022_CN:

+ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */

+ do {

+- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {

++ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {

+ sa->add(sa->set, c);

+ }

+ st3>>=1;

+@@ -588,7 +614,33 @@

+ case UCNV_SET_FILTER_SJIS:

+ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */

+ do {

+- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {

++ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {

++ sa->add(sa->set, c);

++ }

++ st3>>=1;

++ stage3+=2; /* +=st3Multiplier */

++ } while((++c&0xf)!=0);

++ break;

++ case UCNV_SET_FILTER_GR94DBCS:

++ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */

++ do {

++ if( ((st3&1)!=0 || useFallback) &&

++ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&

++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

++ ) {

++ sa->add(sa->set, c);

++ }

++ st3>>=1;

++ stage3+=2; /* +=st3Multiplier */

++ } while((++c&0xf)!=0);

++ break;

++ case UCNV_SET_FILTER_HZ:

++ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */

++ do {

++ if( ((st3&1)!=0 || useFallback) &&

++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&

++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)

++ ) {

+ sa->add(sa->set, c);

+ }

+ st3>>=1;

+@@ -609,7 +661,7 @@

+ }

+- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);

++ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);

+ }

+ U_CFUNC void

+@@ -1694,7 +1746,7 @@

+ cnv->toUBytes[0]=*(source-1);

+ cnv->toULength=_extToU(cnv, cnv->sharedData,

+ 1, &source, sourceLimit,

+- &target, target+targetCapacity,

++ &target, pArgs->targetLimit,

+ &offsets, sourceIndex,

+ pArgs->flush,

+ pErrorCode);

+@@ -1739,6 +1791,65 @@

+ pArgs->offsets=offsets;

+ }

++static UBool

++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {

++ const int32_t *row=stateTable[state];

++ int32_t b, entry;

++ /* First test for final entries in this state for some commonly valid byte values. */

++ entry=row[0xa1];

++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

++ ) {

++ return TRUE;

++ }

++ entry=row[0x41];

++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

++ ) {

++ return TRUE;

++ }

++ /* Then test for final entries in this state. */

++ for(b=0; b<=0xff; ++b) {

++ entry=row[b];

++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

++ ) {

++ return TRUE;

++ }

++ /* Then recurse for transition entries. */

++ for(b=0; b<=0xff; ++b) {

++ entry=row[b];

++ if( MBCS_ENTRY_IS_TRANSITION(entry) &&

++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))

++ ) {

++ return TRUE;

++ }

++ return FALSE;

++}

++/*

++ * Is byte b a single/lead byte in this state?

++ * Recurse for transition states, because here we don't want to say that

++ * b is a lead byte if all byte sequences that start with b are illegal.

++ */

++static UBool

++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {

++ const int32_t *row=stateTable[state];

++ int32_t entry=row[b];

++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */

++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));

++ } else {

++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {

++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */

++ } else {

++ return action!=MBCS_STATE_ILLEGAL;

++ }

++}

+ U_CFUNC void

+ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

+ UErrorCode *pErrorCode) {

+@@ -2094,6 +2205,34 @@

+ sourceIndex=nextSourceIndex;

+ } else if(U_FAILURE(*pErrorCode)) {

+ /* callback(illegal) */

++ if(byteIndex>1) {

++ /*

++ * Ticket 5691: consistent illegal sequences:

++ * - We include at least the first byte in the illegal sequence.

++ * - If any of the non-initial bytes could be the start of a character,

++ * we stop the illegal sequence before the first one of those.

++ */

++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

++ int8_t i;

++ for(i=1;

++ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);

++ ++i) {}

++ if(i<byteIndex) {

++ /* Back out some bytes. */

++ int8_t backOutDistance=byteIndex-i;

++ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);

++ byteIndex=i; /* length of reported illegal byte sequence */

++ if(backOutDistance<=bytesFromThisBuffer) {

++ source-=backOutDistance;

++ } else {

++ /* Back out bytes from the previous buffer: Need to replay them. */

++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);

++ /* preToULength is negative! */

++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);

++ source=(const uint8_t *)pArgs->source;

++ }

+ break;

+ } else /* unassigned sequences indicated with byteIndex>0 */ {

+ /* try an extension mapping */

+@@ -2104,7 +2243,7 @@

+ &offsets, sourceIndex,

+ pArgs->flush,

+ pErrorCode);

+- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);

++ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);

+ if(U_FAILURE(*pErrorCode)) {

+ /* not mappable or buffer overflow */

+@@ -2395,15 +2534,37 @@

+ if(c<0) {

+ if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {

+- *pErrorCode=U_TRUNCATED_CHAR_FOUND;

+- }

+- if(U_FAILURE(*pErrorCode)) {

+ /* incomplete character byte sequence */

+ uint8_t *bytes=cnv->toUBytes;

+ cnv->toULength=(int8_t)(source-lastSource);

+ do {

+ *bytes++=*lastSource++;

+ } while(lastSource<source);

++ *pErrorCode=U_TRUNCATED_CHAR_FOUND;

++ } else if(U_FAILURE(*pErrorCode)) {

++ /* callback(illegal) */

++ /*

++ * Ticket 5691: consistent illegal sequences:

++ * - We include at least the first byte in the illegal sequence.

++ * - If any of the non-initial bytes could be the start of a character,

++ * we stop the illegal sequence before the first one of those.

++ */

++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

++ uint8_t *bytes=cnv->toUBytes;

++ *bytes++=*lastSource++; /* first byte */

++ if(lastSource==source) {

++ cnv->toULength=1;

++ } else /* lastSource<source: multi-byte character */ {

++ int8_t i;

++ for(i=1;

++ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);

++ ++i

++ ) {

++ *bytes++=*lastSource++;

++ }

++ cnv->toULength=i;

++ source=lastSource;

++ }

+ } else {

+ /* no output because of empty input or only state changes */

+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

+@@ -3237,7 +3398,7 @@

+ lastSource=source;

+ c=_extFromU(cnv, cnv->sharedData,

+ c, &source, sourceLimit,

+- &target, target+targetCapacity,

++ &target, (const uint8_t *)(pArgs->targetLimit),

+ &offsets, sourceIndex,

+ pArgs->flush,

+ pErrorCode);

+--- r22777/source/common/ucnvmbcs.h 2007-10-11 14:31:32.196532000 -0700

++++ chrome.canonical/source/common/ucnvmbcs.h 2009-03-23 12:30:17.315007000 -0700

+@@ -492,6 +492,8 @@

+ UCNV_SET_FILTER_DBCS_ONLY,

+ UCNV_SET_FILTER_2022_CN,

+ UCNV_SET_FILTER_SJIS,

++ UCNV_SET_FILTER_GR94DBCS,

++ UCNV_SET_FILTER_HZ,

+ UCNV_SET_FILTER_COUNT

+ } UConverterSetFilter;

+--- r22777/source/common/ucnv.c 2007-08-31 12:39:14.294200000 -0700

++++ chrome.canonical/source/common/ucnv.c 2009-03-23 12:40:10.566608000 -0700

+@@ -1528,11 +1528,14 @@

+ cnv->toULength=0;

+ /* call the callback function */

++ if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) {

++ cnv->toUCallbackReason = UCNV_UNASSIGNED;

++ }

+ cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,

+ cnv->invalidCharBuffer, errorInputLength,

+- (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?

+- UCNV_UNASSIGNED : UCNV_ILLEGAL,

++ cnv->toUCallbackReason,

+ err);

++ cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */

+ /*

+ * loop back to the offset handling

+--- r22777/source/common/uset_imp.h 2007-07-24 19:51:25.692061000 -0700

++++ chrome.canonical/source/common/uset_imp.h 2009-03-23 12:30:09.893067000 -0700

+@@ -36,6 +36,9 @@

+ typedef void U_CALLCONV

+ USetRemove(USet *set, UChar32 c);

++typedef void U_CALLCONV

++USetRemoveRange(USet *set, UChar32 start, UChar32 end);

+ /**

+ * Interface for adding items to a USet, to keep low-level code from

+ * statically depending on the USet implementation.

+@@ -47,6 +50,7 @@

+ USetAddRange *addRange;

+ USetAddString *addString;

+ USetRemove *remove;

++ USetRemoveRange *removeRange;

+ };

+ typedef struct USetAdder USetAdder;

+--- r22777/source/common/ucnv2022.c 2007-10-11 14:31:32.196532000 -0700

++++ chrome.canonical/source/common/ucnv2022.c 2009-03-23 12:57:38.398368000 -0700

+@@ -201,6 +201,7 @@

+ #ifdef U_ENABLE_GENERIC_ISO_2022

+ UBool isFirstBuffer;

+ #endif

++ UBool isEmptySegment;

+ char name[30];

+ char locale[3];

+ }UConverterDataISO2022;

+@@ -609,6 +610,7 @@

+ if(choice<=UCNV_RESET_TO_UNICODE) {

+ uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));

+ myConverterData->key = 0;

++ myConverterData->isEmptySegment = FALSE;

+ }

+ if(choice!=UCNV_RESET_TO_UNICODE) {

+ uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));

+@@ -752,6 +754,7 @@

UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);

uint32_t key = myData2022->key;

int32_t offset = 0;

@@ -9,7 +1198,7 @@

char c;

value = VALID_NON_TERMINAL_2022;

-@@ -804,7 +805,6 @@

+@@ -804,7 +807,6 @@

return;

} else if (value == INVALID_2022 ) {

*err = U_ILLEGAL_ESCAPE_SEQUENCE;

@@ -17,7 +1206,15 @@

} else /* value == VALID_TERMINAL_2022 */ {

switch(var){

#ifdef U_ENABLE_GENERIC_ISO_2022

-@@ -935,6 +935,35 @@

+@@ -814,6 +816,7 @@

+ if(chosenConverterName == NULL) {

+ /* SS2 or SS3 */

+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;

++ _this->toUCallbackReason = UCNV_UNASSIGNED;

+ return;

+ }

+@@ -935,6 +938,37 @@

}

if(U_SUCCESS(*err)) {

_this->toULength = 0;

@@ -50,43 +1247,46 @@

+ }

+ _this->toULength=1;

+ }

++ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {

++ _this->toUCallbackReason = UCNV_UNASSIGNED;

}

-@@ -1097,6 +1126,24 @@

+@@ -1113,6 +1147,24 @@

+ }

}

- /*

-+ * * Check that the result is a 2-byte value with each byte in the range A1..FE

-+ * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte

-+ * * to move it to the ISO 2022 range 21..7E.

-+ * * Return 0 if out of range.

-+ * */

++#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */

++/*

++ * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the

++ * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point

++ * unchanged.

++ */

+static U_INLINE uint32_t

-+_2022FromGR94DBCS(uint32_t value) {

-+ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&

-+ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)

-+ ) {

-+ return value - 0x8080; /* shift down to 21..7e byte range */

++_2022ToGR94DBCS(uint32_t value) {

++ uint32_t returnValue = value + 0x8080;

++ if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&

++ (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {

++ return returnValue;

+ } else {

-+ return 0; /* not valid for ISO 2022 */

++ return value;

+ }

++#endif

-+#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */

-+/*

- * Check that the result is a 2-byte value with each byte in the range A1..FE

- * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte

- * to move it to the ISO 2022 range 21..7E.

-@@ -1112,6 +1159,7 @@

- return 0; /* not valid for ISO 2022 */

- }

-+#endif

#ifdef U_ENABLE_GENERIC_ISO_2022

-@@ -1953,6 +2001,7 @@

+ /**********************************************************************************

+@@ -1436,7 +1488,7 @@

+ c2 = 0; /* invalid */

+ }

+ } else {

+- if((uint8_t)(c2-0x21) <= (0x7e-0x21)) {

++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {

+ c2 += 0x7e;

+ } else {

+ c2 = 0; /* invalid */

+@@ -1953,6 +2005,7 @@

const char *mySourceLimit = args->sourceLimit;

uint32_t targetUniChar = 0x0000;

uint32_t mySourceChar = 0x0000;

@@ -94,7 +1294,7 @@

UConverterDataISO2022* myData;

ISO2022State *pToU2022State;

StateEnum cs;

-@@ -1968,6 +2017,7 @@

+@@ -1968,6 +2021,7 @@

mySourceChar = args->converter->toUBytes[0];

args->converter->toULength = 0;

cs = (StateEnum)pToU2022State->cs[pToU2022State->g];

@@ -102,7 +1302,65 @@

goto getTrailByte;

}

-@@ -2077,17 +2127,44 @@

+@@ -1986,6 +2040,7 @@

+ continue;

+ } else {

+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */

++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */

+ break;

+ }

+@@ -1997,21 +2052,39 @@

+ continue;

+ } else {

+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */

++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */

+ break;

+ }

+ case ESC_2022:

+ mySource--;

+ escape:

+- changeState_2022(args->converter,&(mySource),

+- mySourceLimit, ISO_2022_JP,err);

++ {

++ const char * mySourceBefore = mySource;

++ int8_t toULengthBefore = args->converter->toULength;

++ changeState_2022(args->converter,&(mySource),

++ mySourceLimit, ISO_2022_JP,err);

++ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */

++ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {

++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;

++ args->converter->toUCallbackReason = UCNV_IRREGULAR;

++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);

++ }

+ /* invalid or illegal escape sequence */

+ if(U_FAILURE(*err)){

+ args->target = myTarget;

+ args->source = mySource;

++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */

+ return;

+ }

++ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */

++ if(myData->key==0) {

++ myData->isEmptySegment = TRUE;

++ }

+ continue;

+ /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */

+@@ -2028,6 +2101,7 @@

+ /* falls through */

+ default:

+ /* convert one or two bytes */

++ myData->isEmptySegment = FALSE;

+ cs = (StateEnum)pToU2022State->cs[pToU2022State->g];

+ if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&

+ !IS_JP_DBCS(cs)

+@@ -2077,17 +2151,44 @@

default:

/* G0 DBCS */

if(mySource < mySourceLimit) {

@@ -156,7 +1414,7 @@

} else {

args->converter->toUBytes[0] = (uint8_t)mySourceChar;

args->converter->toULength = 1;

-@@ -2229,7 +2306,12 @@

+@@ -2229,7 +2330,12 @@

}

/* only DBCS or SBCS characters are expected*/

/* DB characters with high bit set to 1 are expected */

@@ -170,8 +1428,39 @@

targetByteUnit=missingCharMarker;

}

if (targetByteUnit != missingCharMarker){

-@@ -2545,17 +2627,34 @@

+@@ -2524,15 +2630,27 @@

+ if(mySourceChar==UCNV_SI){

+ myData->toU2022State.g = 0;

++ if (myData->isEmptySegment) {

++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */

++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;

++ args->converter->toUCallbackReason = UCNV_IRREGULAR;

++ args->converter->toUBytes[0] = (uint8_t)mySourceChar;

++ args->converter->toULength = 1;

++ args->target = myTarget;

++ args->source = mySource;

++ return;

++ }

+ /*consume the source */

+ continue;

+ }else if(mySourceChar==UCNV_SO){

+ myData->toU2022State.g = 1;

++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */

+ /*consume the source */

+ continue;

+ }else if(mySourceChar==ESC_2022){

+ mySource--;

+ escape:

++ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */

+ changeState_2022(args->converter,&(mySource),

+ mySourceLimit, ISO_2022_KR, err);

+ if(U_FAILURE(*err)){

+@@ -2543,19 +2661,37 @@

+ continue;

+ }

++ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */

if(myData->toU2022State.g == 1) {

if(mySource < mySourceLimit) {

- char trailByte;

@@ -214,7 +1503,7 @@

}

} else {

args->converter->toUBytes[0] = (uint8_t)mySourceChar;

-@@ -2563,8 +2662,10 @@

+@@ -2563,8 +2699,10 @@

break;

}

@@ -226,7 +1515,7 @@

}

if(targetUniChar < 0xfffe){

if(args->offsets) {

-@@ -3061,6 +3162,7 @@

+@@ -3061,6 +3199,7 @@

/* continue with a partial double-byte character */

mySourceChar = args->converter->toUBytes[0];

args->converter->toULength = 0;

@@ -234,7 +1523,68 @@

goto getTrailByte;

}

-@@ -3114,29 +3216,50 @@

+@@ -3075,27 +3214,52 @@

+ switch(mySourceChar){

+ case UCNV_SI:

+ pToU2022State->g=0;

++ if (myData->isEmptySegment) {

++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */

++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;

++ args->converter->toUCallbackReason = UCNV_IRREGULAR;

++ args->converter->toUBytes[0] = mySourceChar;

++ args->converter->toULength = 1;

++ args->target = myTarget;

++ args->source = mySource;

++ return;

++ }

+ continue;

+ case UCNV_SO:

+ if(pToU2022State->cs[1] != 0) {

+ pToU2022State->g=1;

++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */

+ continue;

+ } else {

+ /* illegal to have SO before a matching designator */

++ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */

+ break;

+ }

+ case ESC_2022:

+ mySource--;

+ escape:

+- changeState_2022(args->converter,&(mySource),

+- mySourceLimit, ISO_2022_CN,err);

++ {

++ const char * mySourceBefore = mySource;

++ int8_t toULengthBefore = args->converter->toULength;

++ changeState_2022(args->converter,&(mySource),

++ mySourceLimit, ISO_2022_CN,err);

++ /* After SO there must be at least one character before a designator (designator error handled separately) */

++ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {

++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;

++ args->converter->toUCallbackReason = UCNV_IRREGULAR;

++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);

++ }

+ /* invalid or illegal escape sequence */

+ if(U_FAILURE(*err)){

+ args->target = myTarget;

+ args->source = mySource;

++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */

+ return;

+ }

+ continue;

+@@ -3109,34 +3273,56 @@

+ /* falls through */

+ default:

+ /* convert one or two bytes */

++ myData->isEmptySegment = FALSE;

+ if(pToU2022State->g != 0) {

+ if(mySource < mySourceLimit) {

UConverterSharedData *cnv;

StateEnum tempState;

int32_t tempBufLen;

@@ -302,386 +1652,622 @@

} else {

args->converter->toUBytes[0] = (uint8_t)mySourceChar;

args->converter->toULength = 1;

-diff -ru trie.clean/source/common/ucnvmbcs.c chrome.canonical/source/common/ucnvmbcs.c

---- trie.clean/source/common/ucnvmbcs.c 2007-11-07 17:39:05.057870000 -0800

-+++ chrome.canonical/source/common/ucnvmbcs.c 2008-10-29 11:34:34.648518000 -0700

-@@ -1,7 +1,7 @@

- /*

- ******************************************************************************

- *

+@@ -3399,11 +3585,19 @@

+ /* include ASCII for JP */

+ sa->addRange(sa->set, 0, 0x7f);

+ }

+- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {

++ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {

+ /*

+- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,

+- * we need to include half-width Katakana for all JP variants because

+- * JIS X 0208 has hardcoded fallbacks for them.

++ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0

++ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)

++ * use half-width Katakana.

++ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)

++ * half-width Katakana via the ESC ( I sequence.

++ * However, we only emit (fromUnicode) half-width Katakana according to the

++ * definition of each variant.

++ *

++ * When including fallbacks,

++ * we need to include half-width Katakana Unicode code points for all JP variants because

++ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).

+ */

+ /* include half-width Katakana for JP */

+ sa->addRange(sa->set, HWKANA_START, HWKANA_END);

+@@ -3457,6 +3651,12 @@

+ * corresponding to JIS X 0208.

+ */

+ filter=UCNV_SET_FILTER_SJIS;

++ } else if(i==KSC5601) {

++ /*

++ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)

++ * are broader than GR94.

++ */

++ filter=UCNV_SET_FILTER_GR94DBCS;

+ } else {

+ filter=UCNV_SET_FILTER_NONE;

+ }

+@@ -3472,6 +3672,9 @@

+ sa->remove(sa->set, 0x0e);

+ sa->remove(sa->set, 0x0f);

+ sa->remove(sa->set, 0x1b);

++ /* ISO 2022 converters do not convert C1 controls either */

++ sa->removeRange(sa->set, 0x80, 0x9f);

+ }

+ static const UConverterImpl _ISO2022Impl={

+--- r22777/source/common/ucnv_lmb.c 2006-08-19 14:27:08.000000000 -0700

++++ chrome.canonical/source/common/ucnv_lmb.c 2009-03-23 12:30:26.043293000 -0700

+@@ -1,6 +1,6 @@

+ /*

+ **********************************************************************

- *

- ******************************************************************************

-@@ -1739,6 +1739,65 @@

- pArgs->offsets=offsets;

+ **********************************************************************

+ * file name: ucnv_lmb.cpp

+@@ -536,7 +536,7 @@

+ NULL,\

+ _LMBCSSafeClone,\

+- _LMBCSGetUnicodeSet\

++ ucnv_getCompleteUnicodeSet\

+ };\

+ static const UConverterStaticData _LMBCSStaticData##n={\

+ sizeof(UConverterStaticData),\

+@@ -662,15 +662,14 @@

+ return &newLMBCS->cnv;

}

-+static UBool

-+hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {

-+ const int32_t *row=stateTable[state];

-+ int32_t b, entry;

-+ /* First test for final entries in this state for some commonly valid byte values. */

-+ entry=row[0xa1];

-+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

-+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

-+ ) {

-+ return TRUE;

-+ }

-+ entry=row[0x41];

-+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

-+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

-+ ) {

-+ return TRUE;

-+ }

-+ /* Then test for final entries in this state. */

-+ for(b=0; b<=0xff; ++b) {

-+ entry=row[b];

-+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&

-+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL

-+ ) {

-+ return TRUE;

-+ }

-+ /* Then recurse for transition entries. */

-+ for(b=0; b<=0xff; ++b) {

-+ entry=row[b];

-+ if( MBCS_ENTRY_IS_TRANSITION(entry) &&

-+ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))

-+ ) {

-+ return TRUE;

-+ }

-+ return FALSE;

-+}

+-static void

+-_LMBCSGetUnicodeSet(const UConverter *cnv,

+- const USetAdder *sa,

+- UConverterUnicodeSet which,

+- UErrorCode *pErrorCode) {

+- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */

+- sa->addRange(sa->set, 0, 0xf5ff);

+- sa->addRange(sa->set, 0xf700, 0x10ffff);

+-}

+/*

-+ * Is byte b a single/lead byte in this state?

-+ * Recurse for transition states, because here we don't want to say that

-+ * b is a lead byte if all byte sequences that start with b are illegal.

++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)

++ * which added all code points except for U+F6xx

++ * because those cannot be represented in the Unicode group.

++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx

++ * which means that LMBCS can convert all Unicode code points after all.

++ * We now simply use ucnv_getCompleteUnicodeSet().

+ */

-+static UBool

-+isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {

-+ const int32_t *row=stateTable[state];

-+ int32_t entry=row[b];

-+ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */

-+ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));

-+ } else {

-+ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));

-+ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {

-+ return FALSE; /* SI/SO are illegal for DBCS-only conversion */

-+ } else {

-+ return action!=MBCS_STATE_ILLEGAL;

-+ }

-+}

- U_CFUNC void

- ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

- UErrorCode *pErrorCode) {

-@@ -2094,6 +2153,34 @@

- sourceIndex=nextSourceIndex;

- } else if(U_FAILURE(*pErrorCode)) {

- /* callback(illegal) */

-+ if(byteIndex>1) {

-+ /*

-+ * Ticket 5691: consistent illegal sequences:

-+ * - We include at least the first byte in the illegal sequence.

-+ * - If any of the non-initial bytes could be the start of a character,

-+ * we stop the illegal sequence before the first one of those.

-+ */

-+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

-+ int8_t i;

-+ for(i=1;

-+ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);

-+ ++i) {}

-+ if(i<byteIndex) {

-+ /* Back out some bytes. */

-+ int8_t backOutDistance=byteIndex-i;

-+ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);

-+ byteIndex=i; /* length of reported illegal byte sequence */

-+ if(backOutDistance<=bytesFromThisBuffer) {

-+ source-=backOutDistance;

+ /*

+ Here's the basic helper function that we use when converting from

+--- r22777/source/common/ucnvhz.c 2006-07-05 16:08:50.000000000 -0700

++++ chrome.canonical/source/common/ucnvhz.c 2009-03-23 12:42:01.208181000 -0700

+@@ -1,6 +1,6 @@

+ /*

+ **********************************************************************

+ * file name: ucnvhz.c

+@@ -59,6 +59,7 @@

+ UBool isEscapeAppended;

+ UBool isStateDBCS;

+ UBool isTargetUCharDBCS;

++ UBool isEmptySegment;

+ }UConverterDataHZ;

+@@ -72,7 +73,7 @@

+ cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));

+ if(cnv->extraInfo != NULL){

+ uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));

+- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);

++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);

+ }

+ else {

+ *errorCode = U_MEMORY_ALLOCATION_ERROR;

+@@ -98,6 +99,7 @@

+ cnv->mode=0;

+ if(cnv->extraInfo != NULL){

+ ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;

++ ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;

+ }

+ if(choice!=UCNV_RESET_TO_UNICODE) {

+@@ -130,6 +132,10 @@

+ * from-GB code '~}' ($7E7D) is outside the defined GB range.)

+ *

+ * Source: RFC 1842

++*

++* Note that the formal syntax in RFC 1842 is invalid. I assume that the

++* intended definition of single-byte-segment is as follows (pedberg):

++* single-byte-segment = single-byte-seq 1*single-byte-char

+ */

+@@ -141,7 +147,7 @@

+ UChar *myTarget = args->target;

+ const char *mySourceLimit = args->sourceLimit;

+ UChar32 targetUniChar = 0x0000;

+- UChar mySourceChar = 0x0000;

++ int32_t mySourceChar = 0x0000;

+ UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);

+ tempBuf[0]=0;

+ tempBuf[1]=0;

+@@ -156,90 +162,123 @@

+ mySourceChar= (unsigned char) *mySource++;

+- switch(mySourceChar){

++ if(args->converter->mode == UCNV_TILDE) {

++ /* second byte after ~ */

++ args->converter->mode=0;

++ switch(mySourceChar) {

+ case 0x0A:

+- if(args->converter->mode ==UCNV_TILDE){

+- args->converter->mode=0;

+- }

+- *(myTarget++)=(UChar)mySourceChar;

++ /* no output for ~\n (line-continuation marker) */

+ continue;

+ case UCNV_TILDE:

+- if(args->converter->mode ==UCNV_TILDE){

+- *(myTarget++)=(UChar)mySourceChar;

+- args->converter->mode=0;

+- continue;

++ if(args->offsets) {

++ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);

+ }

+- else if(args->converter->toUnicodeStatus !=0){

+- args->converter->mode=0;

+- break;

+- }

+- else{

+- args->converter->mode = UCNV_TILDE;

+- continue;

+- }

++ *(myTarget++)=(UChar)mySourceChar;

++ myData->isEmptySegment = FALSE;

++ continue;

+ case UCNV_OPEN_BRACE:

+- if(args->converter->mode == UCNV_TILDE){

+- args->converter->mode=0;

+- myData->isStateDBCS = TRUE;

+- continue;

+- }

+- else{

+- break;

+- }

+ case UCNV_CLOSE_BRACE:

+- if(args->converter->mode == UCNV_TILDE){

+- args->converter->mode=0;

+- myData->isStateDBCS = FALSE;

+- continue;

+- }

+- else{

+- break;

++ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);

++ if (myData->isEmptySegment) {

++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */

++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;

++ args->converter->toUCallbackReason = UCNV_IRREGULAR;

++ args->converter->toUBytes[0] = UCNV_TILDE;

++ args->converter->toUBytes[1] = mySourceChar;

++ args->converter->toULength = 2;

++ args->target = myTarget;

++ args->source = mySource;

++ return;

+ }

++ myData->isEmptySegment = TRUE;

++ continue;

+ default:

+ /* if the first byte is equal to TILDE and the trail byte

+ * is not a valid byte then it is an error condition

+ */

+- if(args->converter->mode == UCNV_TILDE){

+- args->converter->mode=0;

+- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));

+- goto SAVE_STATE;

+- }

+- break;

+- }

+- if(myData->isStateDBCS){

++ /*

++ * Ticket 5691: consistent illegal sequences:

++ * - We include at least the first byte in the illegal sequence.

++ * - If any of the non-initial bytes could be the start of a character,

++ * we stop the illegal sequence before the first one of those.

++ */

++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */

++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;

++ args->converter->toUBytes[0] = UCNV_TILDE;

++ if( myData->isStateDBCS ?

++ (0x21 <= mySourceChar && mySourceChar <= 0x7e) :

++ mySourceChar <= 0x7f

++ ) {

++ /* The current byte could be the start of a character: Back it out. */

++ args->converter->toULength = 1;

++ --mySource;

+ } else {

-+ /* Back out bytes from the previous buffer: Need to replay them. */

-+ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);

-+ /* preToULength is negative! */

-+ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);

-+ source=(const uint8_t *)pArgs->source;

++ /* Include the current byte in the illegal sequence. */

++ args->converter->toUBytes[1] = mySourceChar;

++ args->converter->toULength = 2;

+ }

++ args->target = myTarget;

++ args->source = mySource;

++ return;

+ }

-+ }

- break;

- } else /* unassigned sequences indicated with byteIndex>0 */ {

- /* try an extension mapping */

-@@ -2104,7 +2191,7 @@

- &offsets, sourceIndex,

- pArgs->flush,

- pErrorCode);

-- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);

-+ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);

++ } else if(myData->isStateDBCS) {

+ if(args->converter->toUnicodeStatus == 0x00){

+- args->converter->toUnicodeStatus = (UChar) mySourceChar;

++ /* lead byte */

++ if(mySourceChar == UCNV_TILDE) {

++ args->converter->mode = UCNV_TILDE;

++ } else {

++ /* add another bit to distinguish a 0 byte from not having seen a lead byte */

++ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);

++ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */

++ }

+ continue;

+ }

+ else{

+- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;

+- tempBuf[1] = (char) (mySourceChar+0x80);

+- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));

++ /* trail byte */

++ int leadIsOk, trailIsOk;

++ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;

++ targetUniChar = 0xffff;

++ /*

++ * Ticket 5691: consistent illegal sequences:

++ * - We include at least the first byte in the illegal sequence.

++ * - If any of the non-initial bytes could be the start of a character,

++ * we stop the illegal sequence before the first one of those.

++ *

++ * In HZ DBCS, if the second byte is in the 21..7e range,

++ * we report only the first byte as the illegal sequence.

++ * Otherwise we convert or report the pair of bytes.

++ */

++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);

++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);

++ if (leadIsOk && trailIsOk) {

++ tempBuf[0] = (char) (leadByte+0x80) ;

++ tempBuf[1] = (char) (mySourceChar+0x80);

++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,

++ tempBuf, 2, args->converter->useFallback);

++ mySourceChar= (leadByte << 8) | mySourceChar;

++ } else if (trailIsOk) {

++ /* report a single illegal byte and continue with the following DBCS starter byte */

++ --mySource;

++ mySourceChar = (int32_t)leadByte;

++ } else {

++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */

++ /* add another bit so that the code below writes 2 bytes in case of error */

++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;

++ }

+ args->converter->toUnicodeStatus =0x00;

+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,

+- tempBuf, 2, args->converter->useFallback);

+ }

+ else{

+- if(args->converter->fromUnicodeStatus == 0x00){

+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,

+- mySource - 1, 1, args->converter->useFallback);

+- }

+- else{

+- goto SAVE_STATE;

++ if(mySourceChar == UCNV_TILDE) {

++ args->converter->mode = UCNV_TILDE;

++ continue;

++ } else if(mySourceChar <= 0x7f) {

++ targetUniChar = (UChar)mySourceChar; /* ASCII */

++ myData->isEmptySegment = FALSE; /* the segment has something valid */

++ } else {

++ targetUniChar = 0xffff;

++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */

+ }

+ if(targetUniChar < 0xfffe){

+ if(args->offsets) {

+@@ -248,26 +287,17 @@

- if(U_FAILURE(*pErrorCode)) {

- /* not mappable or buffer overflow */

-@@ -2395,15 +2482,37 @@

- if(c<0) {

- if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {

-- *pErrorCode=U_TRUNCATED_CHAR_FOUND;

-- }

-- if(U_FAILURE(*pErrorCode)) {

- /* incomplete character byte sequence */

- uint8_t *bytes=cnv->toUBytes;

- cnv->toULength=(int8_t)(source-lastSource);

- do {

- *bytes++=*lastSource++;

- } while(lastSource<source);

-+ *pErrorCode=U_TRUNCATED_CHAR_FOUND;

-+ } else if(U_FAILURE(*pErrorCode)) {

-+ /* callback(illegal) */

-+ /*

-+ * Ticket 5691: consistent illegal sequences:

-+ * - We include at least the first byte in the illegal sequence.

-+ * - If any of the non-initial bytes could be the start of a character,

-+ * we stop the illegal sequence before the first one of those.

-+ */

-+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);

-+ uint8_t *bytes=cnv->toUBytes;

-+ *bytes++=*lastSource++; /* first byte */

-+ if(lastSource==source) {

-+ cnv->toULength=1;

-+ } else /* lastSource<source: multi-byte character */ {

-+ int8_t i;

-+ for(i=1;

-+ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);

-+ ++i

+ *(myTarget++)=(UChar)targetUniChar;

+ }

+- else if(targetUniChar>=0xfffe){

+-SAVE_STATE:

++ else /* targetUniChar>=0xfffe */ {

+ if(targetUniChar == 0xfffe){

+ *err = U_INVALID_CHAR_FOUND;

+ }

+ else{

+ *err = U_ILLEGAL_CHAR_FOUND;

+ }

+- if(myData->isStateDBCS){

+- /* this should never occur since isStateDBCS is set to true

+- * only after tempBuf[0] and tempBuf[1]

+- * are set to the input .. just to please BEAM

+- */

+- if(tempBuf[0]==0 || tempBuf[1]==0){

+- *err = U_INTERNAL_PROGRAM_ERROR;

+- }else{

+- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);

+- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);

+- args->converter->toULength=2;

+- }

++ if(mySourceChar > 0xff){

++ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);

++ args->converter->toUBytes[1] = (uint8_t)mySourceChar;

++ args->converter->toULength=2;

+ }

+ else{

+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;

+@@ -328,16 +358,21 @@

+ escSeq = TILDE_ESCAPE;

+ CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);

+ continue;

+- }

+- else{

++ } else if(mySourceChar <= 0x7f) {

++ length = 1;

++ targetUniChar = mySourceChar;

++ } else {

+ length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,

+ mySourceChar,&targetUniChar,args->converter->useFallback);

+- }

+- /* only DBCS or SBCS characters are expected*/

+- /* DB haracters with high bit set to 1 are expected */

+- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){

+- targetUniChar= missingCharMarker;

++ /* we can only use lead bytes 21..7D and trail bytes 21..7E */

++ if( length == 2 &&

++ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&

++ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)

+ ) {

-+ *bytes++=*lastSource++;

++ targetUniChar -= 0x8080;

++ } else {

++ targetUniChar = missingCharMarker;

+ }

-+ cnv->toULength=i;

-+ source=lastSource;

-+ }

- } else {

- /* no output because of empty input or only state changes */

- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

-diff -ru trie.clean/source/test/cintltst/nccbtst.c chrome.canonical/source/test/cintltst/nccbtst.c

---- trie.clean/source/test/cintltst/nccbtst.c 2007-09-19 09:45:00.986804000 -0700

-+++ chrome.canonical/source/test/cintltst/nccbtst.c 2008-10-29 11:08:51.102376000 -0700

+ }

+ if (targetUniChar != missingCharMarker){

+ myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);

+@@ -360,22 +395,22 @@

+ if(isTargetUCharDBCS){

+ if( myTargetIndex <targetLength){

+- myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);

++ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);

+ if(offsets){

+ *(offsets++) = mySourceIndex-1;

+ }

+ if(myTargetIndex < targetLength){

+- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);

++ myTarget[myTargetIndex++] =(char) targetUniChar;

+ if(offsets){

+ *(offsets++) = mySourceIndex-1;

+ }

+ }else{

+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);

++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;

+ *err = U_BUFFER_OVERFLOW_ERROR;

+ }

+ }else{

+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);

+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);

++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);

++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;

+ *err = U_BUFFER_OVERFLOW_ERROR;

+ }

+@@ -524,14 +559,14 @@

+ const USetAdder *sa,

+ UConverterUnicodeSet which,

+ UErrorCode *pErrorCode) {

+- /* the tilde '~' is hardcoded in the converter */

+- sa->add(sa->set, 0x7e);

++ /* HZ converts all of ASCII */

++ sa->addRange(sa->set, 0, 0x7f);

+ /* add all of the code points that the sub-converter handles */

+- ((UConverterDataHZ*)cnv->extraInfo)->

+- gbConverter->sharedData->impl->

+- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,

+- sa, which, pErrorCode);

++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(

++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,

++ sa, which, UCNV_SET_FILTER_HZ,

++ pErrorCode);

+ }

+ static const UConverterImpl _HZImpl={

+--- r22777/source/common/ucnv_set.c 2005-06-03 13:17:54.000000000 -0700

++++ chrome.canonical/source/common/ucnv_set.c 2009-03-23 12:30:09.917043000 -0700

+@@ -1,7 +1,7 @@

+ /*

+ *******************************************************************************

+ *

+ *******************************************************************************

+@@ -52,7 +52,8 @@

+ uset_add,

+ uset_addRange,

+ uset_addString,

+- uset_remove

++ uset_remove,

++ uset_removeRange

+ };

+ sa.set=setFillIn;

+--- r22777/source/common/ucnv_bld.c 2007-08-24 02:44:10.880047000 -0700

++++ chrome.canonical/source/common/ucnv_bld.c 2009-03-23 12:40:10.653507000 -0700

+@@ -932,6 +932,7 @@

+ myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;

+ myUConverter->subChars = (uint8_t *)myUConverter->subUChars;

+ uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);

++ myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */

+ if(mySharedConverterData->impl->open != NULL) {

+ mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);

+--- r22777/source/common/ucnv_bld.h 2006-07-05 16:08:50.000000000 -0700

++++ chrome.canonical/source/common/ucnv_bld.h 2009-03-23 12:40:10.680507000 -0700

@@ -1,6 +1,6 @@

- /********************************************************************

- * COPYRIGHT:

- ********************************************************************/

-@@ -2530,13 +2530,13 @@

+ **********************************************************************

+ *

+@@ -226,6 +226,9 @@

+ char preToU[UCNV_EXT_MAX_BYTES];

+ int8_t preFromULength, preToULength; /* negative: replay */

+ int8_t preToUFirstLength; /* length of first character */

++ /* new fields for ICU 4.0 */

++ UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */

+ };

+ U_CDECL_END /* end of UConverter */

+--- r22777/source/common/ucnv_ext.c 2007-08-22 22:46:49.525855000 -0700

++++ chrome.canonical/source/common/ucnv_ext.c 2009-03-23 12:30:33.135573000 -0700

+@@ -946,7 +946,7 @@

+ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,

+ const int32_t *cx,

+ const USetAdder *sa,

+- UConverterUnicodeSet which,

++ UBool useFallback,

+ int32_t minLength,

+ UChar32 c,

+ UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,

+@@ -966,7 +966,7 @@

+ value=*fromUSectionValues++;

- static const uint8_t text943[] = {

-- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };

-- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};

-- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};

-+ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };

-+ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 };

-+ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 };

- static const UChar toUnicode943stop[]= { 0x304b};

+ if( value!=0 &&

+- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&

++ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&

+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength

+ ) {

+ if(c>=0) {

+@@ -987,12 +987,14 @@

+ /* no mapping, do nothing */

+ } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {

+ ucnv_extGetUnicodeSetString(

+- sharedData, cx, sa, which, minLength,

++ sharedData, cx, sa, useFallback, minLength,

+ U_SENTINEL, s, length+1,

+ (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),

+ pErrorCode);

+- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==

+- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&

++ } else if((useFallback ?

++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :

++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==

++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&

+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength

+ ) {

+ sa->addString(sa->set, s, length+1);

+@@ -1004,6 +1006,7 @@

+ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,

+ const USetAdder *sa,

+ UConverterUnicodeSet which,

++ UConverterSetFilter filter,

+ UErrorCode *pErrorCode) {

+ const int32_t *cx;

+ const uint16_t *stage12, *stage3, *ps2, *ps3;

+@@ -1011,6 +1014,7 @@

-- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};

-- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};

-+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };

-+ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };

- static const int32_t fromIBM943Offsstop[] = { 0};

+ uint32_t value;

+ int32_t st1, stage1Length, st2, st3, minLength;

++ UBool useFallback;

- gInBufferSize = inputsize;

-@@ -2570,9 +2570,9 @@

- {

- static const uint8_t sampleText[] = {

- 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,

-- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33};

-- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033};

-- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};

-+ 0xff, 0x32, 0x33};

-+ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };

-+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };

- /*checking illegal value for ibm-943 with substitute*/

- gInBufferSize = inputsize;

- gOutBufferSize = outputsize;

-diff -ru trie.clean/source/test/cintltst/nucnvtst.c chrome.canonical/source/test/cintltst/nucnvtst.c

---- trie.clean/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0700

-+++ chrome.canonical/source/test/cintltst/nucnvtst.c 2008-10-29 11:08:51.194286000 -0700

-@@ -2606,7 +2606,7 @@

- TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");

- /*Test for the condition where there is an invalid character*/

- {

-- static const uint8_t source2[]={0xa1, 0x01};

-+ static const uint8_t source2[]={0xa1, 0x80};

- TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");

- }

- /*Test for the condition where we have a truncated char*/

-@@ -3899,11 +3899,11 @@

- TestISO_2022_KR() {

- /* test input */

- static const uint16_t in[]={

-- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D

-- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04

-+ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D

-+ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04

- ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029

- ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB

-- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2

-+ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2

- ,0x53E3,0x53E4,0x000A,0x000D};

- const UChar* uSource;

- const UChar* uSourceLimit;

-diff -ru trie.clean/source/test/testdata/conversion.txt chrome.canonical/source/test/testdata/conversion.txt

---- trie.clean/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0700

-+++ chrome.canonical/source/test/testdata/conversion.txt 2008-10-29 11:37:09.419716000 -0700

-@@ -48,13 +48,135 @@

- toUnicode {

- Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }

- Cases {

-+ // Test ticket 5691: consistent illegal sequences

-+ // The following test cases are for illegal character byte sequences.

-+ //

-+ // Unfortunately, we cannot use the Shift-JIS examples from the ticket

-+ // comments because our Shift-JIS table is Windows-compatible and

-+ // therefore has no illegal single bytes. Same for GBK.

-+ // Instead, we use the stricter GB 18030 also for 2-byte examples.

-+ // The byte sequences are generally slightly different from the ticket

-+ // comment, simply using assigned characters rather than just

-+ // theoretically valid sequences.

-+ {

-+ "gb18030",

-+ :bin{ 618140813c81ff7a },

-+ "a\u4e02\\x81<\\x81\\xFFz",

-+ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "EUC-JP",

-+ :bin{ 618fb0a98fb03c8f3cb0a97a },

-+ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",

-+ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "gb18030",

-+ :bin{ 618130fc318130fc8181303c3e813cfc817a },

-+ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",

-+ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "UTF-8",

-+ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },

-+ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",

-+ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "ISO-2022-JP",

-+ :bin{ 1b24424141af4142affe41431b2842 },

-+ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",

-+ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "ibm-25546",

-+ :bin{ 411b242943420e4141af4142affe41430f5a },

-+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

-+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "ISO-2022-KR",

-+ :bin{ 411b242943420e4141af4142affe41430f5a },

-+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",

-+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "ISO-2022-CN",

-+ :bin{ 411b242941420e4141af4142affe41430f5a },

-+ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

-+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "HZ",

-+ :bin{ 417e7b4141af4142affe41437e7d5a },

-+ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",

-+ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ // Test ticket 5691: consistent illegal sequences

-+ // The following test cases are for illegal escape/designator/shift sequences.

-+ //

-+ // ISO-2022-JP and -CN with illegal escape sequences.

-+ {

-+ "ISO-2022-JP",

-+ :bin{ 611b24201b244241411b283f1b28427a },

-+ "a\\x1B$ \u758f\\x1B\u2538z",

-+ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "ISO-2022-CN",

-+ :bin{ 611b2429201b2429410e41410f7a },

-+ "a\\x1B$) \u4eaez",

-+ :intvector{ 0,1,1,1,1,2,3,4,10,13 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.

-+ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.

-+ {

-+ "ISO-2022-JP-2",

-+ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },

-+ "N\\x1BNNN\xceN\\x1B N",

-+ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "ISO-2022-CN-EXT",

-+ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },

-+ "N\\x1BNNN\u8f0eN\\x1B N",

-+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ {

-+ "ISO-2022-CN-EXT",

-+ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },

-+ "O\\x1BOOO\u492bO\\x1B O",

-+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },

-+ :int{1}, :int{0}, "", "&C", :bin{""}

-+ }

-+ // Test ticket 5691: Example from Peter Edberg.

-+ {

-+ "ISO-2022-JP",

-+ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },

-+ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",

-+ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },

-+ :int{1}, :int{0}, "", "?", :bin{""}

-+ }

- // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and

- // using the Shift-JIS table for JIS X 0208 (ticket #5797)

- {

- "ISO-2022-JP",

- :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },

-- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",

-- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },

-+ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",

-+ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },

- :int{1}, :int{1}, "", "?", :bin{""}

- }

- // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()

-@@ -303,7 +425,7 @@

- {

- "ISO-2022-CN-EXT",

- :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },

-- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }

-+ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }

- }

- // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)

- {

+ UChar s[UCNV_EXT_MAX_UCHARS];

+ UChar32 c;

+@@ -1027,10 +1031,16 @@

+ stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];

++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);

+ /* enumerate the from-Unicode trie table */

+ c=0; /* keep track of the current code point while enumerating */

+- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {

++ if(filter==UCNV_SET_FILTER_2022_CN) {

++ minLength=3;

++ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||

++ filter!=UCNV_SET_FILTER_NONE

++ ) {

+ /* DBCS-only, ignore single-byte results */

+ minLength=2;

+ } else {

+@@ -1064,14 +1074,48 @@

+ length=0;

+ U16_APPEND_UNSAFE(s, length, c);

+ ucnv_extGetUnicodeSetString(

+- sharedData, cx, sa, which, minLength,

++ sharedData, cx, sa, useFallback, minLength,

+ c, s, length,

+ (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),

+ pErrorCode);

+- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==

+- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&

++ } else if((useFallback ?

++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :

++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==

++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&

+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength

+ ) {

++ switch(filter) {

++ case UCNV_SET_FILTER_2022_CN:

++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {

++ continue;

++ }

++ break;

++ case UCNV_SET_FILTER_SJIS:

++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {

++ continue;

++ }

++ break;

++ case UCNV_SET_FILTER_GR94DBCS:

++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&

++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&

++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {

++ continue;

++ }

++ break;

++ case UCNV_SET_FILTER_HZ:

++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&

++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&

++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {

++ continue;

++ }

++ break;

++ default:

++ /*

++ * UCNV_SET_FILTER_NONE,

++ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength

++ */

++ break;

++ }

+ sa->add(sa->set, c);

+ }

+ } while((++c&0xf)!=0);

« no previous file with comments | « third_party/icu38/source/test/testdata/testdata.mak ('k') | third_party/icu38/uconv.security.header.patch » ('j') | no next file with comments »