Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(819)

Unified Diff: third_party/icu38/uconv.security.patch

Issue 52030: Apply ICU patches for ICU tickets 6175 (ISO-2022 and ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/icu38/source/test/testdata/testdata.mak ('k') | third_party/icu38/uconv.security.header.patch » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/icu38/uconv.security.patch
===================================================================
--- third_party/icu38/uconv.security.patch (revision 10949)
+++ third_party/icu38/uconv.security.patch (working copy)
@@ -1,7 +1,1196 @@
-diff -ru trie.clean/source/common/ucnv2022.c chrome.canonical/source/common/ucnv2022.c
---- trie.clean/source/common/ucnv2022.c 2007-11-07 17:39:05.057870000 -0800
-+++ chrome.canonical/source/common/ucnv2022.c 2008-10-29 12:52:22.517453000 -0700
-@@ -752,6 +752,7 @@
+--- r22777/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0700
++++ chrome.canonical/source/test/cintltst/nucnvtst.c 2009-03-23 12:42:01.106292000 -0700
+@@ -17,6 +17,7 @@
+ #include "unicode/uloc.h"
+ #include "unicode/ucnv.h"
+ #include "unicode/ucnv_err.h"
++#include "unicode/ucnv_cb.h"
+ #include "cintltst.h"
+ #include "unicode/utypes.h"
+ #include "unicode/ustring.h"
+@@ -81,6 +82,7 @@
+ static void TestJitterbug2411(void);
+ static void TestJB5275(void);
+ static void TestJB5275_1(void);
++static void TestJitterbug6175(void);
+ #endif
+
+ static void TestRoundTrippingAllUTF(void);
+@@ -297,6 +299,7 @@
+ #if !UCONFIG_NO_LEGACY_CONVERSION
+ addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");
+ addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");
++ addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");
+ #endif
+
+ }
+@@ -2606,7 +2609,7 @@
+ TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
+ /*Test for the condition where there is an invalid character*/
+ {
+- static const uint8_t source2[]={0xa1, 0x01};
++ static const uint8_t source2[]={0xa1, 0x80};
+ TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
+ }
+ /*Test for the condition where we have a truncated char*/
+@@ -3899,11 +3902,11 @@
+ TestISO_2022_KR() {
+ /* test input */
+ static const uint16_t in[]={
+- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
+- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
+ ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
+ ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
+- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
+ ,0x53E3,0x53E4,0x000A,0x000D};
+ const UChar* uSource;
+ const UChar* uSourceLimit;
+@@ -4456,6 +4459,70 @@
+ free(offsets);
+ }
+
++/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */
++typedef struct {
++ const char * converterName;
++ const char * inputText;
++ int inputTextLength;
++} EmptySegmentTest;
++
++/* Callback for TestJitterbug6175, should only get called for empty segment errors */
++static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits,
++ int32_t length, UConverterCallbackReason reason, UErrorCode * err ) {
++ if (reason > UCNV_IRREGULAR) {
++ return;
++ }
++ if (reason != UCNV_IRREGULAR) {
++ log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n");
++ }
++ /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */
++ *err = U_ZERO_ERROR;
++ ucnv_cbToUWriteSub(toArgs,0,err);
++}
++
++enum { kEmptySegmentToUCharsMax = 64 };
++static void TestJitterbug6175(void) {
++ static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A };
++ static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A };
++ static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A };
++ static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A };
++ static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 };
++ static const EmptySegmentTest emptySegmentTests[] = {
++ /* converterName inputText inputTextLength */
++ { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) },
++ { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) },
++ { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) },
++ { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) },
++ { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) },
++ /* terminator: */
++ { NULL, NULL, 0, }
++ };
++ const EmptySegmentTest * testPtr;
++ for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) {
++ UErrorCode err = U_ZERO_ERROR;
++ UConverter * cnv = ucnv_open(testPtr->converterName, &err);
++ if (U_FAILURE(err)) {
++ log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err));
++ return;
++ }
++ ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err);
++ if (U_FAILURE(err)) {
++ log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err));
++ ucnv_close(cnv);
++ return;
++ }
++ {
++ UChar toUChars[kEmptySegmentToUCharsMax];
++ UChar * toUCharsPtr = toUChars;
++ const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax;
++ const char * inCharsPtr = testPtr->inputText;
++ const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength;
++ ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err);
++ }
++ ucnv_close(cnv);
++ }
++}
++
+ static void
+ TestEBCDIC_STATEFUL() {
+ /* test input */
+--- r22777/source/test/cintltst/ncnvtst.c 2007-01-24 15:27:45.575224000 -0800
++++ chrome.canonical/source/test/cintltst/ncnvtst.c 2009-03-23 12:30:17.291031000 -0700
+@@ -1928,7 +1928,7 @@
+ #if !UCONFIG_NO_LEGACY_CONVERSION
+ { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
+ { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
+- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
++ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */
+ { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
+ #else
+ { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }
+--- r22777/source/test/intltest/convtest.h 2007-07-26 20:12:12.288784000 -0700
++++ chrome.canonical/source/test/intltest/convtest.h 2009-03-23 12:30:09.445194000 -0700
+@@ -72,6 +72,7 @@
+ void TestToUnicode();
+ void TestFromUnicode();
+ void TestGetUnicodeSet();
++ void TestGetUnicodeSet2();
+
+ private:
+ UBool
+--- r22777/source/test/intltest/convtest.cpp 2007-03-08 16:28:01.852223000 -0800
++++ chrome.canonical/source/test/intltest/convtest.cpp 2009-03-23 12:30:40.161868000 -0700
+@@ -70,6 +70,7 @@
+ case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
+ case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
+ case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
++ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
+ default: name=""; break; //needed to end loop
+ }
+ }
+@@ -465,6 +466,183 @@
+ }
+ }
+
++U_CDECL_BEGIN
++static void U_CALLCONV
++getUnicodeSetCallback(const void *context,
++ UConverterFromUnicodeArgs *fromUArgs,
++ const UChar* codeUnits,
++ int32_t length,
++ UChar32 codePoint,
++ UConverterCallbackReason reason,
++ UErrorCode *pErrorCode) {
++ if(reason<=UCNV_IRREGULAR) {
++ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
++ *pErrorCode=U_ZERO_ERROR; // skip
++ } // else ignore the reset, close and clone calls.
++}
++U_CDECL_END
++
++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
++void
++ConversionTest::TestGetUnicodeSet2() {
++ // Build a string with all code points.
++ UChar32 cpLimit;
++ int32_t s0Length;
++ if(quick) {
++ cpLimit=s0Length=0x10000; // BMP only
++ } else {
++ cpLimit=0x110000;
++ s0Length=0x10000+0x200000; // BMP + surrogate pairs
++ }
++ UChar *s0=new UChar[s0Length];
++ if(s0==NULL) {
++ return;
++ }
++ UChar *s=s0;
++ UChar32 c;
++ UChar c2;
++ // low BMP
++ for(c=0; c<=0xd7ff; ++c) {
++ *s++=(UChar)c;
++ }
++ // trail surrogates
++ for(c=0xdc00; c<=0xdfff; ++c) {
++ *s++=(UChar)c;
++ }
++ // lead surrogates
++ // (after trails so that there is not even one surrogate pair in between)
++ for(c=0xd800; c<=0xdbff; ++c) {
++ *s++=(UChar)c;
++ }
++ // high BMP
++ for(c=0xe000; c<=0xffff; ++c) {
++ *s++=(UChar)c;
++ }
++ // supplementary code points = surrogate pairs
++ if(cpLimit==0x110000) {
++ for(c=0xd800; c<=0xdbff; ++c) {
++ for(c2=0xdc00; c2<=0xdfff; ++c2) {
++ *s++=(UChar)c;
++ *s++=c2;
++ }
++ }
++ }
++
++ static const char *const cnvNames[]={
++ "UTF-8",
++ "UTF-7",
++ "UTF-16",
++ "US-ASCII",
++ "ISO-8859-1",
++ "windows-1252",
++ "Shift-JIS",
++ "ibm-1390", // EBCDIC_STATEFUL table
++ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
++ "HZ",
++ "ISO-2022-JP",
++ "JIS7",
++ "ISO-2022-CN",
++ "ISO-2022-CN-EXT",
++ "LMBCS"
++ };
++ char buffer[1024];
++ int32_t i;
++ for(i=0; i<LENGTHOF(cnvNames); ++i) {
++ UErrorCode errorCode=U_ZERO_ERROR;
++ UConverter *cnv=cnv_open(cnvNames[i], errorCode);
++ if(U_FAILURE(errorCode)) {
++ errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
++ continue;
++ }
++ UnicodeSet expected;
++ ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
++ if(U_FAILURE(errorCode)) {
++ errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
++ ucnv_close(cnv);
++ continue;
++ }
++ UConverterUnicodeSet which;
++ for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
++ if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
++ ucnv_setFallback(cnv, TRUE);
++ }
++ expected.add(0, cpLimit-1);
++ s=s0;
++ UBool flush;
++ do {
++ char *t=buffer;
++ flush=(UBool)(s==s0+s0Length);
++ ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
++ if(U_FAILURE(errorCode)) {
++ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
++ errorCode=U_ZERO_ERROR;
++ continue;
++ } else {
++ break; // unexpected error, should not occur
++ }
++ }
++ } while(!flush);
++ UnicodeSet set;
++ ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
++ if(cpLimit<0x110000) {
++ set.remove(cpLimit, 0x10ffff);
++ }
++ if(which==UCNV_ROUNDTRIP_SET) {
++ // ignore PUA code points because they will be converted even if they
++ // are fallbacks and when other fallbacks are turned off,
++ // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
++ expected.remove(0xe000, 0xf8ff);
++ expected.remove(0xf0000, 0xffffd);
++ expected.remove(0x100000, 0x10fffd);
++ set.remove(0xe000, 0xf8ff);
++ set.remove(0xf0000, 0xffffd);
++ set.remove(0x100000, 0x10fffd);
++ }
++ if(set!=expected) {
++ // First try to see if we have different sets because ucnv_getUnicodeSet()
++ // added strings: The above conversion method does not tell us what strings might be convertible.
++ // Remove strings from the set and compare again.
++ // Unfortunately, there are no good, direct set methods for finding out whether there are strings
++ // in the set, nor for enumerating or removing just them.
++ // Intersect all code points with the set. The intersection will not contain strings.
++ UnicodeSet temp(0, 0x10ffff);
++ temp.retainAll(set);
++ set=temp;
++ }
++ if(set!=expected) {
++ UnicodeSet diffSet;
++ UnicodeString out;
++
++ // are there items that must be in the set but are not?
++ (diffSet=expected).removeAll(set);
++ if(!diffSet.isEmpty()) {
++ diffSet.toPattern(out, TRUE);
++ if(out.length()>100) {
++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
++ }
++ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
++ cnvNames[i], which);
++ errln(out);
++ }
++
++ // are there items that must not be in the set but are?
++ (diffSet=set).removeAll(expected);
++ if(!diffSet.isEmpty()) {
++ diffSet.toPattern(out, TRUE);
++ if(out.length()>100) {
++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
++ }
++ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
++ cnvNames[i], which);
++ errln(out);
++ }
++ }
++ }
++ }
++
++ delete [] s0;
++}
++
+ // open testdata or ICU data converter ------------------------------------- ***
+
+ UConverter *
+--- r22777/source/test/testdata/testdata.mak 2007-07-26 20:12:12.288784000 -0700
++++ chrome.canonical/source/test/testdata/testdata.mak 2009-03-23 12:31:04.424645000 -0700
+@@ -28,7 +28,7 @@
+
+ TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res)
+
+-"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"
++"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"
+ @echo Building test data
+ @copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ"
+ @copy "$(TESTDATA)\icu26_testtypes.res" "$(TESTDATABLD)"
+@@ -54,6 +54,7 @@
+ iscii.res
+ test.icu
+ test1.cnv
++test1bmp.cnv
+ test3.cnv
+ test4.cnv
+ test4x.cnv
+@@ -126,6 +127,10 @@
+ @echo Building $@
+ @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
+
++"$(TESTDATABLD)\test1bmp.cnv": "$(TESTDATA)\test1bmp.ucm"
++ @echo Building $@
++ @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
++
+ "$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"
+ @echo Building $@
+ @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
+--- r22777/source/test/testdata/Makefile.in 2007-08-21 13:15:55.267002000 -0700
++++ chrome.canonical/source/test/testdata/Makefile.in 2009-03-23 12:31:04.435635000 -0700
+@@ -117,7 +117,7 @@
+ TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu
+ TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUILDDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp
+
+-TEST_UCM_SOURCE= test1.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm
++TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm
+ TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%)
+ TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv)
+
+--- r22777/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0700
++++ chrome.canonical/source/test/testdata/conversion.txt 2009-03-23 12:42:01.119267000 -0700
+@@ -1,6 +1,6 @@
+ //*******************************************************************************
+ //
+-// Copyright (C) 2003-2007, International Business Machines
++// Copyright (C) 2003-2008, International Business Machines
+ // Corporation and others. All Rights Reserved.
+ //
+ // file name: conversion.txt
+@@ -48,13 +48,161 @@
+ toUnicode {
+ Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
+ Cases {
++ // Test ticket 5691: consistent illegal sequences
++ // The following test cases are for illegal character byte sequences.
++ //
++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket
++ // comments because our Shift-JIS table is Windows-compatible and
++ // therefore has no illegal single bytes. Same for GBK.
++ // Instead, we use the stricter GB 18030 also for 2-byte examples.
++ // The byte sequences are generally slightly different from the ticket
++ // comment, simply using assigned characters rather than just
++ // theoretically valid sequences.
++ {
++ "gb18030",
++ :bin{ 618140813c81ff7a },
++ "a\u4e02\\x81<\\x81\\xFFz",
++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "EUC-JP",
++ :bin{ 618fb0a98fb03c8f3cb0a97a },
++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "gb18030",
++ :bin{ 618130fc318130fc8181303c3e813cfc817a },
++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "UTF-8",
++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-JP",
++ :bin{ 1b24424141af4142affe41431b2842 },
++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ibm-25546",
++ :bin{ 411b242943420e4141af4142affe41430f5a },
++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-KR",
++ :bin{ 411b242943420e4141af4142affe41430f5a },
++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 411b242941420e4141af4142affe41430f5a },
++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "HZ",
++ :bin{ 417e7b4141af4142affe41437e7d5a },
++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: consistent illegal sequences
++ // The following test cases are for illegal escape/designator/shift sequences.
++ //
++ // ISO-2022-JP and -CN with illegal escape sequences.
++ {
++ "ISO-2022-JP",
++ :bin{ 611b24201b244241411b283f1b28427a },
++ "a\\x1B$ \u758f\\x1B\u2538z",
++ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 611b2429201b2429410e41410f7a },
++ "a\\x1B$) \u4eaez",
++ :intvector{ 0,1,1,1,1,2,3,4,10,13 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
++ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.
++ {
++ "ISO-2022-JP-2",
++ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
++ "N\\x1BNNN\xceN\\x1B N",
++ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN-EXT",
++ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
++ "N\\x1BNNN\u8f0eN\\x1B N",
++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ {
++ "ISO-2022-CN-EXT",
++ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
++ "O\\x1BOOO\u492bO\\x1B O",
++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: HZ with illegal tilde sequences.
++ {
++ "HZ",
++ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },
++ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z",
++ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS
++ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS
++ 25 }, // SBCS
++ :int{1}, :int{0}, "", "&C", :bin{""}
++ }
++ // Test ticket 5691: Example from Peter Edberg.
++ {
++ "ISO-2022-JP",
++ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
++ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
++ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
++ :int{1}, :int{0}, "", "?", :bin{""}
++ }
++ // Test bug 6071 (2:1 Unicode:charset SBCS mapping).
++ {
++ "*test1bmp",
++ :bin{ 050008 },
++ "e@uv",
++ :intvector{ 0,1,2,2 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
++ {
++ "HZ",
++ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },
++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",
++ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+ {
+ "ISO-2022-JP",
+ :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
+- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
+- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
++ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
+ :int{1}, :int{1}, "", "?", :bin{""}
+ }
+ // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
+@@ -191,6 +339,21 @@
+ :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 },
+ :int{1}, :int{1}, "", "&", :bin{""}
+ }
++ // empty segment (using substitution and stop)
++ {
++ "ISO-2022-KR",
++ :bin{ 1b242943610e0f620d0a },
++ "a\uFFFDb\u000D\u000A",
++ :intvector{ 4, 6, 7, 8, 9 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-KR",
++ :bin{ 1b242943610e0f620d0a },
++ "a",
++ :intvector{ 4 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
++ }
+
+ // ISO-2022-JP
+
+@@ -241,6 +404,21 @@
+ :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 },
+ :int{1}, :int{1}, "", ".", :bin{""}
+ }
++ // empty segment (using substitution and stop)
++ {
++ "ISO-2022-JP",
++ :bin{ 61621b24421b284263640d0a },
++ "ab\uFFFDcd\u000D\u000A",
++ :intvector{ 0, 1, 5, 8, 9, 10, 11 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-JP",
++ :bin{ 61621b24421b284263640d0a },
++ "ab",
++ :intvector{ 0, 1 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"}
++ }
+
+ // ISO-2022-CN
+
+@@ -303,7 +481,7 @@
+ {
+ "ISO-2022-CN-EXT",
+ :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
+- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
++ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
+ }
+ // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
+ {
+@@ -311,6 +489,36 @@
+ :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 },
+ :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 }
+ }
++ // empty segment 1 (using substitution and stop)
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
++ "ab\uFFFD\u994Cc\u000D\u000A",
++ :intvector{ 0, 5, 7, 14, 16, 17, 18 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
++ "ab",
++ :intvector{ 0, 5 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
++ }
++ // empty segment 2 (using substitution and stop)
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e1b24294768640f630d0a },
++ "ab\uFFFD\u5F70c\u000D\u000A",
++ :intvector{ 0, 5, 7, 11, 14, 15, 16 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "ISO-2022-CN",
++ :bin{ 611b242941620e1b24294768640f630d0a },
++ "ab",
++ :intvector{ 0, 5 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"}
++ }
+
+ // ISO-2022 SBCS
+ // [U_ENABLE_GENERIC_ISO_2022]
+@@ -325,6 +533,39 @@
+ // :int{1}, :int{1}, "", ".", :bin{""}
+ //}
+
++ // HZ-GB-2312
++
++ // empty segment 1 (using substitution and stop)
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b7e7d6364 },
++ "ab\uFFFDcd",
++ :intvector{ 0, 1, 4, 6, 7 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b7e7d63640d0a },
++ "ab",
++ :intvector{ 0, 1 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"}
++ }
++ // empty segment 2 & legal redundant switches (using substitution and stop)
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
++ "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD",
++ :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 },
++ :int{1}, :int{1}, "", "?", :bin{""}
++ }
++ {
++ "HZ-GB-2312",
++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
++ "ab\u4E0D\u7A7A",
++ :intvector{ 0, 1, 4, 6 },
++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"}
++ }
++
+ // DBCS-only extensions
+ {
+ "ibm-970",
+@@ -496,6 +737,14 @@
+ :intvector{ 0, 4, 8, 12 },
+ :int{1}, :int{0}, "", "?", :bin{""}
+ }
++ // Test iso-2022-jp-2 miscellaneous symbols
++ {
++ "iso-2022-jp-2",
++ :bin{ 1b242843224f224e1b2842 },
++ "\u260E\u260F",
++ :intvector{ 4, 6 },
++ :int{1}, :int{0}, "", ".", :bin{""}
++ }
+ }
+ }
+
+@@ -504,6 +753,14 @@
+ fromUnicode {
+ Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
+ Cases {
++ // Test bug 6071 (1:2 Unicode:charset SBCS mapping).
++ {
++ "*test1bmp",
++ "e@t",
++ :bin{ 05000709 },
++ :intvector{ 0,1,2,2 },
++ :int{1}, :int{0}, "", "?", ""
++ }
+ // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
+ // using the Shift-JIS table for JIS X 0208 (ticket #5797)
+ {
+@@ -1311,16 +1568,29 @@
+ // versions of ISO-2022-JP
+ {
+ "ISO-2022-JP",
+- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
+- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",
++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
+ :int{0}
+- }
++ }
+ {
+ "ISO-2022-JP-2",
+- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
+- "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
++ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
++ :int{0}
++ }
++ {
++ "JIS7",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
++ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
+ :int{0}
+ }
++ // with fallbacks
++ {
++ "ISO-2022-JP",
++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
++ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
++ :int{1}
++ }
+
+ // versions of ISO-2022-CN
+ {
+@@ -1336,6 +1606,22 @@
+ :int{0}
+ }
+
++ // HZ
++ {
++ "HZ",
++ "[\u0410-\u044f\u4e00\u4e01\u4e03]",
++ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",
++ :int{0}
++ }
++
++ // LMBCS
++ {
++ "LMBCS",
++ "[\x00-\U0010ffff]",
++ "[]",
++ :int{0}
++ }
++
+ // DBCS-only
+ {
+ "ibm-971",
+--- r22777/source/common/ucnv_ext.h 2007-08-22 22:46:49.525855000 -0700
++++ chrome.canonical/source/common/ucnv_ext.h 2009-03-23 12:30:09.644121000 -0700
+@@ -382,10 +382,20 @@
+ UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode);
+
++/*
++ * Add code points and strings to the set according to the extension mappings.
++ * Limitation on the UConverterSetFilter:
++ * The filters currently assume that they are used with 1:1 mappings.
++ * They only apply to single input code points, and then they pass through
++ * only mappings with single-charset-code results.
++ * For example, the Shift-JIS filter only works for 2-byte results and tests
++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
++ */
+ U_CFUNC void
+ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
+ UErrorCode *pErrorCode);
+
+ /* toUnicode helpers -------------------------------------------------------- */
+--- r22777/source/common/ucnvmbcs.c 2007-10-11 14:31:32.196532000 -0700
++++ chrome.canonical/source/common/ucnvmbcs.c 2009-03-23 12:42:01.150242000 -0700
+@@ -1,7 +1,7 @@
+ /*
+ ******************************************************************************
+ *
+-* Copyright (C) 2000-2007, International Business Machines
++* Copyright (C) 2000-2008, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ ******************************************************************************
+@@ -485,9 +485,23 @@
+
+ if(mbcsTable->outputType==MBCS_OUTPUT_1) {
+ const uint16_t *stage2, *stage3, *results;
++ uint16_t minValue;
+
+ results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
+
++ /*
++ * Set a threshold variable for selecting which mappings to use.
++ * See ucnv_MBCSSingleFromBMPWithOffsets() and
++ * MBCS_SINGLE_RESULT_FROM_U() for details.
++ */
++ if(which==UCNV_ROUNDTRIP_SET) {
++ /* use only roundtrips */
++ minValue=0xf00;
++ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
++ /* use all roundtrip and fallback results */
++ minValue=0x800;
++ }
++
+ for(st1=0; st1<maxStage1; ++st1) {
+ st2=table[st1];
+ if(st2>maxStage1) {
+@@ -497,15 +511,8 @@
+ /* read the stage 3 block */
+ stage3=results+st3;
+
+- /*
+- * Add code points for which the roundtrip flag is set.
+- * Once we get a set for fallback mappings, we have to use
+- * a threshold variable with a value of 0x800.
+- * See ucnv_MBCSSingleFromBMPWithOffsets() and
+- * MBCS_SINGLE_RESULT_FROM_U() for details.
+- */
+ do {
+- if(*stage3++>=0xf00) {
++ if(*stage3++>=minValue) {
+ sa->add(sa->set, c);
+ }
+ } while((++c&0xf)!=0);
+@@ -522,9 +529,12 @@
+ const uint8_t *stage3, *bytes;
+ uint32_t st3Multiplier;
+ uint32_t value;
++ UBool useFallback;
+
+ bytes=mbcsTable->fromUnicodeBytes;
+
++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
++
+ switch(mbcsTable->outputType) {
+ case MBCS_OUTPUT_3:
+ case MBCS_OUTPUT_4_EUC:
+@@ -551,9 +561,8 @@
+ st3>>=16;
+
+ /*
+- * Add code points for which the roundtrip flag is set.
+- * Once we get a set for fallback mappings, we have to check
+- * non-roundtrip stage 3 results for whether they are 0.
++ * Add code points for which the roundtrip flag is set,
++ * or which map to non-zero bytes if we use fallbacks.
+ * See ucnv_MBCSFromUnicodeWithOffsets() for details.
+ */
+ switch(filter) {
+@@ -561,6 +570,23 @@
+ do {
+ if(st3&1) {
+ sa->add(sa->set, c);
++ stage3+=st3Multiplier;
++ } else if(useFallback) {
++ uint8_t b=0;
++ switch(st3Multiplier) {
++ case 4:
++ b|=*stage3++;
++ case 3:
++ b|=*stage3++;
++ case 2:
++ b|=stage3[0]|stage3[1];
++ stage3+=2;
++ default:
++ break;
++ }
++ if(b!=0) {
++ sa->add(sa->set, c);
++ }
+ }
+ st3>>=1;
+ } while((++c&0xf)!=0);
+@@ -568,7 +594,7 @@
+ case UCNV_SET_FILTER_DBCS_ONLY:
+ /* Ignore single-byte results (<0x100). */
+ do {
+- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
++ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+@@ -578,7 +604,7 @@
+ case UCNV_SET_FILTER_2022_CN:
+ /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
+ do {
+- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
++ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+@@ -588,7 +614,33 @@
+ case UCNV_SET_FILTER_SJIS:
+ /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
+ do {
+- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
++ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_GR94DBCS:
++ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
++ do {
++ if( ((st3&1)!=0 || useFallback) &&
++ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
++ ) {
++ sa->add(sa->set, c);
++ }
++ st3>>=1;
++ stage3+=2; /* +=st3Multiplier */
++ } while((++c&0xf)!=0);
++ break;
++ case UCNV_SET_FILTER_HZ:
++ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
++ do {
++ if( ((st3&1)!=0 || useFallback) &&
++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
++ ) {
+ sa->add(sa->set, c);
+ }
+ st3>>=1;
+@@ -609,7 +661,7 @@
+ }
+ }
+
+- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
++ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
+ }
+
+ U_CFUNC void
+@@ -1694,7 +1746,7 @@
+ cnv->toUBytes[0]=*(source-1);
+ cnv->toULength=_extToU(cnv, cnv->sharedData,
+ 1, &source, sourceLimit,
+- &target, target+targetCapacity,
++ &target, pArgs->targetLimit,
+ &offsets, sourceIndex,
+ pArgs->flush,
+ pErrorCode);
+@@ -1739,6 +1791,65 @@
+ pArgs->offsets=offsets;
+ }
+
++static UBool
++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
++ const int32_t *row=stateTable[state];
++ int32_t b, entry;
++ /* First test for final entries in this state for some commonly valid byte values. */
++ entry=row[0xa1];
++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++ ) {
++ return TRUE;
++ }
++ entry=row[0x41];
++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++ ) {
++ return TRUE;
++ }
++ /* Then test for final entries in this state. */
++ for(b=0; b<=0xff; ++b) {
++ entry=row[b];
++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
++ ) {
++ return TRUE;
++ }
++ }
++ /* Then recurse for transition entries. */
++ for(b=0; b<=0xff; ++b) {
++ entry=row[b];
++ if( MBCS_ENTRY_IS_TRANSITION(entry) &&
++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
++ ) {
++ return TRUE;
++ }
++ }
++ return FALSE;
++}
++
++/*
++ * Is byte b a single/lead byte in this state?
++ * Recurse for transition states, because here we don't want to say that
++ * b is a lead byte if all byte sequences that start with b are illegal.
++ */
++static UBool
++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
++ const int32_t *row=stateTable[state];
++ int32_t entry=row[b];
++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
++ } else {
++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */
++ } else {
++ return action!=MBCS_STATE_ILLEGAL;
++ }
++ }
++}
++
+ U_CFUNC void
+ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+@@ -2094,6 +2205,34 @@
+ sourceIndex=nextSourceIndex;
+ } else if(U_FAILURE(*pErrorCode)) {
+ /* callback(illegal) */
++ if(byteIndex>1) {
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ */
++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
++ int8_t i;
++ for(i=1;
++ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
++ ++i) {}
++ if(i<byteIndex) {
++ /* Back out some bytes. */
++ int8_t backOutDistance=byteIndex-i;
++ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
++ byteIndex=i; /* length of reported illegal byte sequence */
++ if(backOutDistance<=bytesFromThisBuffer) {
++ source-=backOutDistance;
++ } else {
++ /* Back out bytes from the previous buffer: Need to replay them. */
++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
++ /* preToULength is negative! */
++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
++ source=(const uint8_t *)pArgs->source;
++ }
++ }
++ }
+ break;
+ } else /* unassigned sequences indicated with byteIndex>0 */ {
+ /* try an extension mapping */
+@@ -2104,7 +2243,7 @@
+ &offsets, sourceIndex,
+ pArgs->flush,
+ pErrorCode);
+- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
++ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* not mappable or buffer overflow */
+@@ -2395,15 +2534,37 @@
+
+ if(c<0) {
+ if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
+- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+- }
+- if(U_FAILURE(*pErrorCode)) {
+ /* incomplete character byte sequence */
+ uint8_t *bytes=cnv->toUBytes;
+ cnv->toULength=(int8_t)(source-lastSource);
+ do {
+ *bytes++=*lastSource++;
+ } while(lastSource<source);
++ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
++ } else if(U_FAILURE(*pErrorCode)) {
++ /* callback(illegal) */
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ */
++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
++ uint8_t *bytes=cnv->toUBytes;
++ *bytes++=*lastSource++; /* first byte */
++ if(lastSource==source) {
++ cnv->toULength=1;
++ } else /* lastSource<source: multi-byte character */ {
++ int8_t i;
++ for(i=1;
++ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
++ ++i
++ ) {
++ *bytes++=*lastSource++;
++ }
++ cnv->toULength=i;
++ source=lastSource;
++ }
+ } else {
+ /* no output because of empty input or only state changes */
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+@@ -3237,7 +3398,7 @@
+ lastSource=source;
+ c=_extFromU(cnv, cnv->sharedData,
+ c, &source, sourceLimit,
+- &target, target+targetCapacity,
++ &target, (const uint8_t *)(pArgs->targetLimit),
+ &offsets, sourceIndex,
+ pArgs->flush,
+ pErrorCode);
+--- r22777/source/common/ucnvmbcs.h 2007-10-11 14:31:32.196532000 -0700
++++ chrome.canonical/source/common/ucnvmbcs.h 2009-03-23 12:30:17.315007000 -0700
+@@ -492,6 +492,8 @@
+ UCNV_SET_FILTER_DBCS_ONLY,
+ UCNV_SET_FILTER_2022_CN,
+ UCNV_SET_FILTER_SJIS,
++ UCNV_SET_FILTER_GR94DBCS,
++ UCNV_SET_FILTER_HZ,
+ UCNV_SET_FILTER_COUNT
+ } UConverterSetFilter;
+
+--- r22777/source/common/ucnv.c 2007-08-31 12:39:14.294200000 -0700
++++ chrome.canonical/source/common/ucnv.c 2009-03-23 12:40:10.566608000 -0700
+@@ -1528,11 +1528,14 @@
+ cnv->toULength=0;
+
+ /* call the callback function */
++ if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) {
++ cnv->toUCallbackReason = UCNV_UNASSIGNED;
++ }
+ cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
+ cnv->invalidCharBuffer, errorInputLength,
+- (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
+- UCNV_UNASSIGNED : UCNV_ILLEGAL,
++ cnv->toUCallbackReason,
+ err);
++ cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */
+
+ /*
+ * loop back to the offset handling
+--- r22777/source/common/uset_imp.h 2007-07-24 19:51:25.692061000 -0700
++++ chrome.canonical/source/common/uset_imp.h 2009-03-23 12:30:09.893067000 -0700
+@@ -36,6 +36,9 @@
+ typedef void U_CALLCONV
+ USetRemove(USet *set, UChar32 c);
+
++typedef void U_CALLCONV
++USetRemoveRange(USet *set, UChar32 start, UChar32 end);
++
+ /**
+ * Interface for adding items to a USet, to keep low-level code from
+ * statically depending on the USet implementation.
+@@ -47,6 +50,7 @@
+ USetAddRange *addRange;
+ USetAddString *addString;
+ USetRemove *remove;
++ USetRemoveRange *removeRange;
+ };
+ typedef struct USetAdder USetAdder;
+
+--- r22777/source/common/ucnv2022.c 2007-10-11 14:31:32.196532000 -0700
++++ chrome.canonical/source/common/ucnv2022.c 2009-03-23 12:57:38.398368000 -0700
+@@ -201,6 +201,7 @@
+ #ifdef U_ENABLE_GENERIC_ISO_2022
+ UBool isFirstBuffer;
+ #endif
++ UBool isEmptySegment;
+ char name[30];
+ char locale[3];
+ }UConverterDataISO2022;
+@@ -609,6 +610,7 @@
+ if(choice<=UCNV_RESET_TO_UNICODE) {
+ uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
+ myConverterData->key = 0;
++ myConverterData->isEmptySegment = FALSE;
+ }
+ if(choice!=UCNV_RESET_TO_UNICODE) {
+ uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
+@@ -752,6 +754,7 @@
UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
uint32_t key = myData2022->key;
int32_t offset = 0;
@@ -9,7 +1198,7 @@
char c;
value = VALID_NON_TERMINAL_2022;
-@@ -804,7 +805,6 @@
+@@ -804,7 +807,6 @@
return;
} else if (value == INVALID_2022 ) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
@@ -17,7 +1206,15 @@
} else /* value == VALID_TERMINAL_2022 */ {
switch(var){
#ifdef U_ENABLE_GENERIC_ISO_2022
-@@ -935,6 +935,35 @@
+@@ -814,6 +816,7 @@
+ if(chosenConverterName == NULL) {
+ /* SS2 or SS3 */
+ *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
++ _this->toUCallbackReason = UCNV_UNASSIGNED;
+ return;
+ }
+
+@@ -935,6 +938,37 @@
}
if(U_SUCCESS(*err)) {
_this->toULength = 0;
@@ -50,43 +1247,46 @@
+ }
+ _this->toULength=1;
+ }
++ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
++ _this->toUCallbackReason = UCNV_UNASSIGNED;
}
}
-@@ -1097,6 +1126,24 @@
+@@ -1113,6 +1147,24 @@
+ }
}
- /*
-+ * * Check that the result is a 2-byte value with each byte in the range A1..FE
-+ * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
-+ * * to move it to the ISO 2022 range 21..7E.
-+ * * Return 0 if out of range.
-+ * */
++#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
++/*
++ * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
++ * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
++ * unchanged.
++ */
+static U_INLINE uint32_t
-+_2022FromGR94DBCS(uint32_t value) {
-+ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
-+ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
-+ ) {
-+ return value - 0x8080; /* shift down to 21..7e byte range */
++_2022ToGR94DBCS(uint32_t value) {
++ uint32_t returnValue = value + 0x8080;
++ if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
++ (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
++ return returnValue;
+ } else {
-+ return 0; /* not valid for ISO 2022 */
++ return value;
+ }
+}
++#endif
+
-+#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
-+/*
- * Check that the result is a 2-byte value with each byte in the range A1..FE
- * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
- * to move it to the ISO 2022 range 21..7E.
-@@ -1112,6 +1159,7 @@
- return 0; /* not valid for ISO 2022 */
- }
- }
-+#endif
-
#ifdef U_ENABLE_GENERIC_ISO_2022
-@@ -1953,6 +2001,7 @@
+ /**********************************************************************************
+@@ -1436,7 +1488,7 @@
+ c2 = 0; /* invalid */
+ }
+ } else {
+- if((uint8_t)(c2-0x21) <= (0x7e-0x21)) {
++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
+ c2 += 0x7e;
+ } else {
+ c2 = 0; /* invalid */
+@@ -1953,6 +2005,7 @@
const char *mySourceLimit = args->sourceLimit;
uint32_t targetUniChar = 0x0000;
uint32_t mySourceChar = 0x0000;
@@ -94,7 +1294,7 @@
UConverterDataISO2022* myData;
ISO2022State *pToU2022State;
StateEnum cs;
-@@ -1968,6 +2017,7 @@
+@@ -1968,6 +2021,7 @@
mySourceChar = args->converter->toUBytes[0];
args->converter->toULength = 0;
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
@@ -102,7 +1302,65 @@
goto getTrailByte;
}
-@@ -2077,17 +2127,44 @@
+@@ -1986,6 +2040,7 @@
+ continue;
+ } else {
+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
+ break;
+ }
+
+@@ -1997,21 +2052,39 @@
+ continue;
+ } else {
+ /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
+ break;
+ }
+
+ case ESC_2022:
+ mySource--;
+ escape:
+- changeState_2022(args->converter,&(mySource),
+- mySourceLimit, ISO_2022_JP,err);
++ {
++ const char * mySourceBefore = mySource;
++ int8_t toULengthBefore = args->converter->toULength;
++
++ changeState_2022(args->converter,&(mySource),
++ mySourceLimit, ISO_2022_JP,err);
++
++ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
++ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
++ }
++ }
+
+ /* invalid or illegal escape sequence */
+ if(U_FAILURE(*err)){
+ args->target = myTarget;
+ args->source = mySource;
++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
+ return;
+ }
++ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
++ if(myData->key==0) {
++ myData->isEmptySegment = TRUE;
++ }
+ continue;
+
+ /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
+@@ -2028,6 +2101,7 @@
+ /* falls through */
+ default:
+ /* convert one or two bytes */
++ myData->isEmptySegment = FALSE;
+ cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
+ if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
+ !IS_JP_DBCS(cs)
+@@ -2077,17 +2151,44 @@
default:
/* G0 DBCS */
if(mySource < mySourceLimit) {
@@ -156,7 +1414,7 @@
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
args->converter->toULength = 1;
-@@ -2229,7 +2306,12 @@
+@@ -2229,7 +2330,12 @@
}
/* only DBCS or SBCS characters are expected*/
/* DB characters with high bit set to 1 are expected */
@@ -170,8 +1428,39 @@
targetByteUnit=missingCharMarker;
}
if (targetByteUnit != missingCharMarker){
-@@ -2545,17 +2627,34 @@
+@@ -2524,15 +2630,27 @@
+ if(mySourceChar==UCNV_SI){
+ myData->toU2022State.g = 0;
++ if (myData->isEmptySegment) {
++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
++ args->converter->toULength = 1;
++ args->target = myTarget;
++ args->source = mySource;
++ return;
++ }
+ /*consume the source */
+ continue;
+ }else if(mySourceChar==UCNV_SO){
+ myData->toU2022State.g = 1;
++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
+ /*consume the source */
+ continue;
+ }else if(mySourceChar==ESC_2022){
+ mySource--;
+ escape:
++ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
+ changeState_2022(args->converter,&(mySource),
+ mySourceLimit, ISO_2022_KR, err);
+ if(U_FAILURE(*err)){
+@@ -2543,19 +2661,37 @@
+ continue;
+ }
+
++ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
if(myData->toU2022State.g == 1) {
if(mySource < mySourceLimit) {
- char trailByte;
@@ -214,7 +1503,7 @@
}
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
-@@ -2563,8 +2662,10 @@
+@@ -2563,8 +2699,10 @@
break;
}
}
@@ -226,7 +1515,7 @@
}
if(targetUniChar < 0xfffe){
if(args->offsets) {
-@@ -3061,6 +3162,7 @@
+@@ -3061,6 +3199,7 @@
/* continue with a partial double-byte character */
mySourceChar = args->converter->toUBytes[0];
args->converter->toULength = 0;
@@ -234,7 +1523,68 @@
goto getTrailByte;
}
-@@ -3114,29 +3216,50 @@
+@@ -3075,27 +3214,52 @@
+ switch(mySourceChar){
+ case UCNV_SI:
+ pToU2022State->g=0;
++ if (myData->isEmptySegment) {
++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toUBytes[0] = mySourceChar;
++ args->converter->toULength = 1;
++ args->target = myTarget;
++ args->source = mySource;
++ return;
++ }
+ continue;
+
+ case UCNV_SO:
+ if(pToU2022State->cs[1] != 0) {
+ pToU2022State->g=1;
++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
+ continue;
+ } else {
+ /* illegal to have SO before a matching designator */
++ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
+ break;
+ }
+
+ case ESC_2022:
+ mySource--;
+ escape:
+- changeState_2022(args->converter,&(mySource),
+- mySourceLimit, ISO_2022_CN,err);
++ {
++ const char * mySourceBefore = mySource;
++ int8_t toULengthBefore = args->converter->toULength;
++
++ changeState_2022(args->converter,&(mySource),
++ mySourceLimit, ISO_2022_CN,err);
++
++ /* After SO there must be at least one character before a designator (designator error handled separately) */
++ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
++ }
++ }
+
+ /* invalid or illegal escape sequence */
+ if(U_FAILURE(*err)){
+ args->target = myTarget;
+ args->source = mySource;
++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
+ return;
+ }
+ continue;
+@@ -3109,34 +3273,56 @@
+ /* falls through */
+ default:
+ /* convert one or two bytes */
++ myData->isEmptySegment = FALSE;
+ if(pToU2022State->g != 0) {
+ if(mySource < mySourceLimit) {
UConverterSharedData *cnv;
StateEnum tempState;
int32_t tempBufLen;
@@ -302,386 +1652,622 @@
} else {
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
args->converter->toULength = 1;
-diff -ru trie.clean/source/common/ucnvmbcs.c chrome.canonical/source/common/ucnvmbcs.c
---- trie.clean/source/common/ucnvmbcs.c 2007-11-07 17:39:05.057870000 -0800
-+++ chrome.canonical/source/common/ucnvmbcs.c 2008-10-29 11:34:34.648518000 -0700
-@@ -1,7 +1,7 @@
- /*
- ******************************************************************************
- *
--* Copyright (C) 2000-2007, International Business Machines
-+* Copyright (C) 2000-2008, International Business Machines
+@@ -3399,11 +3585,19 @@
+ /* include ASCII for JP */
+ sa->addRange(sa->set, 0, 0x7f);
+ }
+- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
++ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
+ /*
+- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
+- * we need to include half-width Katakana for all JP variants because
+- * JIS X 0208 has hardcoded fallbacks for them.
++ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
++ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
++ * use half-width Katakana.
++ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
++ * half-width Katakana via the ESC ( I sequence.
++ * However, we only emit (fromUnicode) half-width Katakana according to the
++ * definition of each variant.
++ *
++ * When including fallbacks,
++ * we need to include half-width Katakana Unicode code points for all JP variants because
++ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
+ */
+ /* include half-width Katakana for JP */
+ sa->addRange(sa->set, HWKANA_START, HWKANA_END);
+@@ -3457,6 +3651,12 @@
+ * corresponding to JIS X 0208.
+ */
+ filter=UCNV_SET_FILTER_SJIS;
++ } else if(i==KSC5601) {
++ /*
++ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
++ * are broader than GR94.
++ */
++ filter=UCNV_SET_FILTER_GR94DBCS;
+ } else {
+ filter=UCNV_SET_FILTER_NONE;
+ }
+@@ -3472,6 +3672,9 @@
+ sa->remove(sa->set, 0x0e);
+ sa->remove(sa->set, 0x0f);
+ sa->remove(sa->set, 0x1b);
++
++ /* ISO 2022 converters do not convert C1 controls either */
++ sa->removeRange(sa->set, 0x80, 0x9f);
+ }
+
+ static const UConverterImpl _ISO2022Impl={
+--- r22777/source/common/ucnv_lmb.c 2006-08-19 14:27:08.000000000 -0700
++++ chrome.canonical/source/common/ucnv_lmb.c 2009-03-23 12:30:26.043293000 -0700
+@@ -1,6 +1,6 @@
+ /*
+ **********************************************************************
+-* Copyright (C) 2000-2006, International Business Machines
++* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
- *
- ******************************************************************************
-@@ -1739,6 +1739,65 @@
- pArgs->offsets=offsets;
+ **********************************************************************
+ * file name: ucnv_lmb.cpp
+@@ -536,7 +536,7 @@
+ NULL,\
+ NULL,\
+ _LMBCSSafeClone,\
+- _LMBCSGetUnicodeSet\
++ ucnv_getCompleteUnicodeSet\
+ };\
+ static const UConverterStaticData _LMBCSStaticData##n={\
+ sizeof(UConverterStaticData),\
+@@ -662,15 +662,14 @@
+ return &newLMBCS->cnv;
}
-+static UBool
-+hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
-+ const int32_t *row=stateTable[state];
-+ int32_t b, entry;
-+ /* First test for final entries in this state for some commonly valid byte values. */
-+ entry=row[0xa1];
-+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
-+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
-+ ) {
-+ return TRUE;
-+ }
-+ entry=row[0x41];
-+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
-+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
-+ ) {
-+ return TRUE;
-+ }
-+ /* Then test for final entries in this state. */
-+ for(b=0; b<=0xff; ++b) {
-+ entry=row[b];
-+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
-+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
-+ ) {
-+ return TRUE;
-+ }
-+ }
-+ /* Then recurse for transition entries. */
-+ for(b=0; b<=0xff; ++b) {
-+ entry=row[b];
-+ if( MBCS_ENTRY_IS_TRANSITION(entry) &&
-+ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
-+ ) {
-+ return TRUE;
-+ }
-+ }
-+ return FALSE;
-+}
-+
+-static void
+-_LMBCSGetUnicodeSet(const UConverter *cnv,
+- const USetAdder *sa,
+- UConverterUnicodeSet which,
+- UErrorCode *pErrorCode) {
+- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
+- sa->addRange(sa->set, 0, 0xf5ff);
+- sa->addRange(sa->set, 0xf700, 0x10ffff);
+-}
+/*
-+ * Is byte b a single/lead byte in this state?
-+ * Recurse for transition states, because here we don't want to say that
-+ * b is a lead byte if all byte sequences that start with b are illegal.
++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
++ * which added all code points except for U+F6xx
++ * because those cannot be represented in the Unicode group.
++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx
++ * which means that LMBCS can convert all Unicode code points after all.
++ * We now simply use ucnv_getCompleteUnicodeSet().
+ */
-+static UBool
-+isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
-+ const int32_t *row=stateTable[state];
-+ int32_t entry=row[b];
-+ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
-+ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
-+ } else {
-+ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
-+ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
-+ return FALSE; /* SI/SO are illegal for DBCS-only conversion */
-+ } else {
-+ return action!=MBCS_STATE_ILLEGAL;
-+ }
-+ }
-+}
-+
- U_CFUNC void
- ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
-@@ -2094,6 +2153,34 @@
- sourceIndex=nextSourceIndex;
- } else if(U_FAILURE(*pErrorCode)) {
- /* callback(illegal) */
-+ if(byteIndex>1) {
-+ /*
-+ * Ticket 5691: consistent illegal sequences:
-+ * - We include at least the first byte in the illegal sequence.
-+ * - If any of the non-initial bytes could be the start of a character,
-+ * we stop the illegal sequence before the first one of those.
-+ */
-+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
-+ int8_t i;
-+ for(i=1;
-+ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
-+ ++i) {}
-+ if(i<byteIndex) {
-+ /* Back out some bytes. */
-+ int8_t backOutDistance=byteIndex-i;
-+ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
-+ byteIndex=i; /* length of reported illegal byte sequence */
-+ if(backOutDistance<=bytesFromThisBuffer) {
-+ source-=backOutDistance;
+
+ /*
+ Here's the basic helper function that we use when converting from
+--- r22777/source/common/ucnvhz.c 2006-07-05 16:08:50.000000000 -0700
++++ chrome.canonical/source/common/ucnvhz.c 2009-03-23 12:42:01.208181000 -0700
+@@ -1,6 +1,6 @@
+ /*
+ **********************************************************************
+-* Copyright (C) 2000-2006, International Business Machines
++* Copyright (C) 2000-2007, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ **********************************************************************
+ * file name: ucnvhz.c
+@@ -59,6 +59,7 @@
+ UBool isEscapeAppended;
+ UBool isStateDBCS;
+ UBool isTargetUCharDBCS;
++ UBool isEmptySegment;
+ }UConverterDataHZ;
+
+
+@@ -72,7 +73,7 @@
+ cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
+ if(cnv->extraInfo != NULL){
+ uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
+- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);
+ }
+ else {
+ *errorCode = U_MEMORY_ALLOCATION_ERROR;
+@@ -98,6 +99,7 @@
+ cnv->mode=0;
+ if(cnv->extraInfo != NULL){
+ ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
++ ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
+ }
+ }
+ if(choice!=UCNV_RESET_TO_UNICODE) {
+@@ -130,6 +132,10 @@
+ * from-GB code '~}' ($7E7D) is outside the defined GB range.)
+ *
+ * Source: RFC 1842
++*
++* Note that the formal syntax in RFC 1842 is invalid. I assume that the
++* intended definition of single-byte-segment is as follows (pedberg):
++* single-byte-segment = single-byte-seq 1*single-byte-char
+ */
+
+
+@@ -141,7 +147,7 @@
+ UChar *myTarget = args->target;
+ const char *mySourceLimit = args->sourceLimit;
+ UChar32 targetUniChar = 0x0000;
+- UChar mySourceChar = 0x0000;
++ int32_t mySourceChar = 0x0000;
+ UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
+ tempBuf[0]=0;
+ tempBuf[1]=0;
+@@ -156,90 +162,123 @@
+
+ mySourceChar= (unsigned char) *mySource++;
+
+- switch(mySourceChar){
++ if(args->converter->mode == UCNV_TILDE) {
++ /* second byte after ~ */
++ args->converter->mode=0;
++ switch(mySourceChar) {
+ case 0x0A:
+- if(args->converter->mode ==UCNV_TILDE){
+- args->converter->mode=0;
+-
+- }
+- *(myTarget++)=(UChar)mySourceChar;
++ /* no output for ~\n (line-continuation marker) */
+ continue;
+-
+ case UCNV_TILDE:
+- if(args->converter->mode ==UCNV_TILDE){
+- *(myTarget++)=(UChar)mySourceChar;
+- args->converter->mode=0;
+- continue;
+-
++ if(args->offsets) {
++ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
+ }
+- else if(args->converter->toUnicodeStatus !=0){
+- args->converter->mode=0;
+- break;
+- }
+- else{
+- args->converter->mode = UCNV_TILDE;
+- continue;
+- }
+-
+-
++ *(myTarget++)=(UChar)mySourceChar;
++ myData->isEmptySegment = FALSE;
++ continue;
+ case UCNV_OPEN_BRACE:
+- if(args->converter->mode == UCNV_TILDE){
+- args->converter->mode=0;
+- myData->isStateDBCS = TRUE;
+- continue;
+- }
+- else{
+- break;
+- }
+-
+-
+ case UCNV_CLOSE_BRACE:
+- if(args->converter->mode == UCNV_TILDE){
+- args->converter->mode=0;
+- myData->isStateDBCS = FALSE;
+- continue;
+- }
+- else{
+- break;
++ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
++ if (myData->isEmptySegment) {
++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
++ args->converter->toUBytes[0] = UCNV_TILDE;
++ args->converter->toUBytes[1] = mySourceChar;
++ args->converter->toULength = 2;
++ args->target = myTarget;
++ args->source = mySource;
++ return;
+ }
+-
++ myData->isEmptySegment = TRUE;
++ continue;
+ default:
+ /* if the first byte is equal to TILDE and the trail byte
+ * is not a valid byte then it is an error condition
+ */
+- if(args->converter->mode == UCNV_TILDE){
+- args->converter->mode=0;
+- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
+- goto SAVE_STATE;
+- }
+-
+- break;
+-
+- }
+-
+- if(myData->isStateDBCS){
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ */
++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
++ args->converter->toUBytes[0] = UCNV_TILDE;
++ if( myData->isStateDBCS ?
++ (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
++ mySourceChar <= 0x7f
++ ) {
++ /* The current byte could be the start of a character: Back it out. */
++ args->converter->toULength = 1;
++ --mySource;
+ } else {
-+ /* Back out bytes from the previous buffer: Need to replay them. */
-+ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
-+ /* preToULength is negative! */
-+ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
-+ source=(const uint8_t *)pArgs->source;
++ /* Include the current byte in the illegal sequence. */
++ args->converter->toUBytes[1] = mySourceChar;
++ args->converter->toULength = 2;
+ }
++ args->target = myTarget;
++ args->source = mySource;
++ return;
+ }
-+ }
- break;
- } else /* unassigned sequences indicated with byteIndex>0 */ {
- /* try an extension mapping */
-@@ -2104,7 +2191,7 @@
- &offsets, sourceIndex,
- pArgs->flush,
- pErrorCode);
-- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
-+ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
++ } else if(myData->isStateDBCS) {
+ if(args->converter->toUnicodeStatus == 0x00){
+- args->converter->toUnicodeStatus = (UChar) mySourceChar;
++ /* lead byte */
++ if(mySourceChar == UCNV_TILDE) {
++ args->converter->mode = UCNV_TILDE;
++ } else {
++ /* add another bit to distinguish a 0 byte from not having seen a lead byte */
++ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
++ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
++ }
+ continue;
+ }
+ else{
+- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
+- tempBuf[1] = (char) (mySourceChar+0x80);
+- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
++ /* trail byte */
++ int leadIsOk, trailIsOk;
++ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
++ targetUniChar = 0xffff;
++ /*
++ * Ticket 5691: consistent illegal sequences:
++ * - We include at least the first byte in the illegal sequence.
++ * - If any of the non-initial bytes could be the start of a character,
++ * we stop the illegal sequence before the first one of those.
++ *
++ * In HZ DBCS, if the second byte is in the 21..7e range,
++ * we report only the first byte as the illegal sequence.
++ * Otherwise we convert or report the pair of bytes.
++ */
++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
++ if (leadIsOk && trailIsOk) {
++ tempBuf[0] = (char) (leadByte+0x80) ;
++ tempBuf[1] = (char) (mySourceChar+0x80);
++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
++ tempBuf, 2, args->converter->useFallback);
++ mySourceChar= (leadByte << 8) | mySourceChar;
++ } else if (trailIsOk) {
++ /* report a single illegal byte and continue with the following DBCS starter byte */
++ --mySource;
++ mySourceChar = (int32_t)leadByte;
++ } else {
++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
++ /* add another bit so that the code below writes 2 bytes in case of error */
++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
++ }
+ args->converter->toUnicodeStatus =0x00;
+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+- tempBuf, 2, args->converter->useFallback);
+ }
+ }
+ else{
+- if(args->converter->fromUnicodeStatus == 0x00){
+- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
+- mySource - 1, 1, args->converter->useFallback);
+- }
+- else{
+- goto SAVE_STATE;
++ if(mySourceChar == UCNV_TILDE) {
++ args->converter->mode = UCNV_TILDE;
++ continue;
++ } else if(mySourceChar <= 0x7f) {
++ targetUniChar = (UChar)mySourceChar; /* ASCII */
++ myData->isEmptySegment = FALSE; /* the segment has something valid */
++ } else {
++ targetUniChar = 0xffff;
++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
+ }
+-
+ }
+ if(targetUniChar < 0xfffe){
+ if(args->offsets) {
+@@ -248,26 +287,17 @@
- if(U_FAILURE(*pErrorCode)) {
- /* not mappable or buffer overflow */
-@@ -2395,15 +2482,37 @@
-
- if(c<0) {
- if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
-- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-- }
-- if(U_FAILURE(*pErrorCode)) {
- /* incomplete character byte sequence */
- uint8_t *bytes=cnv->toUBytes;
- cnv->toULength=(int8_t)(source-lastSource);
- do {
- *bytes++=*lastSource++;
- } while(lastSource<source);
-+ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-+ } else if(U_FAILURE(*pErrorCode)) {
-+ /* callback(illegal) */
-+ /*
-+ * Ticket 5691: consistent illegal sequences:
-+ * - We include at least the first byte in the illegal sequence.
-+ * - If any of the non-initial bytes could be the start of a character,
-+ * we stop the illegal sequence before the first one of those.
-+ */
-+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
-+ uint8_t *bytes=cnv->toUBytes;
-+ *bytes++=*lastSource++; /* first byte */
-+ if(lastSource==source) {
-+ cnv->toULength=1;
-+ } else /* lastSource<source: multi-byte character */ {
-+ int8_t i;
-+ for(i=1;
-+ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
-+ ++i
+ *(myTarget++)=(UChar)targetUniChar;
+ }
+- else if(targetUniChar>=0xfffe){
+-SAVE_STATE:
++ else /* targetUniChar>=0xfffe */ {
+ if(targetUniChar == 0xfffe){
+ *err = U_INVALID_CHAR_FOUND;
+ }
+ else{
+ *err = U_ILLEGAL_CHAR_FOUND;
+ }
+- if(myData->isStateDBCS){
+- /* this should never occur since isStateDBCS is set to true
+- * only after tempBuf[0] and tempBuf[1]
+- * are set to the input .. just to please BEAM
+- */
+- if(tempBuf[0]==0 || tempBuf[1]==0){
+- *err = U_INTERNAL_PROGRAM_ERROR;
+- }else{
+- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
+- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
+- args->converter->toULength=2;
+- }
++ if(mySourceChar > 0xff){
++ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
++ args->converter->toUBytes[1] = (uint8_t)mySourceChar;
++ args->converter->toULength=2;
+ }
+ else{
+ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
+@@ -328,16 +358,21 @@
+ escSeq = TILDE_ESCAPE;
+ CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
+ continue;
+- }
+- else{
++ } else if(mySourceChar <= 0x7f) {
++ length = 1;
++ targetUniChar = mySourceChar;
++ } else {
+ length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
+ mySourceChar,&targetUniChar,args->converter->useFallback);
+-
+- }
+- /* only DBCS or SBCS characters are expected*/
+- /* DB haracters with high bit set to 1 are expected */
+- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
+- targetUniChar= missingCharMarker;
++ /* we can only use lead bytes 21..7D and trail bytes 21..7E */
++ if( length == 2 &&
++ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
++ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
+ ) {
-+ *bytes++=*lastSource++;
++ targetUniChar -= 0x8080;
++ } else {
++ targetUniChar = missingCharMarker;
+ }
-+ cnv->toULength=i;
-+ source=lastSource;
-+ }
- } else {
- /* no output because of empty input or only state changes */
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-diff -ru trie.clean/source/test/cintltst/nccbtst.c chrome.canonical/source/test/cintltst/nccbtst.c
---- trie.clean/source/test/cintltst/nccbtst.c 2007-09-19 09:45:00.986804000 -0700
-+++ chrome.canonical/source/test/cintltst/nccbtst.c 2008-10-29 11:08:51.102376000 -0700
+ }
+ if (targetUniChar != missingCharMarker){
+ myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
+@@ -360,22 +395,22 @@
+
+ if(isTargetUCharDBCS){
+ if( myTargetIndex <targetLength){
+- myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
++ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
+ if(offsets){
+ *(offsets++) = mySourceIndex-1;
+ }
+ if(myTargetIndex < targetLength){
+- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
++ myTarget[myTargetIndex++] =(char) targetUniChar;
+ if(offsets){
+ *(offsets++) = mySourceIndex-1;
+ }
+ }else{
+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
+ *err = U_BUFFER_OVERFLOW_ERROR;
+ }
+ }else{
+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
+- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
+ *err = U_BUFFER_OVERFLOW_ERROR;
+ }
+
+@@ -524,14 +559,14 @@
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
+ UErrorCode *pErrorCode) {
+- /* the tilde '~' is hardcoded in the converter */
+- sa->add(sa->set, 0x7e);
++ /* HZ converts all of ASCII */
++ sa->addRange(sa->set, 0, 0x7f);
+
+ /* add all of the code points that the sub-converter handles */
+- ((UConverterDataHZ*)cnv->extraInfo)->
+- gbConverter->sharedData->impl->
+- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
+- sa, which, pErrorCode);
++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(
++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
++ sa, which, UCNV_SET_FILTER_HZ,
++ pErrorCode);
+ }
+
+ static const UConverterImpl _HZImpl={
+--- r22777/source/common/ucnv_set.c 2005-06-03 13:17:54.000000000 -0700
++++ chrome.canonical/source/common/ucnv_set.c 2009-03-23 12:30:09.917043000 -0700
+@@ -1,7 +1,7 @@
+ /*
+ *******************************************************************************
+ *
+-* Copyright (C) 2003-2005, International Business Machines
++* Copyright (C) 2003-2007, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ *******************************************************************************
+@@ -52,7 +52,8 @@
+ uset_add,
+ uset_addRange,
+ uset_addString,
+- uset_remove
++ uset_remove,
++ uset_removeRange
+ };
+ sa.set=setFillIn;
+
+--- r22777/source/common/ucnv_bld.c 2007-08-24 02:44:10.880047000 -0700
++++ chrome.canonical/source/common/ucnv_bld.c 2009-03-23 12:40:10.653507000 -0700
+@@ -932,6 +932,7 @@
+ myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
+ myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
+ uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
++ myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */
+
+ if(mySharedConverterData->impl->open != NULL) {
+ mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);
+--- r22777/source/common/ucnv_bld.h 2006-07-05 16:08:50.000000000 -0700
++++ chrome.canonical/source/common/ucnv_bld.h 2009-03-23 12:40:10.680507000 -0700
@@ -1,6 +1,6 @@
- /********************************************************************
- * COPYRIGHT:
-- * Copyright (c) 1997-2007, International Business Machines Corporation and
-+ * Copyright (c) 1997-2008, International Business Machines Corporation and
- * others. All Rights Reserved.
- ********************************************************************/
/*
-@@ -2530,13 +2530,13 @@
+ **********************************************************************
+-* Copyright (C) 1999-2006, International Business Machines
++* Copyright (C) 1999-2006,2008 International Business Machines
+ * Corporation and others. All Rights Reserved.
+ **********************************************************************
+ *
+@@ -226,6 +226,9 @@
+ char preToU[UCNV_EXT_MAX_BYTES];
+ int8_t preFromULength, preToULength; /* negative: replay */
+ int8_t preToUFirstLength; /* length of first character */
++
++ /* new fields for ICU 4.0 */
++ UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */
+ };
+ U_CDECL_END /* end of UConverter */
+--- r22777/source/common/ucnv_ext.c 2007-08-22 22:46:49.525855000 -0700
++++ chrome.canonical/source/common/ucnv_ext.c 2009-03-23 12:30:33.135573000 -0700
+@@ -946,7 +946,7 @@
+ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
+ const int32_t *cx,
+ const USetAdder *sa,
+- UConverterUnicodeSet which,
++ UBool useFallback,
+ int32_t minLength,
+ UChar32 c,
+ UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
+@@ -966,7 +966,7 @@
+ value=*fromUSectionValues++;
- static const uint8_t text943[] = {
-- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
-- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
-- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
-+ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
-+ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 };
-+ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 };
- static const UChar toUnicode943stop[]= { 0x304b};
+ if( value!=0 &&
+- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
++ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
+ if(c>=0) {
+@@ -987,12 +987,14 @@
+ /* no mapping, do nothing */
+ } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
+ ucnv_extGetUnicodeSetString(
+- sharedData, cx, sa, which, minLength,
++ sharedData, cx, sa, useFallback, minLength,
+ U_SENTINEL, s, length+1,
+ (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
+ pErrorCode);
+- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
++ } else if((useFallback ?
++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
+ sa->addString(sa->set, s, length+1);
+@@ -1004,6 +1006,7 @@
+ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
+ const USetAdder *sa,
+ UConverterUnicodeSet which,
++ UConverterSetFilter filter,
+ UErrorCode *pErrorCode) {
+ const int32_t *cx;
+ const uint16_t *stage12, *stage3, *ps2, *ps3;
+@@ -1011,6 +1014,7 @@
-- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};
-- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};
-+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };
-+ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
- static const int32_t fromIBM943Offsstop[] = { 0};
+ uint32_t value;
+ int32_t st1, stage1Length, st2, st3, minLength;
++ UBool useFallback;
- gInBufferSize = inputsize;
-@@ -2570,9 +2570,9 @@
- {
- static const uint8_t sampleText[] = {
- 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
-- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
-- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033};
-- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};
-+ 0xff, 0x32, 0x33};
-+ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
-+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
- /*checking illegal value for ibm-943 with substitute*/
- gInBufferSize = inputsize;
- gOutBufferSize = outputsize;
-diff -ru trie.clean/source/test/cintltst/nucnvtst.c chrome.canonical/source/test/cintltst/nucnvtst.c
---- trie.clean/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0700
-+++ chrome.canonical/source/test/cintltst/nucnvtst.c 2008-10-29 11:08:51.194286000 -0700
-@@ -2606,7 +2606,7 @@
- TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
- /*Test for the condition where there is an invalid character*/
- {
-- static const uint8_t source2[]={0xa1, 0x01};
-+ static const uint8_t source2[]={0xa1, 0x80};
- TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
- }
- /*Test for the condition where we have a truncated char*/
-@@ -3899,11 +3899,11 @@
- TestISO_2022_KR() {
- /* test input */
- static const uint16_t in[]={
-- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
-- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
-+ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
-+ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
- ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
- ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
-- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
-+ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
- ,0x53E3,0x53E4,0x000A,0x000D};
- const UChar* uSource;
- const UChar* uSourceLimit;
-diff -ru trie.clean/source/test/testdata/conversion.txt chrome.canonical/source/test/testdata/conversion.txt
---- trie.clean/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0700
-+++ chrome.canonical/source/test/testdata/conversion.txt 2008-10-29 11:37:09.419716000 -0700
-@@ -48,13 +48,135 @@
- toUnicode {
- Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
- Cases {
-+ // Test ticket 5691: consistent illegal sequences
-+ // The following test cases are for illegal character byte sequences.
-+ //
-+ // Unfortunately, we cannot use the Shift-JIS examples from the ticket
-+ // comments because our Shift-JIS table is Windows-compatible and
-+ // therefore has no illegal single bytes. Same for GBK.
-+ // Instead, we use the stricter GB 18030 also for 2-byte examples.
-+ // The byte sequences are generally slightly different from the ticket
-+ // comment, simply using assigned characters rather than just
-+ // theoretically valid sequences.
-+ {
-+ "gb18030",
-+ :bin{ 618140813c81ff7a },
-+ "a\u4e02\\x81<\\x81\\xFFz",
-+ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "EUC-JP",
-+ :bin{ 618fb0a98fb03c8f3cb0a97a },
-+ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
-+ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "gb18030",
-+ :bin{ 618130fc318130fc8181303c3e813cfc817a },
-+ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
-+ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "UTF-8",
-+ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
-+ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
-+ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "ISO-2022-JP",
-+ :bin{ 1b24424141af4142affe41431b2842 },
-+ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
-+ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "ibm-25546",
-+ :bin{ 411b242943420e4141af4142affe41430f5a },
-+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
-+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "ISO-2022-KR",
-+ :bin{ 411b242943420e4141af4142affe41430f5a },
-+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
-+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "ISO-2022-CN",
-+ :bin{ 411b242941420e4141af4142affe41430f5a },
-+ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
-+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "HZ",
-+ :bin{ 417e7b4141af4142affe41437e7d5a },
-+ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
-+ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ // Test ticket 5691: consistent illegal sequences
-+ // The following test cases are for illegal escape/designator/shift sequences.
-+ //
-+ // ISO-2022-JP and -CN with illegal escape sequences.
-+ {
-+ "ISO-2022-JP",
-+ :bin{ 611b24201b244241411b283f1b28427a },
-+ "a\\x1B$ \u758f\\x1B\u2538z",
-+ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "ISO-2022-CN",
-+ :bin{ 611b2429201b2429410e41410f7a },
-+ "a\\x1B$) \u4eaez",
-+ :intvector{ 0,1,1,1,1,2,3,4,10,13 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
-+ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.
-+ {
-+ "ISO-2022-JP-2",
-+ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
-+ "N\\x1BNNN\xceN\\x1B N",
-+ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "ISO-2022-CN-EXT",
-+ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
-+ "N\\x1BNNN\u8f0eN\\x1B N",
-+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ {
-+ "ISO-2022-CN-EXT",
-+ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
-+ "O\\x1BOOO\u492bO\\x1B O",
-+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
-+ :int{1}, :int{0}, "", "&C", :bin{""}
-+ }
-+ // Test ticket 5691: Example from Peter Edberg.
-+ {
-+ "ISO-2022-JP",
-+ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
-+ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
-+ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
-+ :int{1}, :int{0}, "", "?", :bin{""}
-+ }
- // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
- // using the Shift-JIS table for JIS X 0208 (ticket #5797)
- {
- "ISO-2022-JP",
- :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
-- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
-- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
-+ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
-+ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
- :int{1}, :int{1}, "", "?", :bin{""}
- }
- // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
-@@ -303,7 +425,7 @@
- {
- "ISO-2022-CN-EXT",
- :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
-- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
-+ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
- }
- // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
- {
+ UChar s[UCNV_EXT_MAX_UCHARS];
+ UChar32 c;
+@@ -1027,10 +1031,16 @@
+
+ stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
+
++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
++
+ /* enumerate the from-Unicode trie table */
+ c=0; /* keep track of the current code point while enumerating */
+
+- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
++ if(filter==UCNV_SET_FILTER_2022_CN) {
++ minLength=3;
++ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
++ filter!=UCNV_SET_FILTER_NONE
++ ) {
+ /* DBCS-only, ignore single-byte results */
+ minLength=2;
+ } else {
+@@ -1064,14 +1074,48 @@
+ length=0;
+ U16_APPEND_UNSAFE(s, length, c);
+ ucnv_extGetUnicodeSetString(
+- sharedData, cx, sa, which, minLength,
++ sharedData, cx, sa, useFallback, minLength,
+ c, s, length,
+ (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
+ pErrorCode);
+- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
+- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
++ } else if((useFallback ?
++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
+ UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
+ ) {
++ switch(filter) {
++ case UCNV_SET_FILTER_2022_CN:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
++ continue;
++ }
++ break;
++ case UCNV_SET_FILTER_SJIS:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
++ continue;
++ }
++ break;
++ case UCNV_SET_FILTER_GR94DBCS:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
++ continue;
++ }
++ break;
++ case UCNV_SET_FILTER_HZ:
++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
++ continue;
++ }
++ break;
++ default:
++ /*
++ * UCNV_SET_FILTER_NONE,
++ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
++ */
++ break;
++ }
+ sa->add(sa->set, c);
+ }
+ } while((++c&0xf)!=0);
« no previous file with comments | « third_party/icu38/source/test/testdata/testdata.mak ('k') | third_party/icu38/uconv.security.header.patch » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698