| Index: third_party/icu38/uconv.security.patch
|
| ===================================================================
|
| --- third_party/icu38/uconv.security.patch (revision 10949)
|
| +++ third_party/icu38/uconv.security.patch (working copy)
|
| @@ -1,7 +1,1196 @@
|
| -diff -ru trie.clean/source/common/ucnv2022.c chrome.canonical/source/common/ucnv2022.c
|
| ---- trie.clean/source/common/ucnv2022.c 2007-11-07 17:39:05.057870000 -0800
|
| -+++ chrome.canonical/source/common/ucnv2022.c 2008-10-29 12:52:22.517453000 -0700
|
| -@@ -752,6 +752,7 @@
|
| +--- r22777/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0700
|
| ++++ chrome.canonical/source/test/cintltst/nucnvtst.c 2009-03-23 12:42:01.106292000 -0700
|
| +@@ -17,6 +17,7 @@
|
| + #include "unicode/uloc.h"
|
| + #include "unicode/ucnv.h"
|
| + #include "unicode/ucnv_err.h"
|
| ++#include "unicode/ucnv_cb.h"
|
| + #include "cintltst.h"
|
| + #include "unicode/utypes.h"
|
| + #include "unicode/ustring.h"
|
| +@@ -81,6 +82,7 @@
|
| + static void TestJitterbug2411(void);
|
| + static void TestJB5275(void);
|
| + static void TestJB5275_1(void);
|
| ++static void TestJitterbug6175(void);
|
| + #endif
|
| +
|
| + static void TestRoundTrippingAllUTF(void);
|
| +@@ -297,6 +299,7 @@
|
| + #if !UCONFIG_NO_LEGACY_CONVERSION
|
| + addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");
|
| + addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");
|
| ++ addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");
|
| + #endif
|
| +
|
| + }
|
| +@@ -2606,7 +2609,7 @@
|
| + TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
|
| + /*Test for the condition where there is an invalid character*/
|
| + {
|
| +- static const uint8_t source2[]={0xa1, 0x01};
|
| ++ static const uint8_t source2[]={0xa1, 0x80};
|
| + TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
|
| + }
|
| + /*Test for the condition where we have a truncated char*/
|
| +@@ -3899,11 +3902,11 @@
|
| + TestISO_2022_KR() {
|
| + /* test input */
|
| + static const uint16_t in[]={
|
| +- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
|
| +- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
|
| ++ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
|
| ++ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
|
| + ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
|
| + ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
|
| +- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
|
| ++ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
|
| + ,0x53E3,0x53E4,0x000A,0x000D};
|
| + const UChar* uSource;
|
| + const UChar* uSourceLimit;
|
| +@@ -4456,6 +4459,70 @@
|
| + free(offsets);
|
| + }
|
| +
|
| ++/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCallbackReason is UCNV_IRREGULAR */
|
| ++typedef struct {
|
| ++ const char * converterName;
|
| ++ const char * inputText;
|
| ++ int inputTextLength;
|
| ++} EmptySegmentTest;
|
| ++
|
| ++/* Callback for TestJitterbug6175, should only get called for empty segment errors */
|
| ++static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToUnicodeArgs *toArgs, const char* codeUnits,
|
| ++ int32_t length, UConverterCallbackReason reason, UErrorCode * err ) {
|
| ++ if (reason > UCNV_IRREGULAR) {
|
| ++ return;
|
| ++ }
|
| ++ if (reason != UCNV_IRREGULAR) {
|
| ++ log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n");
|
| ++ }
|
| ++ /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */
|
| ++ *err = U_ZERO_ERROR;
|
| ++ ucnv_cbToUWriteSub(toArgs,0,err);
|
| ++}
|
| ++
|
| ++enum { kEmptySegmentToUCharsMax = 64 };
|
| ++static void TestJitterbug6175(void) {
|
| ++ static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28,0x42, 0x63, 0x64, 0x0D, 0x0A };
|
| ++ static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A };
|
| ++ static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A };
|
| ++ static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A };
|
| ++ static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63, 0x64 };
|
| ++ static const EmptySegmentTest emptySegmentTests[] = {
|
| ++ /* converterName inputText inputTextLength */
|
| ++ { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) },
|
| ++ { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) },
|
| ++ { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) },
|
| ++ { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) },
|
| ++ { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) },
|
| ++ /* terminator: */
|
| ++ { NULL, NULL, 0, }
|
| ++ };
|
| ++ const EmptySegmentTest * testPtr;
|
| ++ for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr) {
|
| ++ UErrorCode err = U_ZERO_ERROR;
|
| ++ UConverter * cnv = ucnv_open(testPtr->converterName, &err);
|
| ++ if (U_FAILURE(err)) {
|
| ++ log_data_err("Unable to open %s converter: %s\n", testPtr->converterName, u_errorName(err));
|
| ++ return;
|
| ++ }
|
| ++ ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, NULL, &err);
|
| ++ if (U_FAILURE(err)) {
|
| ++ log_data_err("Unable to setToUCallBack for %s converter: %s\n", testPtr->converterName, u_errorName(err));
|
| ++ ucnv_close(cnv);
|
| ++ return;
|
| ++ }
|
| ++ {
|
| ++ UChar toUChars[kEmptySegmentToUCharsMax];
|
| ++ UChar * toUCharsPtr = toUChars;
|
| ++ const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMax;
|
| ++ const char * inCharsPtr = testPtr->inputText;
|
| ++ const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength;
|
| ++ ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCharsLimit, NULL, TRUE, &err);
|
| ++ }
|
| ++ ucnv_close(cnv);
|
| ++ }
|
| ++}
|
| ++
|
| + static void
|
| + TestEBCDIC_STATEFUL() {
|
| + /* test input */
|
| +--- r22777/source/test/cintltst/ncnvtst.c 2007-01-24 15:27:45.575224000 -0800
|
| ++++ chrome.canonical/source/test/cintltst/ncnvtst.c 2009-03-23 12:30:17.291031000 -0700
|
| +@@ -1928,7 +1928,7 @@
|
| + #if !UCONFIG_NO_LEGACY_CONVERSION
|
| + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
|
| + { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
|
| +- { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
|
| ++ /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6002 */
|
| + { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
|
| + #else
|
| + { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }
|
| +--- r22777/source/test/intltest/convtest.h 2007-07-26 20:12:12.288784000 -0700
|
| ++++ chrome.canonical/source/test/intltest/convtest.h 2009-03-23 12:30:09.445194000 -0700
|
| +@@ -72,6 +72,7 @@
|
| + void TestToUnicode();
|
| + void TestFromUnicode();
|
| + void TestGetUnicodeSet();
|
| ++ void TestGetUnicodeSet2();
|
| +
|
| + private:
|
| + UBool
|
| +--- r22777/source/test/intltest/convtest.cpp 2007-03-08 16:28:01.852223000 -0800
|
| ++++ chrome.canonical/source/test/intltest/convtest.cpp 2009-03-23 12:30:40.161868000 -0700
|
| +@@ -70,6 +70,7 @@
|
| + case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
|
| + case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
|
| + case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
|
| ++ case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break;
|
| + default: name=""; break; //needed to end loop
|
| + }
|
| + }
|
| +@@ -465,6 +466,183 @@
|
| + }
|
| + }
|
| +
|
| ++U_CDECL_BEGIN
|
| ++static void U_CALLCONV
|
| ++getUnicodeSetCallback(const void *context,
|
| ++ UConverterFromUnicodeArgs *fromUArgs,
|
| ++ const UChar* codeUnits,
|
| ++ int32_t length,
|
| ++ UChar32 codePoint,
|
| ++ UConverterCallbackReason reason,
|
| ++ UErrorCode *pErrorCode) {
|
| ++ if(reason<=UCNV_IRREGULAR) {
|
| ++ ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point
|
| ++ *pErrorCode=U_ZERO_ERROR; // skip
|
| ++ } // else ignore the reset, close and clone calls.
|
| ++}
|
| ++U_CDECL_END
|
| ++
|
| ++// Compare ucnv_getUnicodeSet() with the set of characters that can be converted.
|
| ++void
|
| ++ConversionTest::TestGetUnicodeSet2() {
|
| ++ // Build a string with all code points.
|
| ++ UChar32 cpLimit;
|
| ++ int32_t s0Length;
|
| ++ if(quick) {
|
| ++ cpLimit=s0Length=0x10000; // BMP only
|
| ++ } else {
|
| ++ cpLimit=0x110000;
|
| ++ s0Length=0x10000+0x200000; // BMP + surrogate pairs
|
| ++ }
|
| ++ UChar *s0=new UChar[s0Length];
|
| ++ if(s0==NULL) {
|
| ++ return;
|
| ++ }
|
| ++ UChar *s=s0;
|
| ++ UChar32 c;
|
| ++ UChar c2;
|
| ++ // low BMP
|
| ++ for(c=0; c<=0xd7ff; ++c) {
|
| ++ *s++=(UChar)c;
|
| ++ }
|
| ++ // trail surrogates
|
| ++ for(c=0xdc00; c<=0xdfff; ++c) {
|
| ++ *s++=(UChar)c;
|
| ++ }
|
| ++ // lead surrogates
|
| ++ // (after trails so that there is not even one surrogate pair in between)
|
| ++ for(c=0xd800; c<=0xdbff; ++c) {
|
| ++ *s++=(UChar)c;
|
| ++ }
|
| ++ // high BMP
|
| ++ for(c=0xe000; c<=0xffff; ++c) {
|
| ++ *s++=(UChar)c;
|
| ++ }
|
| ++ // supplementary code points = surrogate pairs
|
| ++ if(cpLimit==0x110000) {
|
| ++ for(c=0xd800; c<=0xdbff; ++c) {
|
| ++ for(c2=0xdc00; c2<=0xdfff; ++c2) {
|
| ++ *s++=(UChar)c;
|
| ++ *s++=c2;
|
| ++ }
|
| ++ }
|
| ++ }
|
| ++
|
| ++ static const char *const cnvNames[]={
|
| ++ "UTF-8",
|
| ++ "UTF-7",
|
| ++ "UTF-16",
|
| ++ "US-ASCII",
|
| ++ "ISO-8859-1",
|
| ++ "windows-1252",
|
| ++ "Shift-JIS",
|
| ++ "ibm-1390", // EBCDIC_STATEFUL table
|
| ++ "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table
|
| ++ "HZ",
|
| ++ "ISO-2022-JP",
|
| ++ "JIS7",
|
| ++ "ISO-2022-CN",
|
| ++ "ISO-2022-CN-EXT",
|
| ++ "LMBCS"
|
| ++ };
|
| ++ char buffer[1024];
|
| ++ int32_t i;
|
| ++ for(i=0; i<LENGTHOF(cnvNames); ++i) {
|
| ++ UErrorCode errorCode=U_ZERO_ERROR;
|
| ++ UConverter *cnv=cnv_open(cnvNames[i], errorCode);
|
| ++ if(U_FAILURE(errorCode)) {
|
| ++ errln("failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode));
|
| ++ continue;
|
| ++ }
|
| ++ UnicodeSet expected;
|
| ++ ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL, &errorCode);
|
| ++ if(U_FAILURE(errorCode)) {
|
| ++ errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode));
|
| ++ ucnv_close(cnv);
|
| ++ continue;
|
| ++ }
|
| ++ UConverterUnicodeSet which;
|
| ++ for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) {
|
| ++ if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
|
| ++ ucnv_setFallback(cnv, TRUE);
|
| ++ }
|
| ++ expected.add(0, cpLimit-1);
|
| ++ s=s0;
|
| ++ UBool flush;
|
| ++ do {
|
| ++ char *t=buffer;
|
| ++ flush=(UBool)(s==s0+s0Length);
|
| ++ ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode);
|
| ++ if(U_FAILURE(errorCode)) {
|
| ++ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
| ++ errorCode=U_ZERO_ERROR;
|
| ++ continue;
|
| ++ } else {
|
| ++ break; // unexpected error, should not occur
|
| ++ }
|
| ++ }
|
| ++ } while(!flush);
|
| ++ UnicodeSet set;
|
| ++ ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
|
| ++ if(cpLimit<0x110000) {
|
| ++ set.remove(cpLimit, 0x10ffff);
|
| ++ }
|
| ++ if(which==UCNV_ROUNDTRIP_SET) {
|
| ++ // ignore PUA code points because they will be converted even if they
|
| ++ // are fallbacks and when other fallbacks are turned off,
|
| ++ // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips
|
| ++ expected.remove(0xe000, 0xf8ff);
|
| ++ expected.remove(0xf0000, 0xffffd);
|
| ++ expected.remove(0x100000, 0x10fffd);
|
| ++ set.remove(0xe000, 0xf8ff);
|
| ++ set.remove(0xf0000, 0xffffd);
|
| ++ set.remove(0x100000, 0x10fffd);
|
| ++ }
|
| ++ if(set!=expected) {
|
| ++ // First try to see if we have different sets because ucnv_getUnicodeSet()
|
| ++ // added strings: The above conversion method does not tell us what strings might be convertible.
|
| ++ // Remove strings from the set and compare again.
|
| ++ // Unfortunately, there are no good, direct set methods for finding out whether there are strings
|
| ++ // in the set, nor for enumerating or removing just them.
|
| ++ // Intersect all code points with the set. The intersection will not contain strings.
|
| ++ UnicodeSet temp(0, 0x10ffff);
|
| ++ temp.retainAll(set);
|
| ++ set=temp;
|
| ++ }
|
| ++ if(set!=expected) {
|
| ++ UnicodeSet diffSet;
|
| ++ UnicodeString out;
|
| ++
|
| ++ // are there items that must be in the set but are not?
|
| ++ (diffSet=expected).removeAll(set);
|
| ++ if(!diffSet.isEmpty()) {
|
| ++ diffSet.toPattern(out, TRUE);
|
| ++ if(out.length()>100) {
|
| ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
|
| ++ }
|
| ++ errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
|
| ++ cnvNames[i], which);
|
| ++ errln(out);
|
| ++ }
|
| ++
|
| ++ // are there items that must not be in the set but are?
|
| ++ (diffSet=set).removeAll(expected);
|
| ++ if(!diffSet.isEmpty()) {
|
| ++ diffSet.toPattern(out, TRUE);
|
| ++ if(out.length()>100) {
|
| ++ out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsis));
|
| ++ }
|
| ++ errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d",
|
| ++ cnvNames[i], which);
|
| ++ errln(out);
|
| ++ }
|
| ++ }
|
| ++ }
|
| ++ }
|
| ++
|
| ++ delete [] s0;
|
| ++}
|
| ++
|
| + // open testdata or ICU data converter ------------------------------------- ***
|
| +
|
| + UConverter *
|
| +--- r22777/source/test/testdata/testdata.mak 2007-07-26 20:12:12.288784000 -0700
|
| ++++ chrome.canonical/source/test/testdata/testdata.mak 2009-03-23 12:31:04.424645000 -0700
|
| +@@ -28,7 +28,7 @@
|
| +
|
| + TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res)
|
| +
|
| +-"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"
|
| ++"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" "$(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res" "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh.res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN.res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TESTDATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res" "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" "$(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1.cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"
|
| + @echo Building test data
|
| + @copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ"
|
| + @copy "$(TESTDATA)\icu26_testtypes.res" "$(TESTDATABLD)"
|
| +@@ -54,6 +54,7 @@
|
| + iscii.res
|
| + test.icu
|
| + test1.cnv
|
| ++test1bmp.cnv
|
| + test3.cnv
|
| + test4.cnv
|
| + test4x.cnv
|
| +@@ -126,6 +127,10 @@
|
| + @echo Building $@
|
| + @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
|
| +
|
| ++"$(TESTDATABLD)\test1bmp.cnv": "$(TESTDATA)\test1bmp.ucm"
|
| ++ @echo Building $@
|
| ++ @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
|
| ++
|
| + "$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"
|
| + @echo Building $@
|
| + @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
|
| +--- r22777/source/test/testdata/Makefile.in 2007-08-21 13:15:55.267002000 -0700
|
| ++++ chrome.canonical/source/test/testdata/Makefile.in 2009-03-23 12:31:04.435635000 -0700
|
| +@@ -117,7 +117,7 @@
|
| + TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu
|
| + TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUILDDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp
|
| +
|
| +-TEST_UCM_SOURCE= test1.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm
|
| ++TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm
|
| + TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%)
|
| + TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv)
|
| +
|
| +--- r22777/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0700
|
| ++++ chrome.canonical/source/test/testdata/conversion.txt 2009-03-23 12:42:01.119267000 -0700
|
| +@@ -1,6 +1,6 @@
|
| + //*******************************************************************************
|
| + //
|
| +-// Copyright (C) 2003-2007, International Business Machines
|
| ++// Copyright (C) 2003-2008, International Business Machines
|
| + // Corporation and others. All Rights Reserved.
|
| + //
|
| + // file name: conversion.txt
|
| +@@ -48,13 +48,161 @@
|
| + toUnicode {
|
| + Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
|
| + Cases {
|
| ++ // Test ticket 5691: consistent illegal sequences
|
| ++ // The following test cases are for illegal character byte sequences.
|
| ++ //
|
| ++ // Unfortunately, we cannot use the Shift-JIS examples from the ticket
|
| ++ // comments because our Shift-JIS table is Windows-compatible and
|
| ++ // therefore has no illegal single bytes. Same for GBK.
|
| ++ // Instead, we use the stricter GB 18030 also for 2-byte examples.
|
| ++ // The byte sequences are generally slightly different from the ticket
|
| ++ // comment, simply using assigned characters rather than just
|
| ++ // theoretically valid sequences.
|
| ++ {
|
| ++ "gb18030",
|
| ++ :bin{ 618140813c81ff7a },
|
| ++ "a\u4e02\\x81<\\x81\\xFFz",
|
| ++ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "EUC-JP",
|
| ++ :bin{ 618fb0a98fb03c8f3cb0a97a },
|
| ++ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
|
| ++ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "gb18030",
|
| ++ :bin{ 618130fc318130fc8181303c3e813cfc817a },
|
| ++ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
|
| ++ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "UTF-8",
|
| ++ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
|
| ++ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
|
| ++ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-JP",
|
| ++ :bin{ 1b24424141af4142affe41431b2842 },
|
| ++ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
|
| ++ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ibm-25546",
|
| ++ :bin{ 411b242943420e4141af4142affe41430f5a },
|
| ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
|
| ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-KR",
|
| ++ :bin{ 411b242943420e4141af4142affe41430f5a },
|
| ++ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
|
| ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-CN",
|
| ++ :bin{ 411b242941420e4141af4142affe41430f5a },
|
| ++ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
|
| ++ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "HZ",
|
| ++ :bin{ 417e7b4141af4142affe41437e7d5a },
|
| ++ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
|
| ++ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ // Test ticket 5691: consistent illegal sequences
|
| ++ // The following test cases are for illegal escape/designator/shift sequences.
|
| ++ //
|
| ++ // ISO-2022-JP and -CN with illegal escape sequences.
|
| ++ {
|
| ++ "ISO-2022-JP",
|
| ++ :bin{ 611b24201b244241411b283f1b28427a },
|
| ++ "a\\x1B$ \u758f\\x1B\u2538z",
|
| ++ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-CN",
|
| ++ :bin{ 611b2429201b2429410e41410f7a },
|
| ++ "a\\x1B$) \u4eaez",
|
| ++ :intvector{ 0,1,1,1,1,2,3,4,10,13 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
|
| ++ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.
|
| ++ {
|
| ++ "ISO-2022-JP-2",
|
| ++ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
|
| ++ "N\\x1BNNN\xceN\\x1B N",
|
| ++ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-CN-EXT",
|
| ++ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
|
| ++ "N\\x1BNNN\u8f0eN\\x1B N",
|
| ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-CN-EXT",
|
| ++ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
|
| ++ "O\\x1BOOO\u492bO\\x1B O",
|
| ++ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ // Test ticket 5691: HZ with illegal tilde sequences.
|
| ++ {
|
| ++ "HZ",
|
| ++ :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },
|
| ++ "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z",
|
| ++ :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS
|
| ++ 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS
|
| ++ 25 }, // SBCS
|
| ++ :int{1}, :int{0}, "", "&C", :bin{""}
|
| ++ }
|
| ++ // Test ticket 5691: Example from Peter Edberg.
|
| ++ {
|
| ++ "ISO-2022-JP",
|
| ++ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
|
| ++ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
|
| ++ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
|
| ++ :int{1}, :int{0}, "", "?", :bin{""}
|
| ++ }
|
| ++ // Test bug 6071 (2:1 Unicode:charset SBCS mapping).
|
| ++ {
|
| ++ "*test1bmp",
|
| ++ :bin{ 050008 },
|
| ++ "e@uv",
|
| ++ :intvector{ 0,1,2,2 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| ++ // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e
|
| ++ {
|
| ++ "HZ",
|
| ++ :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },
|
| ++ "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",
|
| ++ :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
|
| + // using the Shift-JIS table for JIS X 0208 (ticket #5797)
|
| + {
|
| + "ISO-2022-JP",
|
| + :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
|
| +- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
|
| +- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
|
| ++ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
|
| ++ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
|
| + :int{1}, :int{1}, "", "?", :bin{""}
|
| + }
|
| + // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
|
| +@@ -191,6 +339,21 @@
|
| + :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 },
|
| + :int{1}, :int{1}, "", "&", :bin{""}
|
| + }
|
| ++ // empty segment (using substitution and stop)
|
| ++ {
|
| ++ "ISO-2022-KR",
|
| ++ :bin{ 1b242943610e0f620d0a },
|
| ++ "a\uFFFDb\u000D\u000A",
|
| ++ :intvector{ 4, 6, 7, 8, 9 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-KR",
|
| ++ :bin{ 1b242943610e0f620d0a },
|
| ++ "a",
|
| ++ :intvector{ 4 },
|
| ++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
|
| ++ }
|
| +
|
| + // ISO-2022-JP
|
| +
|
| +@@ -241,6 +404,21 @@
|
| + :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 },
|
| + :int{1}, :int{1}, "", ".", :bin{""}
|
| + }
|
| ++ // empty segment (using substitution and stop)
|
| ++ {
|
| ++ "ISO-2022-JP",
|
| ++ :bin{ 61621b24421b284263640d0a },
|
| ++ "ab\uFFFDcd\u000D\u000A",
|
| ++ :intvector{ 0, 1, 5, 8, 9, 10, 11 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-JP",
|
| ++ :bin{ 61621b24421b284263640d0a },
|
| ++ "ab",
|
| ++ :intvector{ 0, 1 },
|
| ++ :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"}
|
| ++ }
|
| +
|
| + // ISO-2022-CN
|
| +
|
| +@@ -303,7 +481,7 @@
|
| + {
|
| + "ISO-2022-CN-EXT",
|
| + :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
|
| +- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
|
| ++ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
|
| + }
|
| + // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
|
| + {
|
| +@@ -311,6 +489,36 @@
|
| + :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 },
|
| + :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 }
|
| + }
|
| ++ // empty segment 1 (using substitution and stop)
|
| ++ {
|
| ++ "ISO-2022-CN",
|
| ++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
|
| ++ "ab\uFFFD\u994Cc\u000D\u000A",
|
| ++ :intvector{ 0, 5, 7, 14, 16, 17, 18 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-CN",
|
| ++ :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
|
| ++ "ab",
|
| ++ :intvector{ 0, 5 },
|
| ++ :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
|
| ++ }
|
| ++ // empty segment 2 (using substitution and stop)
|
| ++ {
|
| ++ "ISO-2022-CN",
|
| ++ :bin{ 611b242941620e1b24294768640f630d0a },
|
| ++ "ab\uFFFD\u5F70c\u000D\u000A",
|
| ++ :intvector{ 0, 5, 7, 11, 14, 15, 16 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "ISO-2022-CN",
|
| ++ :bin{ 611b242941620e1b24294768640f630d0a },
|
| ++ "ab",
|
| ++ :intvector{ 0, 5 },
|
| ++ :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"}
|
| ++ }
|
| +
|
| + // ISO-2022 SBCS
|
| + // [U_ENABLE_GENERIC_ISO_2022]
|
| +@@ -325,6 +533,39 @@
|
| + // :int{1}, :int{1}, "", ".", :bin{""}
|
| + //}
|
| +
|
| ++ // HZ-GB-2312
|
| ++
|
| ++ // empty segment 1 (using substitution and stop)
|
| ++ {
|
| ++ "HZ-GB-2312",
|
| ++ :bin{ 61627e7b7e7d6364 },
|
| ++ "ab\uFFFDcd",
|
| ++ :intvector{ 0, 1, 4, 6, 7 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "HZ-GB-2312",
|
| ++ :bin{ 61627e7b7e7d63640d0a },
|
| ++ "ab",
|
| ++ :intvector{ 0, 1 },
|
| ++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"}
|
| ++ }
|
| ++ // empty segment 2 & legal redundant switches (using substitution and stop)
|
| ++ {
|
| ++ "HZ-GB-2312",
|
| ++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
|
| ++ "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD",
|
| ++ :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 },
|
| ++ :int{1}, :int{1}, "", "?", :bin{""}
|
| ++ }
|
| ++ {
|
| ++ "HZ-GB-2312",
|
| ++ :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
|
| ++ "ab\u4E0D\u7A7A",
|
| ++ :intvector{ 0, 1, 4, 6 },
|
| ++ :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"}
|
| ++ }
|
| ++
|
| + // DBCS-only extensions
|
| + {
|
| + "ibm-970",
|
| +@@ -496,6 +737,14 @@
|
| + :intvector{ 0, 4, 8, 12 },
|
| + :int{1}, :int{0}, "", "?", :bin{""}
|
| + }
|
| ++ // Test iso-2022-jp-2 miscellaneous symbols
|
| ++ {
|
| ++ "iso-2022-jp-2",
|
| ++ :bin{ 1b242843224f224e1b2842 },
|
| ++ "\u260E\u260F",
|
| ++ :intvector{ 4, 6 },
|
| ++ :int{1}, :int{0}, "", ".", :bin{""}
|
| ++ }
|
| + }
|
| + }
|
| +
|
| +@@ -504,6 +753,14 @@
|
| + fromUnicode {
|
| + Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
|
| + Cases {
|
| ++ // Test bug 6071 (1:2 Unicode:charset SBCS mapping).
|
| ++ {
|
| ++ "*test1bmp",
|
| ++ "e@t",
|
| ++ :bin{ 05000709 },
|
| ++ :intvector{ 0,1,2,2 },
|
| ++ :int{1}, :int{0}, "", "?", ""
|
| ++ }
|
| + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
|
| + // using the Shift-JIS table for JIS X 0208 (ticket #5797)
|
| + {
|
| +@@ -1311,16 +1568,29 @@
|
| + // versions of ISO-2022-JP
|
| + {
|
| + "ISO-2022-JP",
|
| +- "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
|
| +- "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
|
| ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]",
|
| ++ "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
|
| + :int{0}
|
| +- }
|
| ++ }
|
| + {
|
| + "ISO-2022-JP-2",
|
| +- "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
|
| +- "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
|
| ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
|
| ++ "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
|
| ++ :int{0}
|
| ++ }
|
| ++ {
|
| ++ "JIS7",
|
| ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
|
| ++ "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
|
| + :int{0}
|
| + }
|
| ++ // with fallbacks
|
| ++ {
|
| ++ "ISO-2022-JP",
|
| ++ "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
|
| ++ "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
|
| ++ :int{1}
|
| ++ }
|
| +
|
| + // versions of ISO-2022-CN
|
| + {
|
| +@@ -1336,6 +1606,22 @@
|
| + :int{0}
|
| + }
|
| +
|
| ++ // HZ
|
| ++ {
|
| ++ "HZ",
|
| ++ "[\u0410-\u044f\u4e00\u4e01\u4e03]",
|
| ++ "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",
|
| ++ :int{0}
|
| ++ }
|
| ++
|
| ++ // LMBCS
|
| ++ {
|
| ++ "LMBCS",
|
| ++ "[\x00-\U0010ffff]",
|
| ++ "[]",
|
| ++ :int{0}
|
| ++ }
|
| ++
|
| + // DBCS-only
|
| + {
|
| + "ibm-971",
|
| +--- r22777/source/common/ucnv_ext.h 2007-08-22 22:46:49.525855000 -0700
|
| ++++ chrome.canonical/source/common/ucnv_ext.h 2009-03-23 12:30:09.644121000 -0700
|
| +@@ -382,10 +382,20 @@
|
| + UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
|
| + UErrorCode *pErrorCode);
|
| +
|
| ++/*
|
| ++ * Add code points and strings to the set according to the extension mappings.
|
| ++ * Limitation on the UConverterSetFilter:
|
| ++ * The filters currently assume that they are used with 1:1 mappings.
|
| ++ * They only apply to single input code points, and then they pass through
|
| ++ * only mappings with single-charset-code results.
|
| ++ * For example, the Shift-JIS filter only works for 2-byte results and tests
|
| ++ * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
|
| ++ */
|
| + U_CFUNC void
|
| + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
| + const USetAdder *sa,
|
| + UConverterUnicodeSet which,
|
| ++ UConverterSetFilter filter,
|
| + UErrorCode *pErrorCode);
|
| +
|
| + /* toUnicode helpers -------------------------------------------------------- */
|
| +--- r22777/source/common/ucnvmbcs.c 2007-10-11 14:31:32.196532000 -0700
|
| ++++ chrome.canonical/source/common/ucnvmbcs.c 2009-03-23 12:42:01.150242000 -0700
|
| +@@ -1,7 +1,7 @@
|
| + /*
|
| + ******************************************************************************
|
| + *
|
| +-* Copyright (C) 2000-2007, International Business Machines
|
| ++* Copyright (C) 2000-2008, International Business Machines
|
| + * Corporation and others. All Rights Reserved.
|
| + *
|
| + ******************************************************************************
|
| +@@ -485,9 +485,23 @@
|
| +
|
| + if(mbcsTable->outputType==MBCS_OUTPUT_1) {
|
| + const uint16_t *stage2, *stage3, *results;
|
| ++ uint16_t minValue;
|
| +
|
| + results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
|
| +
|
| ++ /*
|
| ++ * Set a threshold variable for selecting which mappings to use.
|
| ++ * See ucnv_MBCSSingleFromBMPWithOffsets() and
|
| ++ * MBCS_SINGLE_RESULT_FROM_U() for details.
|
| ++ */
|
| ++ if(which==UCNV_ROUNDTRIP_SET) {
|
| ++ /* use only roundtrips */
|
| ++ minValue=0xf00;
|
| ++ } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
|
| ++ /* use all roundtrip and fallback results */
|
| ++ minValue=0x800;
|
| ++ }
|
| ++
|
| + for(st1=0; st1<maxStage1; ++st1) {
|
| + st2=table[st1];
|
| + if(st2>maxStage1) {
|
| +@@ -497,15 +511,8 @@
|
| + /* read the stage 3 block */
|
| + stage3=results+st3;
|
| +
|
| +- /*
|
| +- * Add code points for which the roundtrip flag is set.
|
| +- * Once we get a set for fallback mappings, we have to use
|
| +- * a threshold variable with a value of 0x800.
|
| +- * See ucnv_MBCSSingleFromBMPWithOffsets() and
|
| +- * MBCS_SINGLE_RESULT_FROM_U() for details.
|
| +- */
|
| + do {
|
| +- if(*stage3++>=0xf00) {
|
| ++ if(*stage3++>=minValue) {
|
| + sa->add(sa->set, c);
|
| + }
|
| + } while((++c&0xf)!=0);
|
| +@@ -522,9 +529,12 @@
|
| + const uint8_t *stage3, *bytes;
|
| + uint32_t st3Multiplier;
|
| + uint32_t value;
|
| ++ UBool useFallback;
|
| +
|
| + bytes=mbcsTable->fromUnicodeBytes;
|
| +
|
| ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
|
| ++
|
| + switch(mbcsTable->outputType) {
|
| + case MBCS_OUTPUT_3:
|
| + case MBCS_OUTPUT_4_EUC:
|
| +@@ -551,9 +561,8 @@
|
| + st3>>=16;
|
| +
|
| + /*
|
| +- * Add code points for which the roundtrip flag is set.
|
| +- * Once we get a set for fallback mappings, we have to check
|
| +- * non-roundtrip stage 3 results for whether they are 0.
|
| ++ * Add code points for which the roundtrip flag is set,
|
| ++ * or which map to non-zero bytes if we use fallbacks.
|
| + * See ucnv_MBCSFromUnicodeWithOffsets() for details.
|
| + */
|
| + switch(filter) {
|
| +@@ -561,6 +570,23 @@
|
| + do {
|
| + if(st3&1) {
|
| + sa->add(sa->set, c);
|
| ++ stage3+=st3Multiplier;
|
| ++ } else if(useFallback) {
|
| ++ uint8_t b=0;
|
| ++ switch(st3Multiplier) {
|
| ++ case 4:
|
| ++ b|=*stage3++;
|
| ++ case 3:
|
| ++ b|=*stage3++;
|
| ++ case 2:
|
| ++ b|=stage3[0]|stage3[1];
|
| ++ stage3+=2;
|
| ++ default:
|
| ++ break;
|
| ++ }
|
| ++ if(b!=0) {
|
| ++ sa->add(sa->set, c);
|
| ++ }
|
| + }
|
| + st3>>=1;
|
| + } while((++c&0xf)!=0);
|
| +@@ -568,7 +594,7 @@
|
| + case UCNV_SET_FILTER_DBCS_ONLY:
|
| + /* Ignore single-byte results (<0x100). */
|
| + do {
|
| +- if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
|
| ++ if(((st3&1)!=0 || useFallback) && *((const uint16_t *)stage3)>=0x100) {
|
| + sa->add(sa->set, c);
|
| + }
|
| + st3>>=1;
|
| +@@ -578,7 +604,7 @@
|
| + case UCNV_SET_FILTER_2022_CN:
|
| + /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
|
| + do {
|
| +- if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
|
| ++ if(((st3&1)!=0 || useFallback) && ((value=*stage3)==0x81 || value==0x82)) {
|
| + sa->add(sa->set, c);
|
| + }
|
| + st3>>=1;
|
| +@@ -588,7 +614,33 @@
|
| + case UCNV_SET_FILTER_SJIS:
|
| + /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
|
| + do {
|
| +- if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
|
| ++ if(((st3&1)!=0 || useFallback) && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
|
| ++ sa->add(sa->set, c);
|
| ++ }
|
| ++ st3>>=1;
|
| ++ stage3+=2; /* +=st3Multiplier */
|
| ++ } while((++c&0xf)!=0);
|
| ++ break;
|
| ++ case UCNV_SET_FILTER_GR94DBCS:
|
| ++ /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
|
| ++ do {
|
| ++ if( ((st3&1)!=0 || useFallback) &&
|
| ++ (uint16_t)((value=*((const uint16_t *)stage3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
|
| ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
|
| ++ ) {
|
| ++ sa->add(sa->set, c);
|
| ++ }
|
| ++ st3>>=1;
|
| ++ stage3+=2; /* +=st3Multiplier */
|
| ++ } while((++c&0xf)!=0);
|
| ++ break;
|
| ++ case UCNV_SET_FILTER_HZ:
|
| ++ /* Only add code points that are suitable for HZ DBCS (lead byte A1..FD). */
|
| ++ do {
|
| ++ if( ((st3&1)!=0 || useFallback) &&
|
| ++ (uint16_t)((value=*((const uint16_t *)stage3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
|
| ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
|
| ++ ) {
|
| + sa->add(sa->set, c);
|
| + }
|
| + st3>>=1;
|
| +@@ -609,7 +661,7 @@
|
| + }
|
| + }
|
| +
|
| +- ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
|
| ++ ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
|
| + }
|
| +
|
| + U_CFUNC void
|
| +@@ -1694,7 +1746,7 @@
|
| + cnv->toUBytes[0]=*(source-1);
|
| + cnv->toULength=_extToU(cnv, cnv->sharedData,
|
| + 1, &source, sourceLimit,
|
| +- &target, target+targetCapacity,
|
| ++ &target, pArgs->targetLimit,
|
| + &offsets, sourceIndex,
|
| + pArgs->flush,
|
| + pErrorCode);
|
| +@@ -1739,6 +1791,65 @@
|
| + pArgs->offsets=offsets;
|
| + }
|
| +
|
| ++static UBool
|
| ++hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
|
| ++ const int32_t *row=stateTable[state];
|
| ++ int32_t b, entry;
|
| ++ /* First test for final entries in this state for some commonly valid byte values. */
|
| ++ entry=row[0xa1];
|
| ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
| ++ ) {
|
| ++ return TRUE;
|
| ++ }
|
| ++ entry=row[0x41];
|
| ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
| ++ ) {
|
| ++ return TRUE;
|
| ++ }
|
| ++ /* Then test for final entries in this state. */
|
| ++ for(b=0; b<=0xff; ++b) {
|
| ++ entry=row[b];
|
| ++ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| ++ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
| ++ ) {
|
| ++ return TRUE;
|
| ++ }
|
| ++ }
|
| ++ /* Then recurse for transition entries. */
|
| ++ for(b=0; b<=0xff; ++b) {
|
| ++ entry=row[b];
|
| ++ if( MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| ++ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
|
| ++ ) {
|
| ++ return TRUE;
|
| ++ }
|
| ++ }
|
| ++ return FALSE;
|
| ++}
|
| ++
|
| ++/*
|
| ++ * Is byte b a single/lead byte in this state?
|
| ++ * Recurse for transition states, because here we don't want to say that
|
| ++ * b is a lead byte if all byte sequences that start with b are illegal.
|
| ++ */
|
| ++static UBool
|
| ++isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
|
| ++ const int32_t *row=stateTable[state];
|
| ++ int32_t entry=row[b];
|
| ++ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
|
| ++ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
|
| ++ } else {
|
| ++ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
| ++ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
|
| ++ return FALSE; /* SI/SO are illegal for DBCS-only conversion */
|
| ++ } else {
|
| ++ return action!=MBCS_STATE_ILLEGAL;
|
| ++ }
|
| ++ }
|
| ++}
|
| ++
|
| + U_CFUNC void
|
| + ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
| + UErrorCode *pErrorCode) {
|
| +@@ -2094,6 +2205,34 @@
|
| + sourceIndex=nextSourceIndex;
|
| + } else if(U_FAILURE(*pErrorCode)) {
|
| + /* callback(illegal) */
|
| ++ if(byteIndex>1) {
|
| ++ /*
|
| ++ * Ticket 5691: consistent illegal sequences:
|
| ++ * - We include at least the first byte in the illegal sequence.
|
| ++ * - If any of the non-initial bytes could be the start of a character,
|
| ++ * we stop the illegal sequence before the first one of those.
|
| ++ */
|
| ++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
|
| ++ int8_t i;
|
| ++ for(i=1;
|
| ++ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
|
| ++ ++i) {}
|
| ++ if(i<byteIndex) {
|
| ++ /* Back out some bytes. */
|
| ++ int8_t backOutDistance=byteIndex-i;
|
| ++ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
|
| ++ byteIndex=i; /* length of reported illegal byte sequence */
|
| ++ if(backOutDistance<=bytesFromThisBuffer) {
|
| ++ source-=backOutDistance;
|
| ++ } else {
|
| ++ /* Back out bytes from the previous buffer: Need to replay them. */
|
| ++ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
|
| ++ /* preToULength is negative! */
|
| ++ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
|
| ++ source=(const uint8_t *)pArgs->source;
|
| ++ }
|
| ++ }
|
| ++ }
|
| + break;
|
| + } else /* unassigned sequences indicated with byteIndex>0 */ {
|
| + /* try an extension mapping */
|
| +@@ -2104,7 +2243,7 @@
|
| + &offsets, sourceIndex,
|
| + pArgs->flush,
|
| + pErrorCode);
|
| +- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
|
| ++ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
|
| +
|
| + if(U_FAILURE(*pErrorCode)) {
|
| + /* not mappable or buffer overflow */
|
| +@@ -2395,15 +2534,37 @@
|
| +
|
| + if(c<0) {
|
| + if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
|
| +- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
| +- }
|
| +- if(U_FAILURE(*pErrorCode)) {
|
| + /* incomplete character byte sequence */
|
| + uint8_t *bytes=cnv->toUBytes;
|
| + cnv->toULength=(int8_t)(source-lastSource);
|
| + do {
|
| + *bytes++=*lastSource++;
|
| + } while(lastSource<source);
|
| ++ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
| ++ } else if(U_FAILURE(*pErrorCode)) {
|
| ++ /* callback(illegal) */
|
| ++ /*
|
| ++ * Ticket 5691: consistent illegal sequences:
|
| ++ * - We include at least the first byte in the illegal sequence.
|
| ++ * - If any of the non-initial bytes could be the start of a character,
|
| ++ * we stop the illegal sequence before the first one of those.
|
| ++ */
|
| ++ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
|
| ++ uint8_t *bytes=cnv->toUBytes;
|
| ++ *bytes++=*lastSource++; /* first byte */
|
| ++ if(lastSource==source) {
|
| ++ cnv->toULength=1;
|
| ++ } else /* lastSource<source: multi-byte character */ {
|
| ++ int8_t i;
|
| ++ for(i=1;
|
| ++ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
|
| ++ ++i
|
| ++ ) {
|
| ++ *bytes++=*lastSource++;
|
| ++ }
|
| ++ cnv->toULength=i;
|
| ++ source=lastSource;
|
| ++ }
|
| + } else {
|
| + /* no output because of empty input or only state changes */
|
| + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
| +@@ -3237,7 +3398,7 @@
|
| + lastSource=source;
|
| + c=_extFromU(cnv, cnv->sharedData,
|
| + c, &source, sourceLimit,
|
| +- &target, target+targetCapacity,
|
| ++ &target, (const uint8_t *)(pArgs->targetLimit),
|
| + &offsets, sourceIndex,
|
| + pArgs->flush,
|
| + pErrorCode);
|
| +--- r22777/source/common/ucnvmbcs.h 2007-10-11 14:31:32.196532000 -0700
|
| ++++ chrome.canonical/source/common/ucnvmbcs.h 2009-03-23 12:30:17.315007000 -0700
|
| +@@ -492,6 +492,8 @@
|
| + UCNV_SET_FILTER_DBCS_ONLY,
|
| + UCNV_SET_FILTER_2022_CN,
|
| + UCNV_SET_FILTER_SJIS,
|
| ++ UCNV_SET_FILTER_GR94DBCS,
|
| ++ UCNV_SET_FILTER_HZ,
|
| + UCNV_SET_FILTER_COUNT
|
| + } UConverterSetFilter;
|
| +
|
| +--- r22777/source/common/ucnv.c 2007-08-31 12:39:14.294200000 -0700
|
| ++++ chrome.canonical/source/common/ucnv.c 2009-03-23 12:40:10.566608000 -0700
|
| +@@ -1528,11 +1528,14 @@
|
| + cnv->toULength=0;
|
| +
|
| + /* call the callback function */
|
| ++ if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) {
|
| ++ cnv->toUCallbackReason = UCNV_UNASSIGNED;
|
| ++ }
|
| + cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
|
| + cnv->invalidCharBuffer, errorInputLength,
|
| +- (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
|
| +- UCNV_UNASSIGNED : UCNV_ILLEGAL,
|
| ++ cnv->toUCallbackReason,
|
| + err);
|
| ++ cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */
|
| +
|
| + /*
|
| + * loop back to the offset handling
|
| +--- r22777/source/common/uset_imp.h 2007-07-24 19:51:25.692061000 -0700
|
| ++++ chrome.canonical/source/common/uset_imp.h 2009-03-23 12:30:09.893067000 -0700
|
| +@@ -36,6 +36,9 @@
|
| + typedef void U_CALLCONV
|
| + USetRemove(USet *set, UChar32 c);
|
| +
|
| ++typedef void U_CALLCONV
|
| ++USetRemoveRange(USet *set, UChar32 start, UChar32 end);
|
| ++
|
| + /**
|
| + * Interface for adding items to a USet, to keep low-level code from
|
| + * statically depending on the USet implementation.
|
| +@@ -47,6 +50,7 @@
|
| + USetAddRange *addRange;
|
| + USetAddString *addString;
|
| + USetRemove *remove;
|
| ++ USetRemoveRange *removeRange;
|
| + };
|
| + typedef struct USetAdder USetAdder;
|
| +
|
| +--- r22777/source/common/ucnv2022.c 2007-10-11 14:31:32.196532000 -0700
|
| ++++ chrome.canonical/source/common/ucnv2022.c 2009-03-23 12:57:38.398368000 -0700
|
| +@@ -201,6 +201,7 @@
|
| + #ifdef U_ENABLE_GENERIC_ISO_2022
|
| + UBool isFirstBuffer;
|
| + #endif
|
| ++ UBool isEmptySegment;
|
| + char name[30];
|
| + char locale[3];
|
| + }UConverterDataISO2022;
|
| +@@ -609,6 +610,7 @@
|
| + if(choice<=UCNV_RESET_TO_UNICODE) {
|
| + uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
|
| + myConverterData->key = 0;
|
| ++ myConverterData->isEmptySegment = FALSE;
|
| + }
|
| + if(choice!=UCNV_RESET_TO_UNICODE) {
|
| + uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
|
| +@@ -752,6 +754,7 @@
|
| UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
|
| uint32_t key = myData2022->key;
|
| int32_t offset = 0;
|
| @@ -9,7 +1198,7 @@
|
| char c;
|
|
|
| value = VALID_NON_TERMINAL_2022;
|
| -@@ -804,7 +805,6 @@
|
| +@@ -804,7 +807,6 @@
|
| return;
|
| } else if (value == INVALID_2022 ) {
|
| *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| @@ -17,7 +1206,15 @@
|
| } else /* value == VALID_TERMINAL_2022 */ {
|
| switch(var){
|
| #ifdef U_ENABLE_GENERIC_ISO_2022
|
| -@@ -935,6 +935,35 @@
|
| +@@ -814,6 +816,7 @@
|
| + if(chosenConverterName == NULL) {
|
| + /* SS2 or SS3 */
|
| + *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
| ++ _this->toUCallbackReason = UCNV_UNASSIGNED;
|
| + return;
|
| + }
|
| +
|
| +@@ -935,6 +938,37 @@
|
| }
|
| if(U_SUCCESS(*err)) {
|
| _this->toULength = 0;
|
| @@ -50,43 +1247,46 @@
|
| + }
|
| + _this->toULength=1;
|
| + }
|
| ++ } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
|
| ++ _this->toUCallbackReason = UCNV_UNASSIGNED;
|
| }
|
| }
|
|
|
| -@@ -1097,6 +1126,24 @@
|
| +@@ -1113,6 +1147,24 @@
|
| + }
|
| }
|
|
|
| - /*
|
| -+ * * Check that the result is a 2-byte value with each byte in the range A1..FE
|
| -+ * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
|
| -+ * * to move it to the ISO 2022 range 21..7E.
|
| -+ * * Return 0 if out of range.
|
| -+ * */
|
| ++#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
|
| ++/*
|
| ++ * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
|
| ++ * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
|
| ++ * unchanged.
|
| ++ */
|
| +static U_INLINE uint32_t
|
| -+_2022FromGR94DBCS(uint32_t value) {
|
| -+ if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
| -+ (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
|
| -+ ) {
|
| -+ return value - 0x8080; /* shift down to 21..7e byte range */
|
| ++_2022ToGR94DBCS(uint32_t value) {
|
| ++ uint32_t returnValue = value + 0x8080;
|
| ++ if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
| ++ (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
|
| ++ return returnValue;
|
| + } else {
|
| -+ return 0; /* not valid for ISO 2022 */
|
| ++ return value;
|
| + }
|
| +}
|
| ++#endif
|
| +
|
| -+#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
|
| -+/*
|
| - * Check that the result is a 2-byte value with each byte in the range A1..FE
|
| - * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
|
| - * to move it to the ISO 2022 range 21..7E.
|
| -@@ -1112,6 +1159,7 @@
|
| - return 0; /* not valid for ISO 2022 */
|
| - }
|
| - }
|
| -+#endif
|
| -
|
| #ifdef U_ENABLE_GENERIC_ISO_2022
|
|
|
| -@@ -1953,6 +2001,7 @@
|
| + /**********************************************************************************
|
| +@@ -1436,7 +1488,7 @@
|
| + c2 = 0; /* invalid */
|
| + }
|
| + } else {
|
| +- if((uint8_t)(c2-0x21) <= (0x7e-0x21)) {
|
| ++ if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
|
| + c2 += 0x7e;
|
| + } else {
|
| + c2 = 0; /* invalid */
|
| +@@ -1953,6 +2005,7 @@
|
| const char *mySourceLimit = args->sourceLimit;
|
| uint32_t targetUniChar = 0x0000;
|
| uint32_t mySourceChar = 0x0000;
|
| @@ -94,7 +1294,7 @@
|
| UConverterDataISO2022* myData;
|
| ISO2022State *pToU2022State;
|
| StateEnum cs;
|
| -@@ -1968,6 +2017,7 @@
|
| +@@ -1968,6 +2021,7 @@
|
| mySourceChar = args->converter->toUBytes[0];
|
| args->converter->toULength = 0;
|
| cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
| @@ -102,7 +1302,65 @@
|
| goto getTrailByte;
|
| }
|
|
|
| -@@ -2077,17 +2127,44 @@
|
| +@@ -1986,6 +2040,7 @@
|
| + continue;
|
| + } else {
|
| + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
| ++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
| + break;
|
| + }
|
| +
|
| +@@ -1997,21 +2052,39 @@
|
| + continue;
|
| + } else {
|
| + /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
| ++ myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
| + break;
|
| + }
|
| +
|
| + case ESC_2022:
|
| + mySource--;
|
| + escape:
|
| +- changeState_2022(args->converter,&(mySource),
|
| +- mySourceLimit, ISO_2022_JP,err);
|
| ++ {
|
| ++ const char * mySourceBefore = mySource;
|
| ++ int8_t toULengthBefore = args->converter->toULength;
|
| ++
|
| ++ changeState_2022(args->converter,&(mySource),
|
| ++ mySourceLimit, ISO_2022_JP,err);
|
| ++
|
| ++ /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
|
| ++ if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
| ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| ++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| ++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
|
| ++ }
|
| ++ }
|
| +
|
| + /* invalid or illegal escape sequence */
|
| + if(U_FAILURE(*err)){
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
| + return;
|
| + }
|
| ++ /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
|
| ++ if(myData->key==0) {
|
| ++ myData->isEmptySegment = TRUE;
|
| ++ }
|
| + continue;
|
| +
|
| + /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
|
| +@@ -2028,6 +2101,7 @@
|
| + /* falls through */
|
| + default:
|
| + /* convert one or two bytes */
|
| ++ myData->isEmptySegment = FALSE;
|
| + cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
| + if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
|
| + !IS_JP_DBCS(cs)
|
| +@@ -2077,17 +2151,44 @@
|
| default:
|
| /* G0 DBCS */
|
| if(mySource < mySourceLimit) {
|
| @@ -156,7 +1414,7 @@
|
| } else {
|
| args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| args->converter->toULength = 1;
|
| -@@ -2229,7 +2306,12 @@
|
| +@@ -2229,7 +2330,12 @@
|
| }
|
| /* only DBCS or SBCS characters are expected*/
|
| /* DB characters with high bit set to 1 are expected */
|
| @@ -170,8 +1428,39 @@
|
| targetByteUnit=missingCharMarker;
|
| }
|
| if (targetByteUnit != missingCharMarker){
|
| -@@ -2545,17 +2627,34 @@
|
| +@@ -2524,15 +2630,27 @@
|
|
|
| + if(mySourceChar==UCNV_SI){
|
| + myData->toU2022State.g = 0;
|
| ++ if (myData->isEmptySegment) {
|
| ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
| ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| ++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| ++ args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| ++ args->converter->toULength = 1;
|
| ++ args->target = myTarget;
|
| ++ args->source = mySource;
|
| ++ return;
|
| ++ }
|
| + /*consume the source */
|
| + continue;
|
| + }else if(mySourceChar==UCNV_SO){
|
| + myData->toU2022State.g = 1;
|
| ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
| + /*consume the source */
|
| + continue;
|
| + }else if(mySourceChar==ESC_2022){
|
| + mySource--;
|
| + escape:
|
| ++ myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
|
| + changeState_2022(args->converter,&(mySource),
|
| + mySourceLimit, ISO_2022_KR, err);
|
| + if(U_FAILURE(*err)){
|
| +@@ -2543,19 +2661,37 @@
|
| + continue;
|
| + }
|
| +
|
| ++ myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
|
| if(myData->toU2022State.g == 1) {
|
| if(mySource < mySourceLimit) {
|
| - char trailByte;
|
| @@ -214,7 +1503,7 @@
|
| }
|
| } else {
|
| args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| -@@ -2563,8 +2662,10 @@
|
| +@@ -2563,8 +2699,10 @@
|
| break;
|
| }
|
| }
|
| @@ -226,7 +1515,7 @@
|
| }
|
| if(targetUniChar < 0xfffe){
|
| if(args->offsets) {
|
| -@@ -3061,6 +3162,7 @@
|
| +@@ -3061,6 +3199,7 @@
|
| /* continue with a partial double-byte character */
|
| mySourceChar = args->converter->toUBytes[0];
|
| args->converter->toULength = 0;
|
| @@ -234,7 +1523,68 @@
|
| goto getTrailByte;
|
| }
|
|
|
| -@@ -3114,29 +3216,50 @@
|
| +@@ -3075,27 +3214,52 @@
|
| + switch(mySourceChar){
|
| + case UCNV_SI:
|
| + pToU2022State->g=0;
|
| ++ if (myData->isEmptySegment) {
|
| ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
| ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| ++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| ++ args->converter->toUBytes[0] = mySourceChar;
|
| ++ args->converter->toULength = 1;
|
| ++ args->target = myTarget;
|
| ++ args->source = mySource;
|
| ++ return;
|
| ++ }
|
| + continue;
|
| +
|
| + case UCNV_SO:
|
| + if(pToU2022State->cs[1] != 0) {
|
| + pToU2022State->g=1;
|
| ++ myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
| + continue;
|
| + } else {
|
| + /* illegal to have SO before a matching designator */
|
| ++ myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
|
| + break;
|
| + }
|
| +
|
| + case ESC_2022:
|
| + mySource--;
|
| + escape:
|
| +- changeState_2022(args->converter,&(mySource),
|
| +- mySourceLimit, ISO_2022_CN,err);
|
| ++ {
|
| ++ const char * mySourceBefore = mySource;
|
| ++ int8_t toULengthBefore = args->converter->toULength;
|
| ++
|
| ++ changeState_2022(args->converter,&(mySource),
|
| ++ mySourceLimit, ISO_2022_CN,err);
|
| ++
|
| ++ /* After SO there must be at least one character before a designator (designator error handled separately) */
|
| ++ if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
| ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| ++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| ++ args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
|
| ++ }
|
| ++ }
|
| +
|
| + /* invalid or illegal escape sequence */
|
| + if(U_FAILURE(*err)){
|
| + args->target = myTarget;
|
| + args->source = mySource;
|
| ++ myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
| + return;
|
| + }
|
| + continue;
|
| +@@ -3109,34 +3273,56 @@
|
| + /* falls through */
|
| + default:
|
| + /* convert one or two bytes */
|
| ++ myData->isEmptySegment = FALSE;
|
| + if(pToU2022State->g != 0) {
|
| + if(mySource < mySourceLimit) {
|
| UConverterSharedData *cnv;
|
| StateEnum tempState;
|
| int32_t tempBufLen;
|
| @@ -302,386 +1652,622 @@
|
| } else {
|
| args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| args->converter->toULength = 1;
|
| -diff -ru trie.clean/source/common/ucnvmbcs.c chrome.canonical/source/common/ucnvmbcs.c
|
| ---- trie.clean/source/common/ucnvmbcs.c 2007-11-07 17:39:05.057870000 -0800
|
| -+++ chrome.canonical/source/common/ucnvmbcs.c 2008-10-29 11:34:34.648518000 -0700
|
| -@@ -1,7 +1,7 @@
|
| - /*
|
| - ******************************************************************************
|
| - *
|
| --* Copyright (C) 2000-2007, International Business Machines
|
| -+* Copyright (C) 2000-2008, International Business Machines
|
| +@@ -3399,11 +3585,19 @@
|
| + /* include ASCII for JP */
|
| + sa->addRange(sa->set, 0, 0x7f);
|
| + }
|
| +- if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
|
| ++ if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
|
| + /*
|
| +- * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
|
| +- * we need to include half-width Katakana for all JP variants because
|
| +- * JIS X 0208 has hardcoded fallbacks for them.
|
| ++ * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
|
| ++ * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
|
| ++ * use half-width Katakana.
|
| ++ * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
|
| ++ * half-width Katakana via the ESC ( I sequence.
|
| ++ * However, we only emit (fromUnicode) half-width Katakana according to the
|
| ++ * definition of each variant.
|
| ++ *
|
| ++ * When including fallbacks,
|
| ++ * we need to include half-width Katakana Unicode code points for all JP variants because
|
| ++ * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
|
| + */
|
| + /* include half-width Katakana for JP */
|
| + sa->addRange(sa->set, HWKANA_START, HWKANA_END);
|
| +@@ -3457,6 +3651,12 @@
|
| + * corresponding to JIS X 0208.
|
| + */
|
| + filter=UCNV_SET_FILTER_SJIS;
|
| ++ } else if(i==KSC5601) {
|
| ++ /*
|
| ++ * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
|
| ++ * are broader than GR94.
|
| ++ */
|
| ++ filter=UCNV_SET_FILTER_GR94DBCS;
|
| + } else {
|
| + filter=UCNV_SET_FILTER_NONE;
|
| + }
|
| +@@ -3472,6 +3672,9 @@
|
| + sa->remove(sa->set, 0x0e);
|
| + sa->remove(sa->set, 0x0f);
|
| + sa->remove(sa->set, 0x1b);
|
| ++
|
| ++ /* ISO 2022 converters do not convert C1 controls either */
|
| ++ sa->removeRange(sa->set, 0x80, 0x9f);
|
| + }
|
| +
|
| + static const UConverterImpl _ISO2022Impl={
|
| +--- r22777/source/common/ucnv_lmb.c 2006-08-19 14:27:08.000000000 -0700
|
| ++++ chrome.canonical/source/common/ucnv_lmb.c 2009-03-23 12:30:26.043293000 -0700
|
| +@@ -1,6 +1,6 @@
|
| + /*
|
| + **********************************************************************
|
| +-* Copyright (C) 2000-2006, International Business Machines
|
| ++* Copyright (C) 2000-2007, International Business Machines
|
| * Corporation and others. All Rights Reserved.
|
| - *
|
| - ******************************************************************************
|
| -@@ -1739,6 +1739,65 @@
|
| - pArgs->offsets=offsets;
|
| + **********************************************************************
|
| + * file name: ucnv_lmb.cpp
|
| +@@ -536,7 +536,7 @@
|
| + NULL,\
|
| + NULL,\
|
| + _LMBCSSafeClone,\
|
| +- _LMBCSGetUnicodeSet\
|
| ++ ucnv_getCompleteUnicodeSet\
|
| + };\
|
| + static const UConverterStaticData _LMBCSStaticData##n={\
|
| + sizeof(UConverterStaticData),\
|
| +@@ -662,15 +662,14 @@
|
| + return &newLMBCS->cnv;
|
| }
|
|
|
| -+static UBool
|
| -+hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
|
| -+ const int32_t *row=stateTable[state];
|
| -+ int32_t b, entry;
|
| -+ /* First test for final entries in this state for some commonly valid byte values. */
|
| -+ entry=row[0xa1];
|
| -+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| -+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
| -+ ) {
|
| -+ return TRUE;
|
| -+ }
|
| -+ entry=row[0x41];
|
| -+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| -+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
| -+ ) {
|
| -+ return TRUE;
|
| -+ }
|
| -+ /* Then test for final entries in this state. */
|
| -+ for(b=0; b<=0xff; ++b) {
|
| -+ entry=row[b];
|
| -+ if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| -+ MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
|
| -+ ) {
|
| -+ return TRUE;
|
| -+ }
|
| -+ }
|
| -+ /* Then recurse for transition entries. */
|
| -+ for(b=0; b<=0xff; ++b) {
|
| -+ entry=row[b];
|
| -+ if( MBCS_ENTRY_IS_TRANSITION(entry) &&
|
| -+ hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry))
|
| -+ ) {
|
| -+ return TRUE;
|
| -+ }
|
| -+ }
|
| -+ return FALSE;
|
| -+}
|
| -+
|
| +-static void
|
| +-_LMBCSGetUnicodeSet(const UConverter *cnv,
|
| +- const USetAdder *sa,
|
| +- UConverterUnicodeSet which,
|
| +- UErrorCode *pErrorCode) {
|
| +- /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
|
| +- sa->addRange(sa->set, 0, 0xf5ff);
|
| +- sa->addRange(sa->set, 0xf700, 0x10ffff);
|
| +-}
|
| +/*
|
| -+ * Is byte b a single/lead byte in this state?
|
| -+ * Recurse for transition states, because here we don't want to say that
|
| -+ * b is a lead byte if all byte sequences that start with b are illegal.
|
| ++ * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 20117)
|
| ++ * which added all code points except for U+F6xx
|
| ++ * because those cannot be represented in the Unicode group.
|
| ++ * However, it turns out that windows-950 has roundtrips for all of U+F6xx
|
| ++ * which means that LMBCS can convert all Unicode code points after all.
|
| ++ * We now simply use ucnv_getCompleteUnicodeSet().
|
| + */
|
| -+static UBool
|
| -+isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnly, uint8_t b) {
|
| -+ const int32_t *row=stateTable[state];
|
| -+ int32_t entry=row[b];
|
| -+ if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
|
| -+ return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry));
|
| -+ } else {
|
| -+ uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
| -+ if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
|
| -+ return FALSE; /* SI/SO are illegal for DBCS-only conversion */
|
| -+ } else {
|
| -+ return action!=MBCS_STATE_ILLEGAL;
|
| -+ }
|
| -+ }
|
| -+}
|
| -+
|
| - U_CFUNC void
|
| - ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
| - UErrorCode *pErrorCode) {
|
| -@@ -2094,6 +2153,34 @@
|
| - sourceIndex=nextSourceIndex;
|
| - } else if(U_FAILURE(*pErrorCode)) {
|
| - /* callback(illegal) */
|
| -+ if(byteIndex>1) {
|
| -+ /*
|
| -+ * Ticket 5691: consistent illegal sequences:
|
| -+ * - We include at least the first byte in the illegal sequence.
|
| -+ * - If any of the non-initial bytes could be the start of a character,
|
| -+ * we stop the illegal sequence before the first one of those.
|
| -+ */
|
| -+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
|
| -+ int8_t i;
|
| -+ for(i=1;
|
| -+ i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, bytes[i]);
|
| -+ ++i) {}
|
| -+ if(i<byteIndex) {
|
| -+ /* Back out some bytes. */
|
| -+ int8_t backOutDistance=byteIndex-i;
|
| -+ int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
|
| -+ byteIndex=i; /* length of reported illegal byte sequence */
|
| -+ if(backOutDistance<=bytesFromThisBuffer) {
|
| -+ source-=backOutDistance;
|
| +
|
| + /*
|
| + Here's the basic helper function that we use when converting from
|
| +--- r22777/source/common/ucnvhz.c 2006-07-05 16:08:50.000000000 -0700
|
| ++++ chrome.canonical/source/common/ucnvhz.c 2009-03-23 12:42:01.208181000 -0700
|
| +@@ -1,6 +1,6 @@
|
| + /*
|
| + **********************************************************************
|
| +-* Copyright (C) 2000-2006, International Business Machines
|
| ++* Copyright (C) 2000-2007, International Business Machines
|
| + * Corporation and others. All Rights Reserved.
|
| + **********************************************************************
|
| + * file name: ucnvhz.c
|
| +@@ -59,6 +59,7 @@
|
| + UBool isEscapeAppended;
|
| + UBool isStateDBCS;
|
| + UBool isTargetUCharDBCS;
|
| ++ UBool isEmptySegment;
|
| + }UConverterDataHZ;
|
| +
|
| +
|
| +@@ -72,7 +73,7 @@
|
| + cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
|
| + if(cnv->extraInfo != NULL){
|
| + uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
|
| +- ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
|
| ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",errorCode);
|
| + }
|
| + else {
|
| + *errorCode = U_MEMORY_ALLOCATION_ERROR;
|
| +@@ -98,6 +99,7 @@
|
| + cnv->mode=0;
|
| + if(cnv->extraInfo != NULL){
|
| + ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
|
| ++ ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
|
| + }
|
| + }
|
| + if(choice!=UCNV_RESET_TO_UNICODE) {
|
| +@@ -130,6 +132,10 @@
|
| + * from-GB code '~}' ($7E7D) is outside the defined GB range.)
|
| + *
|
| + * Source: RFC 1842
|
| ++*
|
| ++* Note that the formal syntax in RFC 1842 is invalid. I assume that the
|
| ++* intended definition of single-byte-segment is as follows (pedberg):
|
| ++* single-byte-segment = single-byte-seq 1*single-byte-char
|
| + */
|
| +
|
| +
|
| +@@ -141,7 +147,7 @@
|
| + UChar *myTarget = args->target;
|
| + const char *mySourceLimit = args->sourceLimit;
|
| + UChar32 targetUniChar = 0x0000;
|
| +- UChar mySourceChar = 0x0000;
|
| ++ int32_t mySourceChar = 0x0000;
|
| + UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
|
| + tempBuf[0]=0;
|
| + tempBuf[1]=0;
|
| +@@ -156,90 +162,123 @@
|
| +
|
| + mySourceChar= (unsigned char) *mySource++;
|
| +
|
| +- switch(mySourceChar){
|
| ++ if(args->converter->mode == UCNV_TILDE) {
|
| ++ /* second byte after ~ */
|
| ++ args->converter->mode=0;
|
| ++ switch(mySourceChar) {
|
| + case 0x0A:
|
| +- if(args->converter->mode ==UCNV_TILDE){
|
| +- args->converter->mode=0;
|
| +-
|
| +- }
|
| +- *(myTarget++)=(UChar)mySourceChar;
|
| ++ /* no output for ~\n (line-continuation marker) */
|
| + continue;
|
| +-
|
| + case UCNV_TILDE:
|
| +- if(args->converter->mode ==UCNV_TILDE){
|
| +- *(myTarget++)=(UChar)mySourceChar;
|
| +- args->converter->mode=0;
|
| +- continue;
|
| +-
|
| ++ if(args->offsets) {
|
| ++ args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
|
| + }
|
| +- else if(args->converter->toUnicodeStatus !=0){
|
| +- args->converter->mode=0;
|
| +- break;
|
| +- }
|
| +- else{
|
| +- args->converter->mode = UCNV_TILDE;
|
| +- continue;
|
| +- }
|
| +-
|
| +-
|
| ++ *(myTarget++)=(UChar)mySourceChar;
|
| ++ myData->isEmptySegment = FALSE;
|
| ++ continue;
|
| + case UCNV_OPEN_BRACE:
|
| +- if(args->converter->mode == UCNV_TILDE){
|
| +- args->converter->mode=0;
|
| +- myData->isStateDBCS = TRUE;
|
| +- continue;
|
| +- }
|
| +- else{
|
| +- break;
|
| +- }
|
| +-
|
| +-
|
| + case UCNV_CLOSE_BRACE:
|
| +- if(args->converter->mode == UCNV_TILDE){
|
| +- args->converter->mode=0;
|
| +- myData->isStateDBCS = FALSE;
|
| +- continue;
|
| +- }
|
| +- else{
|
| +- break;
|
| ++ myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
|
| ++ if (myData->isEmptySegment) {
|
| ++ myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
| ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| ++ args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
| ++ args->converter->toUBytes[0] = UCNV_TILDE;
|
| ++ args->converter->toUBytes[1] = mySourceChar;
|
| ++ args->converter->toULength = 2;
|
| ++ args->target = myTarget;
|
| ++ args->source = mySource;
|
| ++ return;
|
| + }
|
| +-
|
| ++ myData->isEmptySegment = TRUE;
|
| ++ continue;
|
| + default:
|
| + /* if the first byte is equal to TILDE and the trail byte
|
| + * is not a valid byte then it is an error condition
|
| + */
|
| +- if(args->converter->mode == UCNV_TILDE){
|
| +- args->converter->mode=0;
|
| +- mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
|
| +- goto SAVE_STATE;
|
| +- }
|
| +-
|
| +- break;
|
| +-
|
| +- }
|
| +-
|
| +- if(myData->isStateDBCS){
|
| ++ /*
|
| ++ * Ticket 5691: consistent illegal sequences:
|
| ++ * - We include at least the first byte in the illegal sequence.
|
| ++ * - If any of the non-initial bytes could be the start of a character,
|
| ++ * we stop the illegal sequence before the first one of those.
|
| ++ */
|
| ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
|
| ++ *err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
| ++ args->converter->toUBytes[0] = UCNV_TILDE;
|
| ++ if( myData->isStateDBCS ?
|
| ++ (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
|
| ++ mySourceChar <= 0x7f
|
| ++ ) {
|
| ++ /* The current byte could be the start of a character: Back it out. */
|
| ++ args->converter->toULength = 1;
|
| ++ --mySource;
|
| + } else {
|
| -+ /* Back out bytes from the previous buffer: Need to replay them. */
|
| -+ cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
|
| -+ /* preToULength is negative! */
|
| -+ uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
|
| -+ source=(const uint8_t *)pArgs->source;
|
| ++ /* Include the current byte in the illegal sequence. */
|
| ++ args->converter->toUBytes[1] = mySourceChar;
|
| ++ args->converter->toULength = 2;
|
| + }
|
| ++ args->target = myTarget;
|
| ++ args->source = mySource;
|
| ++ return;
|
| + }
|
| -+ }
|
| - break;
|
| - } else /* unassigned sequences indicated with byteIndex>0 */ {
|
| - /* try an extension mapping */
|
| -@@ -2104,7 +2191,7 @@
|
| - &offsets, sourceIndex,
|
| - pArgs->flush,
|
| - pErrorCode);
|
| -- sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
|
| -+ sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArgs->source);
|
| ++ } else if(myData->isStateDBCS) {
|
| + if(args->converter->toUnicodeStatus == 0x00){
|
| +- args->converter->toUnicodeStatus = (UChar) mySourceChar;
|
| ++ /* lead byte */
|
| ++ if(mySourceChar == UCNV_TILDE) {
|
| ++ args->converter->mode = UCNV_TILDE;
|
| ++ } else {
|
| ++ /* add another bit to distinguish a 0 byte from not having seen a lead byte */
|
| ++ args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
|
| ++ myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
|
| ++ }
|
| + continue;
|
| + }
|
| + else{
|
| +- tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
|
| +- tempBuf[1] = (char) (mySourceChar+0x80);
|
| +- mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) | ((mySourceChar & 0x00ff)+0x80));
|
| ++ /* trail byte */
|
| ++ int leadIsOk, trailIsOk;
|
| ++ uint32_t leadByte = args->converter->toUnicodeStatus & 0xff;
|
| ++ targetUniChar = 0xffff;
|
| ++ /*
|
| ++ * Ticket 5691: consistent illegal sequences:
|
| ++ * - We include at least the first byte in the illegal sequence.
|
| ++ * - If any of the non-initial bytes could be the start of a character,
|
| ++ * we stop the illegal sequence before the first one of those.
|
| ++ *
|
| ++ * In HZ DBCS, if the second byte is in the 21..7e range,
|
| ++ * we report only the first byte as the illegal sequence.
|
| ++ * Otherwise we convert or report the pair of bytes.
|
| ++ */
|
| ++ leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
|
| ++ trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
|
| ++ if (leadIsOk && trailIsOk) {
|
| ++ tempBuf[0] = (char) (leadByte+0x80) ;
|
| ++ tempBuf[1] = (char) (mySourceChar+0x80);
|
| ++ targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
|
| ++ tempBuf, 2, args->converter->useFallback);
|
| ++ mySourceChar= (leadByte << 8) | mySourceChar;
|
| ++ } else if (trailIsOk) {
|
| ++ /* report a single illegal byte and continue with the following DBCS starter byte */
|
| ++ --mySource;
|
| ++ mySourceChar = (int32_t)leadByte;
|
| ++ } else {
|
| ++ /* report a pair of illegal bytes if the second byte is not a DBCS starter */
|
| ++ /* add another bit so that the code below writes 2 bytes in case of error */
|
| ++ mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
|
| ++ }
|
| + args->converter->toUnicodeStatus =0x00;
|
| +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
|
| +- tempBuf, 2, args->converter->useFallback);
|
| + }
|
| + }
|
| + else{
|
| +- if(args->converter->fromUnicodeStatus == 0x00){
|
| +- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
|
| +- mySource - 1, 1, args->converter->useFallback);
|
| +- }
|
| +- else{
|
| +- goto SAVE_STATE;
|
| ++ if(mySourceChar == UCNV_TILDE) {
|
| ++ args->converter->mode = UCNV_TILDE;
|
| ++ continue;
|
| ++ } else if(mySourceChar <= 0x7f) {
|
| ++ targetUniChar = (UChar)mySourceChar; /* ASCII */
|
| ++ myData->isEmptySegment = FALSE; /* the segment has something valid */
|
| ++ } else {
|
| ++ targetUniChar = 0xffff;
|
| ++ myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
|
| + }
|
| +-
|
| + }
|
| + if(targetUniChar < 0xfffe){
|
| + if(args->offsets) {
|
| +@@ -248,26 +287,17 @@
|
|
|
| - if(U_FAILURE(*pErrorCode)) {
|
| - /* not mappable or buffer overflow */
|
| -@@ -2395,15 +2482,37 @@
|
| -
|
| - if(c<0) {
|
| - if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
|
| -- *pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
| -- }
|
| -- if(U_FAILURE(*pErrorCode)) {
|
| - /* incomplete character byte sequence */
|
| - uint8_t *bytes=cnv->toUBytes;
|
| - cnv->toULength=(int8_t)(source-lastSource);
|
| - do {
|
| - *bytes++=*lastSource++;
|
| - } while(lastSource<source);
|
| -+ *pErrorCode=U_TRUNCATED_CHAR_FOUND;
|
| -+ } else if(U_FAILURE(*pErrorCode)) {
|
| -+ /* callback(illegal) */
|
| -+ /*
|
| -+ * Ticket 5691: consistent illegal sequences:
|
| -+ * - We include at least the first byte in the illegal sequence.
|
| -+ * - If any of the non-initial bytes could be the start of a character,
|
| -+ * we stop the illegal sequence before the first one of those.
|
| -+ */
|
| -+ UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
|
| -+ uint8_t *bytes=cnv->toUBytes;
|
| -+ *bytes++=*lastSource++; /* first byte */
|
| -+ if(lastSource==source) {
|
| -+ cnv->toULength=1;
|
| -+ } else /* lastSource<source: multi-byte character */ {
|
| -+ int8_t i;
|
| -+ for(i=1;
|
| -+ lastSource<source && !isSingleOrLead(stateTable, state, isDBCSOnly, *lastSource);
|
| -+ ++i
|
| + *(myTarget++)=(UChar)targetUniChar;
|
| + }
|
| +- else if(targetUniChar>=0xfffe){
|
| +-SAVE_STATE:
|
| ++ else /* targetUniChar>=0xfffe */ {
|
| + if(targetUniChar == 0xfffe){
|
| + *err = U_INVALID_CHAR_FOUND;
|
| + }
|
| + else{
|
| + *err = U_ILLEGAL_CHAR_FOUND;
|
| + }
|
| +- if(myData->isStateDBCS){
|
| +- /* this should never occur since isStateDBCS is set to true
|
| +- * only after tempBuf[0] and tempBuf[1]
|
| +- * are set to the input .. just to please BEAM
|
| +- */
|
| +- if(tempBuf[0]==0 || tempBuf[1]==0){
|
| +- *err = U_INTERNAL_PROGRAM_ERROR;
|
| +- }else{
|
| +- args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
|
| +- args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
|
| +- args->converter->toULength=2;
|
| +- }
|
| ++ if(mySourceChar > 0xff){
|
| ++ args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8);
|
| ++ args->converter->toUBytes[1] = (uint8_t)mySourceChar;
|
| ++ args->converter->toULength=2;
|
| + }
|
| + else{
|
| + args->converter->toUBytes[0] = (uint8_t)mySourceChar;
|
| +@@ -328,16 +358,21 @@
|
| + escSeq = TILDE_ESCAPE;
|
| + CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
|
| + continue;
|
| +- }
|
| +- else{
|
| ++ } else if(mySourceChar <= 0x7f) {
|
| ++ length = 1;
|
| ++ targetUniChar = mySourceChar;
|
| ++ } else {
|
| + length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
|
| + mySourceChar,&targetUniChar,args->converter->useFallback);
|
| +-
|
| +- }
|
| +- /* only DBCS or SBCS characters are expected*/
|
| +- /* DB haracters with high bit set to 1 are expected */
|
| +- if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
|
| +- targetUniChar= missingCharMarker;
|
| ++ /* we can only use lead bytes 21..7D and trail bytes 21..7E */
|
| ++ if( length == 2 &&
|
| ++ (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
|
| ++ (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
|
| + ) {
|
| -+ *bytes++=*lastSource++;
|
| ++ targetUniChar -= 0x8080;
|
| ++ } else {
|
| ++ targetUniChar = missingCharMarker;
|
| + }
|
| -+ cnv->toULength=i;
|
| -+ source=lastSource;
|
| -+ }
|
| - } else {
|
| - /* no output because of empty input or only state changes */
|
| - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
| -diff -ru trie.clean/source/test/cintltst/nccbtst.c chrome.canonical/source/test/cintltst/nccbtst.c
|
| ---- trie.clean/source/test/cintltst/nccbtst.c 2007-09-19 09:45:00.986804000 -0700
|
| -+++ chrome.canonical/source/test/cintltst/nccbtst.c 2008-10-29 11:08:51.102376000 -0700
|
| + }
|
| + if (targetUniChar != missingCharMarker){
|
| + myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
|
| +@@ -360,22 +395,22 @@
|
| +
|
| + if(isTargetUCharDBCS){
|
| + if( myTargetIndex <targetLength){
|
| +- myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
|
| ++ myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
|
| + if(offsets){
|
| + *(offsets++) = mySourceIndex-1;
|
| + }
|
| + if(myTargetIndex < targetLength){
|
| +- myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
|
| ++ myTarget[myTargetIndex++] =(char) targetUniChar;
|
| + if(offsets){
|
| + *(offsets++) = mySourceIndex-1;
|
| + }
|
| + }else{
|
| +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
|
| ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
|
| + *err = U_BUFFER_OVERFLOW_ERROR;
|
| + }
|
| + }else{
|
| +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
|
| +- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
|
| ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) (targetUniChar >> 8);
|
| ++ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
|
| + *err = U_BUFFER_OVERFLOW_ERROR;
|
| + }
|
| +
|
| +@@ -524,14 +559,14 @@
|
| + const USetAdder *sa,
|
| + UConverterUnicodeSet which,
|
| + UErrorCode *pErrorCode) {
|
| +- /* the tilde '~' is hardcoded in the converter */
|
| +- sa->add(sa->set, 0x7e);
|
| ++ /* HZ converts all of ASCII */
|
| ++ sa->addRange(sa->set, 0, 0x7f);
|
| +
|
| + /* add all of the code points that the sub-converter handles */
|
| +- ((UConverterDataHZ*)cnv->extraInfo)->
|
| +- gbConverter->sharedData->impl->
|
| +- getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
|
| +- sa, which, pErrorCode);
|
| ++ ucnv_MBCSGetFilteredUnicodeSetForUnicode(
|
| ++ ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
|
| ++ sa, which, UCNV_SET_FILTER_HZ,
|
| ++ pErrorCode);
|
| + }
|
| +
|
| + static const UConverterImpl _HZImpl={
|
| +--- r22777/source/common/ucnv_set.c 2005-06-03 13:17:54.000000000 -0700
|
| ++++ chrome.canonical/source/common/ucnv_set.c 2009-03-23 12:30:09.917043000 -0700
|
| +@@ -1,7 +1,7 @@
|
| + /*
|
| + *******************************************************************************
|
| + *
|
| +-* Copyright (C) 2003-2005, International Business Machines
|
| ++* Copyright (C) 2003-2007, International Business Machines
|
| + * Corporation and others. All Rights Reserved.
|
| + *
|
| + *******************************************************************************
|
| +@@ -52,7 +52,8 @@
|
| + uset_add,
|
| + uset_addRange,
|
| + uset_addString,
|
| +- uset_remove
|
| ++ uset_remove,
|
| ++ uset_removeRange
|
| + };
|
| + sa.set=setFillIn;
|
| +
|
| +--- r22777/source/common/ucnv_bld.c 2007-08-24 02:44:10.880047000 -0700
|
| ++++ chrome.canonical/source/common/ucnv_bld.c 2009-03-23 12:40:10.653507000 -0700
|
| +@@ -932,6 +932,7 @@
|
| + myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
|
| + myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
|
| + uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
|
| ++ myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */
|
| +
|
| + if(mySharedConverterData->impl->open != NULL) {
|
| + mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);
|
| +--- r22777/source/common/ucnv_bld.h 2006-07-05 16:08:50.000000000 -0700
|
| ++++ chrome.canonical/source/common/ucnv_bld.h 2009-03-23 12:40:10.680507000 -0700
|
| @@ -1,6 +1,6 @@
|
| - /********************************************************************
|
| - * COPYRIGHT:
|
| -- * Copyright (c) 1997-2007, International Business Machines Corporation and
|
| -+ * Copyright (c) 1997-2008, International Business Machines Corporation and
|
| - * others. All Rights Reserved.
|
| - ********************************************************************/
|
| /*
|
| -@@ -2530,13 +2530,13 @@
|
| + **********************************************************************
|
| +-* Copyright (C) 1999-2006, International Business Machines
|
| ++* Copyright (C) 1999-2006,2008 International Business Machines
|
| + * Corporation and others. All Rights Reserved.
|
| + **********************************************************************
|
| + *
|
| +@@ -226,6 +226,9 @@
|
| + char preToU[UCNV_EXT_MAX_BYTES];
|
| + int8_t preFromULength, preToULength; /* negative: replay */
|
| + int8_t preToUFirstLength; /* length of first character */
|
| ++
|
| ++ /* new fields for ICU 4.0 */
|
| ++ UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */
|
| + };
|
|
|
| + U_CDECL_END /* end of UConverter */
|
| +--- r22777/source/common/ucnv_ext.c 2007-08-22 22:46:49.525855000 -0700
|
| ++++ chrome.canonical/source/common/ucnv_ext.c 2009-03-23 12:30:33.135573000 -0700
|
| +@@ -946,7 +946,7 @@
|
| + ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
| + const int32_t *cx,
|
| + const USetAdder *sa,
|
| +- UConverterUnicodeSet which,
|
| ++ UBool useFallback,
|
| + int32_t minLength,
|
| + UChar32 c,
|
| + UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
|
| +@@ -966,7 +966,7 @@
|
| + value=*fromUSectionValues++;
|
|
|
| - static const uint8_t text943[] = {
|
| -- 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
|
| -- static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
|
| -- static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22, 0x5b57};
|
| -+ 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
|
| -+ static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22, 0x5b57 };
|
| -+ static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5b57 };
|
| - static const UChar toUnicode943stop[]= { 0x304b};
|
| + if( value!=0 &&
|
| +- UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
|
| ++ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
|
| + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
| + ) {
|
| + if(c>=0) {
|
| +@@ -987,12 +987,14 @@
|
| + /* no mapping, do nothing */
|
| + } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
|
| + ucnv_extGetUnicodeSetString(
|
| +- sharedData, cx, sa, which, minLength,
|
| ++ sharedData, cx, sa, useFallback, minLength,
|
| + U_SENTINEL, s, length+1,
|
| + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
| + pErrorCode);
|
| +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
| +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
|
| ++ } else if((useFallback ?
|
| ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
|
| ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
| ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
|
| + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
| + ) {
|
| + sa->addString(sa->set, s, length+1);
|
| +@@ -1004,6 +1006,7 @@
|
| + ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
| + const USetAdder *sa,
|
| + UConverterUnicodeSet which,
|
| ++ UConverterSetFilter filter,
|
| + UErrorCode *pErrorCode) {
|
| + const int32_t *cx;
|
| + const uint16_t *stage12, *stage3, *ps2, *ps3;
|
| +@@ -1011,6 +1014,7 @@
|
|
|
| -- static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};
|
| -- static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};
|
| -+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };
|
| -+ static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
|
| - static const int32_t fromIBM943Offsstop[] = { 0};
|
| + uint32_t value;
|
| + int32_t st1, stage1Length, st2, st3, minLength;
|
| ++ UBool useFallback;
|
|
|
| - gInBufferSize = inputsize;
|
| -@@ -2570,9 +2570,9 @@
|
| - {
|
| - static const uint8_t sampleText[] = {
|
| - 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
|
| -- 0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
|
| -- static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0xfffd,/*0x304b,*/ 0x0032, 0x0033};
|
| -- static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};
|
| -+ 0xff, 0x32, 0x33};
|
| -+ static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x1a, 0x1a, 0x0032, 0x0033 };
|
| -+ static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
|
| - /*checking illegal value for ibm-943 with substitute*/
|
| - gInBufferSize = inputsize;
|
| - gOutBufferSize = outputsize;
|
| -diff -ru trie.clean/source/test/cintltst/nucnvtst.c chrome.canonical/source/test/cintltst/nucnvtst.c
|
| ---- trie.clean/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0700
|
| -+++ chrome.canonical/source/test/cintltst/nucnvtst.c 2008-10-29 11:08:51.194286000 -0700
|
| -@@ -2606,7 +2606,7 @@
|
| - TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceLimit <= source");
|
| - /*Test for the condition where there is an invalid character*/
|
| - {
|
| -- static const uint8_t source2[]={0xa1, 0x01};
|
| -+ static const uint8_t source2[]={0xa1, 0x80};
|
| - TestNextUCharError(cnv, (const char*)source2, (const char*)source2+sizeof(source2), U_ZERO_ERROR, "an invalid character");
|
| - }
|
| - /*Test for the condition where we have a truncated char*/
|
| -@@ -3899,11 +3899,11 @@
|
| - TestISO_2022_KR() {
|
| - /* test input */
|
| - static const uint16_t in[]={
|
| -- 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x000A,0x000D
|
| -- ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC02,0xAC04
|
| -+ 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x000D
|
| -+ ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xAC04
|
| - ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0028,0x0029
|
| - ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x53CA,0x53CB
|
| -- ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x53E1,0x53E2
|
| -+ ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x53E2
|
| - ,0x53E3,0x53E4,0x000A,0x000D};
|
| - const UChar* uSource;
|
| - const UChar* uSourceLimit;
|
| -diff -ru trie.clean/source/test/testdata/conversion.txt chrome.canonical/source/test/testdata/conversion.txt
|
| ---- trie.clean/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0700
|
| -+++ chrome.canonical/source/test/testdata/conversion.txt 2008-10-29 11:37:09.419716000 -0700
|
| -@@ -48,13 +48,135 @@
|
| - toUnicode {
|
| - Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
|
| - Cases {
|
| -+ // Test ticket 5691: consistent illegal sequences
|
| -+ // The following test cases are for illegal character byte sequences.
|
| -+ //
|
| -+ // Unfortunately, we cannot use the Shift-JIS examples from the ticket
|
| -+ // comments because our Shift-JIS table is Windows-compatible and
|
| -+ // therefore has no illegal single bytes. Same for GBK.
|
| -+ // Instead, we use the stricter GB 18030 also for 2-byte examples.
|
| -+ // The byte sequences are generally slightly different from the ticket
|
| -+ // comment, simply using assigned characters rather than just
|
| -+ // theoretically valid sequences.
|
| -+ {
|
| -+ "gb18030",
|
| -+ :bin{ 618140813c81ff7a },
|
| -+ "a\u4e02\\x81<\\x81\\xFFz",
|
| -+ :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "EUC-JP",
|
| -+ :bin{ 618fb0a98fb03c8f3cb0a97a },
|
| -+ "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
|
| -+ :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "gb18030",
|
| -+ :bin{ 618130fc318130fc8181303c3e813cfc817a },
|
| -+ "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
|
| -+ :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "UTF-8",
|
| -+ :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
|
| -+ "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z",
|
| -+ :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "ISO-2022-JP",
|
| -+ :bin{ 1b24424141af4142affe41431b2842 },
|
| -+ "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
|
| -+ :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "ibm-25546",
|
| -+ :bin{ 411b242943420e4141af4142affe41430f5a },
|
| -+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
|
| -+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "ISO-2022-KR",
|
| -+ :bin{ 411b242943420e4141af4142affe41430f5a },
|
| -+ "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
|
| -+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "ISO-2022-CN",
|
| -+ :bin{ 411b242941420e4141af4142affe41430f5a },
|
| -+ "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
|
| -+ :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "HZ",
|
| -+ :bin{ 417e7b4141af4142affe41437e7d5a },
|
| -+ "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
|
| -+ :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ // Test ticket 5691: consistent illegal sequences
|
| -+ // The following test cases are for illegal escape/designator/shift sequences.
|
| -+ //
|
| -+ // ISO-2022-JP and -CN with illegal escape sequences.
|
| -+ {
|
| -+ "ISO-2022-JP",
|
| -+ :bin{ 611b24201b244241411b283f1b28427a },
|
| -+ "a\\x1B$ \u758f\\x1B\u2538z",
|
| -+ :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "ISO-2022-CN",
|
| -+ :bin{ 611b2429201b2429410e41410f7a },
|
| -+ "a\\x1B$) \u4eaez",
|
| -+ :intvector{ 0,1,1,1,1,2,3,4,10,13 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences.
|
| -+ // The first ESC N comes before its designator sequence, the last sequence is ESC+space.
|
| -+ {
|
| -+ "ISO-2022-JP-2",
|
| -+ :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
|
| -+ "N\\x1BNNN\xceN\\x1B N",
|
| -+ :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "ISO-2022-CN-EXT",
|
| -+ :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
|
| -+ "N\\x1BNNN\u8f0eN\\x1B N",
|
| -+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ {
|
| -+ "ISO-2022-CN-EXT",
|
| -+ :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
|
| -+ "O\\x1BOOO\u492bO\\x1B O",
|
| -+ :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
|
| -+ :int{1}, :int{0}, "", "&C", :bin{""}
|
| -+ }
|
| -+ // Test ticket 5691: Example from Peter Edberg.
|
| -+ {
|
| -+ "ISO-2022-JP",
|
| -+ :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
|
| -+ "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
|
| -+ :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
|
| -+ :int{1}, :int{0}, "", "?", :bin{""}
|
| -+ }
|
| - // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
|
| - // using the Shift-JIS table for JIS X 0208 (ticket #5797)
|
| - {
|
| - "ISO-2022-JP",
|
| - :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
|
| -- "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
|
| -- :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
|
| -+ "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
|
| -+ :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
|
| - :int{1}, :int{1}, "", "?", :bin{""}
|
| - }
|
| - // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
|
| -@@ -303,7 +425,7 @@
|
| - {
|
| - "ISO-2022-CN-EXT",
|
| - :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
|
| -- :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
|
| -+ :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
|
| - }
|
| - // G3 designator: recognized, but not supported for -CN (only for -CN-EXT)
|
| - {
|
| + UChar s[UCNV_EXT_MAX_UCHARS];
|
| + UChar32 c;
|
| +@@ -1027,10 +1031,16 @@
|
| +
|
| + stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
|
| +
|
| ++ useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
|
| ++
|
| + /* enumerate the from-Unicode trie table */
|
| + c=0; /* keep track of the current code point while enumerating */
|
| +
|
| +- if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
|
| ++ if(filter==UCNV_SET_FILTER_2022_CN) {
|
| ++ minLength=3;
|
| ++ } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
|
| ++ filter!=UCNV_SET_FILTER_NONE
|
| ++ ) {
|
| + /* DBCS-only, ignore single-byte results */
|
| + minLength=2;
|
| + } else {
|
| +@@ -1064,14 +1074,48 @@
|
| + length=0;
|
| + U16_APPEND_UNSAFE(s, length, c);
|
| + ucnv_extGetUnicodeSetString(
|
| +- sharedData, cx, sa, which, minLength,
|
| ++ sharedData, cx, sa, useFallback, minLength,
|
| + c, s, length,
|
| + (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
| + pErrorCode);
|
| +- } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
| +- UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
|
| ++ } else if((useFallback ?
|
| ++ (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
|
| ++ ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
| ++ UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
|
| + UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
| + ) {
|
| ++ switch(filter) {
|
| ++ case UCNV_SET_FILTER_2022_CN:
|
| ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
|
| ++ continue;
|
| ++ }
|
| ++ break;
|
| ++ case UCNV_SET_FILTER_SJIS:
|
| ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (value=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
|
| ++ continue;
|
| ++ }
|
| ++ break;
|
| ++ case UCNV_SET_FILTER_GR94DBCS:
|
| ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
|
| ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
|
| ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
|
| ++ continue;
|
| ++ }
|
| ++ break;
|
| ++ case UCNV_SET_FILTER_HZ:
|
| ++ if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
|
| ++ (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA(value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
|
| ++ (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
|
| ++ continue;
|
| ++ }
|
| ++ break;
|
| ++ default:
|
| ++ /*
|
| ++ * UCNV_SET_FILTER_NONE,
|
| ++ * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength
|
| ++ */
|
| ++ break;
|
| ++ }
|
| + sa->add(sa->set, c);
|
| + }
|
| + } while((++c&0xf)!=0);
|
|
|