Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: third_party/icu38/uconv.security.patch

Issue 52030: Apply ICU patches for ICU tickets 6175 (ISO-2022 and ... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 diff -ru trie.clean/source/common/ucnv2022.c chrome.canonical/source/common/ucnv 2022.c 1 --- r22777/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0 700
2 --- trie.clean/source/common/ucnv2022.c 2007-11-07 17:39:05.057870000 -0800 2 +++ chrome.canonical/source/test/cintltst/nucnvtst.c 2009-03-23 12:42:01.1062 92000 -0700
3 +++ chrome.canonical/source/common/ucnv2022.c 2008-10-29 12:52:22.517453000 -0 700 3 @@ -17,6 +17,7 @@
4 @@ -752,6 +752,7 @@ 4 #include "unicode/uloc.h"
5 #include "unicode/ucnv.h"
6 #include "unicode/ucnv_err.h"
7 +#include "unicode/ucnv_cb.h"
8 #include "cintltst.h"
9 #include "unicode/utypes.h"
10 #include "unicode/ustring.h"
11 @@ -81,6 +82,7 @@
12 static void TestJitterbug2411(void);
13 static void TestJB5275(void);
14 static void TestJB5275_1(void);
15 +static void TestJitterbug6175(void);
16 #endif
17
18 static void TestRoundTrippingAllUTF(void);
19 @@ -297,6 +299,7 @@
20 #if !UCONFIG_NO_LEGACY_CONVERSION
21 addTest(root, &TestJitterbug2346, "tsconv/nucnvtst/TestJitterbug2346");
22 addTest(root, &TestJitterbug2411, "tsconv/nucnvtst/TestJitterbug2411");
23 + addTest(root, &TestJitterbug6175, "tsconv/nucnvtst/TestJitterbug6175");
24 #endif
25
26 }
27 @@ -2606,7 +2609,7 @@
28 TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceL imit <= source");
29 /*Test for the condition where there is an invalid character*/
30 {
31 - static const uint8_t source2[]={0xa1, 0x01};
32 + static const uint8_t source2[]={0xa1, 0x80};
33 TestNextUCharError(cnv, (const char*)source2, (const char*)source2+size of(source2), U_ZERO_ERROR, "an invalid character");
34 }
35 /*Test for the condition where we have a truncated char*/
36 @@ -3899,11 +3902,11 @@
37 TestISO_2022_KR() {
38 /* test input */
39 static const uint16_t in[]={
40 - 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x0 00A,0x000D
41 - ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C02,0xAC04
42 + 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x0 00D
43 + ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C04
44 ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0 028,0x0029
45 ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x5 3CA,0x53CB
46 - ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x5 3E1,0x53E2
47 + ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x5 3E2
48 ,0x53E3,0x53E4,0x000A,0x000D};
49 const UChar* uSource;
50 const UChar* uSourceLimit;
51 @@ -4456,6 +4459,70 @@
52 free(offsets);
53 }
54
55 +/* Tests for empty segments in ISO-2022-JP/KR/CN, HZ, check that UConverterCall backReason is UCNV_IRREGULAR */
56 +typedef struct {
57 + const char * converterName;
58 + const char * inputText;
59 + int inputTextLength;
60 +} EmptySegmentTest;
61 +
62 +/* Callback for TestJitterbug6175, should only get called for empty segment err ors */
63 +static void UCNV_TO_U_CALLBACK_EMPTYSEGMENT( const void *context, UConverterToU nicodeArgs *toArgs, const char* codeUnits,
64 + int32_t length, UConverterCallback Reason reason, UErrorCode * err ) {
65 + if (reason > UCNV_IRREGULAR) {
66 + return;
67 + }
68 + if (reason != UCNV_IRREGULAR) {
69 + log_err("toUnicode callback invoked for empty segment but reason is not UCNV_IRREGULAR\n");
70 + }
71 + /* Standard stuff below from UCNV_TO_U_CALLBACK_SUBSTITUTE */
72 + *err = U_ZERO_ERROR;
73 + ucnv_cbToUWriteSub(toArgs,0,err);
74 +}
75 +
76 +enum { kEmptySegmentToUCharsMax = 64 };
77 +static void TestJitterbug6175(void) {
78 + static const char iso2022jp_a[] = { 0x61, 0x62, 0x1B,0x24,0x42, 0x1B,0x28, 0x42, 0x63, 0x64, 0x0D, 0x0A };
79 + static const char iso2022kr_a[] = { 0x1B,0x24,0x29,0x43, 0x61, 0x0E, 0x0F, 0x62, 0x0D, 0x0A };
80 + static const char iso2022cn_a[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x0F, 0x1B,0x24,0x2A,0x48, 0x1B,0x4E, 0x6A,0x65, 0x63, 0x0D, 0x0A };
81 + static const char iso2022cn_b[] = { 0x61, 0x1B,0x24,0x29,0x41, 0x62, 0x0E, 0x1B,0x24,0x29,0x47, 0x68,0x64, 0x0F, 0x63, 0x0D, 0x0A };
82 + static const char hzGB2312_a[] = { 0x61, 0x62, 0x7E,0x7B, 0x7E,0x7D, 0x63 , 0x64 };
83 + static const EmptySegmentTest emptySegmentTests[] = {
84 + /* converterName inputText inputTextLength */
85 + { "ISO-2022-JP", iso2022jp_a, sizeof(iso2022jp_a) },
86 + { "ISO-2022-KR", iso2022kr_a, sizeof(iso2022kr_a) },
87 + { "ISO-2022-CN", iso2022cn_a, sizeof(iso2022cn_a) },
88 + { "ISO-2022-CN", iso2022cn_b, sizeof(iso2022cn_b) },
89 + { "HZ-GB-2312", hzGB2312_a, sizeof(hzGB2312_a) },
90 + /* terminator: */
91 + { NULL, NULL, 0, }
92 + };
93 + const EmptySegmentTest * testPtr;
94 + for (testPtr = emptySegmentTests; testPtr->converterName != NULL; ++testPtr ) {
95 + UErrorCode err = U_ZERO_ERROR;
96 + UConverter * cnv = ucnv_open(testPtr->converterName, &err);
97 + if (U_FAILURE(err)) {
98 + log_data_err("Unable to open %s converter: %s\n", testPtr->converte rName, u_errorName(err));
99 + return;
100 + }
101 + ucnv_setToUCallBack(cnv, UCNV_TO_U_CALLBACK_EMPTYSEGMENT, NULL, NULL, N ULL, &err);
102 + if (U_FAILURE(err)) {
103 + log_data_err("Unable to setToUCallBack for %s converter: %s\n", tes tPtr->converterName, u_errorName(err));
104 + ucnv_close(cnv);
105 + return;
106 + }
107 + {
108 + UChar toUChars[kEmptySegmentToUCharsMax];
109 + UChar * toUCharsPtr = toUChars;
110 + const UChar * toUCharsLimit = toUCharsPtr + kEmptySegmentToUCharsMa x;
111 + const char * inCharsPtr = testPtr->inputText;
112 + const char * inCharsLimit = inCharsPtr + testPtr->inputTextLength;
113 + ucnv_toUnicode(cnv, &toUCharsPtr, toUCharsLimit, &inCharsPtr, inCha rsLimit, NULL, TRUE, &err);
114 + }
115 + ucnv_close(cnv);
116 + }
117 +}
118 +
119 static void
120 TestEBCDIC_STATEFUL() {
121 /* test input */
122 --- r22777/source/test/cintltst/ncnvtst.c 2007-01-24 15:27:45.575224000 -0 800
123 +++ chrome.canonical/source/test/cintltst/ncnvtst.c 2009-03-23 12:30:17.2910 31000 -0700
124 @@ -1928,7 +1928,7 @@
125 #if !UCONFIG_NO_LEGACY_CONVERSION
126 { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff },
127 { "windows-1251", 0, 0x7f, 0x410, 0x44f, 0x3000, 0xd7ff },
128 - { "HZ", 0x410, 0x44f, 0x4e00, 0x4eff, 0xac00, 0xd7ff },
129 + /* HZ test case fixed and moved to intltest's conversion.txt, ticket #6 002 */
130 { "shift-jis", 0x3041, 0x3093, 0x30a1, 0x30f3, 0x900, 0x1cff }
131 #else
132 { "UTF-8", 0, 0xd7ff, 0xe000, 0x10ffff, 0xd800, 0xdfff }
133 --- r22777/source/test/intltest/convtest.h 2007-07-26 20:12:12.288784000 -0 700
134 +++ chrome.canonical/source/test/intltest/convtest.h 2009-03-23 12:30:09.4451 94000 -0700
135 @@ -72,6 +72,7 @@
136 void TestToUnicode();
137 void TestFromUnicode();
138 void TestGetUnicodeSet();
139 + void TestGetUnicodeSet2();
140
141 private:
142 UBool
143 --- r22777/source/test/intltest/convtest.cpp 2007-03-08 16:28:01.852223000 -0 800
144 +++ chrome.canonical/source/test/intltest/convtest.cpp 2009-03-23 12:30:40.1618 68000 -0700
145 @@ -70,6 +70,7 @@
146 case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break;
147 case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break;
148 case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break;
149 + case 3: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); brea k;
150 default: name=""; break; //needed to end loop
151 }
152 }
153 @@ -465,6 +466,183 @@
154 }
155 }
156
157 +U_CDECL_BEGIN
158 +static void U_CALLCONV
159 +getUnicodeSetCallback(const void *context,
160 + UConverterFromUnicodeArgs *fromUArgs,
161 + const UChar* codeUnits,
162 + int32_t length,
163 + UChar32 codePoint,
164 + UConverterCallbackReason reason,
165 + UErrorCode *pErrorCode) {
166 + if(reason<=UCNV_IRREGULAR) {
167 + ((UnicodeSet *)context)->remove(codePoint); // the converter cannot co nvert this code point
168 + *pErrorCode=U_ZERO_ERROR; // skip
169 + } // else ignore the reset, close and clone calls.
170 +}
171 +U_CDECL_END
172 +
173 +// Compare ucnv_getUnicodeSet() with the set of characters that can be converte d.
174 +void
175 +ConversionTest::TestGetUnicodeSet2() {
176 + // Build a string with all code points.
177 + UChar32 cpLimit;
178 + int32_t s0Length;
179 + if(quick) {
180 + cpLimit=s0Length=0x10000; // BMP only
181 + } else {
182 + cpLimit=0x110000;
183 + s0Length=0x10000+0x200000; // BMP + surrogate pairs
184 + }
185 + UChar *s0=new UChar[s0Length];
186 + if(s0==NULL) {
187 + return;
188 + }
189 + UChar *s=s0;
190 + UChar32 c;
191 + UChar c2;
192 + // low BMP
193 + for(c=0; c<=0xd7ff; ++c) {
194 + *s++=(UChar)c;
195 + }
196 + // trail surrogates
197 + for(c=0xdc00; c<=0xdfff; ++c) {
198 + *s++=(UChar)c;
199 + }
200 + // lead surrogates
201 + // (after trails so that there is not even one surrogate pair in between)
202 + for(c=0xd800; c<=0xdbff; ++c) {
203 + *s++=(UChar)c;
204 + }
205 + // high BMP
206 + for(c=0xe000; c<=0xffff; ++c) {
207 + *s++=(UChar)c;
208 + }
209 + // supplementary code points = surrogate pairs
210 + if(cpLimit==0x110000) {
211 + for(c=0xd800; c<=0xdbff; ++c) {
212 + for(c2=0xdc00; c2<=0xdfff; ++c2) {
213 + *s++=(UChar)c;
214 + *s++=c2;
215 + }
216 + }
217 + }
218 +
219 + static const char *const cnvNames[]={
220 + "UTF-8",
221 + "UTF-7",
222 + "UTF-16",
223 + "US-ASCII",
224 + "ISO-8859-1",
225 + "windows-1252",
226 + "Shift-JIS",
227 + "ibm-1390", // EBCDIC_STATEFUL table
228 + "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL tab le
229 + "HZ",
230 + "ISO-2022-JP",
231 + "JIS7",
232 + "ISO-2022-CN",
233 + "ISO-2022-CN-EXT",
234 + "LMBCS"
235 + };
236 + char buffer[1024];
237 + int32_t i;
238 + for(i=0; i<LENGTHOF(cnvNames); ++i) {
239 + UErrorCode errorCode=U_ZERO_ERROR;
240 + UConverter *cnv=cnv_open(cnvNames[i], errorCode);
241 + if(U_FAILURE(errorCode)) {
242 + errln("failed to open converter %s - %s", cnvNames[i], u_errorName( errorCode));
243 + continue;
244 + }
245 + UnicodeSet expected;
246 + ucnv_setFromUCallBack(cnv, getUnicodeSetCallback, &expected, NULL, NULL , &errorCode);
247 + if(U_FAILURE(errorCode)) {
248 + errln("failed to set the callback on converter %s - %s", cnvNames[i ], u_errorName(errorCode));
249 + ucnv_close(cnv);
250 + continue;
251 + }
252 + UConverterUnicodeSet which;
253 + for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUn icodeSet)((int)which+1)) {
254 + if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
255 + ucnv_setFallback(cnv, TRUE);
256 + }
257 + expected.add(0, cpLimit-1);
258 + s=s0;
259 + UBool flush;
260 + do {
261 + char *t=buffer;
262 + flush=(UBool)(s==s0+s0Length);
263 + ucnv_fromUnicode(cnv, &t, buffer+sizeof(buffer), (const UChar * *)&s, s0+s0Length, NULL, flush, &errorCode);
264 + if(U_FAILURE(errorCode)) {
265 + if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
266 + errorCode=U_ZERO_ERROR;
267 + continue;
268 + } else {
269 + break; // unexpected error, should not occur
270 + }
271 + }
272 + } while(!flush);
273 + UnicodeSet set;
274 + ucnv_getUnicodeSet(cnv, (USet *)&set, which, &errorCode);
275 + if(cpLimit<0x110000) {
276 + set.remove(cpLimit, 0x10ffff);
277 + }
278 + if(which==UCNV_ROUNDTRIP_SET) {
279 + // ignore PUA code points because they will be converted even i f they
280 + // are fallbacks and when other fallbacks are turned off,
281 + // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true rou ndtrips
282 + expected.remove(0xe000, 0xf8ff);
283 + expected.remove(0xf0000, 0xffffd);
284 + expected.remove(0x100000, 0x10fffd);
285 + set.remove(0xe000, 0xf8ff);
286 + set.remove(0xf0000, 0xffffd);
287 + set.remove(0x100000, 0x10fffd);
288 + }
289 + if(set!=expected) {
290 + // First try to see if we have different sets because ucnv_getU nicodeSet()
291 + // added strings: The above conversion method does not tell us what strings might be convertible.
292 + // Remove strings from the set and compare again.
293 + // Unfortunately, there are no good, direct set methods for fin ding out whether there are strings
294 + // in the set, nor for enumerating or removing just them.
295 + // Intersect all code points with the set. The intersection wil l not contain strings.
296 + UnicodeSet temp(0, 0x10ffff);
297 + temp.retainAll(set);
298 + set=temp;
299 + }
300 + if(set!=expected) {
301 + UnicodeSet diffSet;
302 + UnicodeString out;
303 +
304 + // are there items that must be in the set but are not?
305 + (diffSet=expected).removeAll(set);
306 + if(!diffSet.isEmpty()) {
307 + diffSet.toPattern(out, TRUE);
308 + if(out.length()>100) {
309 + out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsi s));
310 + }
311 + errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d",
312 + cnvNames[i], which);
313 + errln(out);
314 + }
315 +
316 + // are there items that must not be in the set but are?
317 + (diffSet=set).removeAll(expected);
318 + if(!diffSet.isEmpty()) {
319 + diffSet.toPattern(out, TRUE);
320 + if(out.length()>100) {
321 + out.replace(100, 0x7fffffff, ellipsis, LENGTHOF(ellipsi s));
322 + }
323 + errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpecte d items - which set: %d",
324 + cnvNames[i], which);
325 + errln(out);
326 + }
327 + }
328 + }
329 + }
330 +
331 + delete [] s0;
332 +}
333 +
334 // open testdata or ICU data converter ------------------------------------- ** *
335
336 UConverter *
337 --- r22777/source/test/testdata/testdata.mak 2007-07-26 20:12:12.288784000 -0 700
338 +++ chrome.canonical/source/test/testdata/testdata.mak 2009-03-23 12:31:04.4246 45000 -0700
339 @@ -28,7 +28,7 @@
340
341 TEST_RES_FILES = $(TEST_RES_SOURCE:.txt=.res)
342
343 -"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" " $(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res " "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh .res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN. res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TEST DATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res " "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" " $(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1. cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\test4.cnv" "$(TESTDATABLD)\test4 x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD)\nfscsi.spp" "$(TESTDATABLD)\ nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDATABLD)\nfsmxs.spp" "$(TESTDATAB LD)\nfsmxp.spp"
344 +"$(TESTDATAOUT)\testdata.dat" : $(TEST_RES_FILES) "$(TESTDATABLD)\casing.res" " $(TESTDATABLD)\conversion.res" "$(TESTDATABLD)\icuio.res" "$(TESTDATABLD)\mc.res " "$(TESTDATABLD)\structLocale.res" "$(TESTDATABLD)\root.res" "$(TESTDATABLD)\sh .res" "$(TESTDATABLD)\sh_YU.res" "$(TESTDATABLD)\te.res" "$(TESTDATABLD)\te_IN. res" "$(TESTDATABLD)\te_IN_REVISED.res" "$(TESTDATABLD)\testaliases.res" "$(TEST DATABLD)\testtypes.res" "$(TESTDATABLD)\testempty.res" "$(TESTDATABLD)\iscii.res " "$(TESTDATABLD)\idna_rules.res" "$(TESTDATABLD)\DataDrivenCollationTest.res" " $(TESTDATABLD)\test.icu" "$(TESTDATABLD)\testtable32.res" "$(TESTDATABLD)\test1. cnv" "$(TESTDATABLD)\test1bmp.cnv" "$(TESTDATABLD)\test3.cnv" "$(TESTDATABLD)\te st4.cnv" "$(TESTDATABLD)\test4x.cnv" "$(TESTDATABLD)\ibm9027.cnv" "$(TESTDATABLD )\nfscsi.spp" "$(TESTDATABLD)\nfscss.spp" "$(TESTDATABLD)\nfscis.spp" "$(TESTDAT ABLD)\nfsmxs.spp" "$(TESTDATABLD)\nfsmxp.spp"
345 @echo Building test data
346 @copy "$(TESTDATABLD)\te.res" "$(TESTDATAOUT)\$(TESTDT)\nam.typ"
347 @copy "$(TESTDATA)\icu26_testtypes.res" "$(TESTDATABLD)"
348 @@ -54,6 +54,7 @@
349 iscii.res
350 test.icu
351 test1.cnv
352 +test1bmp.cnv
353 test3.cnv
354 test4.cnv
355 test4x.cnv
356 @@ -126,6 +127,10 @@
357 @echo Building $@
358 @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
359
360 +"$(TESTDATABLD)\test1bmp.cnv": "$(TESTDATA)\test1bmp.ucm"
361 + @echo Building $@
362 + @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
363 +
364 "$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"
365 @echo Building $@
366 @"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
367 --- r22777/source/test/testdata/Makefile.in 2007-08-21 13:15:55.267002000 -0 700
368 +++ chrome.canonical/source/test/testdata/Makefile.in 2009-03-23 12:31:04.4356 35000 -0700
369 @@ -117,7 +117,7 @@
370 TEST_DAT_FILES=$(TESTBUILDDIR)/test.icu
371 TEST_SPP_FILES=$(TESTBUILDDIR)/nfscsi.spp $(TESTBUILDDIR)/nfscss.spp $(TESTBUIL DDIR)/nfscis.spp $(TESTBUILDDIR)/nfsmxs.spp $(TESTBUILDDIR)/nfsmxp.spp
372
373 -TEST_UCM_SOURCE= test1.ucm test3.ucm test4.ucm test4x.ucm ibm9027.ucm
374 +TEST_UCM_SOURCE= test1.ucm test1bmp.ucm test3.ucm test4.ucm test4x.ucm ibm9027. ucm
375 TEST_UCM_FILES=$(TEST_UCM_SOURCE:%=$(TESTSRCDATADIR)/data/%)
376 TEST_CNV_FILES=$(TEST_UCM_SOURCE:%.ucm=$(TESTBUILDDIR)/%.cnv)
377
378 --- r22777/source/test/testdata/conversion.txt 2007-10-11 14:31:32.196532000 -0 700
379 +++ chrome.canonical/source/test/testdata/conversion.txt 2009-03-23 12:42 :01.119267000 -0700
380 @@ -1,6 +1,6 @@
381 //***************************************************************************** **
382 //
383 -// Copyright (C) 2003-2007, International Business Machines
384 +// Copyright (C) 2003-2008, International Business Machines
385 // Corporation and others. All Rights Reserved.
386 //
387 // file name: conversion.txt
388 @@ -48,13 +48,161 @@
389 toUnicode {
390 Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
391 Cases {
392 + // Test ticket 5691: consistent illegal sequences
393 + // The following test cases are for illegal character byte sequences.
394 + //
395 + // Unfortunately, we cannot use the Shift-JIS examples from the ticket
396 + // comments because our Shift-JIS table is Windows-compatible and
397 + // therefore has no illegal single bytes. Same for GBK.
398 + // Instead, we use the stricter GB 18030 also for 2-byte examples.
399 + // The byte sequences are generally slightly different from the ticket
400 + // comment, simply using assigned characters rather than just
401 + // theoretically valid sequences.
402 + {
403 + "gb18030",
404 + :bin{ 618140813c81ff7a },
405 + "a\u4e02\\x81<\\x81\\xFFz",
406 + :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
407 + :int{1}, :int{0}, "", "&C", :bin{""}
408 + }
409 + {
410 + "EUC-JP",
411 + :bin{ 618fb0a98fb03c8f3cb0a97a },
412 + "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
413 + :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
414 + :int{1}, :int{0}, "", "&C", :bin{""}
415 + }
416 + {
417 + "gb18030",
418 + :bin{ 618130fc318130fc8181303c3e813cfc817a },
419 + "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
420 + :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
421 + :int{1}, :int{0}, "", "&C", :bin{""}
422 + }
423 + {
424 + "UTF-8",
425 + :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
426 + "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1 \\xFF<>z",
427 + :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,1 2,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20, 21 },
428 + :int{1}, :int{0}, "", "&C", :bin{""}
429 + }
430 + {
431 + "ISO-2022-JP",
432 + :bin{ 1b24424141af4142affe41431b2842 },
433 + "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
434 + :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
435 + :int{1}, :int{0}, "", "&C", :bin{""}
436 + }
437 + {
438 + "ibm-25546",
439 + :bin{ 411b242943420e4141af4142affe41430f5a },
440 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
441 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
442 + :int{1}, :int{0}, "", "&C", :bin{""}
443 + }
444 + {
445 + "ISO-2022-KR",
446 + :bin{ 411b242943420e4141af4142affe41430f5a },
447 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
448 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
449 + :int{1}, :int{0}, "", "&C", :bin{""}
450 + }
451 + {
452 + "ISO-2022-CN",
453 + :bin{ 411b242941420e4141af4142affe41430f5a },
454 + "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
455 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
456 + :int{1}, :int{0}, "", "&C", :bin{""}
457 + }
458 + {
459 + "HZ",
460 + :bin{ 417e7b4141af4142affe41437e7d5a },
461 + "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
462 + :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
463 + :int{1}, :int{0}, "", "&C", :bin{""}
464 + }
465 + // Test ticket 5691: consistent illegal sequences
466 + // The following test cases are for illegal escape/designator/shift seq uences.
467 + //
468 + // ISO-2022-JP and -CN with illegal escape sequences.
469 + {
470 + "ISO-2022-JP",
471 + :bin{ 611b24201b244241411b283f1b28427a },
472 + "a\\x1B$ \u758f\\x1B\u2538z",
473 + :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
474 + :int{1}, :int{0}, "", "&C", :bin{""}
475 + }
476 + {
477 + "ISO-2022-CN",
478 + :bin{ 611b2429201b2429410e41410f7a },
479 + "a\\x1B$) \u4eaez",
480 + :intvector{ 0,1,1,1,1,2,3,4,10,13 },
481 + :int{1}, :int{0}, "", "&C", :bin{""}
482 + }
483 + // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS 3 sequences.
484 + // The first ESC N comes before its designator sequence, the last seque nce is ESC+space.
485 + {
486 + "ISO-2022-JP-2",
487 + :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
488 + "N\\x1BNNN\xceN\\x1B N",
489 + :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
490 + :int{1}, :int{0}, "", "&C", :bin{""}
491 + }
492 + {
493 + "ISO-2022-CN-EXT",
494 + :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
495 + "N\\x1BNNN\u8f0eN\\x1B N",
496 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
497 + :int{1}, :int{0}, "", "&C", :bin{""}
498 + }
499 + {
500 + "ISO-2022-CN-EXT",
501 + :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
502 + "O\\x1BOOO\u492bO\\x1B O",
503 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
504 + :int{1}, :int{0}, "", "&C", :bin{""}
505 + }
506 + // Test ticket 5691: HZ with illegal tilde sequences.
507 + {
508 + "HZ",
509 + :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a },
510 + "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a 9Z",
511 + :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS
512 + 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19, 19,19,19,21, // DBCS
513 + 25 }, // SBCS
514 + :int{1}, :int{0}, "", "&C", :bin{""}
515 + }
516 + // Test ticket 5691: Example from Peter Edberg.
517 + {
518 + "ISO-2022-JP",
519 + :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
520 + "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
521 + :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
522 + :int{1}, :int{0}, "", "?", :bin{""}
523 + }
524 + // Test bug 6071 (2:1 Unicode:charset SBCS mapping).
525 + {
526 + "*test1bmp",
527 + :bin{ 050008 },
528 + "e@uv",
529 + :intvector{ 0,1,2,2 },
530 + :int{1}, :int{1}, "", "?", :bin{""}
531 + }
532 + // test that HZ limits its byte values to lead bytes 21..7d and trail b ytes 21..7e
533 + {
534 + "HZ",
535 + :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b },
536 + "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+",
537 + :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 },
538 + :int{1}, :int{1}, "", "?", :bin{""}
539 + }
540 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
541 // using the Shift-JIS table for JIS X 0208 (ticket #5797)
542 {
543 "ISO-2022-JP",
544 :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b284 2 },
545 - "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6 f3e",
546 - :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
547 + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\uf ffd\u6f3e",
548 + :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
549 :int{1}, :int{1}, "", "?", :bin{""}
550 }
551 // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBM PWithOffsets()
552 @@ -191,6 +339,21 @@
553 :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 },
554 :int{1}, :int{1}, "", "&", :bin{""}
555 }
556 + // empty segment (using substitution and stop)
557 + {
558 + "ISO-2022-KR",
559 + :bin{ 1b242943610e0f620d0a },
560 + "a\uFFFDb\u000D\u000A",
561 + :intvector{ 4, 6, 7, 8, 9 },
562 + :int{1}, :int{1}, "", "?", :bin{""}
563 + }
564 + {
565 + "ISO-2022-KR",
566 + :bin{ 1b242943610e0f620d0a },
567 + "a",
568 + :intvector{ 4 },
569 + :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
570 + }
571
572 // ISO-2022-JP
573
574 @@ -241,6 +404,21 @@
575 :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 },
576 :int{1}, :int{1}, "", ".", :bin{""}
577 }
578 + // empty segment (using substitution and stop)
579 + {
580 + "ISO-2022-JP",
581 + :bin{ 61621b24421b284263640d0a },
582 + "ab\uFFFDcd\u000D\u000A",
583 + :intvector{ 0, 1, 5, 8, 9, 10, 11 },
584 + :int{1}, :int{1}, "", "?", :bin{""}
585 + }
586 + {
587 + "ISO-2022-JP",
588 + :bin{ 61621b24421b284263640d0a },
589 + "ab",
590 + :intvector{ 0, 1 },
591 + :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"}
592 + }
593
594 // ISO-2022-CN
595
596 @@ -303,7 +481,7 @@
597 {
598 "ISO-2022-CN-EXT",
599 :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
600 - :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
601 + :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
602 }
603 // G3 designator: recognized, but not supported for -CN (only for -CN-E XT)
604 {
605 @@ -311,6 +489,36 @@
606 :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 },
607 :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 }
608 }
609 + // empty segment 1 (using substitution and stop)
610 + {
611 + "ISO-2022-CN",
612 + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
613 + "ab\uFFFD\u994Cc\u000D\u000A",
614 + :intvector{ 0, 5, 7, 14, 16, 17, 18 },
615 + :int{1}, :int{1}, "", "?", :bin{""}
616 + }
617 + {
618 + "ISO-2022-CN",
619 + :bin{ 611b242941620e0f1b242a481b4e6a65630d0a },
620 + "ab",
621 + :intvector{ 0, 5 },
622 + :int{1}, :int{1}, "illesc", ".", :bin{"0f"}
623 + }
624 + // empty segment 2 (using substitution and stop)
625 + {
626 + "ISO-2022-CN",
627 + :bin{ 611b242941620e1b24294768640f630d0a },
628 + "ab\uFFFD\u5F70c\u000D\u000A",
629 + :intvector{ 0, 5, 7, 11, 14, 15, 16 },
630 + :int{1}, :int{1}, "", "?", :bin{""}
631 + }
632 + {
633 + "ISO-2022-CN",
634 + :bin{ 611b242941620e1b24294768640f630d0a },
635 + "ab",
636 + :intvector{ 0, 5 },
637 + :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"}
638 + }
639
640 // ISO-2022 SBCS
641 // [U_ENABLE_GENERIC_ISO_2022]
642 @@ -325,6 +533,39 @@
643 // :int{1}, :int{1}, "", ".", :bin{""}
644 //}
645
646 + // HZ-GB-2312
647 +
648 + // empty segment 1 (using substitution and stop)
649 + {
650 + "HZ-GB-2312",
651 + :bin{ 61627e7b7e7d6364 },
652 + "ab\uFFFDcd",
653 + :intvector{ 0, 1, 4, 6, 7 },
654 + :int{1}, :int{1}, "", "?", :bin{""}
655 + }
656 + {
657 + "HZ-GB-2312",
658 + :bin{ 61627e7b7e7d63640d0a },
659 + "ab",
660 + :intvector{ 0, 1 },
661 + :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"}
662 + }
663 + // empty segment 2 & legal redundant switches (using substitution and s top)
664 + {
665 + "HZ-GB-2312",
666 + :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
667 + "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD",
668 + :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 },
669 + :int{1}, :int{1}, "", "?", :bin{""}
670 + }
671 + {
672 + "HZ-GB-2312",
673 + :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d },
674 + "ab\u4E0D\u7A7A",
675 + :intvector{ 0, 1, 4, 6 },
676 + :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"}
677 + }
678 +
679 // DBCS-only extensions
680 {
681 "ibm-970",
682 @@ -496,6 +737,14 @@
683 :intvector{ 0, 4, 8, 12 },
684 :int{1}, :int{0}, "", "?", :bin{""}
685 }
686 + // Test iso-2022-jp-2 miscellaneous symbols
687 + {
688 + "iso-2022-jp-2",
689 + :bin{ 1b242843224f224e1b2842 },
690 + "\u260E\u260F",
691 + :intvector{ 4, 6 },
692 + :int{1}, :int{0}, "", ".", :bin{""}
693 + }
694 }
695 }
696
697 @@ -504,6 +753,14 @@
698 fromUnicode {
699 Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
700 Cases {
701 + // Test bug 6071 (1:2 Unicode:charset SBCS mapping).
702 + {
703 + "*test1bmp",
704 + "e@t",
705 + :bin{ 05000709 },
706 + :intvector{ 0,1,2,2 },
707 + :int{1}, :int{0}, "", "?", ""
708 + }
709 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
710 // using the Shift-JIS table for JIS X 0208 (ticket #5797)
711 {
712 @@ -1311,16 +1568,29 @@
713 // versions of ISO-2022-JP
714 {
715 "ISO-2022-JP",
716 - "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e 00\u4e01\uffe5]",
717 - "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\uf a2d\uffe6-\U0010ffff]",
718 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e0 1\uffe5]",
719 + "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e 29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]",
720 :int{0}
721 - }
722 + }
723 {
724 "ISO-2022-JP-2",
725 - "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a 1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
726 - "[\x0e\x0f\x1b\uffe7-\U0010ffff]",
727 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0 390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]",
728 + "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]",
729 + :int{0}
730 + }
731 + {
732 + "JIS7",
733 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0 390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]",
734 + "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]",
735 :int{0}
736 }
737 + // with fallbacks
738 + {
739 + "ISO-2022-JP",
740 + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301 c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]",
741 + "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b \ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]",
742 + :int{1}
743 + }
744
745 // versions of ISO-2022-CN
746 {
747 @@ -1336,6 +1606,22 @@
748 :int{0}
749 }
750
751 + // HZ
752 + {
753 + "HZ",
754 + "[\u0410-\u044f\u4e00\u4e01\u4e03]",
755 + "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]",
756 + :int{0}
757 + }
758 +
759 + // LMBCS
760 + {
761 + "LMBCS",
762 + "[\x00-\U0010ffff]",
763 + "[]",
764 + :int{0}
765 + }
766 +
767 // DBCS-only
768 {
769 "ibm-971",
770 --- r22777/source/common/ucnv_ext.h 2007-08-22 22:46:49.525855000 -0700
771 +++ chrome.canonical/source/common/ucnv_ext.h 2009-03-23 12:30:09.644121000 -0 700
772 @@ -382,10 +382,20 @@
773 UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
774 UErrorCode *pErrorCode);
775
776 +/*
777 + * Add code points and strings to the set according to the extension mappings.
778 + * Limitation on the UConverterSetFilter:
779 + * The filters currently assume that they are used with 1:1 mappings.
780 + * They only apply to single input code points, and then they pass through
781 + * only mappings with single-charset-code results.
782 + * For example, the Shift-JIS filter only works for 2-byte results and tests
783 + * that those 2 bytes are in the JIS X 0208 range of Shift-JIS.
784 + */
785 U_CFUNC void
786 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
787 const USetAdder *sa,
788 UConverterUnicodeSet which,
789 + UConverterSetFilter filter,
790 UErrorCode *pErrorCode);
791
792 /* toUnicode helpers -------------------------------------------------------- * /
793 --- r22777/source/common/ucnvmbcs.c 2007-10-11 14:31:32.196532000 -0700
794 +++ chrome.canonical/source/common/ucnvmbcs.c 2009-03-23 12:42:01.150242000 -0 700
795 @@ -1,7 +1,7 @@
796 /*
797 ******************************************************************************
798 *
799 -* Copyright (C) 2000-2007, International Business Machines
800 +* Copyright (C) 2000-2008, International Business Machines
801 * Corporation and others. All Rights Reserved.
802 *
803 ******************************************************************************
804 @@ -485,9 +485,23 @@
805
806 if(mbcsTable->outputType==MBCS_OUTPUT_1) {
807 const uint16_t *stage2, *stage3, *results;
808 + uint16_t minValue;
809
810 results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
811
812 + /*
813 + * Set a threshold variable for selecting which mappings to use.
814 + * See ucnv_MBCSSingleFromBMPWithOffsets() and
815 + * MBCS_SINGLE_RESULT_FROM_U() for details.
816 + */
817 + if(which==UCNV_ROUNDTRIP_SET) {
818 + /* use only roundtrips */
819 + minValue=0xf00;
820 + } else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
821 + /* use all roundtrip and fallback results */
822 + minValue=0x800;
823 + }
824 +
825 for(st1=0; st1<maxStage1; ++st1) {
826 st2=table[st1];
827 if(st2>maxStage1) {
828 @@ -497,15 +511,8 @@
829 /* read the stage 3 block */
830 stage3=results+st3;
831
832 - /*
833 - * Add code points for which the roundtrip flag is set.
834 - * Once we get a set for fallback mappings, we have to use
835 - * a threshold variable with a value of 0x800.
836 - * See ucnv_MBCSSingleFromBMPWithOffsets() and
837 - * MBCS_SINGLE_RESULT_FROM_U() for details.
838 - */
839 do {
840 - if(*stage3++>=0xf00) {
841 + if(*stage3++>=minValue) {
842 sa->add(sa->set, c);
843 }
844 } while((++c&0xf)!=0);
845 @@ -522,9 +529,12 @@
846 const uint8_t *stage3, *bytes;
847 uint32_t st3Multiplier;
848 uint32_t value;
849 + UBool useFallback;
850
851 bytes=mbcsTable->fromUnicodeBytes;
852
853 + useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
854 +
855 switch(mbcsTable->outputType) {
856 case MBCS_OUTPUT_3:
857 case MBCS_OUTPUT_4_EUC:
858 @@ -551,9 +561,8 @@
859 st3>>=16;
860
861 /*
862 - * Add code points for which the roundtrip flag is set.
863 - * Once we get a set for fallback mappings, we have to check
864 - * non-roundtrip stage 3 results for whether they are 0 .
865 + * Add code points for which the roundtrip flag is set,
866 + * or which map to non-zero bytes if we use fallbacks.
867 * See ucnv_MBCSFromUnicodeWithOffsets() for details.
868 */
869 switch(filter) {
870 @@ -561,6 +570,23 @@
871 do {
872 if(st3&1) {
873 sa->add(sa->set, c);
874 + stage3+=st3Multiplier;
875 + } else if(useFallback) {
876 + uint8_t b=0;
877 + switch(st3Multiplier) {
878 + case 4:
879 + b|=*stage3++;
880 + case 3:
881 + b|=*stage3++;
882 + case 2:
883 + b|=stage3[0]|stage3[1];
884 + stage3+=2;
885 + default:
886 + break;
887 + }
888 + if(b!=0) {
889 + sa->add(sa->set, c);
890 + }
891 }
892 st3>>=1;
893 } while((++c&0xf)!=0);
894 @@ -568,7 +594,7 @@
895 case UCNV_SET_FILTER_DBCS_ONLY:
896 /* Ignore single-byte results (<0x100). */
897 do {
898 - if((st3&1)!=0 && *((const uint16_t *)stage3)>=0 x100) {
899 + if(((st3&1)!=0 || useFallback) && *((const uint 16_t *)stage3)>=0x100) {
900 sa->add(sa->set, c);
901 }
902 st3>>=1;
903 @@ -578,7 +604,7 @@
904 case UCNV_SET_FILTER_2022_CN:
905 /* Only add code points that map to CNS 11643 plan es 1 & 2 for non-EXT ISO-2022-CN. */
906 do {
907 - if((st3&1)!=0 && ((value=*stage3)==0x81 || valu e==0x82)) {
908 + if(((st3&1)!=0 || useFallback) && ((value=*stag e3)==0x81 || value==0x82)) {
909 sa->add(sa->set, c);
910 }
911 st3>>=1;
912 @@ -588,7 +614,33 @@
913 case UCNV_SET_FILTER_SJIS:
914 /* Only add code points that map to Shift-JIS code s corresponding to JIS X 0208. */
915 do {
916 - if((st3&1)!=0 && (value=*((const uint16_t *)sta ge3))>=0x8140 && value<=0xeffc) {
917 + if(((st3&1)!=0 || useFallback) && (value=*((con st uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
918 + sa->add(sa->set, c);
919 + }
920 + st3>>=1;
921 + stage3+=2; /* +=st3Multiplier */
922 + } while((++c&0xf)!=0);
923 + break;
924 + case UCNV_SET_FILTER_GR94DBCS:
925 + /* Only add code points that map to ISO 2022 GR 94 DBCS codes (each byte A1..FE). */
926 + do {
927 + if( ((st3&1)!=0 || useFallback) &&
928 + (uint16_t)((value=*((const uint16_t *)stage 3)) - 0xa1a1)<=(0xfefe - 0xa1a1) &&
929 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
930 + ) {
931 + sa->add(sa->set, c);
932 + }
933 + st3>>=1;
934 + stage3+=2; /* +=st3Multiplier */
935 + } while((++c&0xf)!=0);
936 + break;
937 + case UCNV_SET_FILTER_HZ:
938 + /* Only add code points that are suitable for HZ DB CS (lead byte A1..FD). */
939 + do {
940 + if( ((st3&1)!=0 || useFallback) &&
941 + (uint16_t)((value=*((const uint16_t *)stage 3))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
942 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1)
943 + ) {
944 sa->add(sa->set, c);
945 }
946 st3>>=1;
947 @@ -609,7 +661,7 @@
948 }
949 }
950
951 - ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
952 + ucnv_extGetUnicodeSet(sharedData, sa, which, filter, pErrorCode);
953 }
954
955 U_CFUNC void
956 @@ -1694,7 +1746,7 @@
957 cnv->toUBytes[0]=*(source-1);
958 cnv->toULength=_extToU(cnv, cnv->sharedData,
959 1, &source, sourceLimit,
960 - &target, target+targetCapacity,
961 + &target, pArgs->targetLimit,
962 &offsets, sourceIndex,
963 pArgs->flush,
964 pErrorCode);
965 @@ -1739,6 +1791,65 @@
966 pArgs->offsets=offsets;
967 }
968
969 +static UBool
970 +hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) {
971 + const int32_t *row=stateTable[state];
972 + int32_t b, entry;
973 + /* First test for final entries in this state for some commonly valid byte values. */
974 + entry=row[0xa1];
975 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
976 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
977 + ) {
978 + return TRUE;
979 + }
980 + entry=row[0x41];
981 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
982 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
983 + ) {
984 + return TRUE;
985 + }
986 + /* Then test for final entries in this state. */
987 + for(b=0; b<=0xff; ++b) {
988 + entry=row[b];
989 + if( !MBCS_ENTRY_IS_TRANSITION(entry) &&
990 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL
991 + ) {
992 + return TRUE;
993 + }
994 + }
995 + /* Then recurse for transition entries. */
996 + for(b=0; b<=0xff; ++b) {
997 + entry=row[b];
998 + if( MBCS_ENTRY_IS_TRANSITION(entry) &&
999 + hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE (entry))
1000 + ) {
1001 + return TRUE;
1002 + }
1003 + }
1004 + return FALSE;
1005 +}
1006 +
1007 +/*
1008 + * Is byte b a single/lead byte in this state?
1009 + * Recurse for transition states, because here we don't want to say that
1010 + * b is a lead byte if all byte sequences that start with b are illegal.
1011 + */
1012 +static UBool
1013 +isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnl y, uint8_t b) {
1014 + const int32_t *row=stateTable[state];
1015 + int32_t entry=row[b];
1016 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */
1017 + return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_ST ATE(entry));
1018 + } else {
1019 + uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
1020 + if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) {
1021 + return FALSE; /* SI/SO are illegal for DBCS-only conversion */
1022 + } else {
1023 + return action!=MBCS_STATE_ILLEGAL;
1024 + }
1025 + }
1026 +}
1027 +
1028 U_CFUNC void
1029 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1030 UErrorCode *pErrorCode) {
1031 @@ -2094,6 +2205,34 @@
1032 sourceIndex=nextSourceIndex;
1033 } else if(U_FAILURE(*pErrorCode)) {
1034 /* callback(illegal) */
1035 + if(byteIndex>1) {
1036 + /*
1037 + * Ticket 5691: consistent illegal sequences:
1038 + * - We include at least the first byte in the illegal sequence .
1039 + * - If any of the non-initial bytes could be the start of a ch aracter,
1040 + * we stop the illegal sequence before the first one of those .
1041 + */
1042 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0 );
1043 + int8_t i;
1044 + for(i=1;
1045 + i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnl y, bytes[i]);
1046 + ++i) {}
1047 + if(i<byteIndex) {
1048 + /* Back out some bytes. */
1049 + int8_t backOutDistance=byteIndex-i;
1050 + int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
1051 + byteIndex=i; /* length of reported illegal byte sequence * /
1052 + if(backOutDistance<=bytesFromThisBuffer) {
1053 + source-=backOutDistance;
1054 + } else {
1055 + /* Back out bytes from the previous buffer: Need to rep lay them. */
1056 + cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutD istance);
1057 + /* preToULength is negative! */
1058 + uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
1059 + source=(const uint8_t *)pArgs->source;
1060 + }
1061 + }
1062 + }
1063 break;
1064 } else /* unassigned sequences indicated with byteIndex>0 */ {
1065 /* try an extension mapping */
1066 @@ -2104,7 +2243,7 @@
1067 &offsets, sourceIndex,
1068 pArgs->flush,
1069 pErrorCode);
1070 - sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs ->source);
1071 + sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArg s->source);
1072
1073 if(U_FAILURE(*pErrorCode)) {
1074 /* not mappable or buffer overflow */
1075 @@ -2395,15 +2534,37 @@
1076
1077 if(c<0) {
1078 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
1079 - *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1080 - }
1081 - if(U_FAILURE(*pErrorCode)) {
1082 /* incomplete character byte sequence */
1083 uint8_t *bytes=cnv->toUBytes;
1084 cnv->toULength=(int8_t)(source-lastSource);
1085 do {
1086 *bytes++=*lastSource++;
1087 } while(lastSource<source);
1088 + *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1089 + } else if(U_FAILURE(*pErrorCode)) {
1090 + /* callback(illegal) */
1091 + /*
1092 + * Ticket 5691: consistent illegal sequences:
1093 + * - We include at least the first byte in the illegal sequence.
1094 + * - If any of the non-initial bytes could be the start of a charac ter,
1095 + * we stop the illegal sequence before the first one of those.
1096 + */
1097 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
1098 + uint8_t *bytes=cnv->toUBytes;
1099 + *bytes++=*lastSource++; /* first byte */
1100 + if(lastSource==source) {
1101 + cnv->toULength=1;
1102 + } else /* lastSource<source: multi-byte character */ {
1103 + int8_t i;
1104 + for(i=1;
1105 + lastSource<source && !isSingleOrLead(stateTable, state, isD BCSOnly, *lastSource);
1106 + ++i
1107 + ) {
1108 + *bytes++=*lastSource++;
1109 + }
1110 + cnv->toULength=i;
1111 + source=lastSource;
1112 + }
1113 } else {
1114 /* no output because of empty input or only state changes */
1115 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1116 @@ -3237,7 +3398,7 @@
1117 lastSource=source;
1118 c=_extFromU(cnv, cnv->sharedData,
1119 c, &source, sourceLimit,
1120 - &target, target+targetCapacity,
1121 + &target, (const uint8_t *)(pArgs->targetLimit),
1122 &offsets, sourceIndex,
1123 pArgs->flush,
1124 pErrorCode);
1125 --- r22777/source/common/ucnvmbcs.h 2007-10-11 14:31:32.196532000 -0700
1126 +++ chrome.canonical/source/common/ucnvmbcs.h 2009-03-23 12:30:17.315007000 -0 700
1127 @@ -492,6 +492,8 @@
1128 UCNV_SET_FILTER_DBCS_ONLY,
1129 UCNV_SET_FILTER_2022_CN,
1130 UCNV_SET_FILTER_SJIS,
1131 + UCNV_SET_FILTER_GR94DBCS,
1132 + UCNV_SET_FILTER_HZ,
1133 UCNV_SET_FILTER_COUNT
1134 } UConverterSetFilter;
1135
1136 --- r22777/source/common/ucnv.c 2007-08-31 12:39:14.294200000 -0700
1137 +++ chrome.canonical/source/common/ucnv.c 2009-03-23 12:40:10.566608000 -0 700
1138 @@ -1528,11 +1528,14 @@
1139 cnv->toULength=0;
1140
1141 /* call the callback function */
1142 + if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOU ND) {
1143 + cnv->toUCallbackReason = UCNV_UNASSIGNED;
1144 + }
1145 cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
1146 cnv->invalidCharBuffer, errorInputLength,
1147 - (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUE NCE) ?
1148 - UCNV_UNASSIGNED : UCNV_ILLEGAL,
1149 + cnv->toUCallbackReason,
1150 err);
1151 + cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */
1152
1153 /*
1154 * loop back to the offset handling
1155 --- r22777/source/common/uset_imp.h 2007-07-24 19:51:25.692061000 -0700
1156 +++ chrome.canonical/source/common/uset_imp.h 2009-03-23 12:30:09.893067000 -0 700
1157 @@ -36,6 +36,9 @@
1158 typedef void U_CALLCONV
1159 USetRemove(USet *set, UChar32 c);
1160
1161 +typedef void U_CALLCONV
1162 +USetRemoveRange(USet *set, UChar32 start, UChar32 end);
1163 +
1164 /**
1165 * Interface for adding items to a USet, to keep low-level code from
1166 * statically depending on the USet implementation.
1167 @@ -47,6 +50,7 @@
1168 USetAddRange *addRange;
1169 USetAddString *addString;
1170 USetRemove *remove;
1171 + USetRemoveRange *removeRange;
1172 };
1173 typedef struct USetAdder USetAdder;
1174
1175 --- r22777/source/common/ucnv2022.c 2007-10-11 14:31:32.196532000 -0700
1176 +++ chrome.canonical/source/common/ucnv2022.c 2009-03-23 12:57:38.398368000 -0 700
1177 @@ -201,6 +201,7 @@
1178 #ifdef U_ENABLE_GENERIC_ISO_2022
1179 UBool isFirstBuffer;
1180 #endif
1181 + UBool isEmptySegment;
1182 char name[30];
1183 char locale[3];
1184 }UConverterDataISO2022;
1185 @@ -609,6 +610,7 @@
1186 if(choice<=UCNV_RESET_TO_UNICODE) {
1187 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
1188 myConverterData->key = 0;
1189 + myConverterData->isEmptySegment = FALSE;
1190 }
1191 if(choice!=UCNV_RESET_TO_UNICODE) {
1192 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
1193 @@ -752,6 +754,7 @@
5 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraIn fo); 1194 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraIn fo);
6 uint32_t key = myData2022->key; 1195 uint32_t key = myData2022->key;
7 int32_t offset = 0; 1196 int32_t offset = 0;
8 + int8_t initialToULength = _this->toULength; 1197 + int8_t initialToULength = _this->toULength;
9 char c; 1198 char c;
10 1199
11 value = VALID_NON_TERMINAL_2022; 1200 value = VALID_NON_TERMINAL_2022;
12 @@ -804,7 +805,6 @@ 1201 @@ -804,7 +807,6 @@
13 return; 1202 return;
14 } else if (value == INVALID_2022 ) { 1203 } else if (value == INVALID_2022 ) {
15 *err = U_ILLEGAL_ESCAPE_SEQUENCE; 1204 *err = U_ILLEGAL_ESCAPE_SEQUENCE;
16 - return; 1205 - return;
17 } else /* value == VALID_TERMINAL_2022 */ { 1206 } else /* value == VALID_TERMINAL_2022 */ {
18 switch(var){ 1207 switch(var){
19 #ifdef U_ENABLE_GENERIC_ISO_2022 1208 #ifdef U_ENABLE_GENERIC_ISO_2022
20 @@ -935,6 +935,35 @@ 1209 @@ -814,6 +816,7 @@
1210 if(chosenConverterName == NULL) {
1211 /* SS2 or SS3 */
1212 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
1213 + _this->toUCallbackReason = UCNV_UNASSIGNED;
1214 return;
1215 }
1216
1217 @@ -935,6 +938,37 @@
21 } 1218 }
22 if(U_SUCCESS(*err)) { 1219 if(U_SUCCESS(*err)) {
23 _this->toULength = 0; 1220 _this->toULength = 0;
24 + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { 1221 + } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
25 + if(_this->toULength>1) { 1222 + if(_this->toULength>1) {
26 + /* 1223 + /*
27 + * Ticket 5691: consistent illegal sequences: 1224 + * Ticket 5691: consistent illegal sequences:
28 + * - We include at least the first byte (ESC) in the illegal sequen ce. 1225 + * - We include at least the first byte (ESC) in the illegal sequen ce.
29 + * - If any of the non-initial bytes could be the start of a charac ter, 1226 + * - If any of the non-initial bytes could be the start of a charac ter,
30 + * we stop the illegal sequence before the first one of those. 1227 + * we stop the illegal sequence before the first one of those.
(...skipping 12 matching lines...) Expand all
43 + } else { 1240 + } else {
44 + /* Back out bytes from the previous buffer: Need to replay them . */ 1241 + /* Back out bytes from the previous buffer: Need to replay them . */
45 + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistanc e); 1242 + _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistanc e);
46 + /* same as -(initialToULength-1) */ 1243 + /* same as -(initialToULength-1) */
47 + /* preToULength is negative! */ 1244 + /* preToULength is negative! */
48 + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULen gth); 1245 + uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULen gth);
49 + *source-=bytesFromThisBuffer; 1246 + *source-=bytesFromThisBuffer;
50 + } 1247 + }
51 + _this->toULength=1; 1248 + _this->toULength=1;
52 + } 1249 + }
1250 + } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
1251 + _this->toUCallbackReason = UCNV_UNASSIGNED;
53 } 1252 }
54 } 1253 }
55 1254
56 @@ -1097,6 +1126,24 @@ 1255 @@ -1113,6 +1147,24 @@
1256 }
57 } 1257 }
58 1258
59 /* 1259 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
60 + * * Check that the result is a 2-byte value with each byte in the range A1..F E 1260 +/*
61 + * * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byt e 1261 + * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code poi nt, it returns the
62 + * * to move it to the ISO 2022 range 21..7E. 1262 + * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
63 + * * Return 0 if out of range. 1263 + * unchanged.
64 + * */ 1264 + */
65 +static U_INLINE uint32_t 1265 +static U_INLINE uint32_t
66 +_2022FromGR94DBCS(uint32_t value) { 1266 +_2022ToGR94DBCS(uint32_t value) {
67 + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && 1267 + uint32_t returnValue = value + 0x8080;
68 + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) 1268 + if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
69 + ) { 1269 + (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
70 + return value - 0x8080; /* shift down to 21..7e byte range */ 1270 + return returnValue;
71 + } else { 1271 + } else {
72 + return 0; /* not valid for ISO 2022 */ 1272 + return value;
73 + } 1273 + }
74 +} 1274 +}
1275 +#endif
75 + 1276 +
76 +#if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
77 +/*
78 * Check that the result is a 2-byte value with each byte in the range A1..FE
79 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
80 * to move it to the ISO 2022 range 21..7E.
81 @@ -1112,6 +1159,7 @@
82 return 0; /* not valid for ISO 2022 */
83 }
84 }
85 +#endif
86
87 #ifdef U_ENABLE_GENERIC_ISO_2022 1277 #ifdef U_ENABLE_GENERIC_ISO_2022
88 1278
89 @@ -1953,6 +2001,7 @@ 1279 /****************************************************************************** ****
1280 @@ -1436,7 +1488,7 @@
1281 c2 = 0; /* invalid */
1282 }
1283 } else {
1284 - if((uint8_t)(c2-0x21) <= (0x7e-0x21)) {
1285 + if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
1286 c2 += 0x7e;
1287 } else {
1288 c2 = 0; /* invalid */
1289 @@ -1953,6 +2005,7 @@
90 const char *mySourceLimit = args->sourceLimit; 1290 const char *mySourceLimit = args->sourceLimit;
91 uint32_t targetUniChar = 0x0000; 1291 uint32_t targetUniChar = 0x0000;
92 uint32_t mySourceChar = 0x0000; 1292 uint32_t mySourceChar = 0x0000;
93 + uint32_t tmpSourceChar = 0x0000; 1293 + uint32_t tmpSourceChar = 0x0000;
94 UConverterDataISO2022* myData; 1294 UConverterDataISO2022* myData;
95 ISO2022State *pToU2022State; 1295 ISO2022State *pToU2022State;
96 StateEnum cs; 1296 StateEnum cs;
97 @@ -1968,6 +2017,7 @@ 1297 @@ -1968,6 +2021,7 @@
98 mySourceChar = args->converter->toUBytes[0]; 1298 mySourceChar = args->converter->toUBytes[0];
99 args->converter->toULength = 0; 1299 args->converter->toULength = 0;
100 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; 1300 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
101 + targetUniChar = missingCharMarker; 1301 + targetUniChar = missingCharMarker;
102 goto getTrailByte; 1302 goto getTrailByte;
103 } 1303 }
104 1304
105 @@ -2077,17 +2127,44 @@ 1305 @@ -1986,6 +2040,7 @@
1306 continue;
1307 } else {
1308 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1309 + myData->isEmptySegment = FALSE;» /* reset this, we have a different error */
1310 break;
1311 }
1312
1313 @@ -1997,21 +2052,39 @@
1314 continue;
1315 } else {
1316 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1317 + myData->isEmptySegment = FALSE;» /* reset this, we have a different error */
1318 break;
1319 }
1320
1321 case ESC_2022:
1322 mySource--;
1323 escape:
1324 - changeState_2022(args->converter,&(mySource),
1325 - mySourceLimit, ISO_2022_JP,err);
1326 + {
1327 + const char * mySourceBefore = mySource;
1328 + int8_t toULengthBefore = args->converter->toULength;
1329 +
1330 + changeState_2022(args->converter,&(mySource),
1331 + mySourceLimit, ISO_2022_JP,err);
1332 +
1333 + /* If in ISO-2022-JP only and we successully completed an e scape sequence, but previous segment was empty, create an error */
1334 + if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
1335 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1336 + args->converter->toUCallbackReason = UCNV_IRREGULAR;
1337 + args->converter->toULength = toULengthBefore + (mySourc e - mySourceBefore);
1338 + }
1339 + }
1340
1341 /* invalid or illegal escape sequence */
1342 if(U_FAILURE(*err)){
1343 args->target = myTarget;
1344 args->source = mySource;
1345 + myData->isEmptySegment = FALSE;» /* Reset to avoid future spurious errors */
1346 return;
1347 }
1348 + /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
1349 + if(myData->key==0) {
1350 + myData->isEmptySegment = TRUE;
1351 + }
1352 continue;
1353
1354 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1355 @@ -2028,6 +2101,7 @@
1356 /* falls through */
1357 default:
1358 /* convert one or two bytes */
1359 + myData->isEmptySegment = FALSE;
1360 cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1361 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData-> version==4 &&
1362 !IS_JP_DBCS(cs)
1363 @@ -2077,17 +2151,44 @@
106 default: 1364 default:
107 /* G0 DBCS */ 1365 /* G0 DBCS */
108 if(mySource < mySourceLimit) { 1366 if(mySource < mySourceLimit) {
109 - char trailByte; 1367 - char trailByte;
110 + int leadIsOk, trailIsOk; 1368 + int leadIsOk, trailIsOk;
111 + uint8_t trailByte; 1369 + uint8_t trailByte;
112 getTrailByte: 1370 getTrailByte:
113 - trailByte = *mySource++; 1371 - trailByte = *mySource++;
114 - if(cs == JISX208) { 1372 - if(cs == JISX208) {
115 - _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailBy te, tempBuf); 1373 - _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailBy te, tempBuf);
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
149 + /* report a pair of illegal bytes if the second byt e is not a DBCS starter */ 1407 + /* report a pair of illegal bytes if the second byt e is not a DBCS starter */
150 + ++mySource; 1408 + ++mySource;
151 + /* add another bit so that the code below writes 2 bytes in case of error */ 1409 + /* add another bit so that the code below writes 2 bytes in case of error */
152 + mySourceChar = 0x10000 | (mySourceChar << 8) | trai lByte; 1410 + mySourceChar = 0x10000 | (mySourceChar << 8) | trai lByte;
153 } 1411 }
154 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt e); 1412 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt e);
155 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myC onverterArray[cs], tempBuf, 2, FALSE); 1413 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myC onverterArray[cs], tempBuf, 2, FALSE);
156 } else { 1414 } else {
157 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 1415 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
158 args->converter->toULength = 1; 1416 args->converter->toULength = 1;
159 @@ -2229,7 +2306,12 @@ 1417 @@ -2229,7 +2330,12 @@
160 } 1418 }
161 /* only DBCS or SBCS characters are expected*/ 1419 /* only DBCS or SBCS characters are expected*/
162 /* DB characters with high bit set to 1 are expected */ 1420 /* DB characters with high bit set to 1 are expected */
163 - if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080) && length==2)){ 1421 - if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080) && length==2)){
164 + if( length > 2 || length==0 || 1422 + if( length > 2 || length==0 ||
165 + (length == 1 && targetByteUnit > 0x7f) || 1423 + (length == 1 && targetByteUnit > 0x7f) ||
166 + (length == 2 && 1424 + (length == 2 &&
167 + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || 1425 + ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
168 + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) 1426 + (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
169 + ) { 1427 + ) {
170 targetByteUnit=missingCharMarker; 1428 targetByteUnit=missingCharMarker;
171 } 1429 }
172 if (targetByteUnit != missingCharMarker){ 1430 if (targetByteUnit != missingCharMarker){
173 @@ -2545,17 +2627,34 @@ 1431 @@ -2524,15 +2630,27 @@
174 1432
1433 if(mySourceChar==UCNV_SI){
1434 myData->toU2022State.g = 0;
1435 + if (myData->isEmptySegment) {
1436 + myData->isEmptySegment = FALSE; /* we are handling it, r eset to avoid future spurious errors */
1437 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1438 + args->converter->toUCallbackReason = UCNV_IRREGULAR;
1439 + args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1440 + args->converter->toULength = 1;
1441 + args->target = myTarget;
1442 + args->source = mySource;
1443 + return;
1444 + }
1445 /*consume the source */
1446 continue;
1447 }else if(mySourceChar==UCNV_SO){
1448 myData->toU2022State.g = 1;
1449 + myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
1450 /*consume the source */
1451 continue;
1452 }else if(mySourceChar==ESC_2022){
1453 mySource--;
1454 escape:
1455 + myData->isEmptySegment = FALSE; /* Any invalid ESC seque nces will be detected separately, so just reset this */
1456 changeState_2022(args->converter,&(mySource),
1457 mySourceLimit, ISO_2022_KR, err);
1458 if(U_FAILURE(*err)){
1459 @@ -2543,19 +2661,37 @@
1460 continue;
1461 }
1462
1463 + myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
175 if(myData->toU2022State.g == 1) { 1464 if(myData->toU2022State.g == 1) {
176 if(mySource < mySourceLimit) { 1465 if(mySource < mySourceLimit) {
177 - char trailByte; 1466 - char trailByte;
178 + int leadIsOk, trailIsOk; 1467 + int leadIsOk, trailIsOk;
179 + uint8_t trailByte; 1468 + uint8_t trailByte;
180 getTrailByte: 1469 getTrailByte:
181 - trailByte = *mySource++; 1470 - trailByte = *mySource++;
182 - tempBuf[0] = (char)(mySourceChar + 0x80); 1471 - tempBuf[0] = (char)(mySourceChar + 0x80);
183 - tempBuf[1] = (char)(trailByte + 0x80); 1472 - tempBuf[1] = (char)(trailByte + 0x80);
184 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); 1473 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
(...skipping 22 matching lines...) Expand all
207 - targetUniChar = missingCharMarker; 1496 - targetUniChar = missingCharMarker;
208 + mySourceChar = (mySourceChar << 8) | trailByte; 1497 + mySourceChar = (mySourceChar << 8) | trailByte;
209 + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { 1498 + } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
210 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 1499 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */
211 + ++mySource; 1500 + ++mySource;
212 + /* add another bit so that the code below writes 2 byte s in case of error */ 1501 + /* add another bit so that the code below writes 2 byte s in case of error */
213 + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByt e; 1502 + mySourceChar = 0x10000 | (mySourceChar << 8) | trailByt e;
214 } 1503 }
215 } else { 1504 } else {
216 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 1505 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
217 @@ -2563,8 +2662,10 @@ 1506 @@ -2563,8 +2699,10 @@
218 break; 1507 break;
219 } 1508 }
220 } 1509 }
221 - else{ 1510 - else{
222 + else if(mySourceChar <= 0x7f) { 1511 + else if(mySourceChar <= 0x7f) {
223 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySourc e - 1, 1, useFallback); 1512 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySourc e - 1, 1, useFallback);
224 + } else { 1513 + } else {
225 + targetUniChar = 0xffff; 1514 + targetUniChar = 0xffff;
226 } 1515 }
227 if(targetUniChar < 0xfffe){ 1516 if(targetUniChar < 0xfffe){
228 if(args->offsets) { 1517 if(args->offsets) {
229 @@ -3061,6 +3162,7 @@ 1518 @@ -3061,6 +3199,7 @@
230 /* continue with a partial double-byte character */ 1519 /* continue with a partial double-byte character */
231 mySourceChar = args->converter->toUBytes[0]; 1520 mySourceChar = args->converter->toUBytes[0];
232 args->converter->toULength = 0; 1521 args->converter->toULength = 0;
233 + targetUniChar = missingCharMarker; 1522 + targetUniChar = missingCharMarker;
234 goto getTrailByte; 1523 goto getTrailByte;
235 } 1524 }
236 1525
237 @@ -3114,29 +3216,50 @@ 1526 @@ -3075,27 +3214,52 @@
1527 switch(mySourceChar){
1528 case UCNV_SI:
1529 pToU2022State->g=0;
1530 + if (myData->isEmptySegment) {
1531 + myData->isEmptySegment = FALSE;» /* we are handling it, r eset to avoid future spurious errors */
1532 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1533 + args->converter->toUCallbackReason = UCNV_IRREGULAR;
1534 + args->converter->toUBytes[0] = mySourceChar;
1535 + args->converter->toULength = 1;
1536 + args->target = myTarget;
1537 + args->source = mySource;
1538 + return;
1539 + }
1540 continue;
1541
1542 case UCNV_SO:
1543 if(pToU2022State->cs[1] != 0) {
1544 pToU2022State->g=1;
1545 + myData->isEmptySegment = TRUE;» /* Begin a new segment, empty so far */
1546 continue;
1547 } else {
1548 /* illegal to have SO before a matching designator */
1549 + myData->isEmptySegment = FALSE;» /* Handling a different error, reset this to avoid future spurious errs */
1550 break;
1551 }
1552
1553 case ESC_2022:
1554 mySource--;
1555 escape:
1556 - changeState_2022(args->converter,&(mySource),
1557 - mySourceLimit, ISO_2022_CN,err);
1558 + {
1559 + const char * mySourceBefore = mySource;
1560 + int8_t toULengthBefore = args->converter->toULength;
1561 +
1562 + changeState_2022(args->converter,&(mySource),
1563 + mySourceLimit, ISO_2022_CN,err);
1564 +
1565 + /* After SO there must be at least one character before a d esignator (designator error handled separately) */
1566 + if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegm ent) {
1567 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1568 + args->converter->toUCallbackReason = UCNV_IRREGULAR;
1569 + args->converter->toULength = toULengthBefore + (mySourc e - mySourceBefore);
1570 + }
1571 + }
1572
1573 /* invalid or illegal escape sequence */
1574 if(U_FAILURE(*err)){
1575 args->target = myTarget;
1576 args->source = mySource;
1577 + myData->isEmptySegment = FALSE;» /* Reset to avoid future spurious errors */
1578 return;
1579 }
1580 continue;
1581 @@ -3109,34 +3273,56 @@
1582 /* falls through */
1583 default:
1584 /* convert one or two bytes */
1585 + myData->isEmptySegment = FALSE;
1586 if(pToU2022State->g != 0) {
1587 if(mySource < mySourceLimit) {
238 UConverterSharedData *cnv; 1588 UConverterSharedData *cnv;
239 StateEnum tempState; 1589 StateEnum tempState;
240 int32_t tempBufLen; 1590 int32_t tempBufLen;
241 - char trailByte; 1591 - char trailByte;
242 + int leadIsOk, trailIsOk; 1592 + int leadIsOk, trailIsOk;
243 + uint8_t trailByte; 1593 + uint8_t trailByte;
244 getTrailByte: 1594 getTrailByte:
245 - trailByte = *mySource++; 1595 - trailByte = *mySource++;
246 - tempState = (StateEnum)pToU2022State->cs[pToU2022State- >g]; 1596 - tempState = (StateEnum)pToU2022State->cs[pToU2022State- >g];
247 - if(tempState > CNS_11643_0) { 1597 - if(tempState > CNS_11643_0) {
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
295 } 1645 }
296 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt e); 1646 - mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByt e);
297 if(pToU2022State->g>=2) { 1647 if(pToU2022State->g>=2) {
298 /* return from a single-shift state to the previous one */ 1648 /* return from a single-shift state to the previous one */
299 pToU2022State->g=pToU2022State->prevG; 1649 pToU2022State->g=pToU2022State->prevG;
300 } 1650 }
301 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBu f, tempBufLen, FALSE); 1651 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBu f, tempBufLen, FALSE);
302 } else { 1652 } else {
303 args->converter->toUBytes[0] = (uint8_t)mySourceChar; 1653 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
304 args->converter->toULength = 1; 1654 args->converter->toULength = 1;
305 diff -ru trie.clean/source/common/ucnvmbcs.c chrome.canonical/source/common/ucnv mbcs.c 1655 @@ -3399,11 +3585,19 @@
306 --- trie.clean/source/common/ucnvmbcs.c 2007-11-07 17:39:05.057870000 -0800 1656 /* include ASCII for JP */
307 +++ chrome.canonical/source/common/ucnvmbcs.c 2008-10-29 11:34:34.648518000 -0 700 1657 sa->addRange(sa->set, 0, 0x7f);
1658 }
1659 - if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
1660 + if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_ AND_FALLBACK_SET) {
1661 /*
1662 - * TODO(markus): If and when ucnv_getUnicodeSet() supports fallback s,
1663 - * we need to include half-width Katakana for all JP variants becau se
1664 - * JIS X 0208 has hardcoded fallbacks for them.
1665 + * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))! =0
1666 + * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
1667 + * use half-width Katakana.
1668 + * This is because all ISO-2022-JP variants are lenient in that the y accept (in toUnicode)
1669 + * half-width Katakana via the ESC ( I sequence.
1670 + * However, we only emit (fromUnicode) half-width Katakana accordin g to the
1671 + * definition of each variant.
1672 + *
1673 + * When including fallbacks,
1674 + * we need to include half-width Katakana Unicode code points for a ll JP variants because
1675 + * JIS X 0208 has hardcoded fallbacks for them (which map to full-w idth Katakana).
1676 */
1677 /* include half-width Katakana for JP */
1678 sa->addRange(sa->set, HWKANA_START, HWKANA_END);
1679 @@ -3457,6 +3651,12 @@
1680 * corresponding to JIS X 0208.
1681 */
1682 filter=UCNV_SET_FILTER_SJIS;
1683 + } else if(i==KSC5601) {
1684 + /*
1685 + * Some of the KSC 5601 tables (convrtrs.txt has this aliases o n multiple tables)
1686 + * are broader than GR94.
1687 + */
1688 + filter=UCNV_SET_FILTER_GR94DBCS;
1689 } else {
1690 filter=UCNV_SET_FILTER_NONE;
1691 }
1692 @@ -3472,6 +3672,9 @@
1693 sa->remove(sa->set, 0x0e);
1694 sa->remove(sa->set, 0x0f);
1695 sa->remove(sa->set, 0x1b);
1696 +
1697 + /* ISO 2022 converters do not convert C1 controls either */
1698 + sa->removeRange(sa->set, 0x80, 0x9f);
1699 }
1700
1701 static const UConverterImpl _ISO2022Impl={
1702 --- r22777/source/common/ucnv_lmb.c 2006-08-19 14:27:08.000000000 -0700
1703 +++ chrome.canonical/source/common/ucnv_lmb.c 2009-03-23 12:30:26.043293000 -0 700
1704 @@ -1,6 +1,6 @@
1705 /*
1706 **********************************************************************
1707 -* Copyright (C) 2000-2006, International Business Machines
1708 +* Copyright (C) 2000-2007, International Business Machines
1709 * Corporation and others. All Rights Reserved.
1710 **********************************************************************
1711 * file name: ucnv_lmb.cpp
1712 @@ -536,7 +536,7 @@
1713 NULL,\
1714 NULL,\
1715 _LMBCSSafeClone,\
1716 - _LMBCSGetUnicodeSet\
1717 + ucnv_getCompleteUnicodeSet\
1718 };\
1719 static const UConverterStaticData _LMBCSStaticData##n={\
1720 sizeof(UConverterStaticData),\
1721 @@ -662,15 +662,14 @@
1722 return &newLMBCS->cnv;
1723 }
1724
1725 -static void
1726 -_LMBCSGetUnicodeSet(const UConverter *cnv,
1727 - const USetAdder *sa,
1728 - UConverterUnicodeSet which,
1729 - UErrorCode *pErrorCode) {
1730 - /* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
1731 - sa->addRange(sa->set, 0, 0xf5ff);
1732 - sa->addRange(sa->set, 0xf700, 0x10ffff);
1733 -}
1734 +/*
1735 + * There used to be a _LMBCSGetUnicodeSet() function here (up to svn revision 2 0117)
1736 + * which added all code points except for U+F6xx
1737 + * because those cannot be represented in the Unicode group.
1738 + * However, it turns out that windows-950 has roundtrips for all of U+F6xx
1739 + * which means that LMBCS can convert all Unicode code points after all.
1740 + * We now simply use ucnv_getCompleteUnicodeSet().
1741 + */
1742
1743 /*
1744 Here's the basic helper function that we use when converting from
1745 --- r22777/source/common/ucnvhz.c 2006-07-05 16:08:50.000000000 -0700
1746 +++ chrome.canonical/source/common/ucnvhz.c 2009-03-23 12:42:01.208181000 -0 700
1747 @@ -1,6 +1,6 @@
1748 /*
1749 **********************************************************************
1750 -* Copyright (C) 2000-2006, International Business Machines
1751 +* Copyright (C) 2000-2007, International Business Machines
1752 * Corporation and others. All Rights Reserved.
1753 **********************************************************************
1754 * file name: ucnvhz.c
1755 @@ -59,6 +59,7 @@
1756 UBool isEscapeAppended;
1757 UBool isStateDBCS;
1758 UBool isTargetUCharDBCS;
1759 + UBool isEmptySegment;
1760 }UConverterDataHZ;
1761
1762
1763 @@ -72,7 +73,7 @@
1764 cnv->extraInfo = uprv_malloc(sizeof(UConverterDataHZ));
1765 if(cnv->extraInfo != NULL){
1766 uprv_memset(cnv->extraInfo, 0, sizeof(UConverterDataHZ));
1767 - ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386" ,errorCode);
1768 + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("GBK",erro rCode);
1769 }
1770 else {
1771 *errorCode = U_MEMORY_ALLOCATION_ERROR;
1772 @@ -98,6 +99,7 @@
1773 cnv->mode=0;
1774 if(cnv->extraInfo != NULL){
1775 ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
1776 + ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
1777 }
1778 }
1779 if(choice!=UCNV_RESET_TO_UNICODE) {
1780 @@ -130,6 +132,10 @@
1781 * from-GB code '~}' ($7E7D) is outside the defined GB range.)
1782 *
1783 * Source: RFC 1842
1784 +*
1785 +* Note that the formal syntax in RFC 1842 is invalid. I assume that the
1786 +* intended definition of single-byte-segment is as follows (pedberg):
1787 +* single-byte-segment = single-byte-seq 1*single-byte-char
1788 */
1789
1790
1791 @@ -141,7 +147,7 @@
1792 UChar *myTarget = args->target;
1793 const char *mySourceLimit = args->sourceLimit;
1794 UChar32 targetUniChar = 0x0000;
1795 - UChar mySourceChar = 0x0000;
1796 + int32_t mySourceChar = 0x0000;
1797 UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
1798 tempBuf[0]=0;
1799 tempBuf[1]=0;
1800 @@ -156,90 +162,123 @@
1801
1802 mySourceChar= (unsigned char) *mySource++;
1803
1804 - switch(mySourceChar){
1805 + if(args->converter->mode == UCNV_TILDE) {
1806 + /* second byte after ~ */
1807 + args->converter->mode=0;
1808 + switch(mySourceChar) {
1809 case 0x0A:
1810 - if(args->converter->mode ==UCNV_TILDE){
1811 - args->converter->mode=0;
1812 -
1813 - }
1814 - *(myTarget++)=(UChar)mySourceChar;
1815 + /* no output for ~\n (line-continuation marker) */
1816 continue;
1817 -
1818 case UCNV_TILDE:
1819 - if(args->converter->mode ==UCNV_TILDE){
1820 - *(myTarget++)=(UChar)mySourceChar;
1821 - args->converter->mode=0;
1822 - continue;
1823 -
1824 + if(args->offsets) {
1825 + args->offsets[myTarget - args->target]=(int32_t)(mySour ce - args->source - 2);
1826 }
1827 - else if(args->converter->toUnicodeStatus !=0){
1828 - args->converter->mode=0;
1829 - break;
1830 - }
1831 - else{
1832 - args->converter->mode = UCNV_TILDE;
1833 - continue;
1834 - }
1835 -
1836 -
1837 + *(myTarget++)=(UChar)mySourceChar;
1838 + myData->isEmptySegment = FALSE;
1839 + continue;
1840 case UCNV_OPEN_BRACE:
1841 - if(args->converter->mode == UCNV_TILDE){
1842 - args->converter->mode=0;
1843 - myData->isStateDBCS = TRUE;
1844 - continue;
1845 - }
1846 - else{
1847 - break;
1848 - }
1849 -
1850 -
1851 case UCNV_CLOSE_BRACE:
1852 - if(args->converter->mode == UCNV_TILDE){
1853 - args->converter->mode=0;
1854 - myData->isStateDBCS = FALSE;
1855 - continue;
1856 - }
1857 - else{
1858 - break;
1859 + myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
1860 + if (myData->isEmptySegment) {
1861 + myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
1862 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1863 + args->converter->toUCallbackReason = UCNV_IRREGULAR;
1864 + args->converter->toUBytes[0] = UCNV_TILDE;
1865 + args->converter->toUBytes[1] = mySourceChar;
1866 + args->converter->toULength = 2;
1867 + args->target = myTarget;
1868 + args->source = mySource;
1869 + return;
1870 }
1871 -
1872 + myData->isEmptySegment = TRUE;
1873 + continue;
1874 default:
1875 /* if the first byte is equal to TILDE and the trail byte
1876 * is not a valid byte then it is an error condition
1877 */
1878 - if(args->converter->mode == UCNV_TILDE){
1879 - args->converter->mode=0;
1880 - mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) | ((mySo urceChar & 0x00ff)+0x80));
1881 - goto SAVE_STATE;
1882 - }
1883 -
1884 - break;
1885 -
1886 - }
1887 -
1888 - if(myData->isStateDBCS){
1889 + /*
1890 + * Ticket 5691: consistent illegal sequences:
1891 + * - We include at least the first byte in the illegal sequ ence.
1892 + * - If any of the non-initial bytes could be the start of a character,
1893 + * we stop the illegal sequence before the first one of t hose.
1894 + */
1895 + myData->isEmptySegment = FALSE; /* different error here, re set this to avoid spurious future error */
1896 + *err = U_ILLEGAL_ESCAPE_SEQUENCE;
1897 + args->converter->toUBytes[0] = UCNV_TILDE;
1898 + if( myData->isStateDBCS ?
1899 + (0x21 <= mySourceChar && mySourceChar <= 0x7e) :
1900 + mySourceChar <= 0x7f
1901 + ) {
1902 + /* The current byte could be the start of a character: Back it out. */
1903 + args->converter->toULength = 1;
1904 + --mySource;
1905 + } else {
1906 + /* Include the current byte in the illegal sequence. */
1907 + args->converter->toUBytes[1] = mySourceChar;
1908 + args->converter->toULength = 2;
1909 + }
1910 + args->target = myTarget;
1911 + args->source = mySource;
1912 + return;
1913 + }
1914 + } else if(myData->isStateDBCS) {
1915 if(args->converter->toUnicodeStatus == 0x00){
1916 - args->converter->toUnicodeStatus = (UChar) mySourceChar;
1917 + /* lead byte */
1918 + if(mySourceChar == UCNV_TILDE) {
1919 + args->converter->mode = UCNV_TILDE;
1920 + } else {
1921 + /* add another bit to distinguish a 0 byte from not hav ing seen a lead byte */
1922 + args->converter->toUnicodeStatus = (uint32_t) (mySource Char | 0x100);
1923 + myData->isEmptySegment = FALSE; /* the segment has some thing, either valid or will produce a different error, so reset this */
1924 + }
1925 continue;
1926 }
1927 else{
1928 - tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
1929 - tempBuf[1] = (char) (mySourceChar+0x80);
1930 - mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x 80) << 8) | ((mySourceChar & 0x00ff)+0x80));
1931 + /* trail byte */
1932 + int leadIsOk, trailIsOk;
1933 + uint32_t leadByte = args->converter->toUnicodeStatus & 0xff ;
1934 + targetUniChar = 0xffff;
1935 + /*
1936 + * Ticket 5691: consistent illegal sequences:
1937 + * - We include at least the first byte in the illegal sequ ence.
1938 + * - If any of the non-initial bytes could be the start of a character,
1939 + * we stop the illegal sequence before the first one of t hose.
1940 + *
1941 + * In HZ DBCS, if the second byte is in the 21..7e range,
1942 + * we report only the first byte as the illegal sequence.
1943 + * Otherwise we convert or report the pair of bytes.
1944 + */
1945 + leadIsOk = (uint8_t)(leadByte - 0x21) <= (0x7d - 0x21);
1946 + trailIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21) ;
1947 + if (leadIsOk && trailIsOk) {
1948 + tempBuf[0] = (char) (leadByte+0x80) ;
1949 + tempBuf[1] = (char) (mySourceChar+0x80);
1950 + targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbC onverter->sharedData,
1951 + tempBuf, 2, args->converter->useFallback);
1952 + mySourceChar= (leadByte << 8) | mySourceChar;
1953 + } else if (trailIsOk) {
1954 + /* report a single illegal byte and continue with the f ollowing DBCS starter byte */
1955 + --mySource;
1956 + mySourceChar = (int32_t)leadByte;
1957 + } else {
1958 + /* report a pair of illegal bytes if the second byte is not a DBCS starter */
1959 + /* add another bit so that the code below writes 2 byte s in case of error */
1960 + mySourceChar= 0x10000 | (leadByte << 8) | mySourceChar;
1961 + }
1962 args->converter->toUnicodeStatus =0x00;
1963 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConve rter->sharedData,
1964 - tempBuf, 2, args->converter->useFallback);
1965 }
1966 }
1967 else{
1968 - if(args->converter->fromUnicodeStatus == 0x00){
1969 - targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConve rter->sharedData,
1970 - mySource - 1, 1, args->converter->useFallback);
1971 - }
1972 - else{
1973 - goto SAVE_STATE;
1974 + if(mySourceChar == UCNV_TILDE) {
1975 + args->converter->mode = UCNV_TILDE;
1976 + continue;
1977 + } else if(mySourceChar <= 0x7f) {
1978 + targetUniChar = (UChar)mySourceChar; /* ASCII */
1979 + myData->isEmptySegment = FALSE; /* the segment has somethin g valid */
1980 + } else {
1981 + targetUniChar = 0xffff;
1982 + myData->isEmptySegment = FALSE; /* different error here, re set this to avoid spurious future error */
1983 }
1984 -
1985 }
1986 if(targetUniChar < 0xfffe){
1987 if(args->offsets) {
1988 @@ -248,26 +287,17 @@
1989
1990 *(myTarget++)=(UChar)targetUniChar;
1991 }
1992 - else if(targetUniChar>=0xfffe){
1993 -SAVE_STATE:
1994 + else /* targetUniChar>=0xfffe */ {
1995 if(targetUniChar == 0xfffe){
1996 *err = U_INVALID_CHAR_FOUND;
1997 }
1998 else{
1999 *err = U_ILLEGAL_CHAR_FOUND;
2000 }
2001 - if(myData->isStateDBCS){
2002 - /* this should never occur since isStateDBCS is set to true
2003 - * only after tempBuf[0] and tempBuf[1]
2004 - * are set to the input .. just to please BEAM
2005 - */
2006 - if(tempBuf[0]==0 || tempBuf[1]==0){
2007 - *err = U_INTERNAL_PROGRAM_ERROR;
2008 - }else{
2009 - args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x8 0);
2010 - args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x8 0);
2011 - args->converter->toULength=2;
2012 - }
2013 + if(mySourceChar > 0xff){
2014 + args->converter->toUBytes[0] = (uint8_t)(mySourceChar >> 8) ;
2015 + args->converter->toUBytes[1] = (uint8_t)mySourceChar;
2016 + args->converter->toULength=2;
2017 }
2018 else{
2019 args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2020 @@ -328,16 +358,21 @@
2021 escSeq = TILDE_ESCAPE;
2022 CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,e rr,len,mySourceIndex);
2023 continue;
2024 - }
2025 - else{
2026 + } else if(mySourceChar <= 0x7f) {
2027 + length = 1;
2028 + targetUniChar = mySourceChar;
2029 + } else {
2030 length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->shar edData,
2031 mySourceChar,&targetUniChar,args->converter->useFallback);
2032 -
2033 - }
2034 - /* only DBCS or SBCS characters are expected*/
2035 - /* DB haracters with high bit set to 1 are expected */
2036 - if(length > 2 || length==0 ||(((targetUniChar & 0x8080) != 0x8080)& & length==2)){
2037 - targetUniChar= missingCharMarker;
2038 + /* we can only use lead bytes 21..7D and trail bytes 21..7E */
2039 + if( length == 2 &&
2040 + (uint16_t)(targetUniChar - 0xa1a1) <= (0xfdfe - 0xa1a1) &&
2041 + (uint8_t)(targetUniChar - 0xa1) <= (0xfe - 0xa1)
2042 + ) {
2043 + targetUniChar -= 0x8080;
2044 + } else {
2045 + targetUniChar = missingCharMarker;
2046 + }
2047 }
2048 if (targetUniChar != missingCharMarker){
2049 myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool) (targetUniChar>0x00FF);
2050 @@ -360,22 +395,22 @@
2051
2052 if(isTargetUCharDBCS){
2053 if( myTargetIndex <targetLength){
2054 - myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
2055 + myTarget[myTargetIndex++] =(char) (targetUniChar >> 8);
2056 if(offsets){
2057 *(offsets++) = mySourceIndex-1;
2058 }
2059 if(myTargetIndex < targetLength){
2060 - myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
2061 + myTarget[myTargetIndex++] =(char) targetUniChar;
2062 if(offsets){
2063 *(offsets++) = mySourceIndex-1;
2064 }
2065 }else{
2066 - args->converter->charErrorBuffer[args->converter->c harErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
2067 + args->converter->charErrorBuffer[args->converter->c harErrorBufferLength++] = (char) targetUniChar;
2068 *err = U_BUFFER_OVERFLOW_ERROR;
2069 }
2070 }else{
2071 - args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
2072 - args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
2073 + args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] =(char) (targetUniChar >> 8);
2074 + args->converter->charErrorBuffer[args->converter->charE rrorBufferLength++] = (char) targetUniChar;
2075 *err = U_BUFFER_OVERFLOW_ERROR;
2076 }
2077
2078 @@ -524,14 +559,14 @@
2079 const USetAdder *sa,
2080 UConverterUnicodeSet which,
2081 UErrorCode *pErrorCode) {
2082 - /* the tilde '~' is hardcoded in the converter */
2083 - sa->add(sa->set, 0x7e);
2084 + /* HZ converts all of ASCII */
2085 + sa->addRange(sa->set, 0, 0x7f);
2086
2087 /* add all of the code points that the sub-converter handles */
2088 - ((UConverterDataHZ*)cnv->extraInfo)->
2089 - gbConverter->sharedData->impl->
2090 - getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
2091 - sa, which, pErrorCode);
2092 + ucnv_MBCSGetFilteredUnicodeSetForUnicode(
2093 + ((UConverterDataHZ*)cnv->extraInfo)->gbConverter->sharedData,
2094 + sa, which, UCNV_SET_FILTER_HZ,
2095 + pErrorCode);
2096 }
2097
2098 static const UConverterImpl _HZImpl={
2099 --- r22777/source/common/ucnv_set.c 2005-06-03 13:17:54.000000000 -0700
2100 +++ chrome.canonical/source/common/ucnv_set.c 2009-03-23 12:30:09.917043000 -0 700
308 @@ -1,7 +1,7 @@ 2101 @@ -1,7 +1,7 @@
309 /* 2102 /*
310 ****************************************************************************** 2103 *******************************************************************************
311 * 2104 *
312 -* Copyright (C) 2000-2007, International Business Machines 2105 -* Copyright (C) 2003-2005, International Business Machines
313 +* Copyright (C) 2000-2008, International Business Machines 2106 +* Copyright (C) 2003-2007, International Business Machines
314 * Corporation and others. All Rights Reserved. 2107 * Corporation and others. All Rights Reserved.
315 * 2108 *
316 ****************************************************************************** 2109 *******************************************************************************
317 @@ -1739,6 +1739,65 @@ 2110 @@ -52,7 +52,8 @@
318 pArgs->offsets=offsets; 2111 uset_add,
319 } 2112 uset_addRange,
320 2113 uset_addString,
321 +static UBool 2114 - uset_remove
322 +hasValidTrailBytes(const int32_t (*stateTable)[256], uint8_t state) { 2115 + uset_remove,
323 + const int32_t *row=stateTable[state]; 2116 + uset_removeRange
324 + int32_t b, entry; 2117 };
325 + /* First test for final entries in this state for some commonly valid byte values. */ 2118 sa.set=setFillIn;
326 + entry=row[0xa1]; 2119
327 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2120 --- r22777/source/common/ucnv_bld.c» 2007-08-24 02:44:10.880047000 -0700
328 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2121 +++ chrome.canonical/source/common/ucnv_bld.c» 2009-03-23 12:40:10.653507000 -0 700
2122 @@ -932,6 +932,7 @@
2123 myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
2124 myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
2125 uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subC har, myUConverter->subCharLen);
2126 + myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */
2127
2128 if(mySharedConverterData->impl->open != NULL) {
2129 mySharedConverterData->impl->open(myUConverter, realName, locale, optio ns, err);
2130 --- r22777/source/common/ucnv_bld.h» 2006-07-05 16:08:50.000000000 -0700
2131 +++ chrome.canonical/source/common/ucnv_bld.h» 2009-03-23 12:40:10.680507000 -0 700
2132 @@ -1,6 +1,6 @@
2133 /*
2134 **********************************************************************
2135 -* Copyright (C) 1999-2006, International Business Machines
2136 +* Copyright (C) 1999-2006,2008 International Business Machines
2137 * Corporation and others. All Rights Reserved.
2138 **********************************************************************
2139 *
2140 @@ -226,6 +226,9 @@
2141 char preToU[UCNV_EXT_MAX_BYTES];
2142 int8_t preFromULength, preToULength; /* negative: replay */
2143 int8_t preToUFirstLength; /* length of first character */
2144 +
2145 + /* new fields for ICU 4.0 */
2146 + UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) re ason, set when error is detected */
2147 };
2148
2149 U_CDECL_END /* end of UConverter */
2150 --- r22777/source/common/ucnv_ext.c» 2007-08-22 22:46:49.525855000 -0700
2151 +++ chrome.canonical/source/common/ucnv_ext.c» 2009-03-23 12:30:33.135573000 -0 700
2152 @@ -946,7 +946,7 @@
2153 ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
2154 const int32_t *cx,
2155 const USetAdder *sa,
2156 - UConverterUnicodeSet which,
2157 + UBool useFallback,
2158 int32_t minLength,
2159 UChar32 c,
2160 UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
2161 @@ -966,7 +966,7 @@
2162 value=*fromUSectionValues++;
2163
2164 if( value!=0 &&
2165 - UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
2166 + (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
2167 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
2168 ) {
2169 if(c>=0) {
2170 @@ -987,12 +987,14 @@
2171 /* no mapping, do nothing */
2172 } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
2173 ucnv_extGetUnicodeSetString(
2174 - sharedData, cx, sa, which, minLength,
2175 + sharedData, cx, sa, useFallback, minLength,
2176 U_SENTINEL, s, length+1,
2177 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
2178 pErrorCode);
2179 - } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESER VED_MASK))==
2180 - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
2181 + } else if((useFallback ?
2182 + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
2183 + ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_R ESERVED_MASK))==
2184 + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
2185 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
2186 ) {
2187 sa->addString(sa->set, s, length+1);
2188 @@ -1004,6 +1006,7 @@
2189 ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
2190 const USetAdder *sa,
2191 UConverterUnicodeSet which,
2192 + UConverterSetFilter filter,
2193 UErrorCode *pErrorCode) {
2194 const int32_t *cx;
2195 const uint16_t *stage12, *stage3, *ps2, *ps3;
2196 @@ -1011,6 +1014,7 @@
2197
2198 uint32_t value;
2199 int32_t st1, stage1Length, st2, st3, minLength;
2200 + UBool useFallback;
2201
2202 UChar s[UCNV_EXT_MAX_UCHARS];
2203 UChar32 c;
2204 @@ -1027,10 +1031,16 @@
2205
2206 stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
2207
2208 + useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
2209 +
2210 /* enumerate the from-Unicode trie table */
2211 c=0; /* keep track of the current code point while enumerating */
2212
2213 - if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
2214 + if(filter==UCNV_SET_FILTER_2022_CN) {
2215 + minLength=3;
2216 + } else if( sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ||
2217 + filter!=UCNV_SET_FILTER_NONE
329 + ) { 2218 + ) {
330 + return TRUE; 2219 /* DBCS-only, ignore single-byte results */
331 + } 2220 minLength=2;
332 + entry=row[0x41]; 2221 } else {
333 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2222 @@ -1064,14 +1074,48 @@
334 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2223 length=0;
335 + ) { 2224 U16_APPEND_UNSAFE(s, length, c);
336 + return TRUE; 2225 ucnv_extGetUnicodeSetString(
337 + } 2226 - sharedData, cx, sa, which, minLength,
338 + /* Then test for final entries in this state. */ 2227 + sharedData, cx, sa, useFallback, minLength,
339 + for(b=0; b<=0xff; ++b) { 2228 c, s, length,
340 + entry=row[b]; 2229 (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(valu e),
341 + if( !MBCS_ENTRY_IS_TRANSITION(entry) && 2230 pErrorCode);
342 + MBCS_ENTRY_FINAL_ACTION(entry)!=MBCS_STATE_ILLEGAL 2231 - } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_ EXT_FROM_U_RESERVED_MASK))==
343 + ) { 2232 - UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
344 + return TRUE; 2233 + } else if((useFallback ?
345 + } 2234 + (value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
346 + } 2235 + ((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|U CNV_EXT_FROM_U_RESERVED_MASK))==
347 + /* Then recurse for transition entries. */ 2236 + UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
348 + for(b=0; b<=0xff; ++b) { 2237 UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
349 + entry=row[b]; 2238 ) {
350 + if( MBCS_ENTRY_IS_TRANSITION(entry) && 2239 + switch(filter) {
351 + hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_STATE (entry)) 2240 + case UCNV_SET_FILTER_2022_CN:
352 + ) { 2241 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UC NV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
353 + return TRUE; 2242 + continue;
354 + } 2243 + }
355 + } 2244 + break;
356 + return FALSE; 2245 + case UCNV_SET_FILTER_SJIS:
357 +} 2246 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 && (v alue=UCNV_EXT_FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)) {
358 + 2247 + continue;
359 +/* 2248 + }
360 + * Is byte b a single/lead byte in this state? 2249 + break;
361 + * Recurse for transition states, because here we don't want to say that 2250 + case UCNV_SET_FILTER_GR94DBCS:
362 + * b is a lead byte if all byte sequences that start with b are illegal. 2251 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
363 + */ 2252 + (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA (value))-0xa1a1)<=(0xfefe - 0xa1a1) &&
364 +static UBool 2253 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
365 +isSingleOrLead(const int32_t (*stateTable)[256], uint8_t state, UBool isDBCSOnl y, uint8_t b) { 2254 + continue;
366 + const int32_t *row=stateTable[state]; 2255 + }
367 + int32_t entry=row[b]; 2256 + break;
368 + if(MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2257 + case UCNV_SET_FILTER_HZ:
369 + return hasValidTrailBytes(stateTable, (uint8_t)MBCS_ENTRY_TRANSITION_ST ATE(entry)); 2258 + if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==2 &&
370 + } else { 2259 + (uint16_t)((value=UCNV_EXT_FROM_U_GET_DATA (value))-0xa1a1)<=(0xfdfe - 0xa1a1) &&
371 + uint8_t action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); 2260 + (uint8_t)(value-0xa1)<=(0xfe - 0xa1))) {
372 + if(action==MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2261 + continue;
373 + return FALSE; /* SI/SO are illegal for DBCS-only conversion */ 2262 + }
374 + } else { 2263 + break;
375 + return action!=MBCS_STATE_ILLEGAL; 2264 + default:
376 + } 2265 + /*
377 + } 2266 + * UCNV_SET_FILTER_NONE,
378 +} 2267 + * or UCNV_SET_FILTER_DBCS_ONLY which is handle d via minLength
379 + 2268 + */
380 U_CFUNC void 2269 + break;
381 ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 2270 + }
382 UErrorCode *pErrorCode) { 2271 sa->add(sa->set, c);
383 @@ -2094,6 +2153,34 @@ 2272 }
384 sourceIndex=nextSourceIndex; 2273 } while((++c&0xf)!=0);
385 } else if(U_FAILURE(*pErrorCode)) {
386 /* callback(illegal) */
387 + if(byteIndex>1) {
388 + /*
389 + * Ticket 5691: consistent illegal sequences:
390 + * - We include at least the first byte in the illegal sequence .
391 + * - If any of the non-initial bytes could be the start of a ch aracter,
392 + * we stop the illegal sequence before the first one of those .
393 + */
394 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0 );
395 + int8_t i;
396 + for(i=1;
397 + i<byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnl y, bytes[i]);
398 + ++i) {}
399 + if(i<byteIndex) {
400 + /* Back out some bytes. */
401 + int8_t backOutDistance=byteIndex-i;
402 + int32_t bytesFromThisBuffer=(int32_t)(source-(const uint8_t *)pArgs->source);
403 + byteIndex=i; /* length of reported illegal byte sequence * /
404 + if(backOutDistance<=bytesFromThisBuffer) {
405 + source-=backOutDistance;
406 + } else {
407 + /* Back out bytes from the previous buffer: Need to rep lay them. */
408 + cnv->preToULength=(int8_t)(bytesFromThisBuffer-backOutD istance);
409 + /* preToULength is negative! */
410 + uprv_memcpy(cnv->preToU, bytes+i, -cnv->preToULength);
411 + source=(const uint8_t *)pArgs->source;
412 + }
413 + }
414 + }
415 break;
416 } else /* unassigned sequences indicated with byteIndex>0 */ {
417 /* try an extension mapping */
418 @@ -2104,7 +2191,7 @@
419 &offsets, sourceIndex,
420 pArgs->flush,
421 pErrorCode);
422 - sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs ->source);
423 + sourceIndex=nextSourceIndex+=(int32_t)(source-(const uint8_t *)pArg s->source);
424
425 if(U_FAILURE(*pErrorCode)) {
426 /* not mappable or buffer overflow */
427 @@ -2395,15 +2482,37 @@
428
429 if(c<0) {
430 if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
431 - *pErrorCode=U_TRUNCATED_CHAR_FOUND;
432 - }
433 - if(U_FAILURE(*pErrorCode)) {
434 /* incomplete character byte sequence */
435 uint8_t *bytes=cnv->toUBytes;
436 cnv->toULength=(int8_t)(source-lastSource);
437 do {
438 *bytes++=*lastSource++;
439 } while(lastSource<source);
440 + *pErrorCode=U_TRUNCATED_CHAR_FOUND;
441 + } else if(U_FAILURE(*pErrorCode)) {
442 + /* callback(illegal) */
443 + /*
444 + * Ticket 5691: consistent illegal sequences:
445 + * - We include at least the first byte in the illegal sequence.
446 + * - If any of the non-initial bytes could be the start of a charac ter,
447 + * we stop the illegal sequence before the first one of those.
448 + */
449 + UBool isDBCSOnly=(UBool)(cnv->sharedData->mbcs.dbcsOnlyState!=0);
450 + uint8_t *bytes=cnv->toUBytes;
451 + *bytes++=*lastSource++; /* first byte */
452 + if(lastSource==source) {
453 + cnv->toULength=1;
454 + } else /* lastSource<source: multi-byte character */ {
455 + int8_t i;
456 + for(i=1;
457 + lastSource<source && !isSingleOrLead(stateTable, state, isD BCSOnly, *lastSource);
458 + ++i
459 + ) {
460 + *bytes++=*lastSource++;
461 + }
462 + cnv->toULength=i;
463 + source=lastSource;
464 + }
465 } else {
466 /* no output because of empty input or only state changes */
467 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
468 diff -ru trie.clean/source/test/cintltst/nccbtst.c chrome.canonical/source/test/ cintltst/nccbtst.c
469 --- trie.clean/source/test/cintltst/nccbtst.c 2007-09-19 09:45:00.986804000 -0 700
470 +++ chrome.canonical/source/test/cintltst/nccbtst.c 2008-10-29 11:08:51.1023 76000 -0700
471 @@ -1,6 +1,6 @@
472 /********************************************************************
473 * COPYRIGHT:
474 - * Copyright (c) 1997-2007, International Business Machines Corporation and
475 + * Copyright (c) 1997-2008, International Business Machines Corporation and
476 * others. All Rights Reserved.
477 ********************************************************************/
478 /*
479 @@ -2530,13 +2530,13 @@
480
481
482 static const uint8_t text943[] = {
483 - 0x82, 0xa9, 0x82, 0x20, /*0xc8,*/ 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
484 - static const UChar toUnicode943sub[] = { 0x304b, 0xfffd, /*0xff88,*/ 0x0061 , 0x6f22, 0x5b57};
485 - static const UChar toUnicode943skip[]= { 0x304b, /*0xff88,*/ 0x0061, 0x6f22 , 0x5b57};
486 + 0x82, 0xa9, 0x82, 0x20, 0x61, 0x8a, 0xbf, 0x8e, 0x9a };
487 + static const UChar toUnicode943sub[] = { 0x304b, 0x1a, 0x20, 0x0061, 0x6f22 , 0x5b57 };
488 + static const UChar toUnicode943skip[]= { 0x304b, 0x20, 0x0061, 0x6f22, 0x5 b57 };
489 static const UChar toUnicode943stop[]= { 0x304b};
490
491 - static const int32_t fromIBM943Offssub[] = {0, 2, 4, 5, 7};
492 - static const int32_t fromIBM943Offsskip[] = { 0, 4, 5, 7};
493 + static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 7 };
494 + static const int32_t fromIBM943Offsskip[] = { 0, 3, 4, 5, 7 };
495 static const int32_t fromIBM943Offsstop[] = { 0};
496
497 gInBufferSize = inputsize;
498 @@ -2570,9 +2570,9 @@
499 {
500 static const uint8_t sampleText[] = {
501 0x82, 0xa9, 0x61, 0x62, 0x63 , 0x82,
502 - 0xff, /*0x82, 0xa9,*/ 0x32, 0x33};
503 - static const UChar toUnicode943sub[] = {0x304b, 0x0061, 0x0062, 0x0063, 0x fffd,/*0x304b,*/ 0x0032, 0x0033};
504 - static const int32_t fromIBM943Offssub[] = {0, 2, 3, 4, 5, 7, 8};
505 + 0xff, 0x32, 0x33};
506 + static const UChar toUnicode943sub[] = { 0x304b, 0x0061, 0x0062, 0x0063, 0x 1a, 0x1a, 0x0032, 0x0033 };
507 + static const int32_t fromIBM943Offssub[] = { 0, 2, 3, 4, 5, 6, 7, 8 };
508 /*checking illegal value for ibm-943 with substitute*/
509 gInBufferSize = inputsize;
510 gOutBufferSize = outputsize;
511 diff -ru trie.clean/source/test/cintltst/nucnvtst.c chrome.canonical/source/test /cintltst/nucnvtst.c
512 --- trie.clean/source/test/cintltst/nucnvtst.c 2007-10-11 14:52:29.172174000 -0 700
513 +++ chrome.canonical/source/test/cintltst/nucnvtst.c 2008-10-29 11:08:51.1942 86000 -0700
514 @@ -2606,7 +2606,7 @@
515 TestNextUCharError(cnv, source, source, U_INDEX_OUTOFBOUNDS_ERROR, "sourceL imit <= source");
516 /*Test for the condition where there is an invalid character*/
517 {
518 - static const uint8_t source2[]={0xa1, 0x01};
519 + static const uint8_t source2[]={0xa1, 0x80};
520 TestNextUCharError(cnv, (const char*)source2, (const char*)source2+size of(source2), U_ZERO_ERROR, "an invalid character");
521 }
522 /*Test for the condition where we have a truncated char*/
523 @@ -3899,11 +3899,11 @@
524 TestISO_2022_KR() {
525 /* test input */
526 static const uint16_t in[]={
527 - 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F66,0x9F67,0x9F6A,0x0 00A,0x000D
528 - ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C02,0xAC04
529 + 0x9F4B,0x9F4E,0x9F52,0x9F5F,0x9F61,0x9F67,0x9F6A,0x000A,0x0 00D
530 + ,0x9F6C,0x9F77,0x9F8D,0x9F90,0x9F95,0x9F9C,0xAC00,0xAC01,0xA C04
531 ,0xAC07,0xAC08,0xAC09,0x0025,0x0026,0x0027,0x000A,0x000D,0x0 028,0x0029
532 ,0x002A,0x002B,0x002C,0x002D,0x002E,0x53C3,0x53C8,0x53C9,0x5 3CA,0x53CB
533 - ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53DF,0x5 3E1,0x53E2
534 + ,0x53CD,0x53D4,0x53D6,0x53D7,0x53DB,0x000A,0x000D,0x53E1,0x5 3E2
535 ,0x53E3,0x53E4,0x000A,0x000D};
536 const UChar* uSource;
537 const UChar* uSourceLimit;
538 diff -ru trie.clean/source/test/testdata/conversion.txt chrome.canonical/source/ test/testdata/conversion.txt
539 --- trie.clean/source/test/testdata/conversion.txt 2007-10-11 14:31:32.1965 32000 -0700
540 +++ chrome.canonical/source/test/testdata/conversion.txt 2008-10-29 11:37 :09.419716000 -0700
541 @@ -48,13 +48,135 @@
542 toUnicode {
543 Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
544 Cases {
545 + // Test ticket 5691: consistent illegal sequences
546 + // The following test cases are for illegal character byte sequences.
547 + //
548 + // Unfortunately, we cannot use the Shift-JIS examples from the ticket
549 + // comments because our Shift-JIS table is Windows-compatible and
550 + // therefore has no illegal single bytes. Same for GBK.
551 + // Instead, we use the stricter GB 18030 also for 2-byte examples.
552 + // The byte sequences are generally slightly different from the ticket
553 + // comment, simply using assigned characters rather than just
554 + // theoretically valid sequences.
555 + {
556 + "gb18030",
557 + :bin{ 618140813c81ff7a },
558 + "a\u4e02\\x81<\\x81\\xFFz",
559 + :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 },
560 + :int{1}, :int{0}, "", "&C", :bin{""}
561 + }
562 + {
563 + "EUC-JP",
564 + :bin{ 618fb0a98fb03c8f3cb0a97a },
565 + "a\u4e28\\x8F\\xB0<\\x8F<\u9022z",
566 + :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 },
567 + :int{1}, :int{0}, "", "&C", :bin{""}
568 + }
569 + {
570 + "gb18030",
571 + :bin{ 618130fc318130fc8181303c3e813cfc817a },
572 + "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z",
573 + :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17 },
574 + :int{1}, :int{0}, "", "&C", :bin{""}
575 + }
576 + {
577 + "UTF-8",
578 + :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a },
579 + "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1 \\xFF<>z",
580 + :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,1 2,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20, 21 },
581 + :int{1}, :int{0}, "", "&C", :bin{""}
582 + }
583 + {
584 + "ISO-2022-JP",
585 + :bin{ 1b24424141af4142affe41431b2842 },
586 + "\u758f\\xAF\u758e\\xAF\\xFE\u790e",
587 + :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 },
588 + :int{1}, :int{0}, "", "&C", :bin{""}
589 + }
590 + {
591 + "ibm-25546",
592 + :bin{ 411b242943420e4141af4142affe41430f5a },
593 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
594 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
595 + :int{1}, :int{0}, "", "&C", :bin{""}
596 + }
597 + {
598 + "ISO-2022-KR",
599 + :bin{ 411b242943420e4141af4142affe41430f5a },
600 + "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ",
601 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
602 + :int{1}, :int{0}, "", "&C", :bin{""}
603 + }
604 + {
605 + "ISO-2022-CN",
606 + :bin{ 411b242941420e4141af4142affe41430f5a },
607 + "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
608 + :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 },
609 + :int{1}, :int{0}, "", "&C", :bin{""}
610 + }
611 + {
612 + "HZ",
613 + :bin{ 417e7b4141af4142affe41437e7d5a },
614 + "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z",
615 + :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 },
616 + :int{1}, :int{0}, "", "&C", :bin{""}
617 + }
618 + // Test ticket 5691: consistent illegal sequences
619 + // The following test cases are for illegal escape/designator/shift seq uences.
620 + //
621 + // ISO-2022-JP and -CN with illegal escape sequences.
622 + {
623 + "ISO-2022-JP",
624 + :bin{ 611b24201b244241411b283f1b28427a },
625 + "a\\x1B$ \u758f\\x1B\u2538z",
626 + :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 },
627 + :int{1}, :int{0}, "", "&C", :bin{""}
628 + }
629 + {
630 + "ISO-2022-CN",
631 + :bin{ 611b2429201b2429410e41410f7a },
632 + "a\\x1B$) \u4eaez",
633 + :intvector{ 0,1,1,1,1,2,3,4,10,13 },
634 + :int{1}, :int{0}, "", "&C", :bin{""}
635 + }
636 + // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS 3 sequences.
637 + // The first ESC N comes before its designator sequence, the last seque nce is ESC+space.
638 + {
639 + "ISO-2022-JP-2",
640 + :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e },
641 + "N\\x1BNNN\xceN\\x1B N",
642 + :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 },
643 + :int{1}, :int{0}, "", "&C", :bin{""}
644 + }
645 + {
646 + "ISO-2022-CN-EXT",
647 + :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e },
648 + "N\\x1BNNN\u8f0eN\\x1B N",
649 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
650 + :int{1}, :int{0}, "", "&C", :bin{""}
651 + }
652 + {
653 + "ISO-2022-CN-EXT",
654 + :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f },
655 + "O\\x1BOOO\u492bO\\x1B O",
656 + :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 },
657 + :int{1}, :int{0}, "", "&C", :bin{""}
658 + }
659 + // Test ticket 5691: Example from Peter Edberg.
660 + {
661 + "ISO-2022-JP",
662 + :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 },
663 + "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda",
664 + :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 },
665 + :int{1}, :int{0}, "", "?", :bin{""}
666 + }
667 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
668 // using the Shift-JIS table for JIS X 0208 (ticket #5797)
669 {
670 "ISO-2022-JP",
671 :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b284 2 },
672 - "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6 f3e",
673 - :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
674 + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\uf ffd\u6f3e",
675 + :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 },
676 :int{1}, :int{1}, "", "?", :bin{""}
677 }
678 // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBM PWithOffsets()
679 @@ -303,7 +425,7 @@
680 {
681 "ISO-2022-CN-EXT",
682 :bin{ 411b4e2121 }, "\x41", :intvector{ 0 },
683 - :int{1}, :int{1}, "illesc", ".", :bin{ 1b4e }
684 + :int{1}, :int{1}, "illesc", ".", :bin{ 1b }
685 }
686 // G3 designator: recognized, but not supported for -CN (only for -CN-E XT)
687 {
OLDNEW
« no previous file with comments | « third_party/icu38/source/test/testdata/testdata.mak ('k') | third_party/icu38/uconv.security.header.patch » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698