OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2000-2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * file name: ucnv2022.c |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * created on: 2000feb03 |
| 12 * created by: Markus W. Scherer |
| 13 * |
| 14 * Change history: |
| 15 * |
| 16 * 06/29/2000 helena Major rewrite of the callback APIs. |
| 17 * 08/08/2000 Ram Included support for ISO-2022-JP-2 |
| 18 * Changed implementation of toUnicode |
| 19 * function |
| 20 * 08/21/2000 Ram Added support for ISO-2022-KR |
| 21 * 08/29/2000 Ram Seperated implementation of EBCDIC to |
| 22 * ucnvebdc.c |
| 23 * 09/20/2000 Ram Added support for ISO-2022-CN |
| 24 * Added implementations for getNextUChar() |
| 25 * for specific 2022 country variants. |
| 26 * 10/31/2000 Ram Implemented offsets logic functions |
| 27 */ |
| 28 |
| 29 #include "unicode/utypes.h" |
| 30 |
| 31 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION |
| 32 |
| 33 #include "unicode/ucnv.h" |
| 34 #include "unicode/uset.h" |
| 35 #include "unicode/ucnv_err.h" |
| 36 #include "unicode/ucnv_cb.h" |
| 37 #include "ucnv_imp.h" |
| 38 #include "ucnv_bld.h" |
| 39 #include "ucnv_cnv.h" |
| 40 #include "ucnvmbcs.h" |
| 41 #include "cstring.h" |
| 42 #include "cmemory.h" |
| 43 |
| 44 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 45 |
| 46 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 47 /* |
| 48 * I am disabling the generic ISO-2022 converter after proposing to do so on |
| 49 * the icu mailing list two days ago. |
| 50 * |
| 51 * Reasons: |
| 52 * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of |
| 53 * its designation sequences, single shifts with return to the previous state
, |
| 54 * switch-with-no-return to UTF-16BE or similar, etc. |
| 55 * This is unlike the language-specific variants like ISO-2022-JP which |
| 56 * require a much smaller repertoire of ISO-2022 features. |
| 57 * These variants continue to be supported. |
| 58 * 2. I believe that no one is really using the generic ISO-2022 converter |
| 59 * but rather always one of the language-specific variants. |
| 60 * Note that ICU's generic ISO-2022 converter has always output one escape |
| 61 * sequence followed by UTF-8 for the whole stream. |
| 62 * 3. Switching between subcharsets is extremely slow, because each time |
| 63 * the previous converter is closed and a new one opened, |
| 64 * without any kind of caching, least-recently-used list, etc. |
| 65 * 4. The code is currently buggy, and given the above it does not seem |
| 66 * reasonable to spend the time on maintenance. |
| 67 * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings. |
| 68 * This means, for example, that when ISO-8859-7 is designated, the following |
| 69 * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff. |
| 70 * The ICU ISO-2022 converter does not handle this - and has no information |
| 71 * about which subconverter would have to be shifted vs. which is designed |
| 72 * for 7-bit ISO-2022. |
| 73 * |
| 74 * Markus Scherer 2003-dec-03 |
| 75 */ |
| 76 #endif |
| 77 |
| 78 static const char SHIFT_IN_STR[] = "\x0F"; |
| 79 static const char SHIFT_OUT_STR[] = "\x0E"; |
| 80 |
| 81 #define CR 0x0D |
| 82 #define LF 0x0A |
| 83 #define H_TAB 0x09 |
| 84 #define V_TAB 0x0B |
| 85 #define SPACE 0x20 |
| 86 |
| 87 enum { |
| 88 HWKANA_START=0xff61, |
| 89 HWKANA_END=0xff9f |
| 90 }; |
| 91 |
| 92 /* |
| 93 * 94-character sets with native byte values A1..FE are encoded in ISO 2022 |
| 94 * as bytes 21..7E. (Subtract 0x80.) |
| 95 * 96-character sets with native byte values A0..FF are encoded in ISO 2022 |
| 96 * as bytes 20..7F. (Subtract 0x80.) |
| 97 * Do not encode C1 control codes with native bytes 80..9F |
| 98 * as bytes 00..1F (C0 control codes). |
| 99 */ |
| 100 enum { |
| 101 GR94_START=0xa1, |
| 102 GR94_END=0xfe, |
| 103 GR96_START=0xa0, |
| 104 GR96_END=0xff |
| 105 }; |
| 106 |
| 107 /* |
| 108 * ISO 2022 control codes must not be converted from Unicode |
| 109 * because they would mess up the byte stream. |
| 110 * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b |
| 111 * corresponding to SO, SI, and ESC. |
| 112 */ |
| 113 #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0) |
| 114 |
| 115 /* for ISO-2022-JP and -CN implementations */ |
| 116 typedef enum { |
| 117 /* shared values */ |
| 118 INVALID_STATE=-1, |
| 119 ASCII = 0, |
| 120 |
| 121 SS2_STATE=0x10, |
| 122 SS3_STATE, |
| 123 |
| 124 /* JP */ |
| 125 ISO8859_1 = 1 , |
| 126 ISO8859_7 = 2 , |
| 127 JISX201 = 3, |
| 128 JISX208 = 4, |
| 129 JISX212 = 5, |
| 130 GB2312 =6, |
| 131 KSC5601 =7, |
| 132 HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */ |
| 133 |
| 134 /* CN */ |
| 135 /* the first few enum constants must keep their values because they corr
espond to myConverterArray[] */ |
| 136 GB2312_1=1, |
| 137 ISO_IR_165=2, |
| 138 CNS_11643=3, |
| 139 |
| 140 /* |
| 141 * these are used in StateEnum and ISO2022State variables, |
| 142 * but CNS_11643 must be used to index into myConverterArray[] |
| 143 */ |
| 144 CNS_11643_0=0x20, |
| 145 CNS_11643_1, |
| 146 CNS_11643_2, |
| 147 CNS_11643_3, |
| 148 CNS_11643_4, |
| 149 CNS_11643_5, |
| 150 CNS_11643_6, |
| 151 CNS_11643_7 |
| 152 } StateEnum; |
| 153 |
| 154 /* is the StateEnum charset value for a DBCS charset? */ |
| 155 #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601) |
| 156 |
| 157 #define CSM(cs) ((uint16_t)1<<(cs)) |
| 158 |
| 159 /* |
| 160 * Each of these charset masks (with index x) contains a bit for a charset in ex
act correspondence |
| 161 * to whether that charset is used in the corresponding version x of ISO_2022,lo
cale=ja,version=x |
| 162 * |
| 163 * Note: The converter uses some leniency: |
| 164 * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in |
| 165 * all versions, not just JIS7 and JIS8. |
| 166 * - ICU does not distinguish between different versions of JIS X 0208. |
| 167 */ |
| 168 enum { MAX_JA_VERSION=4 }; |
| 169 static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={ |
| 170 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT), |
| 171 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212), |
| 172 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB231
2)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
| 173 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB231
2)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7), |
| 174 CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB231
2)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7) |
| 175 }; |
| 176 |
| 177 typedef enum { |
| 178 ASCII1=0, |
| 179 LATIN1, |
| 180 SBCS, |
| 181 DBCS, |
| 182 MBCS, |
| 183 HWKANA |
| 184 }Cnv2022Type; |
| 185 |
| 186 typedef struct ISO2022State { |
| 187 int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3)
*/ |
| 188 int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */ |
| 189 int8_t prevG; /* g before single shift (SS2 or SS3) */ |
| 190 } ISO2022State; |
| 191 |
| 192 #define UCNV_OPTIONS_VERSION_MASK 0xf |
| 193 #define UCNV_2022_MAX_CONVERTERS 10 |
| 194 |
| 195 typedef struct{ |
| 196 UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS]; |
| 197 UConverter *currentConverter; |
| 198 Cnv2022Type currentType; |
| 199 ISO2022State toU2022State, fromU2022State; |
| 200 uint32_t key; |
| 201 uint32_t version; |
| 202 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 203 UBool isFirstBuffer; |
| 204 #endif |
| 205 UBool isEmptySegment; |
| 206 char name[30]; |
| 207 char locale[3]; |
| 208 }UConverterDataISO2022; |
| 209 |
| 210 /* Protos */ |
| 211 /* ISO-2022 ----------------------------------------------------------------- */ |
| 212 |
| 213 /*Forward declaration */ |
| 214 U_CFUNC void |
| 215 ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args, |
| 216 UErrorCode * err); |
| 217 U_CFUNC void |
| 218 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args, |
| 219 UErrorCode * err); |
| 220 |
| 221 #define ESC_2022 0x1B /*ESC*/ |
| 222 |
| 223 typedef enum |
| 224 { |
| 225 INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape seque
nce*/ |
| 226 VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 es
cape sequence*/ |
| 227 VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequen
ce*/ |
| 228 VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape seque
nce, but by adding more characters might match another escape sequence*/ |
| 229 } UCNV_TableStates_2022; |
| 230 |
| 231 /* |
| 232 * The way these state transition arrays work is: |
| 233 * ex : ESC$B is the sequence for JISX208 |
| 234 * a) First Iteration: char is ESC |
| 235 * i) Get the value of ESC from normalize_esq_chars_2022[] with int valu
e of ESC as index |
| 236 * int x = normalize_esq_chars_2022[27] which is equal to 1 |
| 237 * ii) Search for this value in escSeqStateTable_Key_2022[] |
| 238 * value of x is stored at escSeqStateTable_Key_2022[0] |
| 239 * iii) Save this index as offset |
| 240 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
| 241 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2
022 |
| 242 * b) Switch on this state and continue to next char |
| 243 * i) Get the value of $ from normalize_esq_chars_2022[] with int value
of $ as index |
| 244 * which is normalize_esq_chars_2022[36] == 4 |
| 245 * ii) x is currently 1(from above) |
| 246 * x<<=5 -- x is now 32 |
| 247 * x+=normalize_esq_chars_2022[36] |
| 248 * now x is 36 |
| 249 * iii) Search for this value in escSeqStateTable_Key_2022[] |
| 250 * value of x is stored at escSeqStateTable_Key_2022[2], so offset is
2 |
| 251 * iv) Get state of this sequence from escSeqStateTable_Value_2022[] |
| 252 * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2
022 |
| 253 * c) Switch on this state and continue to next char |
| 254 * i) Get the value of B from normalize_esq_chars_2022[] with int value o
f B as index |
| 255 * ii) x is currently 36 (from above) |
| 256 * x<<=5 -- x is now 1152 |
| 257 * x+=normalize_esq_chars_2022[66] |
| 258 * now x is 1161 |
| 259 * iii) Search for this value in escSeqStateTable_Key_2022[] |
| 260 * value of x is stored at escSeqStateTable_Key_2022[21], so offset is
21 |
| 261 * iv) Get state of this sequence from escSeqStateTable_Value_2022[21] |
| 262 * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022 |
| 263 * v) Get the converter name form escSeqStateTable_Result_2022[21] which
is JISX208 |
| 264 */ |
| 265 |
| 266 |
| 267 /*Below are the 3 arrays depicting a state transition table*/ |
| 268 static const int8_t normalize_esq_chars_2022[256] = { |
| 269 /* 0 1 2 3 4 5 6 7 8
9 */ |
| 270 |
| 271 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 272 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 273 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,
0 |
| 274 ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29
,0 |
| 275 ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,
0 |
| 276 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 277 ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,
12 |
| 278 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,
28 |
| 279 ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 280 ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 281 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 282 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 283 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 284 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 285 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 286 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 287 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 288 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 289 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 290 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 291 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 292 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 293 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 294 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 295 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
0 |
| 296 ,0 ,0 ,0 ,0 ,0 ,0 |
| 297 }; |
| 298 |
| 299 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 300 /* |
| 301 * When the generic ISO-2022 converter is completely removed, not just disabled |
| 302 * per #ifdef, then the following state table and the associated tables that are |
| 303 * dimensioned with MAX_STATES_2022 should be trimmed. |
| 304 * |
| 305 * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of |
| 306 * the associated escape sequences starting with ESC ( B should be removed. |
| 307 * This includes the ones with key values 1097 and all of the ones above 1000000
. |
| 308 * |
| 309 * For the latter, the tables can simply be truncated. |
| 310 * For the former, since the tables must be kept parallel, it is probably best |
| 311 * to simply duplicate an adjacent table cell, parallel in all tables. |
| 312 * |
| 313 * It may make sense to restructure the tables, especially by using small search |
| 314 * tables for the variants instead of indexing them parallel to the table here. |
| 315 */ |
| 316 #endif |
| 317 |
| 318 #define MAX_STATES_2022 74 |
| 319 static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = { |
| 320 /* 0 1 2 3 4 5 6
7 8 9 */ |
| 321 |
| 322 1 ,34 ,36 ,39 ,55 ,57 ,60
,61 ,1093 ,1096 |
| 323 ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,110
3 ,1104 ,1105 ,1106 |
| 324 ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,117
8 ,1179 ,1254 ,1257 |
| 325 ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,369
37 ,36938 ,36939 ,36940 |
| 326 ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,369
48 ,37640 ,37642 ,37644 |
| 327 ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,377
48 ,40133 ,40136 ,40138 |
| 328 ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,359
47626 ,35947627 ,35947629 ,35947630 |
| 329 ,35947631 ,35947635 ,35947636 ,35947638 |
| 330 }; |
| 331 |
| 332 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 333 |
| 334 static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = { |
| 335 /* 0 1 2 3
4 5 6
7 8 9 */ |
| 336 |
| 337 NULL ,NULL ,NULL ,NUL
L ,NULL ,NULL ,NULL
,NULL ,"latin1" ,"latin1" |
| 338 ,"latin1" ,"ibm-865" ,"ibm-865" ,"ib
m-865" ,"ibm-865" ,"ibm-865" ,"ibm-865"
,"JISX0201" ,"JISX0201" ,"latin1" |
| 339 ,"latin1" ,NULL ,"JISX-208" ,"ib
m-5478" ,"JISX-208" ,NULL ,NULL
,NULL ,NULL ,"UTF8" |
| 340 ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NUL
L ,"ibm-955" ,"ibm-367" ,"ibm-952"
,"ibm-949" ,"JISX-212" ,"ibm-1383" |
| 341 ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ib
m-964" ,"ibm-964" ,"ibm-964" ,"ibm-964"
,"ibm-5478" ,"ibm-949" ,"ISO-IR-165" |
| 342 ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CN
S-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7"
,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" |
| 343 ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NUL
L ,"latin1" ,"ibm-912" ,"ibm-913"
,"ibm-914" ,"ibm-813" ,"ibm-1089" |
| 344 ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"la
tin1" |
| 345 }; |
| 346 |
| 347 #endif |
| 348 |
| 349 static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = { |
| 350 /* 0 1 2
3 4 5
6 7 8
9 */ |
| 351 VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_
2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_
2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMI
NAL_2022 ,VALID_TERMINAL_2022 |
| 352 ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_
2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMI
NAL_2022 ,VALID_TERMINAL_2022 |
| 353 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMI
NAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_T
ERMINAL_2022 ,VALID_TERMINAL_2022 |
| 354 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_
2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMI
NAL_2022 ,VALID_TERMINAL_2022 |
| 355 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_
2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMI
NAL_2022 ,VALID_TERMINAL_2022 |
| 356 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_
2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMI
NAL_2022 ,VALID_TERMINAL_2022 |
| 357 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_
2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMI
NAL_2022 ,VALID_TERMINAL_2022 |
| 358 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
,VALID_TERMINAL_2022 |
| 359 }; |
| 360 |
| 361 |
| 362 /* Type def for refactoring changeState_2022 code*/ |
| 363 typedef enum{ |
| 364 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 365 ISO_2022=0, |
| 366 #endif |
| 367 ISO_2022_JP=1, |
| 368 ISO_2022_KR=2, |
| 369 ISO_2022_CN=3 |
| 370 } Variant2022; |
| 371 |
| 372 /*********** ISO 2022 Converter Protos ***********/ |
| 373 static void |
| 374 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode); |
| 375 |
| 376 static void |
| 377 _ISO2022Close(UConverter *converter); |
| 378 |
| 379 static void |
| 380 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice); |
| 381 |
| 382 static const char* |
| 383 _ISO2022getName(const UConverter* cnv); |
| 384 |
| 385 static void |
| 386 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorC
ode *err); |
| 387 |
| 388 static UConverter * |
| 389 _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSi
ze, UErrorCode *status); |
| 390 |
| 391 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 392 static void |
| 393 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UEr
rorCode* err); |
| 394 #endif |
| 395 |
| 396 /*const UConverterSharedData _ISO2022Data;*/ |
| 397 static const UConverterSharedData _ISO2022JPData; |
| 398 static const UConverterSharedData _ISO2022KRData; |
| 399 static const UConverterSharedData _ISO2022CNData; |
| 400 |
| 401 /*************** Converter implementations ******************/ |
| 402 |
| 403 /* The purpose of this function is to get around gcc compiler warnings. */ |
| 404 static U_INLINE void |
| 405 fromUWriteUInt8(UConverter *cnv, |
| 406 const char *bytes, int32_t length, |
| 407 uint8_t **target, const char *targetLimit, |
| 408 int32_t **offsets, |
| 409 int32_t sourceIndex, |
| 410 UErrorCode *pErrorCode) |
| 411 { |
| 412 char *targetChars = (char *)*target; |
| 413 ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit, |
| 414 offsets, sourceIndex, pErrorCode); |
| 415 *target = (uint8_t*)targetChars; |
| 416 |
| 417 } |
| 418 |
| 419 static U_INLINE void |
| 420 setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConve
rterData){ |
| 421 if(myConverterData->version == 1) { |
| 422 UConverter *cnv = myConverterData->currentConverter; |
| 423 |
| 424 cnv->toUnicodeStatus=0; /* offset */ |
| 425 cnv->mode=0; /* state */ |
| 426 cnv->toULength=0; /* byteIndex */ |
| 427 } |
| 428 } |
| 429 |
| 430 static U_INLINE void |
| 431 setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConv
erterData){ |
| 432 /* in ISO-2022-KR the designator sequence appears only once |
| 433 * in a file so we append it only once |
| 434 */ |
| 435 if( converter->charErrorBufferLength==0){ |
| 436 |
| 437 converter->charErrorBufferLength = 4; |
| 438 converter->charErrorBuffer[0] = 0x1b; |
| 439 converter->charErrorBuffer[1] = 0x24; |
| 440 converter->charErrorBuffer[2] = 0x29; |
| 441 converter->charErrorBuffer[3] = 0x43; |
| 442 } |
| 443 if(myConverterData->version == 1) { |
| 444 UConverter *cnv = myConverterData->currentConverter; |
| 445 |
| 446 cnv->fromUChar32=0; |
| 447 cnv->fromUnicodeStatus=1; /* prevLength */ |
| 448 } |
| 449 } |
| 450 |
| 451 static void |
| 452 _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){ |
| 453 |
| 454 char myLocale[6]={' ',' ',' ',' ',' ',' '}; |
| 455 |
| 456 cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); |
| 457 if(cnv->extraInfo != NULL) { |
| 458 UConverterNamePieces stackPieces; |
| 459 UConverterLoadArgs stackArgs={ (int32_t)sizeof(UConverterLoadArgs) }; |
| 460 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->ex
traInfo; |
| 461 uint32_t version; |
| 462 |
| 463 stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable; |
| 464 |
| 465 uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); |
| 466 myConverterData->currentType = ASCII1; |
| 467 cnv->fromUnicodeStatus =FALSE; |
| 468 if(pArgs->locale){ |
| 469 uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)); |
| 470 } |
| 471 version = pArgs->options & UCNV_OPTIONS_VERSION_MASK; |
| 472 myConverterData->version = version; |
| 473 if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && |
| 474 (myLocale[2]=='_' || myLocale[2]=='\0')) |
| 475 { |
| 476 size_t len=0; |
| 477 /* open the required converters and cache them */ |
| 478 if(version>MAX_JA_VERSION) { |
| 479 /* prevent indexing beyond jpCharsetMasks[] */ |
| 480 myConverterData->version = version = 0; |
| 481 } |
| 482 if(jpCharsetMasks[version]&CSM(ISO8859_7)) { |
| 483 myConverterData->myConverterArray[ISO8859_7] = |
| 484 ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, e
rrorCode); |
| 485 } |
| 486 myConverterData->myConverterArray[JISX208] = |
| 487 ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, error
Code); |
| 488 if(jpCharsetMasks[version]&CSM(JISX212)) { |
| 489 myConverterData->myConverterArray[JISX212] = |
| 490 ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, er
rorCode); |
| 491 } |
| 492 if(jpCharsetMasks[version]&CSM(GB2312)) { |
| 493 myConverterData->myConverterArray[GB2312] = |
| 494 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, er
rorCode); /* gb_2312_80-1 */ |
| 495 } |
| 496 if(jpCharsetMasks[version]&CSM(KSC5601)) { |
| 497 myConverterData->myConverterArray[KSC5601] = |
| 498 ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, er
rorCode); |
| 499 } |
| 500 |
| 501 /* set the function pointers to appropriate funtions */ |
| 502 cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); |
| 503 uprv_strcpy(myConverterData->locale,"ja"); |
| 504 |
| 505 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=
"); |
| 506 len = uprv_strlen(myConverterData->name); |
| 507 myConverterData->name[len]=(char)(myConverterData->version+(int)'0')
; |
| 508 myConverterData->name[len+1]='\0'; |
| 509 } |
| 510 else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') && |
| 511 (myLocale[2]=='_' || myLocale[2]=='\0')) |
| 512 { |
| 513 const char *cnvName; |
| 514 if(version==1) { |
| 515 cnvName="icu-internal-25546"; |
| 516 } else { |
| 517 cnvName="ibm-949"; |
| 518 myConverterData->version=version=0; |
| 519 } |
| 520 if(pArgs->onlyTestIsLoadable) { |
| 521 ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carri
es result */ |
| 522 uprv_free(cnv->extraInfo); |
| 523 cnv->extraInfo=NULL; |
| 524 return; |
| 525 } else { |
| 526 myConverterData->currentConverter=ucnv_open(cnvName, errorCode); |
| 527 if (U_FAILURE(*errorCode)) { |
| 528 _ISO2022Close(cnv); |
| 529 return; |
| 530 } |
| 531 |
| 532 if(version==1) { |
| 533 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,
version=1"); |
| 534 uprv_memcpy(cnv->subChars, myConverterData->currentConverter
->subChars, 4); |
| 535 cnv->subCharLen = myConverterData->currentConverter->subChar
Len; |
| 536 }else{ |
| 537 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,
version=0"); |
| 538 } |
| 539 |
| 540 /* initialize the state variables */ |
| 541 setInitialStateToUnicodeKR(cnv, myConverterData); |
| 542 setInitialStateFromUnicodeKR(cnv, myConverterData); |
| 543 |
| 544 /* set the function pointers to appropriate funtions */ |
| 545 cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData; |
| 546 uprv_strcpy(myConverterData->locale,"ko"); |
| 547 } |
| 548 } |
| 549 else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& m
yLocale[1]=='n'))&& |
| 550 (myLocale[2]=='_' || myLocale[2]=='\0')) |
| 551 { |
| 552 |
| 553 /* open the required converters and cache them */ |
| 554 myConverterData->myConverterArray[GB2312_1] = |
| 555 ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorC
ode); |
| 556 if(version==1) { |
| 557 myConverterData->myConverterArray[ISO_IR_165] = |
| 558 ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs,
errorCode); |
| 559 } |
| 560 myConverterData->myConverterArray[CNS_11643] = |
| 561 ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs,
errorCode); |
| 562 |
| 563 |
| 564 /* set the function pointers to appropriate funtions */ |
| 565 cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData; |
| 566 uprv_strcpy(myConverterData->locale,"cn"); |
| 567 |
| 568 if (version==0){ |
| 569 myConverterData->version = 0; |
| 570 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,vers
ion=0"); |
| 571 }else if (version==1){ |
| 572 myConverterData->version = 1; |
| 573 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,vers
ion=1"); |
| 574 }else { |
| 575 myConverterData->version = 2; |
| 576 (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,vers
ion=2"); |
| 577 } |
| 578 } |
| 579 else{ |
| 580 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 581 myConverterData->isFirstBuffer = TRUE; |
| 582 |
| 583 /* append the UTF-8 escape sequence */ |
| 584 cnv->charErrorBufferLength = 3; |
| 585 cnv->charErrorBuffer[0] = 0x1b; |
| 586 cnv->charErrorBuffer[1] = 0x25; |
| 587 cnv->charErrorBuffer[2] = 0x42; |
| 588 |
| 589 cnv->sharedData=(UConverterSharedData*)&_ISO2022Data; |
| 590 /* initialize the state variables */ |
| 591 uprv_strcpy(myConverterData->name,"ISO_2022"); |
| 592 #else |
| 593 *errorCode = U_UNSUPPORTED_ERROR; |
| 594 return; |
| 595 #endif |
| 596 } |
| 597 |
| 598 cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar; |
| 599 |
| 600 if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) { |
| 601 _ISO2022Close(cnv); |
| 602 } |
| 603 } else { |
| 604 *errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 605 } |
| 606 } |
| 607 |
| 608 |
| 609 static void |
| 610 _ISO2022Close(UConverter *converter) { |
| 611 UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraIn
fo); |
| 612 UConverterSharedData **array = myData->myConverterArray; |
| 613 int32_t i; |
| 614 |
| 615 if (converter->extraInfo != NULL) { |
| 616 /*close the array of converter pointers and free the memory*/ |
| 617 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
| 618 if(array[i]!=NULL) { |
| 619 ucnv_unloadSharedDataIfReady(array[i]); |
| 620 } |
| 621 } |
| 622 |
| 623 ucnv_close(myData->currentConverter); |
| 624 |
| 625 if(!converter->isExtraLocal){ |
| 626 uprv_free (converter->extraInfo); |
| 627 converter->extraInfo = NULL; |
| 628 } |
| 629 } |
| 630 } |
| 631 |
| 632 static void |
| 633 _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) { |
| 634 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter-
>extraInfo); |
| 635 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 636 uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State)); |
| 637 myConverterData->key = 0; |
| 638 myConverterData->isEmptySegment = FALSE; |
| 639 } |
| 640 if(choice!=UCNV_RESET_TO_UNICODE) { |
| 641 uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State)); |
| 642 } |
| 643 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 644 if(myConverterData->locale[0] == 0){ |
| 645 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 646 myConverterData->isFirstBuffer = TRUE; |
| 647 myConverterData->key = 0; |
| 648 if (converter->mode == UCNV_SO){ |
| 649 ucnv_close (myConverterData->currentConverter); |
| 650 myConverterData->currentConverter=NULL; |
| 651 } |
| 652 converter->mode = UCNV_SI; |
| 653 } |
| 654 if(choice!=UCNV_RESET_TO_UNICODE) { |
| 655 /* re-append UTF-8 escape sequence */ |
| 656 converter->charErrorBufferLength = 3; |
| 657 converter->charErrorBuffer[0] = 0x1b; |
| 658 converter->charErrorBuffer[1] = 0x28; |
| 659 converter->charErrorBuffer[2] = 0x42; |
| 660 } |
| 661 } |
| 662 else |
| 663 #endif |
| 664 { |
| 665 /* reset the state variables */ |
| 666 if(myConverterData->locale[0] == 'k'){ |
| 667 if(choice<=UCNV_RESET_TO_UNICODE) { |
| 668 setInitialStateToUnicodeKR(converter, myConverterData); |
| 669 } |
| 670 if(choice!=UCNV_RESET_TO_UNICODE) { |
| 671 setInitialStateFromUnicodeKR(converter, myConverterData); |
| 672 } |
| 673 } |
| 674 } |
| 675 } |
| 676 |
| 677 static const char* |
| 678 _ISO2022getName(const UConverter* cnv){ |
| 679 if(cnv->extraInfo){ |
| 680 UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo; |
| 681 return myData->name; |
| 682 } |
| 683 return NULL; |
| 684 } |
| 685 |
| 686 |
| 687 /*************** to unicode *******************/ |
| 688 /**************************************************************************** |
| 689 * Recognized escape sequences are |
| 690 * <ESC>(B ASCII |
| 691 * <ESC>.A ISO-8859-1 |
| 692 * <ESC>.F ISO-8859-7 |
| 693 * <ESC>(J JISX-201 |
| 694 * <ESC>(I JISX-201 |
| 695 * <ESC>$B JISX-208 |
| 696 * <ESC>$@ JISX-208 |
| 697 * <ESC>$(D JISX-212 |
| 698 * <ESC>$A GB2312 |
| 699 * <ESC>$(C KSC5601 |
| 700 */ |
| 701 static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= { |
| 702 /* 0 1 2 3 4
5 6 7 8 9 */ |
| 703 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 704 ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STA
TE |
| 705 ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 706 ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STA
TE |
| 707 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 708 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 709 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 710 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 711 }; |
| 712 |
| 713 /*************** to unicode *******************/ |
| 714 static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= { |
| 715 /* 0 1 2 3 4
5 6 7 8 9 */ |
| 716 INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 717 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 718 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 719 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 720 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165 |
| 721 ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5
,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 722 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STA
TE |
| 723 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE |
| 724 }; |
| 725 |
| 726 |
| 727 static UCNV_TableStates_2022 |
| 728 getKey_2022(char c,int32_t* key,int32_t* offset){ |
| 729 int32_t togo; |
| 730 int32_t low = 0; |
| 731 int32_t hi = MAX_STATES_2022; |
| 732 int32_t oldmid=0; |
| 733 |
| 734 togo = normalize_esq_chars_2022[(uint8_t)c]; |
| 735 if(togo == 0) { |
| 736 /* not a valid character anywhere in an escape sequence */ |
| 737 *key = 0; |
| 738 *offset = 0; |
| 739 return INVALID_2022; |
| 740 } |
| 741 togo = (*key << 5) + togo; |
| 742 |
| 743 while (hi != low) /*binary search*/{ |
| 744 |
| 745 register int32_t mid = (hi+low) >> 1; /*Finds median*/ |
| 746 |
| 747 if (mid == oldmid) |
| 748 break; |
| 749 |
| 750 if (escSeqStateTable_Key_2022[mid] > togo){ |
| 751 hi = mid; |
| 752 } |
| 753 else if (escSeqStateTable_Key_2022[mid] < togo){ |
| 754 low = mid; |
| 755 } |
| 756 else /*we found it*/{ |
| 757 *key = togo; |
| 758 *offset = mid; |
| 759 return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid]; |
| 760 } |
| 761 oldmid = mid; |
| 762 |
| 763 } |
| 764 |
| 765 *key = 0; |
| 766 *offset = 0; |
| 767 return INVALID_2022; |
| 768 } |
| 769 |
| 770 /*runs through a state machine to determine the escape sequence - codepage corre
spondance |
| 771 */ |
| 772 static void |
| 773 changeState_2022(UConverter* _this, |
| 774 const char** source, |
| 775 const char* sourceLimit, |
| 776 Variant2022 var, |
| 777 UErrorCode* err){ |
| 778 UCNV_TableStates_2022 value; |
| 779 UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInf
o); |
| 780 uint32_t key = myData2022->key; |
| 781 int32_t offset = 0; |
| 782 int8_t initialToULength = _this->toULength; |
| 783 char c; |
| 784 |
| 785 value = VALID_NON_TERMINAL_2022; |
| 786 while (*source < sourceLimit) { |
| 787 c = *(*source)++; |
| 788 _this->toUBytes[_this->toULength++]=(uint8_t)c; |
| 789 value = getKey_2022(c,(int32_t *) &key, &offset); |
| 790 |
| 791 switch (value){ |
| 792 |
| 793 case VALID_NON_TERMINAL_2022 : |
| 794 /* continue with the loop */ |
| 795 break; |
| 796 |
| 797 case VALID_TERMINAL_2022: |
| 798 key = 0; |
| 799 goto DONE; |
| 800 |
| 801 case INVALID_2022: |
| 802 goto DONE; |
| 803 |
| 804 case VALID_MAYBE_TERMINAL_2022: |
| 805 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 806 /* ESC ( B is ambiguous only for ISO_2022 itself */ |
| 807 if(var == ISO_2022) { |
| 808 /* discard toUBytes[] for ESC ( B because this sequence is corre
ct and complete */ |
| 809 _this->toULength = 0; |
| 810 |
| 811 /* TODO need to indicate that ESC ( B was seen; if failure, then
need to replay from source or from MBCS-style replay */ |
| 812 |
| 813 /* continue with the loop */ |
| 814 value = VALID_NON_TERMINAL_2022; |
| 815 break; |
| 816 } else |
| 817 #endif |
| 818 { |
| 819 /* not ISO_2022 itself, finish here */ |
| 820 value = VALID_TERMINAL_2022; |
| 821 key = 0; |
| 822 goto DONE; |
| 823 } |
| 824 } |
| 825 } |
| 826 |
| 827 DONE: |
| 828 myData2022->key = key; |
| 829 |
| 830 if (value == VALID_NON_TERMINAL_2022) { |
| 831 /* indicate that the escape sequence is incomplete: key!=0 */ |
| 832 return; |
| 833 } else if (value == INVALID_2022 ) { |
| 834 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 835 } else /* value == VALID_TERMINAL_2022 */ { |
| 836 switch(var){ |
| 837 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 838 case ISO_2022: |
| 839 { |
| 840 const char *chosenConverterName = escSeqStateTable_Result_2022[offse
t]; |
| 841 if(chosenConverterName == NULL) { |
| 842 /* SS2 or SS3 */ |
| 843 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 844 _this->toUCallbackReason = UCNV_UNASSIGNED; |
| 845 return; |
| 846 } |
| 847 |
| 848 _this->mode = UCNV_SI; |
| 849 ucnv_close(myData2022->currentConverter); |
| 850 myData2022->currentConverter = myUConverter = ucnv_open(chosenConver
terName, err); |
| 851 if(U_SUCCESS(*err)) { |
| 852 myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP; |
| 853 _this->mode = UCNV_SO; |
| 854 } |
| 855 break; |
| 856 } |
| 857 #endif |
| 858 case ISO_2022_JP: |
| 859 { |
| 860 StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset]; |
| 861 switch(tempState) { |
| 862 case INVALID_STATE: |
| 863 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 864 break; |
| 865 case SS2_STATE: |
| 866 if(myData2022->toU2022State.cs[2]!=0) { |
| 867 if(myData2022->toU2022State.g<2) { |
| 868 myData2022->toU2022State.prevG=myData2022->toU2022St
ate.g; |
| 869 } |
| 870 myData2022->toU2022State.g=2; |
| 871 } else { |
| 872 /* illegal to have SS2 before a matching designator */ |
| 873 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 874 } |
| 875 break; |
| 876 /* case SS3_STATE: not used in ISO-2022-JP-x */ |
| 877 case ISO8859_1: |
| 878 case ISO8859_7: |
| 879 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) ==
0) { |
| 880 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 881 } else { |
| 882 /* G2 charset for SS2 */ |
| 883 myData2022->toU2022State.cs[2]=(int8_t)tempState; |
| 884 } |
| 885 break; |
| 886 default: |
| 887 if((jpCharsetMasks[myData2022->version] & CSM(tempState)) ==
0) { |
| 888 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 889 } else { |
| 890 /* G0 charset */ |
| 891 myData2022->toU2022State.cs[0]=(int8_t)tempState; |
| 892 } |
| 893 break; |
| 894 } |
| 895 } |
| 896 break; |
| 897 case ISO_2022_CN: |
| 898 { |
| 899 StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset]; |
| 900 switch(tempState) { |
| 901 case INVALID_STATE: |
| 902 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 903 break; |
| 904 case SS2_STATE: |
| 905 if(myData2022->toU2022State.cs[2]!=0) { |
| 906 if(myData2022->toU2022State.g<2) { |
| 907 myData2022->toU2022State.prevG=myData2022->toU2022St
ate.g; |
| 908 } |
| 909 myData2022->toU2022State.g=2; |
| 910 } else { |
| 911 /* illegal to have SS2 before a matching designator */ |
| 912 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 913 } |
| 914 break; |
| 915 case SS3_STATE: |
| 916 if(myData2022->toU2022State.cs[3]!=0) { |
| 917 if(myData2022->toU2022State.g<2) { |
| 918 myData2022->toU2022State.prevG=myData2022->toU2022St
ate.g; |
| 919 } |
| 920 myData2022->toU2022State.g=3; |
| 921 } else { |
| 922 /* illegal to have SS3 before a matching designator */ |
| 923 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 924 } |
| 925 break; |
| 926 case ISO_IR_165: |
| 927 if(myData2022->version==0) { |
| 928 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 929 break; |
| 930 } |
| 931 /*fall through*/ |
| 932 case GB2312_1: |
| 933 /*fall through*/ |
| 934 case CNS_11643_1: |
| 935 myData2022->toU2022State.cs[1]=(int8_t)tempState; |
| 936 break; |
| 937 case CNS_11643_2: |
| 938 myData2022->toU2022State.cs[2]=(int8_t)tempState; |
| 939 break; |
| 940 default: |
| 941 /* other CNS 11643 planes */ |
| 942 if(myData2022->version==0) { |
| 943 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 944 } else { |
| 945 myData2022->toU2022State.cs[3]=(int8_t)tempState; |
| 946 } |
| 947 break; |
| 948 } |
| 949 } |
| 950 break; |
| 951 case ISO_2022_KR: |
| 952 if(offset==0x30){ |
| 953 /* nothing to be done, just accept this one escape sequence */ |
| 954 } else { |
| 955 *err = U_UNSUPPORTED_ESCAPE_SEQUENCE; |
| 956 } |
| 957 break; |
| 958 |
| 959 default: |
| 960 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 961 break; |
| 962 } |
| 963 } |
| 964 if(U_SUCCESS(*err)) { |
| 965 _this->toULength = 0; |
| 966 } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) { |
| 967 if(_this->toULength>1) { |
| 968 /* |
| 969 * Ticket 5691: consistent illegal sequences: |
| 970 * - We include at least the first byte (ESC) in the illegal sequenc
e. |
| 971 * - If any of the non-initial bytes could be the start of a charact
er, |
| 972 * we stop the illegal sequence before the first one of those. |
| 973 * In escape sequences, all following bytes are "printable", that
is, |
| 974 * unless they are completely illegal (>7f in SBCS, outside 21..7e
in DBCS), |
| 975 * they are valid single/lead bytes. |
| 976 * For simplicity, we always only report the initial ESC byte as t
he |
| 977 * illegal sequence and back out all other bytes we looked at. |
| 978 */ |
| 979 /* Back out some bytes. */ |
| 980 int8_t backOutDistance=_this->toULength-1; |
| 981 int8_t bytesFromThisBuffer=_this->toULength-initialToULength; |
| 982 if(backOutDistance<=bytesFromThisBuffer) { |
| 983 /* same as initialToULength<=1 */ |
| 984 *source-=backOutDistance; |
| 985 } else { |
| 986 /* Back out bytes from the previous buffer: Need to replay them.
*/ |
| 987 _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance
); |
| 988 /* same as -(initialToULength-1) */ |
| 989 /* preToULength is negative! */ |
| 990 uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULeng
th); |
| 991 *source-=bytesFromThisBuffer; |
| 992 } |
| 993 _this->toULength=1; |
| 994 } |
| 995 } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) { |
| 996 _this->toUCallbackReason = UCNV_UNASSIGNED; |
| 997 } |
| 998 } |
| 999 |
| 1000 /*Checks the characters of the buffer against valid 2022 escape sequences |
| 1001 *if the match we return a pointer to the initial start of the sequence otherwise |
| 1002 *we return sourceLimit |
| 1003 */ |
| 1004 /*for 2022 looks ahead in the stream |
| 1005 *to determine the longest possible convertible |
| 1006 *data stream |
| 1007 */ |
| 1008 static U_INLINE const char* |
| 1009 getEndOfBuffer_2022(const char** source, |
| 1010 const char* sourceLimit, |
| 1011 UBool flush){ |
| 1012 |
| 1013 const char* mySource = *source; |
| 1014 |
| 1015 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 1016 if (*source >= sourceLimit) |
| 1017 return sourceLimit; |
| 1018 |
| 1019 do{ |
| 1020 |
| 1021 if (*mySource == ESC_2022){ |
| 1022 int8_t i; |
| 1023 int32_t key = 0; |
| 1024 int32_t offset; |
| 1025 UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022; |
| 1026 |
| 1027 /* Kludge: I could not |
| 1028 * figure out the reason for validating an escape sequence |
| 1029 * twice - once here and once in changeState_2022(). |
| 1030 * is it possible to have an ESC character in a ISO2022 |
| 1031 * byte stream which is valid in a code page? Is it legal? |
| 1032 */ |
| 1033 for (i=0; |
| 1034 (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022); |
| 1035 i++) { |
| 1036 value = getKey_2022(*(mySource+i), &key, &offset); |
| 1037 } |
| 1038 if (value > 0 || *mySource==ESC_2022) |
| 1039 return mySource; |
| 1040 |
| 1041 if ((value == VALID_NON_TERMINAL_2022)&&(!flush) ) |
| 1042 return sourceLimit; |
| 1043 } |
| 1044 }while (++mySource < sourceLimit); |
| 1045 |
| 1046 return sourceLimit; |
| 1047 #else |
| 1048 while(mySource < sourceLimit && *mySource != ESC_2022) { |
| 1049 ++mySource; |
| 1050 } |
| 1051 return mySource; |
| 1052 #endif |
| 1053 } |
| 1054 |
| 1055 |
| 1056 /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmb
cs.c |
| 1057 * any future change in _MBCSFromUChar32() function should be reflected here. |
| 1058 * @return number of bytes in *value; negative number if fallback; 0 if no mappi
ng |
| 1059 */ |
| 1060 static U_INLINE int32_t |
| 1061 MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, |
| 1062 UChar32 c, |
| 1063 uint32_t* value, |
| 1064 UBool useFallback, |
| 1065 int outputType) |
| 1066 { |
| 1067 const int32_t *cx; |
| 1068 const uint16_t *table; |
| 1069 uint32_t stage2Entry; |
| 1070 uint32_t myValue; |
| 1071 int32_t length; |
| 1072 const uint8_t *p; |
| 1073 /* |
| 1074 * TODO(markus): Use and require new, faster MBCS conversion table structure
s. |
| 1075 * Use internal version of ucnv_open() that verifies that the new structures
are available, |
| 1076 * else U_INTERNAL_PROGRAM_ERROR. |
| 1077 */ |
| 1078 /* BMP-only codepages are stored without stage 1 entries for supplementary c
ode points */ |
| 1079 if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
| 1080 table=sharedData->mbcs.fromUnicodeTable; |
| 1081 stage2Entry=MBCS_STAGE_2_FROM_U(table, c); |
| 1082 /* get the bytes and the length for the output */ |
| 1083 if(outputType==MBCS_OUTPUT_2){ |
| 1084 myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes,
stage2Entry, c); |
| 1085 if(myValue<=0xff) { |
| 1086 length=1; |
| 1087 } else { |
| 1088 length=2; |
| 1089 } |
| 1090 } else /* outputType==MBCS_OUTPUT_3 */ { |
| 1091 p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, sta
ge2Entry, c); |
| 1092 myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; |
| 1093 if(myValue<=0xff) { |
| 1094 length=1; |
| 1095 } else if(myValue<=0xffff) { |
| 1096 length=2; |
| 1097 } else { |
| 1098 length=3; |
| 1099 } |
| 1100 } |
| 1101 /* is this code point assigned, or do we use fallbacks? */ |
| 1102 if((stage2Entry&(1<<(16+(c&0xf))))!=0) { |
| 1103 /* assigned */ |
| 1104 *value=myValue; |
| 1105 return length; |
| 1106 } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) { |
| 1107 /* |
| 1108 * We allow a 0 byte output if the "assigned" bit is set for this en
try. |
| 1109 * There is no way with this data structure for fallback output |
| 1110 * to be a zero byte. |
| 1111 */ |
| 1112 *value=myValue; |
| 1113 return -length; |
| 1114 } |
| 1115 } |
| 1116 |
| 1117 cx=sharedData->mbcs.extIndexes; |
| 1118 if(cx!=NULL) { |
| 1119 return ucnv_extSimpleMatchFromU(cx, c, value, useFallback); |
| 1120 } |
| 1121 |
| 1122 /* unassigned */ |
| 1123 return 0; |
| 1124 } |
| 1125 |
| 1126 /* This inline function replicates code in _MBCSSingleFromUChar32() function in
ucnvmbcs.c |
| 1127 * any future change in _MBCSSingleFromUChar32() function should be reflected he
re. |
| 1128 * @param retval pointer to output byte |
| 1129 * @return 1 roundtrip byte 0 no mapping -1 fallback byte |
| 1130 */ |
| 1131 static U_INLINE int32_t |
| 1132 MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, |
| 1133 UChar32 c, |
| 1134 uint32_t* retval, |
| 1135 UBool useFallback) |
| 1136 { |
| 1137 const uint16_t *table; |
| 1138 int32_t value; |
| 1139 /* BMP-only codepages are stored without stage 1 entries for supplementary c
ode points */ |
| 1140 if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { |
| 1141 return 0; |
| 1142 } |
| 1143 /* convert the Unicode code point in c into codepage bytes (same as in _MBCS
FromUnicodeWithOffsets) */ |
| 1144 table=sharedData->mbcs.fromUnicodeTable; |
| 1145 /* get the byte for the output */ |
| 1146 value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnic
odeBytes, c); |
| 1147 /* is this code point assigned, or do we use fallbacks? */ |
| 1148 *retval=(uint32_t)(value&0xff); |
| 1149 if(value>=0xf00) { |
| 1150 return 1; /* roundtrip */ |
| 1151 } else if(useFallback ? value>=0x800 : value>=0xc00) { |
| 1152 return -1; /* fallback taken */ |
| 1153 } else { |
| 1154 return 0; /* no mapping */ |
| 1155 } |
| 1156 } |
| 1157 |
| 1158 /* |
| 1159 * Check that the result is a 2-byte value with each byte in the range A1..FE |
| 1160 * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte |
| 1161 * to move it to the ISO 2022 range 21..7E. |
| 1162 * Return 0 if out of range. |
| 1163 */ |
| 1164 static U_INLINE uint32_t |
| 1165 _2022FromGR94DBCS(uint32_t value) { |
| 1166 if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && |
| 1167 (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) |
| 1168 ) { |
| 1169 return value - 0x8080; /* shift down to 21..7e byte range */ |
| 1170 } else { |
| 1171 return 0; /* not valid for ISO 2022 */ |
| 1172 } |
| 1173 } |
| 1174 |
| 1175 #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after
that. */ |
| 1176 /* |
| 1177 * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code poin
t, it returns the |
| 1178 * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns
the 2022 code point |
| 1179 * unchanged. |
| 1180 */ |
| 1181 static U_INLINE uint32_t |
| 1182 _2022ToGR94DBCS(uint32_t value) { |
| 1183 uint32_t returnValue = value + 0x8080; |
| 1184 if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) && |
| 1185 (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) { |
| 1186 return returnValue; |
| 1187 } else { |
| 1188 return value; |
| 1189 } |
| 1190 } |
| 1191 #endif |
| 1192 |
| 1193 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 1194 |
| 1195 /*******************************************************************************
*** |
| 1196 * ISO-2022 Converter |
| 1197 * |
| 1198 * |
| 1199 */ |
| 1200 |
| 1201 static void |
| 1202 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, |
| 1203 UErrorCode* err){ |
| 1204 const char* mySourceLimit, *realSourceLimit; |
| 1205 const char* sourceStart; |
| 1206 const UChar* myTargetStart; |
| 1207 UConverter* saveThis; |
| 1208 UConverterDataISO2022* myData; |
| 1209 int8_t length; |
| 1210 |
| 1211 saveThis = args->converter; |
| 1212 myData=((UConverterDataISO2022*)(saveThis->extraInfo)); |
| 1213 |
| 1214 realSourceLimit = args->sourceLimit; |
| 1215 while (args->source < realSourceLimit) { |
| 1216 if(myData->key == 0) { /* are we in the middle of an escape sequence? */ |
| 1217 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
| 1218 mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit
, args->flush); |
| 1219 |
| 1220 if(args->source < mySourceLimit) { |
| 1221 if(myData->currentConverter==NULL) { |
| 1222 myData->currentConverter = ucnv_open("ASCII",err); |
| 1223 if(U_FAILURE(*err)){ |
| 1224 return; |
| 1225 } |
| 1226 |
| 1227 myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U
_CALLBACK_STOP; |
| 1228 saveThis->mode = UCNV_SO; |
| 1229 } |
| 1230 |
| 1231 /* convert to before the ESC or until the end of the buffer */ |
| 1232 myData->isFirstBuffer=FALSE; |
| 1233 sourceStart = args->source; |
| 1234 myTargetStart = args->target; |
| 1235 args->converter = myData->currentConverter; |
| 1236 ucnv_toUnicode(args->converter, |
| 1237 &args->target, |
| 1238 args->targetLimit, |
| 1239 &args->source, |
| 1240 mySourceLimit, |
| 1241 args->offsets, |
| 1242 (UBool)(args->flush && mySourceLimit == realSourceLimit), |
| 1243 err); |
| 1244 args->converter = saveThis; |
| 1245 |
| 1246 if (*err == U_BUFFER_OVERFLOW_ERROR) { |
| 1247 /* move the overflow buffer */ |
| 1248 length = saveThis->UCharErrorBufferLength = myData->currentC
onverter->UCharErrorBufferLength; |
| 1249 myData->currentConverter->UCharErrorBufferLength = 0; |
| 1250 if(length > 0) { |
| 1251 uprv_memcpy(saveThis->UCharErrorBuffer, |
| 1252 myData->currentConverter->UCharErrorBuffer, |
| 1253 length*U_SIZEOF_UCHAR); |
| 1254 } |
| 1255 return; |
| 1256 } |
| 1257 |
| 1258 /* |
| 1259 * At least one of: |
| 1260 * -Error while converting |
| 1261 * -Done with entire buffer |
| 1262 * -Need to write offsets or update the current offset |
| 1263 * (leave that up to the code in ucnv.c) |
| 1264 * |
| 1265 * or else we just stopped at an ESC byte and continue with chan
geState_2022() |
| 1266 */ |
| 1267 if (U_FAILURE(*err) || |
| 1268 (args->source == realSourceLimit) || |
| 1269 (args->offsets != NULL && (args->target != myTargetStart ||
args->source != sourceStart) || |
| 1270 (mySourceLimit < realSourceLimit && myData->currentConverter
->toULength > 0)) |
| 1271 ) { |
| 1272 /* copy partial or error input for truncated detection and e
rror handling */ |
| 1273 if(U_FAILURE(*err)) { |
| 1274 length = saveThis->invalidCharLength = myData->currentCo
nverter->invalidCharLength; |
| 1275 if(length > 0) { |
| 1276 uprv_memcpy(saveThis->invalidCharBuffer, myData->cur
rentConverter->invalidCharBuffer, length); |
| 1277 } |
| 1278 } else { |
| 1279 length = saveThis->toULength = myData->currentConverter-
>toULength; |
| 1280 if(length > 0) { |
| 1281 uprv_memcpy(saveThis->toUBytes, myData->currentConve
rter->toUBytes, length); |
| 1282 if(args->source < mySourceLimit) { |
| 1283 *err = U_TRUNCATED_CHAR_FOUND; /* truncated inpu
t before ESC */ |
| 1284 } |
| 1285 } |
| 1286 } |
| 1287 return; |
| 1288 } |
| 1289 } |
| 1290 } |
| 1291 |
| 1292 sourceStart = args->source; |
| 1293 changeState_2022(args->converter, |
| 1294 &(args->source), |
| 1295 realSourceLimit, |
| 1296 ISO_2022, |
| 1297 err); |
| 1298 if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets !=
NULL)) { |
| 1299 /* let the ucnv.c code update its current offset */ |
| 1300 return; |
| 1301 } |
| 1302 } |
| 1303 } |
| 1304 |
| 1305 #endif |
| 1306 |
| 1307 /* |
| 1308 * To Unicode Callback helper function |
| 1309 */ |
| 1310 static void |
| 1311 toUnicodeCallback(UConverter *cnv, |
| 1312 const uint32_t sourceChar, const uint32_t targetUniChar, |
| 1313 UErrorCode* err){ |
| 1314 if(sourceChar>0xff){ |
| 1315 cnv->toUBytes[0] = (uint8_t)(sourceChar>>8); |
| 1316 cnv->toUBytes[1] = (uint8_t)sourceChar; |
| 1317 cnv->toULength = 2; |
| 1318 } |
| 1319 else{ |
| 1320 cnv->toUBytes[0] =(char) sourceChar; |
| 1321 cnv->toULength = 1; |
| 1322 } |
| 1323 |
| 1324 if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ |
| 1325 *err = U_INVALID_CHAR_FOUND; |
| 1326 } |
| 1327 else{ |
| 1328 *err = U_ILLEGAL_CHAR_FOUND; |
| 1329 } |
| 1330 } |
| 1331 |
| 1332 /**************************************ISO-2022-JP******************************
*******************/ |
| 1333 |
| 1334 /************************************** IMPORTANT ******************************
******************** |
| 1335 * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode(
) functions for SBCS,DBCS and |
| 1336 * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32(). |
| 1337 * The converter iterates over each Unicode codepoint |
| 1338 * to obtain the equivalent codepoints from the codepages supported. Since the so
urce buffer is |
| 1339 * processed one char at a time it would make sense to reduce the extra processin
g a canned converter |
| 1340 * would do as far as possible. |
| 1341 * |
| 1342 * If the implementation of these macros or structure of sharedData struct change
in the future, make |
| 1343 * sure that ISO-2022 is also changed. |
| 1344 ********************************************************************************
******************* |
| 1345 */ |
| 1346 |
| 1347 /*******************************************************************************
******************** |
| 1348 * Rules for ISO-2022-jp encoding |
| 1349 * (i) Escape sequences must be fully contained within a line they should not |
| 1350 * span new lines or CRs |
| 1351 * (ii) If the last character on a line is represented by two bytes then an ASCI
I or |
| 1352 * JIS-Roman character escape sequence should follow before the line termin
ates |
| 1353 * (iii) If the first character on the line is represented by two bytes then a tw
o |
| 1354 * byte character escape sequence should precede it |
| 1355 * (iv) If no escape sequence is encountered then the characters are ASCII |
| 1356 * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to
G2, |
| 1357 * and invoked with SS2 (ESC N). |
| 1358 * (vi) If there is any G0 designation in text, there must be a switch to |
| 1359 * ASCII or to JIS X 0201-Roman before a space character (but not |
| 1360 * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control |
| 1361 * characters such as tab or CRLF. |
| 1362 * (vi) Supported encodings: |
| 1363 * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-885
9-7 |
| 1364 * |
| 1365 * source : RFC-1554 |
| 1366 * |
| 1367 * JISX201, JISX208,JISX212 : new .cnv data files created |
| 1368 * KSC5601 : alias to ibm-949 mapping table |
| 1369 * GB2312 : alias to ibm-1386 mapping table |
| 1370 * ISO-8859-1 : Algorithmic implemented as LATIN1 case |
| 1371 * ISO-8859-7 : alisas to ibm-9409 mapping table |
| 1372 */ |
| 1373 |
| 1374 /* preference order of JP charsets */ |
| 1375 static const StateEnum jpCharsetPref[]={ |
| 1376 ASCII, |
| 1377 JISX201, |
| 1378 ISO8859_1, |
| 1379 ISO8859_7, |
| 1380 JISX208, |
| 1381 JISX212, |
| 1382 GB2312, |
| 1383 KSC5601, |
| 1384 HWKANA_7BIT |
| 1385 }; |
| 1386 |
| 1387 /* |
| 1388 * The escape sequences must be in order of the enum constants like JISX201 = 3
, |
| 1389 * not in order of jpCharsetPref[]! |
| 1390 */ |
| 1391 static const char escSeqChars[][6] ={ |
| 1392 "\x1B\x28\x42", /* <ESC>(B ASCII */ |
| 1393 "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */ |
| 1394 "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */ |
| 1395 "\x1B\x28\x4A", /* <ESC>(J JISX-201 */ |
| 1396 "\x1B\x24\x42", /* <ESC>$B JISX-208 */ |
| 1397 "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */ |
| 1398 "\x1B\x24\x41", /* <ESC>$A GB2312 */ |
| 1399 "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */ |
| 1400 "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */ |
| 1401 |
| 1402 }; |
| 1403 static const int8_t escSeqCharsLen[] ={ |
| 1404 3, /* length of <ESC>(B ASCII */ |
| 1405 3, /* length of <ESC>.A ISO-8859-1 */ |
| 1406 3, /* length of <ESC>.F ISO-8859-7 */ |
| 1407 3, /* length of <ESC>(J JISX-201 */ |
| 1408 3, /* length of <ESC>$B JISX-208 */ |
| 1409 4, /* length of <ESC>$(D JISX-212 */ |
| 1410 3, /* length of <ESC>$A GB2312 */ |
| 1411 4, /* length of <ESC>$(C KSC5601 */ |
| 1412 3 /* length of <ESC>(I HWKANA_7BIT */ |
| 1413 }; |
| 1414 |
| 1415 /* |
| 1416 * The iteration over various code pages works this way: |
| 1417 * i) Get the currentState from myConverterData->currentState |
| 1418 * ii) Check if the character is mapped to a valid character in the currentState |
| 1419 * Yes -> a) set the initIterState to currentState |
| 1420 * b) remain in this state until an invalid character is found |
| 1421 * No -> a) go to the next code page and find the character |
| 1422 * iii) Before changing the state increment the current state check if the curren
t state |
| 1423 * is equal to the intitIteration state |
| 1424 * Yes -> A character that cannot be represented in any of the supported en
codings |
| 1425 * break and return a U_INVALID_CHARACTER error |
| 1426 * No -> Continue and find the character in next code page |
| 1427 * |
| 1428 * |
| 1429 * TODO: Implement a priority technique where the users are allowed to set the pr
iority of code pages |
| 1430 */ |
| 1431 |
| 1432 /* Map 00..7F to Unicode according to JIS X 0201. */ |
| 1433 static U_INLINE uint32_t |
| 1434 jisx201ToU(uint32_t value) { |
| 1435 if(value < 0x5c) { |
| 1436 return value; |
| 1437 } else if(value == 0x5c) { |
| 1438 return 0xa5; |
| 1439 } else if(value == 0x7e) { |
| 1440 return 0x203e; |
| 1441 } else /* value <= 0x7f */ { |
| 1442 return value; |
| 1443 } |
| 1444 } |
| 1445 |
| 1446 /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. *
/ |
| 1447 static U_INLINE uint32_t |
| 1448 jisx201FromU(uint32_t value) { |
| 1449 if(value<=0x7f) { |
| 1450 if(value!=0x5c && value!=0x7e) { |
| 1451 return value; |
| 1452 } |
| 1453 } else if(value==0xa5) { |
| 1454 return 0x5c; |
| 1455 } else if(value==0x203e) { |
| 1456 return 0x7e; |
| 1457 } |
| 1458 return 0xfffe; |
| 1459 } |
| 1460 |
| 1461 /* |
| 1462 * Take a valid Shift-JIS byte pair, check that it is in the range corresponding |
| 1463 * to JIS X 0208, and convert it to a pair of 21..7E bytes. |
| 1464 * Return 0 if the byte pair is out of range. |
| 1465 */ |
| 1466 static U_INLINE uint32_t |
| 1467 _2022FromSJIS(uint32_t value) { |
| 1468 uint8_t trail; |
| 1469 |
| 1470 if(value > 0xEFFC) { |
| 1471 return 0; /* beyond JIS X 0208 */ |
| 1472 } |
| 1473 |
| 1474 trail = (uint8_t)value; |
| 1475 |
| 1476 value &= 0xff00; /* lead byte */ |
| 1477 if(value <= 0x9f00) { |
| 1478 value -= 0x7000; |
| 1479 } else /* 0xe000 <= value <= 0xef00 */ { |
| 1480 value -= 0xb000; |
| 1481 } |
| 1482 value <<= 1; |
| 1483 |
| 1484 if(trail <= 0x9e) { |
| 1485 value -= 0x100; |
| 1486 if(trail <= 0x7e) { |
| 1487 value |= trail - 0x1f; |
| 1488 } else { |
| 1489 value |= trail - 0x20; |
| 1490 } |
| 1491 } else /* trail <= 0xfc */ { |
| 1492 value |= trail - 0x7e; |
| 1493 } |
| 1494 return value; |
| 1495 } |
| 1496 |
| 1497 /* |
| 1498 * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. |
| 1499 * If either byte is outside 21..7E make sure that the result is not valid |
| 1500 * for Shift-JIS so that the converter catches it. |
| 1501 * Some invalid byte values already turn into equally invalid Shift-JIS |
| 1502 * byte values and need not be tested explicitly. |
| 1503 */ |
| 1504 static U_INLINE void |
| 1505 _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { |
| 1506 if(c1&1) { |
| 1507 ++c1; |
| 1508 if(c2 <= 0x5f) { |
| 1509 c2 += 0x1f; |
| 1510 } else if(c2 <= 0x7e) { |
| 1511 c2 += 0x20; |
| 1512 } else { |
| 1513 c2 = 0; /* invalid */ |
| 1514 } |
| 1515 } else { |
| 1516 if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) { |
| 1517 c2 += 0x7e; |
| 1518 } else { |
| 1519 c2 = 0; /* invalid */ |
| 1520 } |
| 1521 } |
| 1522 c1 >>= 1; |
| 1523 if(c1 <= 0x2f) { |
| 1524 c1 += 0x70; |
| 1525 } else if(c1 <= 0x3f) { |
| 1526 c1 += 0xb0; |
| 1527 } else { |
| 1528 c1 = 0; /* invalid */ |
| 1529 } |
| 1530 bytes[0] = (char)c1; |
| 1531 bytes[1] = (char)c2; |
| 1532 } |
| 1533 |
| 1534 /* |
| 1535 * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS
) |
| 1536 * Katakana. |
| 1537 * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fa
llbacks |
| 1538 * because Shift-JIS roundtrips half-width Katakana to single bytes. |
| 1539 * These were the only fallbacks in ICU's jisx-208.ucm file. |
| 1540 */ |
| 1541 static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { |
| 1542 0x2123, /* U+FF61 */ |
| 1543 0x2156, |
| 1544 0x2157, |
| 1545 0x2122, |
| 1546 0x2126, |
| 1547 0x2572, |
| 1548 0x2521, |
| 1549 0x2523, |
| 1550 0x2525, |
| 1551 0x2527, |
| 1552 0x2529, |
| 1553 0x2563, |
| 1554 0x2565, |
| 1555 0x2567, |
| 1556 0x2543, |
| 1557 0x213C, /* U+FF70 */ |
| 1558 0x2522, |
| 1559 0x2524, |
| 1560 0x2526, |
| 1561 0x2528, |
| 1562 0x252A, |
| 1563 0x252B, |
| 1564 0x252D, |
| 1565 0x252F, |
| 1566 0x2531, |
| 1567 0x2533, |
| 1568 0x2535, |
| 1569 0x2537, |
| 1570 0x2539, |
| 1571 0x253B, |
| 1572 0x253D, |
| 1573 0x253F, /* U+FF80 */ |
| 1574 0x2541, |
| 1575 0x2544, |
| 1576 0x2546, |
| 1577 0x2548, |
| 1578 0x254A, |
| 1579 0x254B, |
| 1580 0x254C, |
| 1581 0x254D, |
| 1582 0x254E, |
| 1583 0x254F, |
| 1584 0x2552, |
| 1585 0x2555, |
| 1586 0x2558, |
| 1587 0x255B, |
| 1588 0x255E, |
| 1589 0x255F, /* U+FF90 */ |
| 1590 0x2560, |
| 1591 0x2561, |
| 1592 0x2562, |
| 1593 0x2564, |
| 1594 0x2566, |
| 1595 0x2568, |
| 1596 0x2569, |
| 1597 0x256A, |
| 1598 0x256B, |
| 1599 0x256C, |
| 1600 0x256D, |
| 1601 0x256F, |
| 1602 0x2573, |
| 1603 0x212B, |
| 1604 0x212C /* U+FF9F */ |
| 1605 }; |
| 1606 |
| 1607 static void |
| 1608 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
, UErrorCode* err) { |
| 1609 UConverter *cnv = args->converter; |
| 1610 UConverterDataISO2022 *converterData; |
| 1611 ISO2022State *pFromU2022State; |
| 1612 uint8_t *target = (uint8_t *) args->target; |
| 1613 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
| 1614 const UChar* source = args->source; |
| 1615 const UChar* sourceLimit = args->sourceLimit; |
| 1616 int32_t* offsets = args->offsets; |
| 1617 UChar32 sourceChar; |
| 1618 char buffer[8]; |
| 1619 int32_t len, outLen; |
| 1620 int8_t choices[10]; |
| 1621 int32_t choiceCount; |
| 1622 uint32_t targetValue = 0; |
| 1623 UBool useFallback; |
| 1624 |
| 1625 int32_t i; |
| 1626 int8_t cs, g; |
| 1627 |
| 1628 /* set up the state */ |
| 1629 converterData = (UConverterDataISO2022*)cnv->extraInfo; |
| 1630 pFromU2022State = &converterData->fromU2022State; |
| 1631 |
| 1632 choiceCount = 0; |
| 1633 |
| 1634 /* check if the last codepoint of previous buffer was a lead surrogate*/ |
| 1635 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
| 1636 goto getTrail; |
| 1637 } |
| 1638 |
| 1639 while(source < sourceLimit) { |
| 1640 if(target < targetLimit) { |
| 1641 |
| 1642 sourceChar = *(source++); |
| 1643 /*check if the char is a First surrogate*/ |
| 1644 if(UTF_IS_SURROGATE(sourceChar)) { |
| 1645 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { |
| 1646 getTrail: |
| 1647 /*look ahead to find the trail surrogate*/ |
| 1648 if(source < sourceLimit) { |
| 1649 /* test the following code unit */ |
| 1650 UChar trail=(UChar) *source; |
| 1651 if(UTF_IS_SECOND_SURROGATE(trail)) { |
| 1652 source++; |
| 1653 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); |
| 1654 cnv->fromUChar32=0x00; |
| 1655 /* convert this supplementary code point */ |
| 1656 /* exit this condition tree */ |
| 1657 } else { |
| 1658 /* this is an unmatched lead code unit (1st surrogat
e) */ |
| 1659 /* callback(illegal) */ |
| 1660 *err=U_ILLEGAL_CHAR_FOUND; |
| 1661 cnv->fromUChar32=sourceChar; |
| 1662 break; |
| 1663 } |
| 1664 } else { |
| 1665 /* no more input */ |
| 1666 cnv->fromUChar32=sourceChar; |
| 1667 break; |
| 1668 } |
| 1669 } else { |
| 1670 /* this is an unmatched trail code unit (2nd surrogate) */ |
| 1671 /* callback(illegal) */ |
| 1672 *err=U_ILLEGAL_CHAR_FOUND; |
| 1673 cnv->fromUChar32=sourceChar; |
| 1674 break; |
| 1675 } |
| 1676 } |
| 1677 |
| 1678 /* do not convert SO/SI/ESC */ |
| 1679 if(IS_2022_CONTROL(sourceChar)) { |
| 1680 /* callback(illegal) */ |
| 1681 *err=U_ILLEGAL_CHAR_FOUND; |
| 1682 cnv->fromUChar32=sourceChar; |
| 1683 break; |
| 1684 } |
| 1685 |
| 1686 /* do the conversion */ |
| 1687 |
| 1688 if(choiceCount == 0) { |
| 1689 uint16_t csm; |
| 1690 |
| 1691 /* |
| 1692 * The csm variable keeps track of which charsets are allowed |
| 1693 * and not used yet while building the choices[]. |
| 1694 */ |
| 1695 csm = jpCharsetMasks[converterData->version]; |
| 1696 choiceCount = 0; |
| 1697 |
| 1698 /* JIS7/8: try single-byte half-width Katakana before JISX208 */ |
| 1699 if(converterData->version == 3 || converterData->version == 4) { |
| 1700 choices[choiceCount++] = (int8_t)HWKANA_7BIT; |
| 1701 } |
| 1702 /* Do not try single-byte half-width Katakana for other versions
. */ |
| 1703 csm &= ~CSM(HWKANA_7BIT); |
| 1704 |
| 1705 /* try the current G0 charset */ |
| 1706 choices[choiceCount++] = cs = pFromU2022State->cs[0]; |
| 1707 csm &= ~CSM(cs); |
| 1708 |
| 1709 /* try the current G2 charset */ |
| 1710 if((cs = pFromU2022State->cs[2]) != 0) { |
| 1711 choices[choiceCount++] = cs; |
| 1712 csm &= ~CSM(cs); |
| 1713 } |
| 1714 |
| 1715 /* try all the other possible charsets */ |
| 1716 for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) { |
| 1717 cs = (int8_t)jpCharsetPref[i]; |
| 1718 if(CSM(cs) & csm) { |
| 1719 choices[choiceCount++] = cs; |
| 1720 csm &= ~CSM(cs); |
| 1721 } |
| 1722 } |
| 1723 } |
| 1724 |
| 1725 cs = g = 0; |
| 1726 /* |
| 1727 * len==0: no mapping found yet |
| 1728 * len<0: found a fallback result: continue looking for a roundtrip
but no further fallbacks |
| 1729 * len>0: found a roundtrip result, done |
| 1730 */ |
| 1731 len = 0; |
| 1732 /* |
| 1733 * We will turn off useFallback after finding a fallback, |
| 1734 * but we still get fallbacks from PUA code points as usual. |
| 1735 * Therefore, we will also need to check that we don't overwrite |
| 1736 * an early fallback with a later one. |
| 1737 */ |
| 1738 useFallback = cnv->useFallback; |
| 1739 |
| 1740 for(i = 0; i < choiceCount && len <= 0; ++i) { |
| 1741 uint32_t value; |
| 1742 int32_t len2; |
| 1743 int8_t cs0 = choices[i]; |
| 1744 switch(cs0) { |
| 1745 case ASCII: |
| 1746 if(sourceChar <= 0x7f) { |
| 1747 targetValue = (uint32_t)sourceChar; |
| 1748 len = 1; |
| 1749 cs = cs0; |
| 1750 g = 0; |
| 1751 } |
| 1752 break; |
| 1753 case ISO8859_1: |
| 1754 if(GR96_START <= sourceChar && sourceChar <= GR96_END) { |
| 1755 targetValue = (uint32_t)sourceChar - 0x80; |
| 1756 len = 1; |
| 1757 cs = cs0; |
| 1758 g = 2; |
| 1759 } |
| 1760 break; |
| 1761 case HWKANA_7BIT: |
| 1762 if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HW
KANA_START)) { |
| 1763 if(converterData->version==3) { |
| 1764 /* JIS7: use G1 (SO) */ |
| 1765 /* Shift U+FF61..U+FF9F to bytes 21..5F. */ |
| 1766 targetValue = (uint32_t)(sourceChar - (HWKANA_START
- 0x21)); |
| 1767 len = 1; |
| 1768 pFromU2022State->cs[1] = cs = cs0; /* do not output
an escape sequence */ |
| 1769 g = 1; |
| 1770 } else if(converterData->version==4) { |
| 1771 /* JIS8: use 8-bit bytes with any single-byte charse
t, see escape sequence output below */ |
| 1772 /* Shift U+FF61..U+FF9F to bytes A1..DF. */ |
| 1773 targetValue = (uint32_t)(sourceChar - (HWKANA_START
- 0xa1)); |
| 1774 len = 1; |
| 1775 |
| 1776 cs = pFromU2022State->cs[0]; |
| 1777 if(IS_JP_DBCS(cs)) { |
| 1778 /* switch from a DBCS charset to JISX201 */ |
| 1779 cs = (int8_t)JISX201; |
| 1780 } |
| 1781 /* else stay in the current G0 charset */ |
| 1782 g = 0; |
| 1783 } |
| 1784 /* else do not use HWKANA_7BIT with other versions */ |
| 1785 } |
| 1786 break; |
| 1787 case JISX201: |
| 1788 /* G0 SBCS */ |
| 1789 value = jisx201FromU(sourceChar); |
| 1790 if(value <= 0x7f) { |
| 1791 targetValue = value; |
| 1792 len = 1; |
| 1793 cs = cs0; |
| 1794 g = 0; |
| 1795 useFallback = FALSE; |
| 1796 } |
| 1797 break; |
| 1798 case JISX208: |
| 1799 /* G0 DBCS from Shift-JIS table */ |
| 1800 len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 1801 converterData->myConverterArray[cs0], |
| 1802 sourceChar, &value, |
| 1803 useFallback, MBCS_OUTPUT_2); |
| 1804 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept
DBCS: abs(len)==2 */ |
| 1805 value = _2022FromSJIS(value); |
| 1806 if(value != 0) { |
| 1807 targetValue = value; |
| 1808 len = len2; |
| 1809 cs = cs0; |
| 1810 g = 0; |
| 1811 useFallback = FALSE; |
| 1812 } |
| 1813 } else if(len == 0 && useFallback && |
| 1814 (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_E
ND - HWKANA_START)) { |
| 1815 targetValue = hwkana_fb[sourceChar - HWKANA_START]; |
| 1816 len = -2; |
| 1817 cs = cs0; |
| 1818 g = 0; |
| 1819 useFallback = FALSE; |
| 1820 } |
| 1821 break; |
| 1822 case ISO8859_7: |
| 1823 /* G0 SBCS forced to 7-bit output */ |
| 1824 len2 = MBCS_SINGLE_FROM_UCHAR32( |
| 1825 converterData->myConverterArray[cs0], |
| 1826 sourceChar, &value, |
| 1827 useFallback); |
| 1828 if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= val
ue && value <= GR96_END) { |
| 1829 targetValue = value - 0x80; |
| 1830 len = len2; |
| 1831 cs = cs0; |
| 1832 g = 2; |
| 1833 useFallback = FALSE; |
| 1834 } |
| 1835 break; |
| 1836 default: |
| 1837 /* G0 DBCS */ |
| 1838 len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 1839 converterData->myConverterArray[cs0], |
| 1840 sourceChar, &value, |
| 1841 useFallback, MBCS_OUTPUT_2); |
| 1842 if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept
DBCS: abs(len)==2 */ |
| 1843 if(cs0 == KSC5601) { |
| 1844 /* |
| 1845 * Check for valid bytes for the encoding scheme. |
| 1846 * This is necessary because the sub-converter (wind
ows-949) |
| 1847 * has a broader encoding scheme than is valid for 2
022. |
| 1848 */ |
| 1849 value = _2022FromGR94DBCS(value); |
| 1850 if(value == 0) { |
| 1851 break; |
| 1852 } |
| 1853 } |
| 1854 targetValue = value; |
| 1855 len = len2; |
| 1856 cs = cs0; |
| 1857 g = 0; |
| 1858 useFallback = FALSE; |
| 1859 } |
| 1860 break; |
| 1861 } |
| 1862 } |
| 1863 |
| 1864 if(len != 0) { |
| 1865 if(len < 0) { |
| 1866 len = -len; /* fallback */ |
| 1867 } |
| 1868 outLen = 0; /* count output bytes */ |
| 1869 |
| 1870 /* write SI if necessary (only for JIS7) */ |
| 1871 if(pFromU2022State->g == 1 && g == 0) { |
| 1872 buffer[outLen++] = UCNV_SI; |
| 1873 pFromU2022State->g = 0; |
| 1874 } |
| 1875 |
| 1876 /* write the designation sequence if necessary */ |
| 1877 if(cs != pFromU2022State->cs[g]) { |
| 1878 int32_t escLen = escSeqCharsLen[cs]; |
| 1879 uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen); |
| 1880 outLen += escLen; |
| 1881 pFromU2022State->cs[g] = cs; |
| 1882 |
| 1883 /* invalidate the choices[] */ |
| 1884 choiceCount = 0; |
| 1885 } |
| 1886 |
| 1887 /* write the shift sequence if necessary */ |
| 1888 if(g != pFromU2022State->g) { |
| 1889 switch(g) { |
| 1890 /* case 0 handled before writing escapes */ |
| 1891 case 1: |
| 1892 buffer[outLen++] = UCNV_SO; |
| 1893 pFromU2022State->g = 1; |
| 1894 break; |
| 1895 default: /* case 2 */ |
| 1896 buffer[outLen++] = 0x1b; |
| 1897 buffer[outLen++] = 0x4e; |
| 1898 break; |
| 1899 /* no case 3: no SS3 in ISO-2022-JP-x */ |
| 1900 } |
| 1901 } |
| 1902 |
| 1903 /* write the output bytes */ |
| 1904 if(len == 1) { |
| 1905 buffer[outLen++] = (char)targetValue; |
| 1906 } else /* len == 2 */ { |
| 1907 buffer[outLen++] = (char)(targetValue >> 8); |
| 1908 buffer[outLen++] = (char)targetValue; |
| 1909 } |
| 1910 } else { |
| 1911 /* |
| 1912 * if we cannot find the character after checking all codepages |
| 1913 * then this is an error |
| 1914 */ |
| 1915 *err = U_INVALID_CHAR_FOUND; |
| 1916 cnv->fromUChar32=sourceChar; |
| 1917 break; |
| 1918 } |
| 1919 |
| 1920 if(sourceChar == CR || sourceChar == LF) { |
| 1921 /* reset the G2 state at the end of a line (conversion got us in
to ASCII or JISX201 already) */ |
| 1922 pFromU2022State->cs[2] = 0; |
| 1923 choiceCount = 0; |
| 1924 } |
| 1925 |
| 1926 /* output outLen>0 bytes in buffer[] */ |
| 1927 if(outLen == 1) { |
| 1928 *target++ = buffer[0]; |
| 1929 if(offsets) { |
| 1930 *offsets++ = (int32_t)(source - args->source - 1); /* -1: kn
own to be ASCII */ |
| 1931 } |
| 1932 } else if(outLen == 2 && (target + 2) <= targetLimit) { |
| 1933 *target++ = buffer[0]; |
| 1934 *target++ = buffer[1]; |
| 1935 if(offsets) { |
| 1936 int32_t sourceIndex = (int32_t)(source - args->source - U16_
LENGTH(sourceChar)); |
| 1937 *offsets++ = sourceIndex; |
| 1938 *offsets++ = sourceIndex; |
| 1939 } |
| 1940 } else { |
| 1941 fromUWriteUInt8( |
| 1942 cnv, |
| 1943 buffer, outLen, |
| 1944 &target, (const char *)targetLimit, |
| 1945 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourc
eChar)), |
| 1946 err); |
| 1947 if(U_FAILURE(*err)) { |
| 1948 break; |
| 1949 } |
| 1950 } |
| 1951 } /* end if(myTargetIndex<myTargetLength) */ |
| 1952 else{ |
| 1953 *err =U_BUFFER_OVERFLOW_ERROR; |
| 1954 break; |
| 1955 } |
| 1956 |
| 1957 }/* end while(mySourceIndex<mySourceLength) */ |
| 1958 |
| 1959 /* |
| 1960 * the end of the input stream and detection of truncated input |
| 1961 * are handled by the framework, but for ISO-2022-JP conversion |
| 1962 * we need to be in ASCII mode at the very end |
| 1963 * |
| 1964 * conditions: |
| 1965 * successful |
| 1966 * in SO mode or not in ASCII mode |
| 1967 * end of input and no truncated input |
| 1968 */ |
| 1969 if( U_SUCCESS(*err) && |
| 1970 (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) && |
| 1971 args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
| 1972 ) { |
| 1973 int32_t sourceIndex; |
| 1974 |
| 1975 outLen = 0; |
| 1976 |
| 1977 if(pFromU2022State->g != 0) { |
| 1978 buffer[outLen++] = UCNV_SI; |
| 1979 pFromU2022State->g = 0; |
| 1980 } |
| 1981 |
| 1982 if(pFromU2022State->cs[0] != ASCII) { |
| 1983 int32_t escLen = escSeqCharsLen[ASCII]; |
| 1984 uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen); |
| 1985 outLen += escLen; |
| 1986 pFromU2022State->cs[0] = (int8_t)ASCII; |
| 1987 } |
| 1988 |
| 1989 /* get the source index of the last input character */ |
| 1990 /* |
| 1991 * TODO this would be simpler and more reliable if we used a pair |
| 1992 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| 1993 * so that we could simply use the prevSourceIndex here; |
| 1994 * this code gives an incorrect result for the rare case of an unmatched |
| 1995 * trail surrogate that is alone in the last buffer of the text stream |
| 1996 */ |
| 1997 sourceIndex=(int32_t)(source-args->source); |
| 1998 if(sourceIndex>0) { |
| 1999 --sourceIndex; |
| 2000 if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| 2001 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| 2002 ) { |
| 2003 --sourceIndex; |
| 2004 } |
| 2005 } else { |
| 2006 sourceIndex=-1; |
| 2007 } |
| 2008 |
| 2009 fromUWriteUInt8( |
| 2010 cnv, |
| 2011 buffer, outLen, |
| 2012 &target, (const char *)targetLimit, |
| 2013 &offsets, sourceIndex, |
| 2014 err); |
| 2015 } |
| 2016 |
| 2017 /*save the state and return */ |
| 2018 args->source = source; |
| 2019 args->target = (char*)target; |
| 2020 } |
| 2021 |
| 2022 /*************** to unicode *******************/ |
| 2023 |
| 2024 static void |
| 2025 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| 2026 UErrorCode* err){ |
| 2027 char tempBuf[2]; |
| 2028 const char *mySource = (char *) args->source; |
| 2029 UChar *myTarget = args->target; |
| 2030 const char *mySourceLimit = args->sourceLimit; |
| 2031 uint32_t targetUniChar = 0x0000; |
| 2032 uint32_t mySourceChar = 0x0000; |
| 2033 uint32_t tmpSourceChar = 0x0000; |
| 2034 UConverterDataISO2022* myData; |
| 2035 ISO2022State *pToU2022State; |
| 2036 StateEnum cs; |
| 2037 |
| 2038 myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| 2039 pToU2022State = &myData->toU2022State; |
| 2040 |
| 2041 if(myData->key != 0) { |
| 2042 /* continue with a partial escape sequence */ |
| 2043 goto escape; |
| 2044 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myT
arget < args->targetLimit) { |
| 2045 /* continue with a partial double-byte character */ |
| 2046 mySourceChar = args->converter->toUBytes[0]; |
| 2047 args->converter->toULength = 0; |
| 2048 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| 2049 targetUniChar = missingCharMarker; |
| 2050 goto getTrailByte; |
| 2051 } |
| 2052 |
| 2053 while(mySource < mySourceLimit){ |
| 2054 |
| 2055 targetUniChar =missingCharMarker; |
| 2056 |
| 2057 if(myTarget < args->targetLimit){ |
| 2058 |
| 2059 mySourceChar= (unsigned char) *mySource++; |
| 2060 |
| 2061 switch(mySourceChar) { |
| 2062 case UCNV_SI: |
| 2063 if(myData->version==3) { |
| 2064 pToU2022State->g=0; |
| 2065 continue; |
| 2066 } else { |
| 2067 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
| 2068 myData->isEmptySegment = FALSE; /* reset this, we have a
different error */ |
| 2069 break; |
| 2070 } |
| 2071 |
| 2072 case UCNV_SO: |
| 2073 if(myData->version==3) { |
| 2074 /* JIS7: switch to G1 half-width Katakana */ |
| 2075 pToU2022State->cs[1] = (int8_t)HWKANA_7BIT; |
| 2076 pToU2022State->g=1; |
| 2077 continue; |
| 2078 } else { |
| 2079 /* only JIS7 uses SI/SO, not ISO-2022-JP-x */ |
| 2080 myData->isEmptySegment = FALSE; /* reset this, we have a
different error */ |
| 2081 break; |
| 2082 } |
| 2083 |
| 2084 case ESC_2022: |
| 2085 mySource--; |
| 2086 escape: |
| 2087 { |
| 2088 const char * mySourceBefore = mySource; |
| 2089 int8_t toULengthBefore = args->converter->toULength; |
| 2090 |
| 2091 changeState_2022(args->converter,&(mySource), |
| 2092 mySourceLimit, ISO_2022_JP,err); |
| 2093 |
| 2094 /* If in ISO-2022-JP only and we successully completed an es
cape sequence, but previous segment was empty, create an error */ |
| 2095 if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) &
& myData->isEmptySegment) { |
| 2096 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 2097 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 2098 args->converter->toULength = (int8_t)(toULengthBefore +
(mySource - mySourceBefore)); |
| 2099 } |
| 2100 } |
| 2101 |
| 2102 /* invalid or illegal escape sequence */ |
| 2103 if(U_FAILURE(*err)){ |
| 2104 args->target = myTarget; |
| 2105 args->source = mySource; |
| 2106 myData->isEmptySegment = FALSE; /* Reset to avoid future
spurious errors */ |
| 2107 return; |
| 2108 } |
| 2109 /* If we successfully completed an escape sequence, we begin a n
ew segment, empty so far */ |
| 2110 if(myData->key==0) { |
| 2111 myData->isEmptySegment = TRUE; |
| 2112 } |
| 2113 continue; |
| 2114 |
| 2115 /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */ |
| 2116 |
| 2117 case CR: |
| 2118 /*falls through*/ |
| 2119 case LF: |
| 2120 /* automatically reset to single-byte mode */ |
| 2121 if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU20
22State->cs[0] != JISX201) { |
| 2122 pToU2022State->cs[0] = (int8_t)ASCII; |
| 2123 } |
| 2124 pToU2022State->cs[2] = 0; |
| 2125 pToU2022State->g = 0; |
| 2126 /* falls through */ |
| 2127 default: |
| 2128 /* convert one or two bytes */ |
| 2129 myData->isEmptySegment = FALSE; |
| 2130 cs = (StateEnum)pToU2022State->cs[pToU2022State->g]; |
| 2131 if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->v
ersion==4 && |
| 2132 !IS_JP_DBCS(cs) |
| 2133 ) { |
| 2134 /* 8-bit halfwidth katakana in any single-byte mode for JIS8
*/ |
| 2135 targetUniChar = mySourceChar + (HWKANA_START - 0xa1); |
| 2136 |
| 2137 /* return from a single-shift state to the previous one */ |
| 2138 if(pToU2022State->g >= 2) { |
| 2139 pToU2022State->g=pToU2022State->prevG; |
| 2140 } |
| 2141 } else switch(cs) { |
| 2142 case ASCII: |
| 2143 if(mySourceChar <= 0x7f) { |
| 2144 targetUniChar = mySourceChar; |
| 2145 } |
| 2146 break; |
| 2147 case ISO8859_1: |
| 2148 if(mySourceChar <= 0x7f) { |
| 2149 targetUniChar = mySourceChar + 0x80; |
| 2150 } |
| 2151 /* return from a single-shift state to the previous one */ |
| 2152 pToU2022State->g=pToU2022State->prevG; |
| 2153 break; |
| 2154 case ISO8859_7: |
| 2155 if(mySourceChar <= 0x7f) { |
| 2156 /* convert mySourceChar+0x80 to use a normal 8-bit table
*/ |
| 2157 targetUniChar = |
| 2158 _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( |
| 2159 myData->myConverterArray[cs], |
| 2160 mySourceChar + 0x80); |
| 2161 } |
| 2162 /* return from a single-shift state to the previous one */ |
| 2163 pToU2022State->g=pToU2022State->prevG; |
| 2164 break; |
| 2165 case JISX201: |
| 2166 if(mySourceChar <= 0x7f) { |
| 2167 targetUniChar = jisx201ToU(mySourceChar); |
| 2168 } |
| 2169 break; |
| 2170 case HWKANA_7BIT: |
| 2171 if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) { |
| 2172 /* 7-bit halfwidth Katakana */ |
| 2173 targetUniChar = mySourceChar + (HWKANA_START - 0x21); |
| 2174 } |
| 2175 break; |
| 2176 default: |
| 2177 /* G0 DBCS */ |
| 2178 if(mySource < mySourceLimit) { |
| 2179 int leadIsOk, trailIsOk; |
| 2180 uint8_t trailByte; |
| 2181 getTrailByte: |
| 2182 trailByte = (uint8_t)*mySource; |
| 2183 /* |
| 2184 * Ticket 5691: consistent illegal sequences: |
| 2185 * - We include at least the first byte in the illegal s
equence. |
| 2186 * - If any of the non-initial bytes could be the start
of a character, |
| 2187 * we stop the illegal sequence before the first one o
f those. |
| 2188 * |
| 2189 * In ISO-2022 DBCS, if the second byte is in the 21..7e
range or is |
| 2190 * an ESC/SO/SI, we report only the first byte as the il
legal sequence. |
| 2191 * Otherwise we convert or report the pair of bytes. |
| 2192 */ |
| 2193 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x2
1); |
| 2194 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21)
; |
| 2195 if (leadIsOk && trailIsOk) { |
| 2196 ++mySource; |
| 2197 tmpSourceChar = (mySourceChar << 8) | trailByte; |
| 2198 if(cs == JISX208) { |
| 2199 _2022ToSJIS((uint8_t)mySourceChar, trailByte, te
mpBuf); |
| 2200 mySourceChar = tmpSourceChar; |
| 2201 } else { |
| 2202 /* Copy before we modify tmpSourceChar so toUnic
odeCallback() sees the correct bytes. */ |
| 2203 mySourceChar = tmpSourceChar; |
| 2204 if (cs == KSC5601) { |
| 2205 tmpSourceChar += 0x8080; /* = _2022ToGR94DB
CS(tmpSourceChar) */ |
| 2206 } |
| 2207 tempBuf[0] = (char)(tmpSourceChar >> 8); |
| 2208 tempBuf[1] = (char)(tmpSourceChar); |
| 2209 } |
| 2210 targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->
myConverterArray[cs], tempBuf, 2, FALSE); |
| 2211 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| 2212 /* report a pair of illegal bytes if the second byte
is not a DBCS starter */ |
| 2213 ++mySource; |
| 2214 /* add another bit so that the code below writes 2 b
ytes in case of error */ |
| 2215 mySourceChar = 0x10000 | (mySourceChar << 8) | trail
Byte; |
| 2216 } |
| 2217 } else { |
| 2218 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 2219 args->converter->toULength = 1; |
| 2220 goto endloop; |
| 2221 } |
| 2222 } /* End of inner switch */ |
| 2223 break; |
| 2224 } /* End of outer switch */ |
| 2225 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
| 2226 if(args->offsets){ |
| 2227 args->offsets[myTarget - args->target] = (int32_t)(mySource
- args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2228 } |
| 2229 *(myTarget++)=(UChar)targetUniChar; |
| 2230 } |
| 2231 else if(targetUniChar > missingCharMarker){ |
| 2232 /* disassemble the surrogate pair and write to output*/ |
| 2233 targetUniChar-=0x0010000; |
| 2234 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
| 2235 if(args->offsets){ |
| 2236 args->offsets[myTarget - args->target] = (int32_t)(mySource
- args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2237 } |
| 2238 ++myTarget; |
| 2239 if(myTarget< args->targetLimit){ |
| 2240 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| 2241 if(args->offsets){ |
| 2242 args->offsets[myTarget - args->target] = (int32_t)(mySou
rce - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2243 } |
| 2244 ++myTarget; |
| 2245 }else{ |
| 2246 args->converter->UCharErrorBuffer[args->converter->UCharErro
rBufferLength++]= |
| 2247 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff))
; |
| 2248 } |
| 2249 |
| 2250 } |
| 2251 else{ |
| 2252 /* Call the callback function*/ |
| 2253 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err
); |
| 2254 break; |
| 2255 } |
| 2256 } |
| 2257 else{ /* goes with "if(myTarget < args->targetLimit)" way up near to
p of function */ |
| 2258 *err =U_BUFFER_OVERFLOW_ERROR; |
| 2259 break; |
| 2260 } |
| 2261 } |
| 2262 endloop: |
| 2263 args->target = myTarget; |
| 2264 args->source = mySource; |
| 2265 } |
| 2266 |
| 2267 |
| 2268 /*************************************************************** |
| 2269 * Rules for ISO-2022-KR encoding |
| 2270 * i) The KSC5601 designator sequence should appear only once in a file, |
| 2271 * at the begining of a line before any KSC5601 characters. This usually |
| 2272 * means that it appears by itself on the first line of the file |
| 2273 * ii) There are only 2 shifting sequences SO to shift into double byte mode |
| 2274 * and SI to shift into single byte mode |
| 2275 */ |
| 2276 static void |
| 2277 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs*
args, UErrorCode* err){ |
| 2278 |
| 2279 UConverter* saveConv = args->converter; |
| 2280 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->ext
raInfo; |
| 2281 args->converter=myConverterData->currentConverter; |
| 2282 |
| 2283 myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32; |
| 2284 ucnv_MBCSFromUnicodeWithOffsets(args,err); |
| 2285 saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
| 2286 |
| 2287 if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| 2288 if(myConverterData->currentConverter->charErrorBufferLength > 0) { |
| 2289 uprv_memcpy( |
| 2290 saveConv->charErrorBuffer, |
| 2291 myConverterData->currentConverter->charErrorBuffer, |
| 2292 myConverterData->currentConverter->charErrorBufferLength); |
| 2293 } |
| 2294 saveConv->charErrorBufferLength = myConverterData->currentConverter->cha
rErrorBufferLength; |
| 2295 myConverterData->currentConverter->charErrorBufferLength = 0; |
| 2296 } |
| 2297 args->converter=saveConv; |
| 2298 } |
| 2299 |
| 2300 static void |
| 2301 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
, UErrorCode* err){ |
| 2302 |
| 2303 const UChar *source = args->source; |
| 2304 const UChar *sourceLimit = args->sourceLimit; |
| 2305 unsigned char *target = (unsigned char *) args->target; |
| 2306 unsigned char *targetLimit = (unsigned char *) args->targetLimit; |
| 2307 int32_t* offsets = args->offsets; |
| 2308 uint32_t targetByteUnit = 0x0000; |
| 2309 UChar32 sourceChar = 0x0000; |
| 2310 UBool isTargetByteDBCS; |
| 2311 UBool oldIsTargetByteDBCS; |
| 2312 UConverterDataISO2022 *converterData; |
| 2313 UConverterSharedData* sharedData; |
| 2314 UBool useFallback; |
| 2315 int32_t length =0; |
| 2316 |
| 2317 converterData=(UConverterDataISO2022*)args->converter->extraInfo; |
| 2318 /* if the version is 1 then the user is requesting |
| 2319 * conversion with ibm-25546 pass the arguments to |
| 2320 * MBCS converter and return |
| 2321 */ |
| 2322 if(converterData->version==1){ |
| 2323 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
| 2324 return; |
| 2325 } |
| 2326 |
| 2327 /* initialize data */ |
| 2328 sharedData = converterData->currentConverter->sharedData; |
| 2329 useFallback = args->converter->useFallback; |
| 2330 isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus; |
| 2331 oldIsTargetByteDBCS = isTargetByteDBCS; |
| 2332 |
| 2333 isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus; |
| 2334 if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) { |
| 2335 goto getTrail; |
| 2336 } |
| 2337 while(source < sourceLimit){ |
| 2338 |
| 2339 targetByteUnit = missingCharMarker; |
| 2340 |
| 2341 if(target < (unsigned char*) args->targetLimit){ |
| 2342 sourceChar = *source++; |
| 2343 |
| 2344 /* do not convert SO/SI/ESC */ |
| 2345 if(IS_2022_CONTROL(sourceChar)) { |
| 2346 /* callback(illegal) */ |
| 2347 *err=U_ILLEGAL_CHAR_FOUND; |
| 2348 args->converter->fromUChar32=sourceChar; |
| 2349 break; |
| 2350 } |
| 2351 |
| 2352 length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByte
Unit,useFallback,MBCS_OUTPUT_2); |
| 2353 if(length < 0) { |
| 2354 length = -length; /* fallback */ |
| 2355 } |
| 2356 /* only DBCS or SBCS characters are expected*/ |
| 2357 /* DB characters with high bit set to 1 are expected */ |
| 2358 if( length > 2 || length==0 || |
| 2359 (length == 1 && targetByteUnit > 0x7f) || |
| 2360 (length == 2 && |
| 2361 ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) || |
| 2362 (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1))) |
| 2363 ) { |
| 2364 targetByteUnit=missingCharMarker; |
| 2365 } |
| 2366 if (targetByteUnit != missingCharMarker){ |
| 2367 |
| 2368 oldIsTargetByteDBCS = isTargetByteDBCS; |
| 2369 isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF); |
| 2370 /* append the shift sequence */ |
| 2371 if (oldIsTargetByteDBCS != isTargetByteDBCS ){ |
| 2372 |
| 2373 if (isTargetByteDBCS) |
| 2374 *target++ = UCNV_SO; |
| 2375 else |
| 2376 *target++ = UCNV_SI; |
| 2377 if(offsets) |
| 2378 *(offsets++) = (int32_t)(source - args->source-1); |
| 2379 } |
| 2380 /* write the targetUniChar to target */ |
| 2381 if(targetByteUnit <= 0x00FF){ |
| 2382 if( target < targetLimit){ |
| 2383 *(target++) = (unsigned char) targetByteUnit; |
| 2384 if(offsets){ |
| 2385 *(offsets++) = (int32_t)(source - args->source-1); |
| 2386 } |
| 2387 |
| 2388 }else{ |
| 2389 args->converter->charErrorBuffer[args->converter->charEr
rorBufferLength++] = (unsigned char) (targetByteUnit); |
| 2390 *err = U_BUFFER_OVERFLOW_ERROR; |
| 2391 } |
| 2392 }else{ |
| 2393 if(target < targetLimit){ |
| 2394 *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80)
; |
| 2395 if(offsets){ |
| 2396 *(offsets++) = (int32_t)(source - args->source-1); |
| 2397 } |
| 2398 if(target < targetLimit){ |
| 2399 *(target++) =(unsigned char) (targetByteUnit -0x80); |
| 2400 if(offsets){ |
| 2401 *(offsets++) = (int32_t)(source - args->source-1
); |
| 2402 } |
| 2403 }else{ |
| 2404 args->converter->charErrorBuffer[args->converter->ch
arErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80); |
| 2405 *err = U_BUFFER_OVERFLOW_ERROR; |
| 2406 } |
| 2407 }else{ |
| 2408 args->converter->charErrorBuffer[args->converter->charEr
rorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80); |
| 2409 args->converter->charErrorBuffer[args->converter->charEr
rorBufferLength++] = (unsigned char) (targetByteUnit-0x80); |
| 2410 *err = U_BUFFER_OVERFLOW_ERROR; |
| 2411 } |
| 2412 } |
| 2413 |
| 2414 } |
| 2415 else{ |
| 2416 /* oops.. the code point is unassingned |
| 2417 * set the error and reason |
| 2418 */ |
| 2419 |
| 2420 /*check if the char is a First surrogate*/ |
| 2421 if(UTF_IS_SURROGATE(sourceChar)) { |
| 2422 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { |
| 2423 getTrail: |
| 2424 /*look ahead to find the trail surrogate*/ |
| 2425 if(source < sourceLimit) { |
| 2426 /* test the following code unit */ |
| 2427 UChar trail=(UChar) *source; |
| 2428 if(UTF_IS_SECOND_SURROGATE(trail)) { |
| 2429 source++; |
| 2430 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trai
l); |
| 2431 *err = U_INVALID_CHAR_FOUND; |
| 2432 /* convert this surrogate code point */ |
| 2433 /* exit this condition tree */ |
| 2434 } else { |
| 2435 /* this is an unmatched lead code unit (1st surr
ogate) */ |
| 2436 /* callback(illegal) */ |
| 2437 *err=U_ILLEGAL_CHAR_FOUND; |
| 2438 } |
| 2439 } else { |
| 2440 /* no more input */ |
| 2441 *err = U_ZERO_ERROR; |
| 2442 } |
| 2443 } else { |
| 2444 /* this is an unmatched trail code unit (2nd surrogate)
*/ |
| 2445 /* callback(illegal) */ |
| 2446 *err=U_ILLEGAL_CHAR_FOUND; |
| 2447 } |
| 2448 } else { |
| 2449 /* callback(unassigned) for a BMP code point */ |
| 2450 *err = U_INVALID_CHAR_FOUND; |
| 2451 } |
| 2452 |
| 2453 args->converter->fromUChar32=sourceChar; |
| 2454 break; |
| 2455 } |
| 2456 } /* end if(myTargetIndex<myTargetLength) */ |
| 2457 else{ |
| 2458 *err =U_BUFFER_OVERFLOW_ERROR; |
| 2459 break; |
| 2460 } |
| 2461 |
| 2462 }/* end while(mySourceIndex<mySourceLength) */ |
| 2463 |
| 2464 /* |
| 2465 * the end of the input stream and detection of truncated input |
| 2466 * are handled by the framework, but for ISO-2022-KR conversion |
| 2467 * we need to be in ASCII mode at the very end |
| 2468 * |
| 2469 * conditions: |
| 2470 * successful |
| 2471 * not in ASCII mode |
| 2472 * end of input and no truncated input |
| 2473 */ |
| 2474 if( U_SUCCESS(*err) && |
| 2475 isTargetByteDBCS && |
| 2476 args->flush && source>=sourceLimit && args->converter->fromUChar32==0 |
| 2477 ) { |
| 2478 int32_t sourceIndex; |
| 2479 |
| 2480 /* we are switching to ASCII */ |
| 2481 isTargetByteDBCS=FALSE; |
| 2482 |
| 2483 /* get the source index of the last input character */ |
| 2484 /* |
| 2485 * TODO this would be simpler and more reliable if we used a pair |
| 2486 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| 2487 * so that we could simply use the prevSourceIndex here; |
| 2488 * this code gives an incorrect result for the rare case of an unmatched |
| 2489 * trail surrogate that is alone in the last buffer of the text stream |
| 2490 */ |
| 2491 sourceIndex=(int32_t)(source-args->source); |
| 2492 if(sourceIndex>0) { |
| 2493 --sourceIndex; |
| 2494 if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| 2495 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| 2496 ) { |
| 2497 --sourceIndex; |
| 2498 } |
| 2499 } else { |
| 2500 sourceIndex=-1; |
| 2501 } |
| 2502 |
| 2503 fromUWriteUInt8( |
| 2504 args->converter, |
| 2505 SHIFT_IN_STR, 1, |
| 2506 &target, (const char *)targetLimit, |
| 2507 &offsets, sourceIndex, |
| 2508 err); |
| 2509 } |
| 2510 |
| 2511 /*save the state and return */ |
| 2512 args->source = source; |
| 2513 args->target = (char*)target; |
| 2514 args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS; |
| 2515 } |
| 2516 |
| 2517 /************************ To Unicode ***************************************/ |
| 2518 |
| 2519 static void |
| 2520 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args
, |
| 2521 UErrorCode* err){ |
| 2522 char const* sourceStart; |
| 2523 UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extr
aInfo); |
| 2524 |
| 2525 UConverterToUnicodeArgs subArgs; |
| 2526 int32_t minArgsSize; |
| 2527 |
| 2528 /* set up the subconverter arguments */ |
| 2529 if(args->size<sizeof(UConverterToUnicodeArgs)) { |
| 2530 minArgsSize = args->size; |
| 2531 } else { |
| 2532 minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs); |
| 2533 } |
| 2534 |
| 2535 uprv_memcpy(&subArgs, args, minArgsSize); |
| 2536 subArgs.size = (uint16_t)minArgsSize; |
| 2537 subArgs.converter = myData->currentConverter; |
| 2538 |
| 2539 /* remember the original start of the input for offsets */ |
| 2540 sourceStart = args->source; |
| 2541 |
| 2542 if(myData->key != 0) { |
| 2543 /* continue with a partial escape sequence */ |
| 2544 goto escape; |
| 2545 } |
| 2546 |
| 2547 while(U_SUCCESS(*err) && args->source < args->sourceLimit) { |
| 2548 /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/ |
| 2549 subArgs.source = args->source; |
| 2550 subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceL
imit, args->flush); |
| 2551 if(subArgs.source != subArgs.sourceLimit) { |
| 2552 /* |
| 2553 * get the current partial byte sequence |
| 2554 * |
| 2555 * it needs to be moved between the public and the subconverter |
| 2556 * so that the conversion framework, which only sees the public |
| 2557 * converter, can handle truncated and illegal input etc. |
| 2558 */ |
| 2559 if(args->converter->toULength > 0) { |
| 2560 uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUByt
es, args->converter->toULength); |
| 2561 } |
| 2562 subArgs.converter->toULength = args->converter->toULength; |
| 2563 |
| 2564 /* |
| 2565 * Convert up to the end of the input, or to before the next escape
character. |
| 2566 * Does not handle conversion extensions because the preToU[] state
etc. |
| 2567 * is not copied. |
| 2568 */ |
| 2569 ucnv_MBCSToUnicodeWithOffsets(&subArgs, err); |
| 2570 |
| 2571 if(args->offsets != NULL && sourceStart != args->source) { |
| 2572 /* update offsets to base them on the actual start of the input
*/ |
| 2573 int32_t *offsets = args->offsets; |
| 2574 UChar *target = args->target; |
| 2575 int32_t delta = (int32_t)(args->source - sourceStart); |
| 2576 while(target < subArgs.target) { |
| 2577 if(*offsets >= 0) { |
| 2578 *offsets += delta; |
| 2579 } |
| 2580 ++offsets; |
| 2581 ++target; |
| 2582 } |
| 2583 } |
| 2584 args->source = subArgs.source; |
| 2585 args->target = subArgs.target; |
| 2586 args->offsets = subArgs.offsets; |
| 2587 |
| 2588 /* copy input/error/overflow buffers */ |
| 2589 if(subArgs.converter->toULength > 0) { |
| 2590 uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUByt
es, subArgs.converter->toULength); |
| 2591 } |
| 2592 args->converter->toULength = subArgs.converter->toULength; |
| 2593 |
| 2594 if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| 2595 if(subArgs.converter->UCharErrorBufferLength > 0) { |
| 2596 uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.conve
rter->UCharErrorBuffer, |
| 2597 subArgs.converter->UCharErrorBufferLength); |
| 2598 } |
| 2599 args->converter->UCharErrorBufferLength=subArgs.converter->UChar
ErrorBufferLength; |
| 2600 subArgs.converter->UCharErrorBufferLength = 0; |
| 2601 } |
| 2602 } |
| 2603 |
| 2604 if (U_FAILURE(*err) || (args->source == args->sourceLimit)) { |
| 2605 return; |
| 2606 } |
| 2607 |
| 2608 escape: |
| 2609 changeState_2022(args->converter, |
| 2610 &(args->source), |
| 2611 args->sourceLimit, |
| 2612 ISO_2022_KR, |
| 2613 err); |
| 2614 } |
| 2615 } |
| 2616 |
| 2617 static void |
| 2618 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| 2619 UErrorCode* err){ |
| 2620 char tempBuf[2]; |
| 2621 const char *mySource = ( char *) args->source; |
| 2622 UChar *myTarget = args->target; |
| 2623 const char *mySourceLimit = args->sourceLimit; |
| 2624 UChar32 targetUniChar = 0x0000; |
| 2625 UChar mySourceChar = 0x0000; |
| 2626 UConverterDataISO2022* myData; |
| 2627 UConverterSharedData* sharedData ; |
| 2628 UBool useFallback; |
| 2629 |
| 2630 myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| 2631 if(myData->version==1){ |
| 2632 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err); |
| 2633 return; |
| 2634 } |
| 2635 |
| 2636 /* initialize state */ |
| 2637 sharedData = myData->currentConverter->sharedData; |
| 2638 useFallback = args->converter->useFallback; |
| 2639 |
| 2640 if(myData->key != 0) { |
| 2641 /* continue with a partial escape sequence */ |
| 2642 goto escape; |
| 2643 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myT
arget < args->targetLimit) { |
| 2644 /* continue with a partial double-byte character */ |
| 2645 mySourceChar = args->converter->toUBytes[0]; |
| 2646 args->converter->toULength = 0; |
| 2647 goto getTrailByte; |
| 2648 } |
| 2649 |
| 2650 while(mySource< mySourceLimit){ |
| 2651 |
| 2652 if(myTarget < args->targetLimit){ |
| 2653 |
| 2654 mySourceChar= (unsigned char) *mySource++; |
| 2655 |
| 2656 if(mySourceChar==UCNV_SI){ |
| 2657 myData->toU2022State.g = 0; |
| 2658 if (myData->isEmptySegment) { |
| 2659 myData->isEmptySegment = FALSE; /* we are handling it, r
eset to avoid future spurious errors */ |
| 2660 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 2661 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 2662 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 2663 args->converter->toULength = 1; |
| 2664 args->target = myTarget; |
| 2665 args->source = mySource; |
| 2666 return; |
| 2667 } |
| 2668 /*consume the source */ |
| 2669 continue; |
| 2670 }else if(mySourceChar==UCNV_SO){ |
| 2671 myData->toU2022State.g = 1; |
| 2672 myData->isEmptySegment = TRUE; /* Begin a new segment, empty so
far */ |
| 2673 /*consume the source */ |
| 2674 continue; |
| 2675 }else if(mySourceChar==ESC_2022){ |
| 2676 mySource--; |
| 2677 escape: |
| 2678 myData->isEmptySegment = FALSE; /* Any invalid ESC sequences wil
l be detected separately, so just reset this */ |
| 2679 changeState_2022(args->converter,&(mySource), |
| 2680 mySourceLimit, ISO_2022_KR, err); |
| 2681 if(U_FAILURE(*err)){ |
| 2682 args->target = myTarget; |
| 2683 args->source = mySource; |
| 2684 return; |
| 2685 } |
| 2686 continue; |
| 2687 } |
| 2688 |
| 2689 myData->isEmptySegment = FALSE; /* Any invalid char errors will
be detected separately, so just reset this */ |
| 2690 if(myData->toU2022State.g == 1) { |
| 2691 if(mySource < mySourceLimit) { |
| 2692 int leadIsOk, trailIsOk; |
| 2693 uint8_t trailByte; |
| 2694 getTrailByte: |
| 2695 targetUniChar = missingCharMarker; |
| 2696 trailByte = (uint8_t)*mySource; |
| 2697 /* |
| 2698 * Ticket 5691: consistent illegal sequences: |
| 2699 * - We include at least the first byte in the illegal seque
nce. |
| 2700 * - If any of the non-initial bytes could be the start of a
character, |
| 2701 * we stop the illegal sequence before the first one of th
ose. |
| 2702 * |
| 2703 * In ISO-2022 DBCS, if the second byte is in the 21..7e ran
ge or is |
| 2704 * an ESC/SO/SI, we report only the first byte as the illega
l sequence. |
| 2705 * Otherwise we convert or report the pair of bytes. |
| 2706 */ |
| 2707 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21); |
| 2708 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21); |
| 2709 if (leadIsOk && trailIsOk) { |
| 2710 ++mySource; |
| 2711 tempBuf[0] = (char)(mySourceChar + 0x80); |
| 2712 tempBuf[1] = (char)(trailByte + 0x80); |
| 2713 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData,
tempBuf, 2, useFallback); |
| 2714 mySourceChar = (mySourceChar << 8) | trailByte; |
| 2715 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| 2716 /* report a pair of illegal bytes if the second byte is
not a DBCS starter */ |
| 2717 ++mySource; |
| 2718 /* add another bit so that the code below writes 2 bytes
in case of error */ |
| 2719 mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte
; |
| 2720 } |
| 2721 } else { |
| 2722 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 2723 args->converter->toULength = 1; |
| 2724 break; |
| 2725 } |
| 2726 } |
| 2727 else if(mySourceChar <= 0x7f) { |
| 2728 targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource
- 1, 1, useFallback); |
| 2729 } else { |
| 2730 targetUniChar = 0xffff; |
| 2731 } |
| 2732 if(targetUniChar < 0xfffe){ |
| 2733 if(args->offsets) { |
| 2734 args->offsets[myTarget - args->target] = (int32_t)(mySource
- args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 2735 } |
| 2736 *(myTarget++)=(UChar)targetUniChar; |
| 2737 } |
| 2738 else { |
| 2739 /* Call the callback function*/ |
| 2740 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err
); |
| 2741 break; |
| 2742 } |
| 2743 } |
| 2744 else{ |
| 2745 *err =U_BUFFER_OVERFLOW_ERROR; |
| 2746 break; |
| 2747 } |
| 2748 } |
| 2749 args->target = myTarget; |
| 2750 args->source = mySource; |
| 2751 } |
| 2752 |
| 2753 /*************************** END ISO2022-KR *********************************/ |
| 2754 |
| 2755 /*************************** ISO-2022-CN ********************************* |
| 2756 * |
| 2757 * Rules for ISO-2022-CN Encoding: |
| 2758 * i) The designator sequence must appear once on a line before any instance |
| 2759 * of character set it designates. |
| 2760 * ii) If two lines contain characters from the same character set, both lines |
| 2761 * must include the designator sequence. |
| 2762 * iii) Once the designator sequence is known, a shifting sequence has to be foun
d |
| 2763 * to invoke the shifting |
| 2764 * iv) All lines start in ASCII and end in ASCII. |
| 2765 * v) Four shifting sequences are employed for this purpose: |
| 2766 * |
| 2767 * Sequcence ASCII Eq Charsets |
| 2768 * ---------- ------- --------- |
| 2769 * SI <SI> US-ASCII |
| 2770 * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165 |
| 2771 * SS2 <ESC>N CNS-11643-1992 Plane 2 |
| 2772 * SS3 <ESC>O CNS-11643-1992 Planes 3-7 |
| 2773 * |
| 2774 * vi) |
| 2775 * SOdesignator : ESC "$" ")" finalchar_for_SO |
| 2776 * SS2designator : ESC "$" "*" finalchar_for_SS2 |
| 2777 * SS3designator : ESC "$" "+" finalchar_for_SS3 |
| 2778 * |
| 2779 * ESC $ ) A Indicates the bytes following SO are Chinese |
| 2780 * characters as defined in GB 2312-80, until |
| 2781 * another SOdesignation appears |
| 2782 * |
| 2783 * |
| 2784 * ESC $ ) E Indicates the bytes following SO are as defined |
| 2785 * in ISO-IR-165 (for details, see section 2.1), |
| 2786 * until another SOdesignation appears |
| 2787 * |
| 2788 * ESC $ ) G Indicates the bytes following SO are as defined |
| 2789 * in CNS 11643-plane-1, until another |
| 2790 * SOdesignation appears |
| 2791 * |
| 2792 * ESC $ * H Indicates the two bytes immediately following |
| 2793 * SS2 is a Chinese character as defined in CNS |
| 2794 * 11643-plane-2, until another SS2designation |
| 2795 * appears |
| 2796 * (Meaning <ESC>N must preceed every 2 byte |
| 2797 * sequence.) |
| 2798 * |
| 2799 * ESC $ + I Indicates the immediate two bytes following SS3 |
| 2800 * is a Chinese character as defined in CNS |
| 2801 * 11643-plane-3, until another SS3designation |
| 2802 * appears |
| 2803 * (Meaning <ESC>O must preceed every 2 byte |
| 2804 * sequence.) |
| 2805 * |
| 2806 * ESC $ + J Indicates the immediate two bytes following SS3 |
| 2807 * is a Chinese character as defined in CNS |
| 2808 * 11643-plane-4, until another SS3designation |
| 2809 * appears |
| 2810 * (In English: <ESC>O must preceed every 2 byte |
| 2811 * sequence.) |
| 2812 * |
| 2813 * ESC $ + K Indicates the immediate two bytes following SS3 |
| 2814 * is a Chinese character as defined in CNS |
| 2815 * 11643-plane-5, until another SS3designation |
| 2816 * appears |
| 2817 * |
| 2818 * ESC $ + L Indicates the immediate two bytes following SS3 |
| 2819 * is a Chinese character as defined in CNS |
| 2820 * 11643-plane-6, until another SS3designation |
| 2821 * appears |
| 2822 * |
| 2823 * ESC $ + M Indicates the immediate two bytes following SS3 |
| 2824 * is a Chinese character as defined in CNS |
| 2825 * 11643-plane-7, until another SS3designation |
| 2826 * appears |
| 2827 * |
| 2828 * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and |
| 2829 * has its own designation information before any Chinese characters |
| 2830 * appear |
| 2831 * |
| 2832 */ |
| 2833 |
| 2834 /* The following are defined this way to make the strings truely readonly */ |
| 2835 static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41"; |
| 2836 static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45"; |
| 2837 static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47"; |
| 2838 static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48"; |
| 2839 static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49"; |
| 2840 static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A"; |
| 2841 static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B"; |
| 2842 static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C"; |
| 2843 static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D"; |
| 2844 |
| 2845 /********************** ISO2022-CN Data **************************/ |
| 2846 static const char* const escSeqCharsCN[10] ={ |
| 2847 SHIFT_IN_STR, /* ASCII */ |
| 2848 GB_2312_80_STR, |
| 2849 ISO_IR_165_STR, |
| 2850 CNS_11643_1992_Plane_1_STR, |
| 2851 CNS_11643_1992_Plane_2_STR, |
| 2852 CNS_11643_1992_Plane_3_STR, |
| 2853 CNS_11643_1992_Plane_4_STR, |
| 2854 CNS_11643_1992_Plane_5_STR, |
| 2855 CNS_11643_1992_Plane_6_STR, |
| 2856 CNS_11643_1992_Plane_7_STR |
| 2857 }; |
| 2858 |
| 2859 static void |
| 2860 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
, UErrorCode* err){ |
| 2861 UConverter *cnv = args->converter; |
| 2862 UConverterDataISO2022 *converterData; |
| 2863 ISO2022State *pFromU2022State; |
| 2864 uint8_t *target = (uint8_t *) args->target; |
| 2865 const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; |
| 2866 const UChar* source = args->source; |
| 2867 const UChar* sourceLimit = args->sourceLimit; |
| 2868 int32_t* offsets = args->offsets; |
| 2869 UChar32 sourceChar; |
| 2870 char buffer[8]; |
| 2871 int32_t len; |
| 2872 int8_t choices[3]; |
| 2873 int32_t choiceCount; |
| 2874 uint32_t targetValue = 0; |
| 2875 UBool useFallback; |
| 2876 |
| 2877 /* set up the state */ |
| 2878 converterData = (UConverterDataISO2022*)cnv->extraInfo; |
| 2879 pFromU2022State = &converterData->fromU2022State; |
| 2880 |
| 2881 choiceCount = 0; |
| 2882 |
| 2883 /* check if the last codepoint of previous buffer was a lead surrogate*/ |
| 2884 if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { |
| 2885 goto getTrail; |
| 2886 } |
| 2887 |
| 2888 while( source < sourceLimit){ |
| 2889 if(target < targetLimit){ |
| 2890 |
| 2891 sourceChar = *(source++); |
| 2892 /*check if the char is a First surrogate*/ |
| 2893 if(UTF_IS_SURROGATE(sourceChar)) { |
| 2894 if(UTF_IS_SURROGATE_FIRST(sourceChar)) { |
| 2895 getTrail: |
| 2896 /*look ahead to find the trail surrogate*/ |
| 2897 if(source < sourceLimit) { |
| 2898 /* test the following code unit */ |
| 2899 UChar trail=(UChar) *source; |
| 2900 if(UTF_IS_SECOND_SURROGATE(trail)) { |
| 2901 source++; |
| 2902 sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); |
| 2903 cnv->fromUChar32=0x00; |
| 2904 /* convert this supplementary code point */ |
| 2905 /* exit this condition tree */ |
| 2906 } else { |
| 2907 /* this is an unmatched lead code unit (1st surrogat
e) */ |
| 2908 /* callback(illegal) */ |
| 2909 *err=U_ILLEGAL_CHAR_FOUND; |
| 2910 cnv->fromUChar32=sourceChar; |
| 2911 break; |
| 2912 } |
| 2913 } else { |
| 2914 /* no more input */ |
| 2915 cnv->fromUChar32=sourceChar; |
| 2916 break; |
| 2917 } |
| 2918 } else { |
| 2919 /* this is an unmatched trail code unit (2nd surrogate) */ |
| 2920 /* callback(illegal) */ |
| 2921 *err=U_ILLEGAL_CHAR_FOUND; |
| 2922 cnv->fromUChar32=sourceChar; |
| 2923 break; |
| 2924 } |
| 2925 } |
| 2926 |
| 2927 /* do the conversion */ |
| 2928 if(sourceChar <= 0x007f ){ |
| 2929 /* do not convert SO/SI/ESC */ |
| 2930 if(IS_2022_CONTROL(sourceChar)) { |
| 2931 /* callback(illegal) */ |
| 2932 *err=U_ILLEGAL_CHAR_FOUND; |
| 2933 cnv->fromUChar32=sourceChar; |
| 2934 break; |
| 2935 } |
| 2936 |
| 2937 /* US-ASCII */ |
| 2938 if(pFromU2022State->g == 0) { |
| 2939 buffer[0] = (char)sourceChar; |
| 2940 len = 1; |
| 2941 } else { |
| 2942 buffer[0] = UCNV_SI; |
| 2943 buffer[1] = (char)sourceChar; |
| 2944 len = 2; |
| 2945 pFromU2022State->g = 0; |
| 2946 choiceCount = 0; |
| 2947 } |
| 2948 if(sourceChar == CR || sourceChar == LF) { |
| 2949 /* reset the state at the end of a line */ |
| 2950 uprv_memset(pFromU2022State, 0, sizeof(ISO2022State)); |
| 2951 choiceCount = 0; |
| 2952 } |
| 2953 } |
| 2954 else{ |
| 2955 /* convert U+0080..U+10ffff */ |
| 2956 int32_t i; |
| 2957 int8_t cs, g; |
| 2958 |
| 2959 if(choiceCount == 0) { |
| 2960 /* try the current SO/G1 converter first */ |
| 2961 choices[0] = pFromU2022State->cs[1]; |
| 2962 |
| 2963 /* default to GB2312_1 if none is designated yet */ |
| 2964 if(choices[0] == 0) { |
| 2965 choices[0] = GB2312_1; |
| 2966 } |
| 2967 |
| 2968 if(converterData->version == 0) { |
| 2969 /* ISO-2022-CN */ |
| 2970 |
| 2971 /* try the other SO/G1 converter; a CNS_11643_1 lookup m
ay result in any plane */ |
| 2972 if(choices[0] == GB2312_1) { |
| 2973 choices[1] = (int8_t)CNS_11643_1; |
| 2974 } else { |
| 2975 choices[1] = (int8_t)GB2312_1; |
| 2976 } |
| 2977 |
| 2978 choiceCount = 2; |
| 2979 } else if (converterData->version == 1) { |
| 2980 /* ISO-2022-CN-EXT */ |
| 2981 |
| 2982 /* try one of the other converters */ |
| 2983 switch(choices[0]) { |
| 2984 case GB2312_1: |
| 2985 choices[1] = (int8_t)CNS_11643_1; |
| 2986 choices[2] = (int8_t)ISO_IR_165; |
| 2987 break; |
| 2988 case ISO_IR_165: |
| 2989 choices[1] = (int8_t)GB2312_1; |
| 2990 choices[2] = (int8_t)CNS_11643_1; |
| 2991 break; |
| 2992 default: /* CNS_11643_x */ |
| 2993 choices[1] = (int8_t)GB2312_1; |
| 2994 choices[2] = (int8_t)ISO_IR_165; |
| 2995 break; |
| 2996 } |
| 2997 |
| 2998 choiceCount = 3; |
| 2999 } else { |
| 3000 choices[0] = (int8_t)CNS_11643_1; |
| 3001 choices[1] = (int8_t)GB2312_1; |
| 3002 } |
| 3003 } |
| 3004 |
| 3005 cs = g = 0; |
| 3006 /* |
| 3007 * len==0: no mapping found yet |
| 3008 * len<0: found a fallback result: continue looking for a roundt
rip but no further fallbacks |
| 3009 * len>0: found a roundtrip result, done |
| 3010 */ |
| 3011 len = 0; |
| 3012 /* |
| 3013 * We will turn off useFallback after finding a fallback, |
| 3014 * but we still get fallbacks from PUA code points as usual. |
| 3015 * Therefore, we will also need to check that we don't overwrite |
| 3016 * an early fallback with a later one. |
| 3017 */ |
| 3018 useFallback = cnv->useFallback; |
| 3019 |
| 3020 for(i = 0; i < choiceCount && len <= 0; ++i) { |
| 3021 int8_t cs0 = choices[i]; |
| 3022 if(cs0 > 0) { |
| 3023 uint32_t value; |
| 3024 int32_t len2; |
| 3025 if(cs0 >= CNS_11643_0) { |
| 3026 len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 3027 converterData->myConverterArray[CNS_1164
3], |
| 3028 sourceChar, |
| 3029 &value, |
| 3030 useFallback, |
| 3031 MBCS_OUTPUT_3); |
| 3032 if(len2 == 3 || (len2 == -3 && len == 0)) { |
| 3033 targetValue = value; |
| 3034 cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80
); |
| 3035 if(len2 >= 0) { |
| 3036 len = 2; |
| 3037 } else { |
| 3038 len = -2; |
| 3039 useFallback = FALSE; |
| 3040 } |
| 3041 if(cs == CNS_11643_1) { |
| 3042 g = 1; |
| 3043 } else if(cs == CNS_11643_2) { |
| 3044 g = 2; |
| 3045 } else /* plane 3..7 */ if(converterData->versio
n == 1) { |
| 3046 g = 3; |
| 3047 } else { |
| 3048 /* ISO-2022-CN (without -EXT) does not suppo
rt plane 3..7 */ |
| 3049 len = 0; |
| 3050 } |
| 3051 } |
| 3052 } else { |
| 3053 /* GB2312_1 or ISO-IR-165 */ |
| 3054 len2 = MBCS_FROM_UCHAR32_ISO2022( |
| 3055 converterData->myConverterArray[cs0], |
| 3056 sourceChar, |
| 3057 &value, |
| 3058 useFallback, |
| 3059 MBCS_OUTPUT_2); |
| 3060 if(len2 == 2 || (len2 == -2 && len == 0)) { |
| 3061 targetValue = value; |
| 3062 len = len2; |
| 3063 cs = cs0; |
| 3064 g = 1; |
| 3065 useFallback = FALSE; |
| 3066 } |
| 3067 } |
| 3068 } |
| 3069 } |
| 3070 |
| 3071 if(len != 0) { |
| 3072 len = 0; /* count output bytes; it must have been abs(len) =
= 2 */ |
| 3073 |
| 3074 /* write the designation sequence if necessary */ |
| 3075 if(cs != pFromU2022State->cs[g]) { |
| 3076 if(cs < CNS_11643) { |
| 3077 uprv_memcpy(buffer, escSeqCharsCN[cs], 4); |
| 3078 } else { |
| 3079 uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs -
CNS_11643_1)], 4); |
| 3080 } |
| 3081 len = 4; |
| 3082 pFromU2022State->cs[g] = cs; |
| 3083 if(g == 1) { |
| 3084 /* changing the SO/G1 charset invalidates the choice
s[] */ |
| 3085 choiceCount = 0; |
| 3086 } |
| 3087 } |
| 3088 |
| 3089 /* write the shift sequence if necessary */ |
| 3090 if(g != pFromU2022State->g) { |
| 3091 switch(g) { |
| 3092 case 1: |
| 3093 buffer[len++] = UCNV_SO; |
| 3094 |
| 3095 /* set the new state only if it is the locking shift
SO/G1, not for SS2 or SS3 */ |
| 3096 pFromU2022State->g = 1; |
| 3097 break; |
| 3098 case 2: |
| 3099 buffer[len++] = 0x1b; |
| 3100 buffer[len++] = 0x4e; |
| 3101 break; |
| 3102 default: /* case 3 */ |
| 3103 buffer[len++] = 0x1b; |
| 3104 buffer[len++] = 0x4f; |
| 3105 break; |
| 3106 } |
| 3107 } |
| 3108 |
| 3109 /* write the two output bytes */ |
| 3110 buffer[len++] = (char)(targetValue >> 8); |
| 3111 buffer[len++] = (char)targetValue; |
| 3112 } else { |
| 3113 /* if we cannot find the character after checking all codepa
ges |
| 3114 * then this is an error |
| 3115 */ |
| 3116 *err = U_INVALID_CHAR_FOUND; |
| 3117 cnv->fromUChar32=sourceChar; |
| 3118 break; |
| 3119 } |
| 3120 } |
| 3121 |
| 3122 /* output len>0 bytes in buffer[] */ |
| 3123 if(len == 1) { |
| 3124 *target++ = buffer[0]; |
| 3125 if(offsets) { |
| 3126 *offsets++ = (int32_t)(source - args->source - 1); /* -1: kn
own to be ASCII */ |
| 3127 } |
| 3128 } else if(len == 2 && (target + 2) <= targetLimit) { |
| 3129 *target++ = buffer[0]; |
| 3130 *target++ = buffer[1]; |
| 3131 if(offsets) { |
| 3132 int32_t sourceIndex = (int32_t)(source - args->source - U16_
LENGTH(sourceChar)); |
| 3133 *offsets++ = sourceIndex; |
| 3134 *offsets++ = sourceIndex; |
| 3135 } |
| 3136 } else { |
| 3137 fromUWriteUInt8( |
| 3138 cnv, |
| 3139 buffer, len, |
| 3140 &target, (const char *)targetLimit, |
| 3141 &offsets, (int32_t)(source - args->source - U16_LENGTH(sourc
eChar)), |
| 3142 err); |
| 3143 if(U_FAILURE(*err)) { |
| 3144 break; |
| 3145 } |
| 3146 } |
| 3147 } /* end if(myTargetIndex<myTargetLength) */ |
| 3148 else{ |
| 3149 *err =U_BUFFER_OVERFLOW_ERROR; |
| 3150 break; |
| 3151 } |
| 3152 |
| 3153 }/* end while(mySourceIndex<mySourceLength) */ |
| 3154 |
| 3155 /* |
| 3156 * the end of the input stream and detection of truncated input |
| 3157 * are handled by the framework, but for ISO-2022-CN conversion |
| 3158 * we need to be in ASCII mode at the very end |
| 3159 * |
| 3160 * conditions: |
| 3161 * successful |
| 3162 * not in ASCII mode |
| 3163 * end of input and no truncated input |
| 3164 */ |
| 3165 if( U_SUCCESS(*err) && |
| 3166 pFromU2022State->g!=0 && |
| 3167 args->flush && source>=sourceLimit && cnv->fromUChar32==0 |
| 3168 ) { |
| 3169 int32_t sourceIndex; |
| 3170 |
| 3171 /* we are switching to ASCII */ |
| 3172 pFromU2022State->g=0; |
| 3173 |
| 3174 /* get the source index of the last input character */ |
| 3175 /* |
| 3176 * TODO this would be simpler and more reliable if we used a pair |
| 3177 * of sourceIndex/prevSourceIndex like in ucnvmbcs.c |
| 3178 * so that we could simply use the prevSourceIndex here; |
| 3179 * this code gives an incorrect result for the rare case of an unmatched |
| 3180 * trail surrogate that is alone in the last buffer of the text stream |
| 3181 */ |
| 3182 sourceIndex=(int32_t)(source-args->source); |
| 3183 if(sourceIndex>0) { |
| 3184 --sourceIndex; |
| 3185 if( U16_IS_TRAIL(args->source[sourceIndex]) && |
| 3186 (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1])) |
| 3187 ) { |
| 3188 --sourceIndex; |
| 3189 } |
| 3190 } else { |
| 3191 sourceIndex=-1; |
| 3192 } |
| 3193 |
| 3194 fromUWriteUInt8( |
| 3195 cnv, |
| 3196 SHIFT_IN_STR, 1, |
| 3197 &target, (const char *)targetLimit, |
| 3198 &offsets, sourceIndex, |
| 3199 err); |
| 3200 } |
| 3201 |
| 3202 /*save the state and return */ |
| 3203 args->source = source; |
| 3204 args->target = (char*)target; |
| 3205 } |
| 3206 |
| 3207 |
| 3208 static void |
| 3209 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, |
| 3210 UErrorCode* err){ |
| 3211 char tempBuf[3]; |
| 3212 const char *mySource = (char *) args->source; |
| 3213 UChar *myTarget = args->target; |
| 3214 const char *mySourceLimit = args->sourceLimit; |
| 3215 uint32_t targetUniChar = 0x0000; |
| 3216 uint32_t mySourceChar = 0x0000; |
| 3217 UConverterDataISO2022* myData; |
| 3218 ISO2022State *pToU2022State; |
| 3219 |
| 3220 myData=(UConverterDataISO2022*)(args->converter->extraInfo); |
| 3221 pToU2022State = &myData->toU2022State; |
| 3222 |
| 3223 if(myData->key != 0) { |
| 3224 /* continue with a partial escape sequence */ |
| 3225 goto escape; |
| 3226 } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myT
arget < args->targetLimit) { |
| 3227 /* continue with a partial double-byte character */ |
| 3228 mySourceChar = args->converter->toUBytes[0]; |
| 3229 args->converter->toULength = 0; |
| 3230 targetUniChar = missingCharMarker; |
| 3231 goto getTrailByte; |
| 3232 } |
| 3233 |
| 3234 while(mySource < mySourceLimit){ |
| 3235 |
| 3236 targetUniChar =missingCharMarker; |
| 3237 |
| 3238 if(myTarget < args->targetLimit){ |
| 3239 |
| 3240 mySourceChar= (unsigned char) *mySource++; |
| 3241 |
| 3242 switch(mySourceChar){ |
| 3243 case UCNV_SI: |
| 3244 pToU2022State->g=0; |
| 3245 if (myData->isEmptySegment) { |
| 3246 myData->isEmptySegment = FALSE; /* we are handling it, r
eset to avoid future spurious errors */ |
| 3247 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 3248 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 3249 args->converter->toUBytes[0] = mySourceChar; |
| 3250 args->converter->toULength = 1; |
| 3251 args->target = myTarget; |
| 3252 args->source = mySource; |
| 3253 return; |
| 3254 } |
| 3255 continue; |
| 3256 |
| 3257 case UCNV_SO: |
| 3258 if(pToU2022State->cs[1] != 0) { |
| 3259 pToU2022State->g=1; |
| 3260 myData->isEmptySegment = TRUE; /* Begin a new segment,
empty so far */ |
| 3261 continue; |
| 3262 } else { |
| 3263 /* illegal to have SO before a matching designator */ |
| 3264 myData->isEmptySegment = FALSE; /* Handling a different
error, reset this to avoid future spurious errs */ |
| 3265 break; |
| 3266 } |
| 3267 |
| 3268 case ESC_2022: |
| 3269 mySource--; |
| 3270 escape: |
| 3271 { |
| 3272 const char * mySourceBefore = mySource; |
| 3273 int8_t toULengthBefore = args->converter->toULength; |
| 3274 |
| 3275 changeState_2022(args->converter,&(mySource), |
| 3276 mySourceLimit, ISO_2022_CN,err); |
| 3277 |
| 3278 /* After SO there must be at least one character before a de
signator (designator error handled separately) */ |
| 3279 if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegme
nt) { |
| 3280 *err = U_ILLEGAL_ESCAPE_SEQUENCE; |
| 3281 args->converter->toUCallbackReason = UCNV_IRREGULAR; |
| 3282 args->converter->toULength = (int8_t)(toULengthBefore +
(mySource - mySourceBefore)); |
| 3283 } |
| 3284 } |
| 3285 |
| 3286 /* invalid or illegal escape sequence */ |
| 3287 if(U_FAILURE(*err)){ |
| 3288 args->target = myTarget; |
| 3289 args->source = mySource; |
| 3290 myData->isEmptySegment = FALSE; /* Reset to avoid future
spurious errors */ |
| 3291 return; |
| 3292 } |
| 3293 continue; |
| 3294 |
| 3295 /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */ |
| 3296 |
| 3297 case CR: |
| 3298 /*falls through*/ |
| 3299 case LF: |
| 3300 uprv_memset(pToU2022State, 0, sizeof(ISO2022State)); |
| 3301 /* falls through */ |
| 3302 default: |
| 3303 /* convert one or two bytes */ |
| 3304 myData->isEmptySegment = FALSE; |
| 3305 if(pToU2022State->g != 0) { |
| 3306 if(mySource < mySourceLimit) { |
| 3307 UConverterSharedData *cnv; |
| 3308 StateEnum tempState; |
| 3309 int32_t tempBufLen; |
| 3310 int leadIsOk, trailIsOk; |
| 3311 uint8_t trailByte; |
| 3312 getTrailByte: |
| 3313 trailByte = (uint8_t)*mySource; |
| 3314 /* |
| 3315 * Ticket 5691: consistent illegal sequences: |
| 3316 * - We include at least the first byte in the illegal s
equence. |
| 3317 * - If any of the non-initial bytes could be the start
of a character, |
| 3318 * we stop the illegal sequence before the first one o
f those. |
| 3319 * |
| 3320 * In ISO-2022 DBCS, if the second byte is in the 21..7e
range or is |
| 3321 * an ESC/SO/SI, we report only the first byte as the il
legal sequence. |
| 3322 * Otherwise we convert or report the pair of bytes. |
| 3323 */ |
| 3324 leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x2
1); |
| 3325 trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21)
; |
| 3326 if (leadIsOk && trailIsOk) { |
| 3327 ++mySource; |
| 3328 tempState = (StateEnum)pToU2022State->cs[pToU2022Sta
te->g]; |
| 3329 if(tempState >= CNS_11643_0) { |
| 3330 cnv = myData->myConverterArray[CNS_11643]; |
| 3331 tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0
)); |
| 3332 tempBuf[1] = (char) (mySourceChar); |
| 3333 tempBuf[2] = (char) trailByte; |
| 3334 tempBufLen = 3; |
| 3335 |
| 3336 }else{ |
| 3337 cnv = myData->myConverterArray[tempState]; |
| 3338 tempBuf[0] = (char) (mySourceChar); |
| 3339 tempBuf[1] = (char) trailByte; |
| 3340 tempBufLen = 2; |
| 3341 } |
| 3342 targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tem
pBuf, tempBufLen, FALSE); |
| 3343 mySourceChar = (mySourceChar << 8) | trailByte; |
| 3344 } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) { |
| 3345 /* report a pair of illegal bytes if the second byte
is not a DBCS starter */ |
| 3346 ++mySource; |
| 3347 /* add another bit so that the code below writes 2 b
ytes in case of error */ |
| 3348 mySourceChar = 0x10000 | (mySourceChar << 8) | trail
Byte; |
| 3349 } |
| 3350 if(pToU2022State->g>=2) { |
| 3351 /* return from a single-shift state to the previous
one */ |
| 3352 pToU2022State->g=pToU2022State->prevG; |
| 3353 } |
| 3354 } else { |
| 3355 args->converter->toUBytes[0] = (uint8_t)mySourceChar; |
| 3356 args->converter->toULength = 1; |
| 3357 goto endloop; |
| 3358 } |
| 3359 } |
| 3360 else{ |
| 3361 if(mySourceChar <= 0x7f) { |
| 3362 targetUniChar = (UChar) mySourceChar; |
| 3363 } |
| 3364 } |
| 3365 break; |
| 3366 } |
| 3367 if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ |
| 3368 if(args->offsets){ |
| 3369 args->offsets[myTarget - args->target] = (int32_t)(mySource
- args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 3370 } |
| 3371 *(myTarget++)=(UChar)targetUniChar; |
| 3372 } |
| 3373 else if(targetUniChar > missingCharMarker){ |
| 3374 /* disassemble the surrogate pair and write to output*/ |
| 3375 targetUniChar-=0x0010000; |
| 3376 *myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10)); |
| 3377 if(args->offsets){ |
| 3378 args->offsets[myTarget - args->target] = (int32_t)(mySource
- args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 3379 } |
| 3380 ++myTarget; |
| 3381 if(myTarget< args->targetLimit){ |
| 3382 *myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff)); |
| 3383 if(args->offsets){ |
| 3384 args->offsets[myTarget - args->target] = (int32_t)(mySou
rce - args->source - (mySourceChar <= 0xff ? 1 : 2)); |
| 3385 } |
| 3386 ++myTarget; |
| 3387 }else{ |
| 3388 args->converter->UCharErrorBuffer[args->converter->UCharErro
rBufferLength++]= |
| 3389 (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff))
; |
| 3390 } |
| 3391 |
| 3392 } |
| 3393 else{ |
| 3394 /* Call the callback function*/ |
| 3395 toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err
); |
| 3396 break; |
| 3397 } |
| 3398 } |
| 3399 else{ |
| 3400 *err =U_BUFFER_OVERFLOW_ERROR; |
| 3401 break; |
| 3402 } |
| 3403 } |
| 3404 endloop: |
| 3405 args->target = myTarget; |
| 3406 args->source = mySource; |
| 3407 } |
| 3408 |
| 3409 static void |
| 3410 _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorC
ode *err) { |
| 3411 UConverter *cnv = args->converter; |
| 3412 UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraI
nfo; |
| 3413 ISO2022State *pFromU2022State=&myConverterData->fromU2022State; |
| 3414 char *p, *subchar; |
| 3415 char buffer[8]; |
| 3416 int32_t length; |
| 3417 |
| 3418 subchar=(char *)cnv->subChars; |
| 3419 length=cnv->subCharLen; /* assume length==1 for most variants */ |
| 3420 |
| 3421 p = buffer; |
| 3422 switch(myConverterData->locale[0]){ |
| 3423 case 'j': |
| 3424 { |
| 3425 int8_t cs; |
| 3426 |
| 3427 if(pFromU2022State->g == 1) { |
| 3428 /* JIS7: switch from G1 to G0 */ |
| 3429 pFromU2022State->g = 0; |
| 3430 *p++ = UCNV_SI; |
| 3431 } |
| 3432 |
| 3433 cs = pFromU2022State->cs[0]; |
| 3434 if(cs != ASCII && cs != JISX201) { |
| 3435 /* not in ASCII or JIS X 0201: switch to ASCII */ |
| 3436 pFromU2022State->cs[0] = (int8_t)ASCII; |
| 3437 *p++ = '\x1b'; |
| 3438 *p++ = '\x28'; |
| 3439 *p++ = '\x42'; |
| 3440 } |
| 3441 |
| 3442 *p++ = subchar[0]; |
| 3443 break; |
| 3444 } |
| 3445 case 'c': |
| 3446 if(pFromU2022State->g != 0) { |
| 3447 /* not in ASCII mode: switch to ASCII */ |
| 3448 pFromU2022State->g = 0; |
| 3449 *p++ = UCNV_SI; |
| 3450 } |
| 3451 *p++ = subchar[0]; |
| 3452 break; |
| 3453 case 'k': |
| 3454 if(myConverterData->version == 0) { |
| 3455 if(length == 1) { |
| 3456 if((UBool)args->converter->fromUnicodeStatus) { |
| 3457 /* in DBCS mode: switch to SBCS */ |
| 3458 args->converter->fromUnicodeStatus = 0; |
| 3459 *p++ = UCNV_SI; |
| 3460 } |
| 3461 *p++ = subchar[0]; |
| 3462 } else /* length == 2*/ { |
| 3463 if(!(UBool)args->converter->fromUnicodeStatus) { |
| 3464 /* in SBCS mode: switch to DBCS */ |
| 3465 args->converter->fromUnicodeStatus = 1; |
| 3466 *p++ = UCNV_SO; |
| 3467 } |
| 3468 *p++ = subchar[0]; |
| 3469 *p++ = subchar[1]; |
| 3470 } |
| 3471 break; |
| 3472 } else { |
| 3473 /* save the subconverter's substitution string */ |
| 3474 uint8_t *currentSubChars = myConverterData->currentConverter->subCha
rs; |
| 3475 int8_t currentSubCharLen = myConverterData->currentConverter->subCha
rLen; |
| 3476 |
| 3477 /* set our substitution string into the subconverter */ |
| 3478 myConverterData->currentConverter->subChars = (uint8_t *)subchar; |
| 3479 myConverterData->currentConverter->subCharLen = (int8_t)length; |
| 3480 |
| 3481 /* let the subconverter write the subchar, set/retrieve fromUChar32
state */ |
| 3482 args->converter = myConverterData->currentConverter; |
| 3483 myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32; |
| 3484 ucnv_cbFromUWriteSub(args, 0, err); |
| 3485 cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32; |
| 3486 args->converter = cnv; |
| 3487 |
| 3488 /* restore the subconverter's substitution string */ |
| 3489 myConverterData->currentConverter->subChars = currentSubChars; |
| 3490 myConverterData->currentConverter->subCharLen = currentSubCharLen; |
| 3491 |
| 3492 if(*err == U_BUFFER_OVERFLOW_ERROR) { |
| 3493 if(myConverterData->currentConverter->charErrorBufferLength > 0)
{ |
| 3494 uprv_memcpy( |
| 3495 cnv->charErrorBuffer, |
| 3496 myConverterData->currentConverter->charErrorBuffer, |
| 3497 myConverterData->currentConverter->charErrorBufferLength
); |
| 3498 } |
| 3499 cnv->charErrorBufferLength = myConverterData->currentConverter->
charErrorBufferLength; |
| 3500 myConverterData->currentConverter->charErrorBufferLength = 0; |
| 3501 } |
| 3502 return; |
| 3503 } |
| 3504 default: |
| 3505 /* not expected */ |
| 3506 break; |
| 3507 } |
| 3508 ucnv_cbFromUWriteBytes(args, |
| 3509 buffer, (int32_t)(p - buffer), |
| 3510 offsetIndex, err); |
| 3511 } |
| 3512 |
| 3513 /* |
| 3514 * Structure for cloning an ISO 2022 converter into a single memory block. |
| 3515 * ucnv_safeClone() of the converter will align the entire cloneStruct, |
| 3516 * and then ucnv_safeClone() of the sub-converter may additionally align |
| 3517 * currentConverter inside the cloneStruct, for which we need the deadSpace |
| 3518 * after currentConverter. |
| 3519 * This is because UAlignedMemory may be larger than the actually |
| 3520 * necessary alignment size for the platform. |
| 3521 * The other cloneStruct fields will not be moved around, |
| 3522 * and are aligned properly with cloneStruct's alignment. |
| 3523 */ |
| 3524 struct cloneStruct |
| 3525 { |
| 3526 UConverter cnv; |
| 3527 UConverter currentConverter; |
| 3528 UAlignedMemory deadSpace; |
| 3529 UConverterDataISO2022 mydata; |
| 3530 }; |
| 3531 |
| 3532 |
| 3533 static UConverter * |
| 3534 _ISO_2022_SafeClone( |
| 3535 const UConverter *cnv, |
| 3536 void *stackBuffer, |
| 3537 int32_t *pBufferSize, |
| 3538 UErrorCode *status) |
| 3539 { |
| 3540 struct cloneStruct * localClone; |
| 3541 UConverterDataISO2022 *cnvData; |
| 3542 int32_t i, size; |
| 3543 |
| 3544 if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *p
BufferSize */ |
| 3545 *pBufferSize = (int32_t)sizeof(struct cloneStruct); |
| 3546 return NULL; |
| 3547 } |
| 3548 |
| 3549 cnvData = (UConverterDataISO2022 *)cnv->extraInfo; |
| 3550 localClone = (struct cloneStruct *)stackBuffer; |
| 3551 |
| 3552 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ |
| 3553 |
| 3554 uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022)); |
| 3555 localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra dat
a */ |
| 3556 localClone->cnv.isExtraLocal = TRUE; |
| 3557 |
| 3558 /* share the subconverters */ |
| 3559 |
| 3560 if(cnvData->currentConverter != NULL) { |
| 3561 size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* includ
e size of padding */ |
| 3562 localClone->mydata.currentConverter = |
| 3563 ucnv_safeClone(cnvData->currentConverter, |
| 3564 &localClone->currentConverter, |
| 3565 &size, status); |
| 3566 if(U_FAILURE(*status)) { |
| 3567 return NULL; |
| 3568 } |
| 3569 } |
| 3570 |
| 3571 for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) { |
| 3572 if(cnvData->myConverterArray[i] != NULL) { |
| 3573 ucnv_incrementRefCount(cnvData->myConverterArray[i]); |
| 3574 } |
| 3575 } |
| 3576 |
| 3577 return &localClone->cnv; |
| 3578 } |
| 3579 |
| 3580 static void |
| 3581 _ISO_2022_GetUnicodeSet(const UConverter *cnv, |
| 3582 const USetAdder *sa, |
| 3583 UConverterUnicodeSet which, |
| 3584 UErrorCode *pErrorCode) |
| 3585 { |
| 3586 int32_t i; |
| 3587 UConverterDataISO2022* cnvData; |
| 3588 |
| 3589 if (U_FAILURE(*pErrorCode)) { |
| 3590 return; |
| 3591 } |
| 3592 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 3593 if (cnv->sharedData == &_ISO2022Data) { |
| 3594 /* We use UTF-8 in this case */ |
| 3595 sa->addRange(sa->set, 0, 0xd7FF); |
| 3596 sa->addRange(sa->set, 0xE000, 0x10FFFF); |
| 3597 return; |
| 3598 } |
| 3599 #endif |
| 3600 |
| 3601 cnvData = (UConverterDataISO2022*)cnv->extraInfo; |
| 3602 |
| 3603 /* open a set and initialize it with code points that are algorithmically ro
und-tripped */ |
| 3604 switch(cnvData->locale[0]){ |
| 3605 case 'j': |
| 3606 /* include JIS X 0201 which is hardcoded */ |
| 3607 sa->add(sa->set, 0xa5); |
| 3608 sa->add(sa->set, 0x203e); |
| 3609 if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { |
| 3610 /* include Latin-1 for some variants of JP */ |
| 3611 sa->addRange(sa->set, 0, 0xff); |
| 3612 } else { |
| 3613 /* include ASCII for JP */ |
| 3614 sa->addRange(sa->set, 0, 0x7f); |
| 3615 } |
| 3616 if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_A
ND_FALLBACK_SET) { |
| 3617 /* |
| 3618 * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=
0 |
| 3619 * because the bit is on for all JP versions although only versions
3 & 4 (JIS7 & JIS8) |
| 3620 * use half-width Katakana. |
| 3621 * This is because all ISO-2022-JP variants are lenient in that they
accept (in toUnicode) |
| 3622 * half-width Katakana via the ESC ( I sequence. |
| 3623 * However, we only emit (fromUnicode) half-width Katakana according
to the |
| 3624 * definition of each variant. |
| 3625 * |
| 3626 * When including fallbacks, |
| 3627 * we need to include half-width Katakana Unicode code points for al
l JP variants because |
| 3628 * JIS X 0208 has hardcoded fallbacks for them (which map to full-wi
dth Katakana). |
| 3629 */ |
| 3630 /* include half-width Katakana for JP */ |
| 3631 sa->addRange(sa->set, HWKANA_START, HWKANA_END); |
| 3632 } |
| 3633 break; |
| 3634 case 'c': |
| 3635 case 'z': |
| 3636 /* include ASCII for CN */ |
| 3637 sa->addRange(sa->set, 0, 0x7f); |
| 3638 break; |
| 3639 case 'k': |
| 3640 /* there is only one converter for KR, and it is not in the myConverterA
rray[] */ |
| 3641 cnvData->currentConverter->sharedData->impl->getUnicodeSet( |
| 3642 cnvData->currentConverter, sa, which, pErrorCode); |
| 3643 /* the loop over myConverterArray[] will simply not find another convert
er */ |
| 3644 break; |
| 3645 default: |
| 3646 break; |
| 3647 } |
| 3648 |
| 3649 #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implem
ent ucnv_getUnicodeSet() with reverse fallbacks. */ |
| 3650 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
| 3651 cnvData->version==0 && i==CNS_11643 |
| 3652 ) { |
| 3653 /* special handling for non-EXT ISO-2022-CN: add only code point
s for CNS planes 1 and 2 */ |
| 3654 ucnv_MBCSGetUnicodeSetForBytes( |
| 3655 cnvData->myConverterArray[i], |
| 3656 sa, UCNV_ROUNDTRIP_SET, |
| 3657 0, 0x81, 0x82, |
| 3658 pErrorCode); |
| 3659 } |
| 3660 #endif |
| 3661 |
| 3662 for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { |
| 3663 UConverterSetFilter filter; |
| 3664 if(cnvData->myConverterArray[i]!=NULL) { |
| 3665 if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && |
| 3666 cnvData->version==0 && i==CNS_11643 |
| 3667 ) { |
| 3668 /* |
| 3669 * Version-specific for CN: |
| 3670 * CN version 0 does not map CNS planes 3..7 although |
| 3671 * they are all available in the CNS conversion table; |
| 3672 * CN version 1 (-EXT) does map them all. |
| 3673 * The two versions create different Unicode sets. |
| 3674 */ |
| 3675 filter=UCNV_SET_FILTER_2022_CN; |
| 3676 } else if(cnvData->locale[0]=='j' && i==JISX208) { |
| 3677 /* |
| 3678 * Only add code points that map to Shift-JIS codes |
| 3679 * corresponding to JIS X 0208. |
| 3680 */ |
| 3681 filter=UCNV_SET_FILTER_SJIS; |
| 3682 } else if(i==KSC5601) { |
| 3683 /* |
| 3684 * Some of the KSC 5601 tables (convrtrs.txt has this aliases on
multiple tables) |
| 3685 * are broader than GR94. |
| 3686 */ |
| 3687 filter=UCNV_SET_FILTER_GR94DBCS; |
| 3688 } else { |
| 3689 filter=UCNV_SET_FILTER_NONE; |
| 3690 } |
| 3691 ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i
], sa, which, filter, pErrorCode); |
| 3692 } |
| 3693 } |
| 3694 |
| 3695 /* |
| 3696 * ISO 2022 converters must not convert SO/SI/ESC despite what |
| 3697 * sub-converters do by themselves. |
| 3698 * Remove these characters from the set. |
| 3699 */ |
| 3700 sa->remove(sa->set, 0x0e); |
| 3701 sa->remove(sa->set, 0x0f); |
| 3702 sa->remove(sa->set, 0x1b); |
| 3703 |
| 3704 /* ISO 2022 converters do not convert C1 controls either */ |
| 3705 sa->removeRange(sa->set, 0x80, 0x9f); |
| 3706 } |
| 3707 |
| 3708 static const UConverterImpl _ISO2022Impl={ |
| 3709 UCNV_ISO_2022, |
| 3710 |
| 3711 NULL, |
| 3712 NULL, |
| 3713 |
| 3714 _ISO2022Open, |
| 3715 _ISO2022Close, |
| 3716 _ISO2022Reset, |
| 3717 |
| 3718 #ifdef U_ENABLE_GENERIC_ISO_2022 |
| 3719 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
| 3720 T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC, |
| 3721 ucnv_fromUnicode_UTF8, |
| 3722 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, |
| 3723 #else |
| 3724 NULL, |
| 3725 NULL, |
| 3726 NULL, |
| 3727 NULL, |
| 3728 #endif |
| 3729 NULL, |
| 3730 |
| 3731 NULL, |
| 3732 _ISO2022getName, |
| 3733 _ISO_2022_WriteSub, |
| 3734 _ISO_2022_SafeClone, |
| 3735 _ISO_2022_GetUnicodeSet |
| 3736 }; |
| 3737 static const UConverterStaticData _ISO2022StaticData={ |
| 3738 sizeof(UConverterStaticData), |
| 3739 "ISO_2022", |
| 3740 2022, |
| 3741 UCNV_IBM, |
| 3742 UCNV_ISO_2022, |
| 3743 1, |
| 3744 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ |
| 3745 { 0x1a, 0, 0, 0 }, |
| 3746 1, |
| 3747 FALSE, |
| 3748 FALSE, |
| 3749 0, |
| 3750 0, |
| 3751 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3752 }; |
| 3753 const UConverterSharedData _ISO2022Data={ |
| 3754 sizeof(UConverterSharedData), |
| 3755 ~((uint32_t) 0), |
| 3756 NULL, |
| 3757 NULL, |
| 3758 &_ISO2022StaticData, |
| 3759 FALSE, |
| 3760 &_ISO2022Impl, |
| 3761 0 |
| 3762 }; |
| 3763 |
| 3764 /*************JP****************/ |
| 3765 static const UConverterImpl _ISO2022JPImpl={ |
| 3766 UCNV_ISO_2022, |
| 3767 |
| 3768 NULL, |
| 3769 NULL, |
| 3770 |
| 3771 _ISO2022Open, |
| 3772 _ISO2022Close, |
| 3773 _ISO2022Reset, |
| 3774 |
| 3775 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3776 UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3777 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3778 UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC, |
| 3779 NULL, |
| 3780 |
| 3781 NULL, |
| 3782 _ISO2022getName, |
| 3783 _ISO_2022_WriteSub, |
| 3784 _ISO_2022_SafeClone, |
| 3785 _ISO_2022_GetUnicodeSet |
| 3786 }; |
| 3787 static const UConverterStaticData _ISO2022JPStaticData={ |
| 3788 sizeof(UConverterStaticData), |
| 3789 "ISO_2022_JP", |
| 3790 0, |
| 3791 UCNV_IBM, |
| 3792 UCNV_ISO_2022, |
| 3793 1, |
| 3794 6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */ |
| 3795 { 0x1a, 0, 0, 0 }, |
| 3796 1, |
| 3797 FALSE, |
| 3798 FALSE, |
| 3799 0, |
| 3800 0, |
| 3801 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3802 }; |
| 3803 static const UConverterSharedData _ISO2022JPData={ |
| 3804 sizeof(UConverterSharedData), |
| 3805 ~((uint32_t) 0), |
| 3806 NULL, |
| 3807 NULL, |
| 3808 &_ISO2022JPStaticData, |
| 3809 FALSE, |
| 3810 &_ISO2022JPImpl, |
| 3811 0 |
| 3812 }; |
| 3813 |
| 3814 /************* KR ***************/ |
| 3815 static const UConverterImpl _ISO2022KRImpl={ |
| 3816 UCNV_ISO_2022, |
| 3817 |
| 3818 NULL, |
| 3819 NULL, |
| 3820 |
| 3821 _ISO2022Open, |
| 3822 _ISO2022Close, |
| 3823 _ISO2022Reset, |
| 3824 |
| 3825 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3826 UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3827 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3828 UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC, |
| 3829 NULL, |
| 3830 |
| 3831 NULL, |
| 3832 _ISO2022getName, |
| 3833 _ISO_2022_WriteSub, |
| 3834 _ISO_2022_SafeClone, |
| 3835 _ISO_2022_GetUnicodeSet |
| 3836 }; |
| 3837 static const UConverterStaticData _ISO2022KRStaticData={ |
| 3838 sizeof(UConverterStaticData), |
| 3839 "ISO_2022_KR", |
| 3840 0, |
| 3841 UCNV_IBM, |
| 3842 UCNV_ISO_2022, |
| 3843 1, |
| 3844 3, /* max 3 bytes per UChar: SO+DBCS */ |
| 3845 { 0x1a, 0, 0, 0 }, |
| 3846 1, |
| 3847 FALSE, |
| 3848 FALSE, |
| 3849 0, |
| 3850 0, |
| 3851 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3852 }; |
| 3853 static const UConverterSharedData _ISO2022KRData={ |
| 3854 sizeof(UConverterSharedData), |
| 3855 ~((uint32_t) 0), |
| 3856 NULL, |
| 3857 NULL, |
| 3858 &_ISO2022KRStaticData, |
| 3859 FALSE, |
| 3860 &_ISO2022KRImpl, |
| 3861 0 |
| 3862 }; |
| 3863 |
| 3864 /*************** CN ***************/ |
| 3865 static const UConverterImpl _ISO2022CNImpl={ |
| 3866 |
| 3867 UCNV_ISO_2022, |
| 3868 |
| 3869 NULL, |
| 3870 NULL, |
| 3871 |
| 3872 _ISO2022Open, |
| 3873 _ISO2022Close, |
| 3874 _ISO2022Reset, |
| 3875 |
| 3876 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3877 UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3878 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3879 UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC, |
| 3880 NULL, |
| 3881 |
| 3882 NULL, |
| 3883 _ISO2022getName, |
| 3884 _ISO_2022_WriteSub, |
| 3885 _ISO_2022_SafeClone, |
| 3886 _ISO_2022_GetUnicodeSet |
| 3887 }; |
| 3888 static const UConverterStaticData _ISO2022CNStaticData={ |
| 3889 sizeof(UConverterStaticData), |
| 3890 "ISO_2022_CN", |
| 3891 0, |
| 3892 UCNV_IBM, |
| 3893 UCNV_ISO_2022, |
| 3894 1, |
| 3895 8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + D
BCS */ |
| 3896 { 0x1a, 0, 0, 0 }, |
| 3897 1, |
| 3898 FALSE, |
| 3899 FALSE, |
| 3900 0, |
| 3901 0, |
| 3902 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
| 3903 }; |
| 3904 static const UConverterSharedData _ISO2022CNData={ |
| 3905 sizeof(UConverterSharedData), |
| 3906 ~((uint32_t) 0), |
| 3907 NULL, |
| 3908 NULL, |
| 3909 &_ISO2022CNStaticData, |
| 3910 FALSE, |
| 3911 &_ISO2022CNImpl, |
| 3912 0 |
| 3913 }; |
| 3914 |
| 3915 |
| 3916 |
| 3917 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ |
OLD | NEW |