OLD | NEW |
1 /* | 1 /* |
2 ****************************************************************************** | 2 ****************************************************************************** |
3 * | 3 * |
4 * Copyright (C) 1999-2006, International Business Machines | 4 * Copyright (C) 1999-2006, International Business Machines |
5 * Corporation and others. All Rights Reserved. | 5 * Corporation and others. All Rights Reserved. |
6 * | 6 * |
7 ****************************************************************************** | 7 ****************************************************************************** |
8 * file name: utf_impl.c | 8 * file name: utf_impl.c |
9 * encoding: US-ASCII | 9 * encoding: US-ASCII |
10 * tab size: 8 (not used) | 10 * tab size: 8 (not used) |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
67 * - MOV AX, 6 (result) | 67 * - MOV AX, 6 (result) |
68 * - JZ finish (ZF==1 if leadByte==0xff) | 68 * - JZ finish (ZF==1 if leadByte==0xff) |
69 * - SUB AX, BX (result) | 69 * - SUB AX, BX (result) |
70 * -finish: | 70 * -finish: |
71 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) | 71 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) |
72 * | 72 * |
73 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal; | 73 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal; |
74 * lead bytes above 0xf4 are illegal. | 74 * lead bytes above 0xf4 are illegal. |
75 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences. | 75 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences. |
76 */ | 76 */ |
77 const uint8 | 77 const uint8_t utf8_countTrailBytes[256] = |
78 utf8_countTrailBytes[256]={ | 78 { |
79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 79 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
83 | 82 |
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
88 | 86 |
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
93 | 90 |
94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
96 | 93 |
97 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 94 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, |
98 3, 3, 3, 3, 3, | 95 3, 3, /* illegal in Unicode */ |
99 3, 3, 3, /* illegal in Unicode */ | 96 4, 4, 4, 4, /* illegal in Unicode */ |
100 4, 4, 4, 4, /* illegal in Unicode */ | 97 5, 5, /* illegal in Unicode */ |
101 5, 5, /* illegal in Unicode */ | 98 0, 0 /* illegal bytes 0xfe and 0xff */ |
102 0, 0 /* illegal bytes 0xfe and 0xff */ | |
103 }; | 99 }; |
104 | 100 |
105 static const UChar32 | 101 static const UChar32 |
106 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; | 102 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; |
107 | 103 |
108 static const UChar32 | 104 static const UChar32 |
109 utf8_errorValue[6]={ | 105 utf8_errorValue[6]={ |
110 CBUTF8_ERROR_VALUE_1, CBUTF8_ERROR_VALUE_2, CBUTF_ERROR_VALUE, 0x10ffff, | 106 CBUTF8_ERROR_VALUE_1, CBUTF8_ERROR_VALUE_2, CBUTF_ERROR_VALUE, 0x10ffff, |
111 0x3ffffff, 0x7fffffff | 107 0x3ffffff, 0x7fffffff |
112 }; | 108 }; |
(...skipping 13 matching lines...) Expand all Loading... |
126 * Same as the obsolete "safe" behavior, but non-characters are also treated | 122 * Same as the obsolete "safe" behavior, but non-characters are also treated |
127 * like illegal sequences. | 123 * like illegal sequences. |
128 * | 124 * |
129 * The special negative (<0) value -2 is used for lenient treatment of surrogate | 125 * The special negative (<0) value -2 is used for lenient treatment of surrogate |
130 * code points as legal. Some implementations use this for roundtripping of | 126 * code points as legal. Some implementations use this for roundtripping of |
131 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they | 127 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they |
132 * contain unpaired surrogates. | 128 * contain unpaired surrogates. |
133 * | 129 * |
134 * Note that a UBool is the same as an int8_t. | 130 * Note that a UBool is the same as an int8_t. |
135 */ | 131 */ |
136 UChar32 | 132 UChar32 utf8_nextCharSafeBody(const uint8_t* s, |
137 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool
strict) { | 133 int32_t* pi, |
138 int32 i=*pi; | 134 int32_t length, |
139 uint8 count=CBU8_COUNT_TRAIL_BYTES(c); | 135 UChar32 c, |
| 136 UBool strict) { |
| 137 int32_t i = *pi; |
| 138 uint8_t count = CBU8_COUNT_TRAIL_BYTES(c); |
140 if((i)+count<=(length)) { | 139 if((i)+count<=(length)) { |
141 uint8 trail, illegal=0; | 140 uint8_t trail, illegal = 0; |
142 | 141 |
143 CBU8_MASK_LEAD_BYTE((c), count); | 142 CBU8_MASK_LEAD_BYTE((c), count); |
144 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe
and 0xff */ | 143 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe
and 0xff */ |
145 switch(count) { | 144 switch(count) { |
146 /* each branch falls through to the next one */ | 145 /* each branch falls through to the next one */ |
147 case 5: | 146 case 5: |
148 case 4: | 147 case 4: |
149 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode
's UTF-8 */ | 148 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode
's UTF-8 */ |
150 illegal=1; | 149 illegal=1; |
151 break; | 150 break; |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
185 * Starting with Unicode 3.2, surrogate code points must not be | 184 * Starting with Unicode 3.2, surrogate code points must not be |
186 * encoded in UTF-8, and there are no irregular sequences any more. | 185 * encoded in UTF-8, and there are no irregular sequences any more. |
187 * | 186 * |
188 * U8_ macros (new in ICU 2.4) return negative values for error conditio
ns. | 187 * U8_ macros (new in ICU 2.4) return negative values for error conditio
ns. |
189 */ | 188 */ |
190 | 189 |
191 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ | 190 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ |
192 /* illegal is also set if count>=4 */ | 191 /* illegal is also set if count>=4 */ |
193 if(illegal || (c)<utf8_minLegal[count] || (CBU_IS_SURROGATE(c) && strict
!=-2)) { | 192 if(illegal || (c)<utf8_minLegal[count] || (CBU_IS_SURROGATE(c) && strict
!=-2)) { |
194 /* error handling */ | 193 /* error handling */ |
195 uint8 errorCount=count; | 194 uint8_t errorCount = count; |
196 /* don't go beyond this sequence */ | 195 /* don't go beyond this sequence */ |
197 i=*pi; | 196 i=*pi; |
198 while(count>0 && CBU8_IS_TRAIL(s[i])) { | 197 while(count>0 && CBU8_IS_TRAIL(s[i])) { |
199 ++(i); | 198 ++(i); |
200 --count; | 199 --count; |
201 } | 200 } |
202 if(strict>=0) { | 201 if(strict>=0) { |
203 c=utf8_errorValue[errorCount-count]; | 202 c=utf8_errorValue[errorCount-count]; |
204 } else { | 203 } else { |
205 c=CBU_SENTINEL; | 204 c=CBU_SENTINEL; |
206 } | 205 } |
207 } else if((strict)>0 && CBU_IS_UNICODE_NONCHAR(c)) { | 206 } else if((strict)>0 && CBU_IS_UNICODE_NONCHAR(c)) { |
208 /* strict: forbid non-characters like U+fffe */ | 207 /* strict: forbid non-characters like U+fffe */ |
209 c=utf8_errorValue[count]; | 208 c=utf8_errorValue[count]; |
210 } | 209 } |
211 } else /* too few bytes left */ { | 210 } else /* too few bytes left */ { |
212 /* error handling */ | 211 /* error handling */ |
213 int32 i0=i; | 212 int32_t i0 = i; |
214 /* don't just set (i)=(length) in case there is an illegal sequence */ | 213 /* don't just set (i)=(length) in case there is an illegal sequence */ |
215 while((i)<(length) && CBU8_IS_TRAIL(s[i])) { | 214 while((i)<(length) && CBU8_IS_TRAIL(s[i])) { |
216 ++(i); | 215 ++(i); |
217 } | 216 } |
218 if(strict>=0) { | 217 if(strict>=0) { |
219 c=utf8_errorValue[i-i0]; | 218 c=utf8_errorValue[i-i0]; |
220 } else { | 219 } else { |
221 c=CBU_SENTINEL; | 220 c=CBU_SENTINEL; |
222 } | 221 } |
223 } | 222 } |
224 *pi=i; | 223 *pi=i; |
225 return c; | 224 return c; |
226 } | 225 } |
227 | 226 |
228 } // namespace base_icu | 227 } // namespace base_icu |
OLD | NEW |