OLD | NEW |
| (Empty) |
1 /* | |
2 * The authors of this software are Rob Pike and Ken Thompson. | |
3 * Copyright (c) 2002 by Lucent Technologies. | |
4 * Permission to use, copy, modify, and distribute this software for any | |
5 * purpose without fee is hereby granted, provided that this entire notice | |
6 * is included in all copies of any software which is or includes a copy | |
7 * or modification of this software and in all copies of the supporting | |
8 * documentation for such software. | |
9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED | |
10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE AN
Y | |
11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY | |
12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. | |
13 */ | |
14 #include <stdarg.h> | |
15 #include <string.h> | |
16 #include "utf.h" | |
17 #include "utfdef.h" | |
18 | |
19 enum | |
20 { | |
21 Bit1 = 7, | |
22 Bitx = 6, | |
23 Bit2 = 5, | |
24 Bit3 = 4, | |
25 Bit4 = 3, | |
26 Bit5 = 2, | |
27 | |
28 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ | |
29 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ | |
30 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ | |
31 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ | |
32 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ | |
33 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ | |
34 | |
35 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ | |
36 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ | |
37 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ | |
38 Rune4 = (1<<(Bit4+3*Bitx))-1, | |
39 /* 0001 1111 1111 1111 1111 1111 */ | |
40 | |
41 Maskx = (1<<Bitx)-1, /* 0011 1111 */ | |
42 Testx = Maskx ^ 0xFF, /* 1100 0000 */ | |
43 | |
44 Bad = Runeerror, | |
45 }; | |
46 | |
47 /* | |
48 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 | |
49 * This is a slower but "safe" version of the old chartorune | |
50 * that works on strings that are not necessarily null-terminated. | |
51 * | |
52 * If you know for sure that your string is null-terminated, | |
53 * chartorune will be a bit faster. | |
54 * | |
55 * It is guaranteed not to attempt to access "length" | |
56 * past the incoming pointer. This is to avoid | |
57 * possible access violations. If the string appears to be | |
58 * well-formed but incomplete (i.e., to get the whole Rune | |
59 * we'd need to read past str+length) then we'll set the Rune | |
60 * to Bad and return 0. | |
61 * | |
62 * Note that if we have decoding problems for other | |
63 * reasons, we return 1 instead of 0. | |
64 */ | |
65 int | |
66 charntorune(Rune *rune, const char *str, int length) | |
67 { | |
68 int c, c1, c2, c3; | |
69 long l; | |
70 | |
71 /* When we're not allowed to read anything */ | |
72 if(length <= 0) { | |
73 goto badlen; | |
74 } | |
75 | |
76 /* | |
77 * one character sequence (7-bit value) | |
78 * 00000-0007F => T1 | |
79 */ | |
80 c = *(uchar*)str; | |
81 if(c < Tx) { | |
82 *rune = c; | |
83 return 1; | |
84 } | |
85 | |
86 // If we can't read more than one character we must stop | |
87 if(length <= 1) { | |
88 goto badlen; | |
89 } | |
90 | |
91 /* | |
92 * two character sequence (11-bit value) | |
93 * 0080-07FF => T2 Tx | |
94 */ | |
95 c1 = *(uchar*)(str+1) ^ Tx; | |
96 if(c1 & Testx) | |
97 goto bad; | |
98 if(c < T3) { | |
99 if(c < T2) | |
100 goto bad; | |
101 l = ((c << Bitx) | c1) & Rune2; | |
102 if(l <= Rune1) | |
103 goto bad; | |
104 *rune = l; | |
105 return 2; | |
106 } | |
107 | |
108 // If we can't read more than two characters we must stop | |
109 if(length <= 2) { | |
110 goto badlen; | |
111 } | |
112 | |
113 /* | |
114 * three character sequence (16-bit value) | |
115 * 0800-FFFF => T3 Tx Tx | |
116 */ | |
117 c2 = *(uchar*)(str+2) ^ Tx; | |
118 if(c2 & Testx) | |
119 goto bad; | |
120 if(c < T4) { | |
121 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; | |
122 if(l <= Rune2) | |
123 goto bad; | |
124 *rune = l; | |
125 return 3; | |
126 } | |
127 | |
128 if (length <= 3) | |
129 goto badlen; | |
130 | |
131 /* | |
132 * four character sequence (21-bit value) | |
133 * 10000-1FFFFF => T4 Tx Tx Tx | |
134 */ | |
135 c3 = *(uchar*)(str+3) ^ Tx; | |
136 if (c3 & Testx) | |
137 goto bad; | |
138 if (c < T5) { | |
139 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4
; | |
140 if (l <= Rune3) | |
141 goto bad; | |
142 *rune = l; | |
143 return 4; | |
144 } | |
145 | |
146 // Support for 5-byte or longer UTF-8 would go here, but | |
147 // since we don't have that, we'll just fall through to bad. | |
148 | |
149 /* | |
150 * bad decoding | |
151 */ | |
152 bad: | |
153 *rune = Bad; | |
154 return 1; | |
155 badlen: | |
156 *rune = Bad; | |
157 return 0; | |
158 | |
159 } | |
160 | |
161 | |
162 /* | |
163 * This is the older "unsafe" version, which works fine on | |
164 * null-terminated strings. | |
165 */ | |
166 int | |
167 chartorune(Rune *rune, const char *str) | |
168 { | |
169 int c, c1, c2, c3; | |
170 long l; | |
171 | |
172 /* | |
173 * one character sequence | |
174 * 00000-0007F => T1 | |
175 */ | |
176 c = *(uchar*)str; | |
177 if(c < Tx) { | |
178 *rune = c; | |
179 return 1; | |
180 } | |
181 | |
182 /* | |
183 * two character sequence | |
184 * 0080-07FF => T2 Tx | |
185 */ | |
186 c1 = *(uchar*)(str+1) ^ Tx; | |
187 if(c1 & Testx) | |
188 goto bad; | |
189 if(c < T3) { | |
190 if(c < T2) | |
191 goto bad; | |
192 l = ((c << Bitx) | c1) & Rune2; | |
193 if(l <= Rune1) | |
194 goto bad; | |
195 *rune = l; | |
196 return 2; | |
197 } | |
198 | |
199 /* | |
200 * three character sequence | |
201 * 0800-FFFF => T3 Tx Tx | |
202 */ | |
203 c2 = *(uchar*)(str+2) ^ Tx; | |
204 if(c2 & Testx) | |
205 goto bad; | |
206 if(c < T4) { | |
207 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; | |
208 if(l <= Rune2) | |
209 goto bad; | |
210 *rune = l; | |
211 return 3; | |
212 } | |
213 | |
214 /* | |
215 * four character sequence (21-bit value) | |
216 * 10000-1FFFFF => T4 Tx Tx Tx | |
217 */ | |
218 c3 = *(uchar*)(str+3) ^ Tx; | |
219 if (c3 & Testx) | |
220 goto bad; | |
221 if (c < T5) { | |
222 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4
; | |
223 if (l <= Rune3) | |
224 goto bad; | |
225 *rune = l; | |
226 return 4; | |
227 } | |
228 | |
229 /* | |
230 * Support for 5-byte or longer UTF-8 would go here, but | |
231 * since we don't have that, we'll just fall through to bad. | |
232 */ | |
233 | |
234 /* | |
235 * bad decoding | |
236 */ | |
237 bad: | |
238 *rune = Bad; | |
239 return 1; | |
240 } | |
241 | |
242 int | |
243 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { | |
244 *consumed = charntorune(rune, str, length); | |
245 return *rune != Runeerror || *consumed == 3; | |
246 } | |
247 | |
248 int | |
249 runetochar(char *str, const Rune *rune) | |
250 { | |
251 /* Runes are signed, so convert to unsigned for range check. */ | |
252 unsigned long c; | |
253 | |
254 /* | |
255 * one character sequence | |
256 * 00000-0007F => 00-7F | |
257 */ | |
258 c = *rune; | |
259 if(c <= Rune1) { | |
260 str[0] = c; | |
261 return 1; | |
262 } | |
263 | |
264 /* | |
265 * two character sequence | |
266 * 0080-07FF => T2 Tx | |
267 */ | |
268 if(c <= Rune2) { | |
269 str[0] = T2 | (c >> 1*Bitx); | |
270 str[1] = Tx | (c & Maskx); | |
271 return 2; | |
272 } | |
273 | |
274 /* | |
275 * If the Rune is out of range, convert it to the error rune. | |
276 * Do this test here because the error rune encodes to three bytes. | |
277 * Doing it earlier would duplicate work, since an out of range | |
278 * Rune wouldn't have fit in one or two bytes. | |
279 */ | |
280 if (c > Runemax) | |
281 c = Runeerror; | |
282 | |
283 /* | |
284 * three character sequence | |
285 * 0800-FFFF => T3 Tx Tx | |
286 */ | |
287 if (c <= Rune3) { | |
288 str[0] = T3 | (c >> 2*Bitx); | |
289 str[1] = Tx | ((c >> 1*Bitx) & Maskx); | |
290 str[2] = Tx | (c & Maskx); | |
291 return 3; | |
292 } | |
293 | |
294 /* | |
295 * four character sequence (21-bit value) | |
296 * 10000-1FFFFF => T4 Tx Tx Tx | |
297 */ | |
298 str[0] = T4 | (c >> 3*Bitx); | |
299 str[1] = Tx | ((c >> 2*Bitx) & Maskx); | |
300 str[2] = Tx | ((c >> 1*Bitx) & Maskx); | |
301 str[3] = Tx | (c & Maskx); | |
302 return 4; | |
303 } | |
304 | |
305 int | |
306 runelen(Rune rune) | |
307 { | |
308 char str[10]; | |
309 | |
310 return runetochar(str, &rune); | |
311 } | |
312 | |
313 int | |
314 runenlen(const Rune *r, int nrune) | |
315 { | |
316 int nb, c; | |
317 | |
318 nb = 0; | |
319 while(nrune--) { | |
320 c = *r++; | |
321 if (c <= Rune1) | |
322 nb++; | |
323 else if (c <= Rune2) | |
324 nb += 2; | |
325 else if (c <= Rune3) | |
326 nb += 3; | |
327 else /* assert(c <= Rune4) */ | |
328 nb += 4; | |
329 } | |
330 return nb; | |
331 } | |
332 | |
333 int | |
334 fullrune(const char *str, int n) | |
335 { | |
336 if (n > 0) { | |
337 int c = *(uchar*)str; | |
338 if (c < Tx) | |
339 return 1; | |
340 if (n > 1) { | |
341 if (c < T3) | |
342 return 1; | |
343 if (n > 2) { | |
344 if (c < T4 || n > 3) | |
345 return 1; | |
346 } | |
347 } | |
348 } | |
349 return 0; | |
350 } | |
OLD | NEW |