OLD | NEW |
| (Empty) |
1 /* | |
2 * The authors of this software are Rob Pike and Ken Thompson. | |
3 * Copyright (c) 2002 by Lucent Technologies. | |
4 * Permission to use, copy, modify, and distribute this software for any | |
5 * purpose without fee is hereby granted, provided that this entire notice | |
6 * is included in all copies of any software which is or includes a copy | |
7 * or modification of this software and in all copies of the supporting | |
8 * documentation for such software. | |
9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED | |
10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE AN
Y | |
11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY | |
12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. | |
13 */ | |
14 #include <stdarg.h> | |
15 #include <string.h> | |
16 #include "util/utf.h" | |
17 | |
18 namespace re2 { | |
19 | |
20 enum | |
21 { | |
22 Bit1 = 7, | |
23 Bitx = 6, | |
24 Bit2 = 5, | |
25 Bit3 = 4, | |
26 Bit4 = 3, | |
27 Bit5 = 2, | |
28 | |
29 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ | |
30 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ | |
31 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ | |
32 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ | |
33 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ | |
34 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ | |
35 | |
36 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ | |
37 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ | |
38 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ | |
39 Rune4 = (1<<(Bit4+3*Bitx))-1, | |
40 /* 0001 1111 1111 1111 1111 1111 */ | |
41 | |
42 Maskx = (1<<Bitx)-1, /* 0011 1111 */ | |
43 Testx = Maskx ^ 0xFF, /* 1100 0000 */ | |
44 | |
45 Bad = Runeerror, | |
46 }; | |
47 | |
48 int | |
49 chartorune(Rune *rune, const char *str) | |
50 { | |
51 int c, c1, c2, c3; | |
52 long l; | |
53 | |
54 /* | |
55 * one character sequence | |
56 * 00000-0007F => T1 | |
57 */ | |
58 c = *(unsigned char*)str; | |
59 if(c < Tx) { | |
60 *rune = c; | |
61 return 1; | |
62 } | |
63 | |
64 /* | |
65 * two character sequence | |
66 * 0080-07FF => T2 Tx | |
67 */ | |
68 c1 = *(unsigned char*)(str+1) ^ Tx; | |
69 if(c1 & Testx) | |
70 goto bad; | |
71 if(c < T3) { | |
72 if(c < T2) | |
73 goto bad; | |
74 l = ((c << Bitx) | c1) & Rune2; | |
75 if(l <= Rune1) | |
76 goto bad; | |
77 *rune = l; | |
78 return 2; | |
79 } | |
80 | |
81 /* | |
82 * three character sequence | |
83 * 0800-FFFF => T3 Tx Tx | |
84 */ | |
85 c2 = *(unsigned char*)(str+2) ^ Tx; | |
86 if(c2 & Testx) | |
87 goto bad; | |
88 if(c < T4) { | |
89 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; | |
90 if(l <= Rune2) | |
91 goto bad; | |
92 *rune = l; | |
93 return 3; | |
94 } | |
95 | |
96 /* | |
97 * four character sequence (21-bit value) | |
98 * 10000-1FFFFF => T4 Tx Tx Tx | |
99 */ | |
100 c3 = *(unsigned char*)(str+3) ^ Tx; | |
101 if (c3 & Testx) | |
102 goto bad; | |
103 if (c < T5) { | |
104 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4
; | |
105 if (l <= Rune3) | |
106 goto bad; | |
107 *rune = l; | |
108 return 4; | |
109 } | |
110 | |
111 /* | |
112 * Support for 5-byte or longer UTF-8 would go here, but | |
113 * since we don't have that, we'll just fall through to bad. | |
114 */ | |
115 | |
116 /* | |
117 * bad decoding | |
118 */ | |
119 bad: | |
120 *rune = Bad; | |
121 return 1; | |
122 } | |
123 | |
124 int | |
125 runetochar(char *str, const Rune *rune) | |
126 { | |
127 /* Runes are signed, so convert to unsigned for range check. */ | |
128 unsigned long c; | |
129 | |
130 /* | |
131 * one character sequence | |
132 * 00000-0007F => 00-7F | |
133 */ | |
134 c = *rune; | |
135 if(c <= Rune1) { | |
136 str[0] = static_cast<char>(c); | |
137 return 1; | |
138 } | |
139 | |
140 /* | |
141 * two character sequence | |
142 * 0080-07FF => T2 Tx | |
143 */ | |
144 if(c <= Rune2) { | |
145 str[0] = T2 | static_cast<char>(c >> 1*Bitx); | |
146 str[1] = Tx | (c & Maskx); | |
147 return 2; | |
148 } | |
149 | |
150 /* | |
151 * If the Rune is out of range, convert it to the error rune. | |
152 * Do this test here because the error rune encodes to three bytes. | |
153 * Doing it earlier would duplicate work, since an out of range | |
154 * Rune wouldn't have fit in one or two bytes. | |
155 */ | |
156 if (c > Runemax) | |
157 c = Runeerror; | |
158 | |
159 /* | |
160 * three character sequence | |
161 * 0800-FFFF => T3 Tx Tx | |
162 */ | |
163 if (c <= Rune3) { | |
164 str[0] = T3 | static_cast<char>(c >> 2*Bitx); | |
165 str[1] = Tx | ((c >> 1*Bitx) & Maskx); | |
166 str[2] = Tx | (c & Maskx); | |
167 return 3; | |
168 } | |
169 | |
170 /* | |
171 * four character sequence (21-bit value) | |
172 * 10000-1FFFFF => T4 Tx Tx Tx | |
173 */ | |
174 str[0] = T4 | static_cast<char>(c >> 3*Bitx); | |
175 str[1] = Tx | ((c >> 2*Bitx) & Maskx); | |
176 str[2] = Tx | ((c >> 1*Bitx) & Maskx); | |
177 str[3] = Tx | (c & Maskx); | |
178 return 4; | |
179 } | |
180 | |
181 int | |
182 runelen(Rune rune) | |
183 { | |
184 char str[10]; | |
185 | |
186 return runetochar(str, &rune); | |
187 } | |
188 | |
189 int | |
190 fullrune(const char *str, int n) | |
191 { | |
192 if (n > 0) { | |
193 int c = *(unsigned char*)str; | |
194 if (c < Tx) | |
195 return 1; | |
196 if (n > 1) { | |
197 if (c < T3) | |
198 return 1; | |
199 if (n > 2) { | |
200 if (c < T4 || n > 3) | |
201 return 1; | |
202 } | |
203 } | |
204 } | |
205 return 0; | |
206 } | |
207 | |
208 | |
209 int | |
210 utflen(const char *s) | |
211 { | |
212 int c; | |
213 long n; | |
214 Rune rune; | |
215 | |
216 n = 0; | |
217 for(;;) { | |
218 c = *(unsigned char*)s; | |
219 if(c < Runeself) { | |
220 if(c == 0) | |
221 return n; | |
222 s++; | |
223 } else | |
224 s += chartorune(&rune, s); | |
225 n++; | |
226 } | |
227 return 0; | |
228 } | |
229 | |
230 char* | |
231 utfrune(const char *s, Rune c) | |
232 { | |
233 long c1; | |
234 Rune r; | |
235 int n; | |
236 | |
237 if(c < Runesync) /* not part of utf sequence */ | |
238 return strchr((char*)s, c); | |
239 | |
240 for(;;) { | |
241 c1 = *(unsigned char*)s; | |
242 if(c1 < Runeself) { /* one byte rune */ | |
243 if(c1 == 0) | |
244 return 0; | |
245 if(c1 == c) | |
246 return (char*)s; | |
247 s++; | |
248 continue; | |
249 } | |
250 n = chartorune(&r, s); | |
251 if(r == c) | |
252 return (char*)s; | |
253 s += n; | |
254 } | |
255 return 0; | |
256 } | |
257 | |
258 } // namespace re2 | |
OLD | NEW |