OLD | NEW |
---|---|
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
6 | 6 |
7 #include "src/unicode-inl.h" | 7 #include "src/unicode-inl.h" |
8 #include <stdio.h> | 8 #include <stdio.h> |
9 #include <stdlib.h> | 9 #include <stdlib.h> |
10 | 10 |
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
183 return 0; | 183 return 0; |
184 } | 184 } |
185 return -1; | 185 return -1; |
186 } | 186 } |
187 } else { | 187 } else { |
188 return 0; | 188 return 0; |
189 } | 189 } |
190 } | 190 } |
191 | 191 |
192 | 192 |
193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { | 193 static inline size_t NonASCIISequenceLength(byte first) { |
194 // We only get called for non-ASCII characters. | 194 static const uint8_t lengths[256] = { |
195 if (length == 1) { | 195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
200 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
203 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
204 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
205 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |
vogelheim
2015/05/21 16:58:38
The table is difficult to read. It also leaves me
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
I updated the matrix to be 16 x 16, and added comm
| |
206 return lengths[first]; | |
207 } | |
208 | |
209 | |
210 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { | |
vogelheim
2015/05/21 16:58:38
This might also benefit from a unit test that will
vogelheim
2015/05/21 16:58:38
I believe this deserves some commentary, and if on
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
yeah, actually, it's supposed to be consistent wit
| |
211 DCHECK((str[0] & 0x80) == 0x80); | |
212 size_t length = NonASCIISequenceLength(str[0]); | |
213 if (length == 0 || max_length < length) { | |
196 *cursor += 1; | 214 *cursor += 1; |
197 return kBadChar; | 215 return kBadChar; |
198 } | 216 } |
199 byte first = str[0]; | 217 if (length == 2) { |
vogelheim
2015/05/21 16:58:38
I was trying to figure out *why* these characters
vogelheim
2015/05/21 16:58:38
I find the code below to be somewhat confusing. If
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
right. It's just that UTF-8 cannot encode all of u
| |
200 byte second = str[1] ^ 0x80; | 218 DCHECK(str[0] <= 0xDF); |
201 if (second & 0xC0) { | 219 if (str[0] < 0xC2) { |
202 *cursor += 1; | |
203 return kBadChar; | |
204 } | |
205 if (first < 0xE0) { | |
206 if (first < 0xC0) { | |
207 *cursor += 1; | 220 *cursor += 1; |
208 return kBadChar; | 221 return kBadChar; |
209 } | 222 } |
210 uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; | 223 if (str[1] < 0x80 || str[1] > 0xBF) { |
211 if (code_point <= kMaxOneByteChar) { | |
212 *cursor += 1; | 224 *cursor += 1; |
213 return kBadChar; | 225 return kBadChar; |
214 } | 226 } |
215 *cursor += 2; | 227 *cursor += 2; |
216 return code_point; | 228 return ((str[0] << 6) + str[1]) - 0x00003080; |
217 } | 229 } |
218 if (length == 2) { | 230 if (length == 3) { |
219 *cursor += 1; | 231 DCHECK(str[0] >= 0xE0 && str[0] <= 0xEF); |
220 return kBadChar; | 232 switch (str[0]) { |
221 } | 233 case 0xE0: |
222 byte third = str[2] ^ 0x80; | 234 if (str[1] < 0xA0 || str[1] > 0xBF) { |
223 if (third & 0xC0) { | 235 *cursor += 1; |
224 *cursor += 1; | 236 return kBadChar; |
225 return kBadChar; | 237 } |
226 } | 238 break; |
227 if (first < 0xF0) { | 239 case 0xED: |
228 uchar code_point = ((((first << 6) | second) << 6) | third) | 240 if (str[1] < 0x80 || str[1] > 0x9F) { |
229 & kMaxThreeByteChar; | 241 *cursor += 1; |
230 if (code_point <= kMaxTwoByteChar) { | 242 return kBadChar; |
243 } | |
244 break; | |
245 default: | |
246 if (str[1] < 0x80 || str[1] > 0xBF) { | |
247 *cursor += 1; | |
248 return kBadChar; | |
249 } | |
250 } | |
251 if (str[2] < 0x80 || str[2] > 0xBF) { | |
231 *cursor += 1; | 252 *cursor += 1; |
232 return kBadChar; | 253 return kBadChar; |
233 } | 254 } |
234 *cursor += 3; | 255 *cursor += 3; |
235 return code_point; | 256 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
236 } | 257 } |
237 if (length == 3) { | 258 DCHECK(length == 4); |
259 DCHECK(str[0] >= 0xF0 && str[0] <= 0xF4); | |
260 switch (str[0]) { | |
261 case 0xF0: | |
262 if (str[1] < 0x90 || str[1] > 0xBF) { | |
263 *cursor += 1; | |
264 return kBadChar; | |
265 } | |
266 break; | |
267 case 0xF4: | |
268 if (str[1] < 0x80 || str[1] > 0x8F) { | |
269 *cursor += 1; | |
270 return kBadChar; | |
271 } | |
272 break; | |
273 default: | |
274 if (str[1] < 0x80 || str[1] > 0xBF) { | |
275 *cursor += 1; | |
276 return kBadChar; | |
277 } | |
278 } | |
279 if (str[2] < 0x80 || str[2] > 0xBF) { | |
238 *cursor += 1; | 280 *cursor += 1; |
239 return kBadChar; | 281 return kBadChar; |
240 } | 282 } |
241 byte fourth = str[3] ^ 0x80; | 283 if (str[3] < 0x80 || str[3] > 0xBF) { |
242 if (fourth & 0xC0) { | |
243 *cursor += 1; | 284 *cursor += 1; |
244 return kBadChar; | 285 return kBadChar; |
245 } | 286 } |
246 if (first < 0xF8) { | 287 *cursor += 4; |
247 uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) | 288 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
248 & kMaxFourByteChar; | 289 0x03C82080; |
249 if (code_point <= kMaxThreeByteChar) { | |
250 *cursor += 1; | |
251 return kBadChar; | |
252 } | |
253 *cursor += 4; | |
254 return code_point; | |
255 } | |
256 *cursor += 1; | |
257 return kBadChar; | |
258 } | 290 } |
259 | 291 |
260 | 292 |
261 // Uppercase: point.category == 'Lu' | 293 // Uppercase: point.category == 'Lu' |
262 | 294 |
263 static const uint16_t kUppercaseTable0Size = 455; | 295 static const uint16_t kUppercaseTable0Size = 455; |
264 static const int32_t kUppercaseTable0[455] = { | 296 static const int32_t kUppercaseTable0[455] = { |
265 1073741889, 90, 1073742016, 214, | 297 1073741889, 90, 1073742016, 214, |
266 1073742040, 222, 256, 258, // NOLINT | 298 1073742040, 222, 256, 258, // NOLINT |
267 260, 262, 264, 266, | 299 260, 262, 264, 266, |
(...skipping 3117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3417 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3386 + | 3418 + |
3387 kCanonicalizationRangeMultiStrings1Size * | 3419 kCanonicalizationRangeMultiStrings1Size * |
3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3420 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3389 + | 3421 + |
3390 kCanonicalizationRangeMultiStrings7Size * | 3422 kCanonicalizationRangeMultiStrings7Size * |
3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3423 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
3392 } | 3424 } |
3393 | 3425 |
3394 } // namespace unibrow | 3426 } // namespace unibrow |
OLD | NEW |