OLD | NEW |
---|---|
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
6 | 6 |
7 #include "src/unicode-inl.h" | 7 #include "src/unicode-inl.h" |
8 #include <stdio.h> | 8 #include <stdio.h> |
9 #include <stdlib.h> | 9 #include <stdlib.h> |
10 | 10 |
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
183 return 0; | 183 return 0; |
184 } | 184 } |
185 return -1; | 185 return -1; |
186 } | 186 } |
187 } else { | 187 } else { |
188 return 0; | 188 return 0; |
189 } | 189 } |
190 } | 190 } |
191 | 191 |
192 | 192 |
193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { | 193 static inline size_t NonASCIISequenceLength(byte first) { |
194 // We only get called for non-ASCII characters. | 194 // clang-format off |
195 if (length == 1) { | 195 static const uint8_t lengths[256] = { |
196 // The first 128 entries correspond to ASCII characters. | |
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
205 // The following 64 entries correspond to continuation bytes. | |
206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
210 // The next are two invalid overlong encodings and 30 two-byte sequences. | |
211 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
212 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
213 // 16 three-byte sequences. | |
214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
215 // 5 four-byte sequences, followed by sequences that could only encode | |
216 // code points outside of the unicode range. | |
217 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |
218 // clang-format on | |
219 return lengths[first]; | |
220 } | |
221 | |
222 | |
223 static inline bool IsContinuationCharacter(byte chr) { | |
224 return chr >= 0x80 && chr <= 0xBF; | |
225 } | |
226 | |
227 | |
228 // This method decodes an UTF-8 value according to RFC 3629. | |
229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { | |
vogelheim
2015/05/22 15:53:38
Not sure if worth the effort, but this might be mo
jochen (gone - plz use gerrit)
2015/05/22 18:12:39
then I pick this version :)
| |
230 size_t length = NonASCIISequenceLength(str[0]); | |
231 if (length == 0 || max_length < length) { | |
196 *cursor += 1; | 232 *cursor += 1; |
197 return kBadChar; | 233 return kBadChar; |
198 } | 234 } |
199 byte first = str[0]; | 235 if (length == 2) { |
200 byte second = str[1] ^ 0x80; | 236 if (!IsContinuationCharacter(str[1])) { |
201 if (second & 0xC0) { | |
202 *cursor += 1; | |
203 return kBadChar; | |
204 } | |
205 if (first < 0xE0) { | |
206 if (first < 0xC0) { | |
207 *cursor += 1; | |
208 return kBadChar; | |
209 } | |
210 uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; | |
211 if (code_point <= kMaxOneByteChar) { | |
212 *cursor += 1; | 237 *cursor += 1; |
213 return kBadChar; | 238 return kBadChar; |
214 } | 239 } |
215 *cursor += 2; | 240 *cursor += 2; |
216 return code_point; | 241 return ((str[0] << 6) + str[1]) - 0x00003080; |
217 } | 242 } |
218 if (length == 2) { | 243 if (length == 3) { |
219 *cursor += 1; | 244 switch (str[0]) { |
220 return kBadChar; | 245 case 0xE0: |
221 } | 246 // Overlong three-byte sequence. |
222 byte third = str[2] ^ 0x80; | 247 if (str[1] < 0xA0 || str[1] > 0xBF) { |
223 if (third & 0xC0) { | 248 *cursor += 1; |
224 *cursor += 1; | 249 return kBadChar; |
225 return kBadChar; | 250 } |
226 } | 251 break; |
227 if (first < 0xF0) { | 252 case 0xED: |
228 uchar code_point = ((((first << 6) | second) << 6) | third) | 253 // High and low surrogate halves. |
229 & kMaxThreeByteChar; | 254 if (str[1] < 0x80 || str[1] > 0x9F) { |
230 if (code_point <= kMaxTwoByteChar) { | 255 *cursor += 1; |
256 return kBadChar; | |
257 } | |
258 break; | |
259 default: | |
260 if (!IsContinuationCharacter(str[1])) { | |
261 *cursor += 1; | |
262 return kBadChar; | |
263 } | |
264 } | |
265 if (!IsContinuationCharacter(str[2])) { | |
231 *cursor += 1; | 266 *cursor += 1; |
232 return kBadChar; | 267 return kBadChar; |
233 } | 268 } |
234 *cursor += 3; | 269 *cursor += 3; |
235 return code_point; | 270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
236 } | 271 } |
237 if (length == 3) { | 272 DCHECK(length == 4); |
273 switch (str[0]) { | |
274 case 0xF0: | |
275 // Overlong four-byte sequence. | |
276 if (str[1] < 0x90 || str[1] > 0xBF) { | |
277 *cursor += 1; | |
278 return kBadChar; | |
279 } | |
280 break; | |
281 case 0xF4: | |
282 // Code poits outside of the unicode range. | |
vogelheim
2015/05/22 15:53:38
poits -> points
| |
283 if (str[1] < 0x80 || str[1] > 0x8F) { | |
284 *cursor += 1; | |
285 return kBadChar; | |
286 } | |
287 break; | |
288 default: | |
289 if (!IsContinuationCharacter(str[1])) { | |
290 *cursor += 1; | |
291 return kBadChar; | |
292 } | |
293 } | |
294 if (!IsContinuationCharacter(str[2])) { | |
238 *cursor += 1; | 295 *cursor += 1; |
239 return kBadChar; | 296 return kBadChar; |
240 } | 297 } |
241 byte fourth = str[3] ^ 0x80; | 298 if (!IsContinuationCharacter(str[3])) { |
242 if (fourth & 0xC0) { | |
243 *cursor += 1; | 299 *cursor += 1; |
244 return kBadChar; | 300 return kBadChar; |
245 } | 301 } |
246 if (first < 0xF8) { | 302 *cursor += 4; |
247 uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) | 303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
248 & kMaxFourByteChar; | 304 0x03C82080; |
249 if (code_point <= kMaxThreeByteChar) { | |
250 *cursor += 1; | |
251 return kBadChar; | |
252 } | |
253 *cursor += 4; | |
254 return code_point; | |
255 } | |
256 *cursor += 1; | |
257 return kBadChar; | |
258 } | 305 } |
259 | 306 |
260 | 307 |
261 // Uppercase: point.category == 'Lu' | 308 // Uppercase: point.category == 'Lu' |
262 | 309 |
263 static const uint16_t kUppercaseTable0Size = 455; | 310 static const uint16_t kUppercaseTable0Size = 455; |
264 static const int32_t kUppercaseTable0[455] = { | 311 static const int32_t kUppercaseTable0[455] = { |
265 1073741889, 90, 1073742016, 214, | 312 1073741889, 90, 1073742016, 214, |
266 1073742040, 222, 256, 258, // NOLINT | 313 1073742040, 222, 256, 258, // NOLINT |
267 260, 262, 264, 266, | 314 260, 262, 264, 266, |
(...skipping 3117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3432 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3386 + | 3433 + |
3387 kCanonicalizationRangeMultiStrings1Size * | 3434 kCanonicalizationRangeMultiStrings1Size * |
3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3435 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3389 + | 3436 + |
3390 kCanonicalizationRangeMultiStrings7Size * | 3437 kCanonicalizationRangeMultiStrings7Size * |
3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3438 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
3392 } | 3439 } |
3393 | 3440 |
3394 } // namespace unibrow | 3441 } // namespace unibrow |
OLD | NEW |