OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
6 | 6 |
7 #include "src/unicode-inl.h" | 7 #include "src/unicode-inl.h" |
8 #include <stdio.h> | 8 #include <stdio.h> |
9 #include <stdlib.h> | 9 #include <stdlib.h> |
10 | 10 |
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
183 return 0; | 183 return 0; |
184 } | 184 } |
185 return -1; | 185 return -1; |
186 } | 186 } |
187 } else { | 187 } else { |
188 return 0; | 188 return 0; |
189 } | 189 } |
190 } | 190 } |
191 | 191 |
192 | 192 |
193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { | 193 static inline size_t NonASCIISequenceLength(byte first) { |
194 // We only get called for non-ASCII characters. | 194 // clang-format off |
195 if (length == 1) { | 195 static const uint8_t lengths[256] = { |
| 196 // The first 128 entries correspond to ASCII characters. |
| 197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 205 // The following 64 entries correspond to continuation bytes. |
| 206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 210 // The next are two invalid overlong encodings and 30 two-byte sequences. |
| 211 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 212 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 213 // 16 three-byte sequences. |
| 214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 215 // 5 four-byte sequences, followed by sequences that could only encode |
| 216 // code points outside of the unicode range. |
| 217 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; |
| 218 // clang-format on |
| 219 return lengths[first]; |
| 220 } |
| 221 |
| 222 |
| 223 static inline bool IsContinuationCharacter(byte chr) { |
| 224 return chr >= 0x80 && chr <= 0xBF; |
| 225 } |
| 226 |
| 227 |
| 228 // This method decodes an UTF-8 value according to RFC 3629. |
| 229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { |
| 230 size_t length = NonASCIISequenceLength(str[0]); |
| 231 if (length == 0 || max_length < length) { |
196 *cursor += 1; | 232 *cursor += 1; |
197 return kBadChar; | 233 return kBadChar; |
198 } | 234 } |
199 byte first = str[0]; | 235 if (length == 2) { |
200 byte second = str[1] ^ 0x80; | 236 if (!IsContinuationCharacter(str[1])) { |
201 if (second & 0xC0) { | |
202 *cursor += 1; | |
203 return kBadChar; | |
204 } | |
205 if (first < 0xE0) { | |
206 if (first < 0xC0) { | |
207 *cursor += 1; | |
208 return kBadChar; | |
209 } | |
210 uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; | |
211 if (code_point <= kMaxOneByteChar) { | |
212 *cursor += 1; | 237 *cursor += 1; |
213 return kBadChar; | 238 return kBadChar; |
214 } | 239 } |
215 *cursor += 2; | 240 *cursor += 2; |
216 return code_point; | 241 return ((str[0] << 6) + str[1]) - 0x00003080; |
217 } | 242 } |
218 if (length == 2) { | 243 if (length == 3) { |
219 *cursor += 1; | 244 switch (str[0]) { |
220 return kBadChar; | 245 case 0xE0: |
221 } | 246 // Overlong three-byte sequence. |
222 byte third = str[2] ^ 0x80; | 247 if (str[1] < 0xA0 || str[1] > 0xBF) { |
223 if (third & 0xC0) { | 248 *cursor += 1; |
224 *cursor += 1; | 249 return kBadChar; |
225 return kBadChar; | 250 } |
226 } | 251 break; |
227 if (first < 0xF0) { | 252 case 0xED: |
228 uchar code_point = ((((first << 6) | second) << 6) | third) | 253 // High and low surrogate halves. |
229 & kMaxThreeByteChar; | 254 if (str[1] < 0x80 || str[1] > 0x9F) { |
230 if (code_point <= kMaxTwoByteChar) { | 255 *cursor += 1; |
| 256 return kBadChar; |
| 257 } |
| 258 break; |
| 259 default: |
| 260 if (!IsContinuationCharacter(str[1])) { |
| 261 *cursor += 1; |
| 262 return kBadChar; |
| 263 } |
| 264 } |
| 265 if (!IsContinuationCharacter(str[2])) { |
231 *cursor += 1; | 266 *cursor += 1; |
232 return kBadChar; | 267 return kBadChar; |
233 } | 268 } |
234 *cursor += 3; | 269 *cursor += 3; |
235 return code_point; | 270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
236 } | 271 } |
237 if (length == 3) { | 272 DCHECK(length == 4); |
| 273 switch (str[0]) { |
| 274 case 0xF0: |
| 275 // Overlong four-byte sequence. |
| 276 if (str[1] < 0x90 || str[1] > 0xBF) { |
| 277 *cursor += 1; |
| 278 return kBadChar; |
| 279 } |
| 280 break; |
| 281 case 0xF4: |
| 282 // Code points outside of the unicode range. |
| 283 if (str[1] < 0x80 || str[1] > 0x8F) { |
| 284 *cursor += 1; |
| 285 return kBadChar; |
| 286 } |
| 287 break; |
| 288 default: |
| 289 if (!IsContinuationCharacter(str[1])) { |
| 290 *cursor += 1; |
| 291 return kBadChar; |
| 292 } |
| 293 } |
| 294 if (!IsContinuationCharacter(str[2])) { |
238 *cursor += 1; | 295 *cursor += 1; |
239 return kBadChar; | 296 return kBadChar; |
240 } | 297 } |
241 byte fourth = str[3] ^ 0x80; | 298 if (!IsContinuationCharacter(str[3])) { |
242 if (fourth & 0xC0) { | |
243 *cursor += 1; | 299 *cursor += 1; |
244 return kBadChar; | 300 return kBadChar; |
245 } | 301 } |
246 if (first < 0xF8) { | 302 *cursor += 4; |
247 uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) | 303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
248 & kMaxFourByteChar; | 304 0x03C82080; |
249 if (code_point <= kMaxThreeByteChar) { | |
250 *cursor += 1; | |
251 return kBadChar; | |
252 } | |
253 *cursor += 4; | |
254 return code_point; | |
255 } | |
256 *cursor += 1; | |
257 return kBadChar; | |
258 } | 305 } |
259 | 306 |
260 | 307 |
261 // Uppercase: point.category == 'Lu' | 308 // Uppercase: point.category == 'Lu' |
262 | 309 |
263 static const uint16_t kUppercaseTable0Size = 455; | 310 static const uint16_t kUppercaseTable0Size = 455; |
264 static const int32_t kUppercaseTable0[455] = { | 311 static const int32_t kUppercaseTable0[455] = { |
265 1073741889, 90, 1073742016, 214, | 312 1073741889, 90, 1073742016, 214, |
266 1073742040, 222, 256, 258, // NOLINT | 313 1073742040, 222, 256, 258, // NOLINT |
267 260, 262, 264, 266, | 314 260, 262, 264, 266, |
(...skipping 3117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3432 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3386 + | 3433 + |
3387 kCanonicalizationRangeMultiStrings1Size * | 3434 kCanonicalizationRangeMultiStrings1Size * |
3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3435 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3389 + | 3436 + |
3390 kCanonicalizationRangeMultiStrings7Size * | 3437 kCanonicalizationRangeMultiStrings7Size * |
3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3438 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
3392 } | 3439 } |
3393 | 3440 |
3394 } // namespace unibrow | 3441 } // namespace unibrow |
OLD | NEW |