Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 // | 4 // |
| 5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
| 6 | 6 |
| 7 #include "src/unicode-inl.h" | 7 #include "src/unicode-inl.h" |
| 8 #include <stdio.h> | 8 #include <stdio.h> |
| 9 #include <stdlib.h> | 9 #include <stdlib.h> |
| 10 | 10 |
| (...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 183 return 0; | 183 return 0; |
| 184 } | 184 } |
| 185 return -1; | 185 return -1; |
| 186 } | 186 } |
| 187 } else { | 187 } else { |
| 188 return 0; | 188 return 0; |
| 189 } | 189 } |
| 190 } | 190 } |
| 191 | 191 |
| 192 | 192 |
| 193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { | 193 static inline size_t NonASCIISequenceLength(byte first) { |
| 194 // We only get called for non-ASCII characters. | 194 static const uint8_t lengths[256] = { |
| 195 if (length == 1) { | 195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 200 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 201 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 202 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 203 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 204 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
| 205 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |
|
vogelheim
2015/05/21 16:58:38
The table is difficult to read. It also leaves me
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
I updated the matrix to be 16 x 16, and added comm
| |
| 206 return lengths[first]; | |
| 207 } | |
| 208 | |
| 209 | |
| 210 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { | |
|
vogelheim
2015/05/21 16:58:38
This might also benefit from a unit test that will
vogelheim
2015/05/21 16:58:38
I believe this deserves some commentary, and if on
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
yeah, actually, it's supposed to be consistent wit
| |
| 211 DCHECK((str[0] & 0x80) == 0x80); | |
| 212 size_t length = NonASCIISequenceLength(str[0]); | |
| 213 if (length == 0 || max_length < length) { | |
| 196 *cursor += 1; | 214 *cursor += 1; |
| 197 return kBadChar; | 215 return kBadChar; |
| 198 } | 216 } |
| 199 byte first = str[0]; | 217 if (length == 2) { |
|
vogelheim
2015/05/21 16:58:38
I was trying to figure out *why* these characters
vogelheim
2015/05/21 16:58:38
I find the code below to be somewhat confusing. If
jochen (gone - plz use gerrit)
2015/05/22 12:38:12
right. It's just that UTF-8 cannot encode all of u
| |
| 200 byte second = str[1] ^ 0x80; | 218 DCHECK(str[0] <= 0xDF); |
| 201 if (second & 0xC0) { | 219 if (str[0] < 0xC2) { |
| 202 *cursor += 1; | |
| 203 return kBadChar; | |
| 204 } | |
| 205 if (first < 0xE0) { | |
| 206 if (first < 0xC0) { | |
| 207 *cursor += 1; | 220 *cursor += 1; |
| 208 return kBadChar; | 221 return kBadChar; |
| 209 } | 222 } |
| 210 uchar code_point = ((first << 6) | second) & kMaxTwoByteChar; | 223 if (str[1] < 0x80 || str[1] > 0xBF) { |
| 211 if (code_point <= kMaxOneByteChar) { | |
| 212 *cursor += 1; | 224 *cursor += 1; |
| 213 return kBadChar; | 225 return kBadChar; |
| 214 } | 226 } |
| 215 *cursor += 2; | 227 *cursor += 2; |
| 216 return code_point; | 228 return ((str[0] << 6) + str[1]) - 0x00003080; |
| 217 } | 229 } |
| 218 if (length == 2) { | 230 if (length == 3) { |
| 219 *cursor += 1; | 231 DCHECK(str[0] >= 0xE0 && str[0] <= 0xEF); |
| 220 return kBadChar; | 232 switch (str[0]) { |
| 221 } | 233 case 0xE0: |
| 222 byte third = str[2] ^ 0x80; | 234 if (str[1] < 0xA0 || str[1] > 0xBF) { |
| 223 if (third & 0xC0) { | 235 *cursor += 1; |
| 224 *cursor += 1; | 236 return kBadChar; |
| 225 return kBadChar; | 237 } |
| 226 } | 238 break; |
| 227 if (first < 0xF0) { | 239 case 0xED: |
| 228 uchar code_point = ((((first << 6) | second) << 6) | third) | 240 if (str[1] < 0x80 || str[1] > 0x9F) { |
| 229 & kMaxThreeByteChar; | 241 *cursor += 1; |
| 230 if (code_point <= kMaxTwoByteChar) { | 242 return kBadChar; |
| 243 } | |
| 244 break; | |
| 245 default: | |
| 246 if (str[1] < 0x80 || str[1] > 0xBF) { | |
| 247 *cursor += 1; | |
| 248 return kBadChar; | |
| 249 } | |
| 250 } | |
| 251 if (str[2] < 0x80 || str[2] > 0xBF) { | |
| 231 *cursor += 1; | 252 *cursor += 1; |
| 232 return kBadChar; | 253 return kBadChar; |
| 233 } | 254 } |
| 234 *cursor += 3; | 255 *cursor += 3; |
| 235 return code_point; | 256 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
| 236 } | 257 } |
| 237 if (length == 3) { | 258 DCHECK(length == 4); |
| 259 DCHECK(str[0] >= 0xF0 && str[0] <= 0xF4); | |
| 260 switch (str[0]) { | |
| 261 case 0xF0: | |
| 262 if (str[1] < 0x90 || str[1] > 0xBF) { | |
| 263 *cursor += 1; | |
| 264 return kBadChar; | |
| 265 } | |
| 266 break; | |
| 267 case 0xF4: | |
| 268 if (str[1] < 0x80 || str[1] > 0x8F) { | |
| 269 *cursor += 1; | |
| 270 return kBadChar; | |
| 271 } | |
| 272 break; | |
| 273 default: | |
| 274 if (str[1] < 0x80 || str[1] > 0xBF) { | |
| 275 *cursor += 1; | |
| 276 return kBadChar; | |
| 277 } | |
| 278 } | |
| 279 if (str[2] < 0x80 || str[2] > 0xBF) { | |
| 238 *cursor += 1; | 280 *cursor += 1; |
| 239 return kBadChar; | 281 return kBadChar; |
| 240 } | 282 } |
| 241 byte fourth = str[3] ^ 0x80; | 283 if (str[3] < 0x80 || str[3] > 0xBF) { |
| 242 if (fourth & 0xC0) { | |
| 243 *cursor += 1; | 284 *cursor += 1; |
| 244 return kBadChar; | 285 return kBadChar; |
| 245 } | 286 } |
| 246 if (first < 0xF8) { | 287 *cursor += 4; |
| 247 uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) | 288 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
| 248 & kMaxFourByteChar; | 289 0x03C82080; |
| 249 if (code_point <= kMaxThreeByteChar) { | |
| 250 *cursor += 1; | |
| 251 return kBadChar; | |
| 252 } | |
| 253 *cursor += 4; | |
| 254 return code_point; | |
| 255 } | |
| 256 *cursor += 1; | |
| 257 return kBadChar; | |
| 258 } | 290 } |
| 259 | 291 |
| 260 | 292 |
| 261 // Uppercase: point.category == 'Lu' | 293 // Uppercase: point.category == 'Lu' |
| 262 | 294 |
| 263 static const uint16_t kUppercaseTable0Size = 455; | 295 static const uint16_t kUppercaseTable0Size = 455; |
| 264 static const int32_t kUppercaseTable0[455] = { | 296 static const int32_t kUppercaseTable0[455] = { |
| 265 1073741889, 90, 1073742016, 214, | 297 1073741889, 90, 1073742016, 214, |
| 266 1073742040, 222, 256, 258, // NOLINT | 298 1073742040, 222, 256, 258, // NOLINT |
| 267 260, 262, 264, 266, | 299 260, 262, 264, 266, |
| (...skipping 3117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3417 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
| 3386 + | 3418 + |
| 3387 kCanonicalizationRangeMultiStrings1Size * | 3419 kCanonicalizationRangeMultiStrings1Size * |
| 3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3420 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
| 3389 + | 3421 + |
| 3390 kCanonicalizationRangeMultiStrings7Size * | 3422 kCanonicalizationRangeMultiStrings7Size * |
| 3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3423 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
| 3392 } | 3424 } |
| 3393 | 3425 |
| 3394 } // namespace unibrow | 3426 } // namespace unibrow |
| OLD | NEW |