Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(616)

Side by Side Diff: src/unicode.cc

Issue 1148653007: Update UTF-8 decoder to detect more special cases. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: updates Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | test/cctest/test-api.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // This file was generated at 2014-10-08 15:25:47.940335 5 // This file was generated at 2014-10-08 15:25:47.940335
6 6
7 #include "src/unicode-inl.h" 7 #include "src/unicode-inl.h"
8 #include <stdio.h> 8 #include <stdio.h>
9 #include <stdlib.h> 9 #include <stdlib.h>
10 10
(...skipping 172 matching lines...) Expand 10 before | Expand all | Expand 10 after
183 return 0; 183 return 0;
184 } 184 }
185 return -1; 185 return -1;
186 } 186 }
187 } else { 187 } else {
188 return 0; 188 return 0;
189 } 189 }
190 } 190 }
191 191
192 192
193 uchar Utf8::CalculateValue(const byte* str, size_t length, size_t* cursor) { 193 static inline size_t NonASCIISequenceLength(byte first) {
194 // We only get called for non-ASCII characters. 194 // clang-format off
195 if (length == 1) { 195 static const uint8_t lengths[256] = {
196 // The first 128 entries correspond to ASCII characters.
197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 // The following 64 entries correspond to continuation bytes.
206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210 // The next are two invalid overlong encodings and 30 two-byte sequences.
211 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
212 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
213 // 16 three-byte sequences.
214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215 // 5 four-byte sequences, followed by sequences that could only encode
216 // code points outside of the unicode range.
217 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
218 // clang-format on
219 return lengths[first];
220 }
221
222
223 static inline bool IsContinuationCharacter(byte chr) {
224 return chr >= 0x80 && chr <= 0xBF;
225 }
226
227
228 // This method decodes an UTF-8 value according to RFC 3629.
229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
230 size_t length = NonASCIISequenceLength(str[0]);
231 if (length == 0 || max_length < length) {
196 *cursor += 1; 232 *cursor += 1;
197 return kBadChar; 233 return kBadChar;
198 } 234 }
199 byte first = str[0]; 235 if (length == 2) {
200 byte second = str[1] ^ 0x80; 236 if (!IsContinuationCharacter(str[1])) {
201 if (second & 0xC0) {
202 *cursor += 1;
203 return kBadChar;
204 }
205 if (first < 0xE0) {
206 if (first < 0xC0) {
207 *cursor += 1;
208 return kBadChar;
209 }
210 uchar code_point = ((first << 6) | second) & kMaxTwoByteChar;
211 if (code_point <= kMaxOneByteChar) {
212 *cursor += 1; 237 *cursor += 1;
213 return kBadChar; 238 return kBadChar;
214 } 239 }
215 *cursor += 2; 240 *cursor += 2;
216 return code_point; 241 return ((str[0] << 6) + str[1]) - 0x00003080;
217 } 242 }
218 if (length == 2) { 243 if (length == 3) {
219 *cursor += 1; 244 switch (str[0]) {
220 return kBadChar; 245 case 0xE0:
221 } 246 // Overlong three-byte sequence.
222 byte third = str[2] ^ 0x80; 247 if (str[1] < 0xA0 || str[1] > 0xBF) {
223 if (third & 0xC0) { 248 *cursor += 1;
224 *cursor += 1; 249 return kBadChar;
225 return kBadChar; 250 }
226 } 251 break;
227 if (first < 0xF0) { 252 case 0xED:
228 uchar code_point = ((((first << 6) | second) << 6) | third) 253 // High and low surrogate halves.
229 & kMaxThreeByteChar; 254 if (str[1] < 0x80 || str[1] > 0x9F) {
230 if (code_point <= kMaxTwoByteChar) { 255 *cursor += 1;
256 return kBadChar;
257 }
258 break;
259 default:
260 if (!IsContinuationCharacter(str[1])) {
261 *cursor += 1;
262 return kBadChar;
263 }
264 }
265 if (!IsContinuationCharacter(str[2])) {
231 *cursor += 1; 266 *cursor += 1;
232 return kBadChar; 267 return kBadChar;
233 } 268 }
234 *cursor += 3; 269 *cursor += 3;
235 return code_point; 270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
236 } 271 }
237 if (length == 3) { 272 DCHECK(length == 4);
273 switch (str[0]) {
274 case 0xF0:
275 // Overlong four-byte sequence.
276 if (str[1] < 0x90 || str[1] > 0xBF) {
277 *cursor += 1;
278 return kBadChar;
279 }
280 break;
281 case 0xF4:
282 // Code points outside of the unicode range.
283 if (str[1] < 0x80 || str[1] > 0x8F) {
284 *cursor += 1;
285 return kBadChar;
286 }
287 break;
288 default:
289 if (!IsContinuationCharacter(str[1])) {
290 *cursor += 1;
291 return kBadChar;
292 }
293 }
294 if (!IsContinuationCharacter(str[2])) {
238 *cursor += 1; 295 *cursor += 1;
239 return kBadChar; 296 return kBadChar;
240 } 297 }
241 byte fourth = str[3] ^ 0x80; 298 if (!IsContinuationCharacter(str[3])) {
242 if (fourth & 0xC0) {
243 *cursor += 1; 299 *cursor += 1;
244 return kBadChar; 300 return kBadChar;
245 } 301 }
246 if (first < 0xF8) { 302 *cursor += 4;
247 uchar code_point = (((((first << 6 | second) << 6) | third) << 6) | fourth) 303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
248 & kMaxFourByteChar; 304 0x03C82080;
249 if (code_point <= kMaxThreeByteChar) {
250 *cursor += 1;
251 return kBadChar;
252 }
253 *cursor += 4;
254 return code_point;
255 }
256 *cursor += 1;
257 return kBadChar;
258 } 305 }
259 306
260 307
261 // Uppercase: point.category == 'Lu' 308 // Uppercase: point.category == 'Lu'
262 309
263 static const uint16_t kUppercaseTable0Size = 455; 310 static const uint16_t kUppercaseTable0Size = 455;
264 static const int32_t kUppercaseTable0[455] = { 311 static const int32_t kUppercaseTable0[455] = {
265 1073741889, 90, 1073742016, 214, 312 1073741889, 90, 1073742016, 214,
266 1073742040, 222, 256, 258, // NOLINT 313 1073742040, 222, 256, 258, // NOLINT
267 260, 262, 264, 266, 314 260, 262, 264, 266,
(...skipping 3117 matching lines...) Expand 10 before | Expand all | Expand 10 after
3385 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3432 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3386 + 3433 +
3387 kCanonicalizationRangeMultiStrings1Size * 3434 kCanonicalizationRangeMultiStrings1Size *
3388 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3435 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3389 + 3436 +
3390 kCanonicalizationRangeMultiStrings7Size * 3437 kCanonicalizationRangeMultiStrings7Size *
3391 sizeof(MultiCharacterSpecialCase<1>); // NOLINT 3438 sizeof(MultiCharacterSpecialCase<1>); // NOLINT
3392 } 3439 }
3393 3440
3394 } // namespace unibrow 3441 } // namespace unibrow
OLDNEW
« no previous file with comments | « no previous file | test/cctest/test-api.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698