Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(233)

Side by Side Diff: runtime/vm/unicode.cc

Issue 11411092: Revert "Add some support for the code-point code-unit distinction." (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « runtime/vm/unicode.h ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
(...skipping 30 matching lines...) Expand all
41 }; 41 };
42 42
43 43
44 // Minimum values of code points used to check shortest form. 44 // Minimum values of code points used to check shortest form.
45 static const uint32_t kOverlongMinimum[7] = { 45 static const uint32_t kOverlongMinimum[7] = {
46 0, // padding 46 0, // padding
47 0x0, 47 0x0,
48 0x80, 48 0x80,
49 0x800, 49 0x800,
50 0x10000, 50 0x10000,
51 0xFFFFFFFF, // We never allow 5 byte sequences. 51 0xFFFFFFFF,
52 0xFFFFFFFF // We never allow 6 byte sequences. 52 0xFFFFFFFF
53 }; 53 };
54 54
55 55
56 static bool IsTrailByte(uint8_t code_unit) { 56 static bool IsTrailByte(uint8_t code_unit) {
57 return (code_unit & 0xc0) == 0x80; 57 return (code_unit & 0xc0) == 0x80;
58 } 58 }
59 59
60 60
61 static bool IsLatin1SequenceStart(uint8_t code_unit) { 61 static bool IsLatin1SequenceStart(uint8_t code_unit) {
62 // Check is codepoint is <= U+00FF 62 // Check is codepoint is <= U+00FF
63 return (code_unit <= Utf8::kMaxOneByteChar); 63 return (code_unit <= Utf8::kMaxOneByteChar);
64 } 64 }
65 65
66 66
67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { 67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {
68 // Check the UTF-8 code unit to determine if it is a sequence start for a 68 // Check is codepoint is >= U+10000.
69 // code point >= U+10000.
70 return (code_unit >= 0xF0); 69 return (code_unit >= 0xF0);
71 } 70 }
72 71
73 72
74 // Returns true if the code point value is above Plane 17. 73 // Returns true if the code point value is above Plane 17.
75 static bool IsOutOfRange(int32_t code_point) { 74 static bool IsOutOfRange(uint32_t code_point) {
76 return (code_point > Utf16::kMaxCodePoint); 75 return (code_point > 0x10FFFF);
77 } 76 }
78 77
79 78
80 // Returns true if the byte sequence is ill-formed. 79 // Returns true if the byte sequence is ill-formed.
81 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { 80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {
82 return code_point < kOverlongMinimum[num_bytes]; 81 return code_point < kOverlongMinimum[num_bytes];
83 } 82 }
84 83
85 84
86 // Returns a count of the number of UTF-16 code units represented by this UTF-8 85 // Returns a count of the number of UTF-8 trail bytes.
87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then 86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
88 // the type is kSupplementary. Otherwise it is kBMP. 87 intptr_t array_len,
89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, 88 Type* type) {
90 intptr_t array_len,
91 Type* type) {
92 intptr_t len = 0; 89 intptr_t len = 0;
93 Type char_type = kLatin1; 90 Type char_type = kLatin1;
94 for (intptr_t i = 0; i < array_len; i++) { 91 for (intptr_t i = 0; i < array_len; i++) {
95 uint8_t code_unit = utf8_array[i]; 92 uint8_t code_unit = utf8_array[i];
96 if (!IsTrailByte(code_unit)) { 93 if (!IsTrailByte(code_unit)) {
97 ++len; 94 ++len;
98 } 95 }
99 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF 96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF
100 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000
101 char_type = kSupplementary; 98 char_type = kSupplementary;
102 ++len; // Surrogate pair in the UTF-16 encoding. 99 ++len;
103 } else if (char_type == kLatin1) { 100 } else if (char_type == kLatin1) {
104 char_type = kBMP; 101 char_type = kBMP;
105 } 102 }
106 } 103 }
107 } 104 }
108 *type = char_type; 105 *type = char_type;
109 return len; 106 return len;
110 } 107 }
111 108
112 109
113 // Returns true if str is a valid UTF-8 string. 110 // Returns true if str is a valid NUL-terminated UTF-8 string.
114 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { 111 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {
115 intptr_t i = 0; 112 intptr_t i = 0;
116 while (i < array_len) { 113 while (i < array_len) {
117 uint32_t ch = utf8_array[i] & 0xFF; 114 uint32_t ch = utf8_array[i] & 0xFF;
118 intptr_t j = 1; 115 intptr_t j = 1;
119 if (ch >= 0x80) { 116 if (ch >= 0x80) {
120 int8_t num_trail_bytes = kTrailBytes[ch]; 117 int8_t num_trail_bytes = kTrailBytes[ch];
121 bool is_malformed = false; 118 bool is_malformed = false;
122 for (; j < num_trail_bytes; ++j) { 119 for (; j < num_trail_bytes; ++j) {
123 if ((i + j) < array_len) { 120 if ((i + j) < array_len) {
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after
205 Utf8::Encode(ch, &dst[pos]); 202 Utf8::Encode(ch, &dst[pos]);
206 pos += num_bytes; 203 pos += num_bytes;
207 } 204 }
208 return pos; 205 return pos;
209 } 206 }
210 207
211 208
212 intptr_t Utf8::Decode(const uint8_t* utf8_array, 209 intptr_t Utf8::Decode(const uint8_t* utf8_array,
213 intptr_t array_len, 210 intptr_t array_len,
214 int32_t* dst) { 211 int32_t* dst) {
215 int32_t ch = utf8_array[0] & 0xFF; 212 uint32_t ch = utf8_array[0] & 0xFF;
216 intptr_t i = 1; 213 intptr_t i = 1;
217 if (ch >= 0x80) { 214 if (ch >= 0x80) {
218 intptr_t num_trail_bytes = kTrailBytes[ch]; 215 intptr_t num_trail_bytes = kTrailBytes[ch];
219 bool is_malformed = false; 216 bool is_malformed = false;
220 for (; i < num_trail_bytes; ++i) { 217 for (; i < num_trail_bytes; ++i) {
221 if (i < array_len) { 218 if (i < array_len) {
222 uint8_t code_unit = utf8_array[i]; 219 uint8_t code_unit = utf8_array[i];
223 is_malformed |= !IsTrailByte(code_unit); 220 is_malformed |= !IsTrailByte(code_unit);
224 ch = (ch << 6) + code_unit; 221 ch = (ch << 6) + code_unit;
225 } else { 222 } else {
226 *dst = kInvalidCodePoint; 223 *dst = -1;
227 return 0; 224 return 0;
228 } 225 }
229 } 226 }
230 ch -= kMagicBits[num_trail_bytes]; 227 ch -= kMagicBits[num_trail_bytes];
231 if (!((is_malformed == false) && 228 if (!((is_malformed == false) &&
232 (i == num_trail_bytes) && 229 (i == num_trail_bytes) &&
233 !IsOutOfRange(ch) && 230 !IsOutOfRange(ch) &&
234 !IsNonShortestForm(ch, i) && 231 !IsNonShortestForm(ch, i) &&
235 !Utf16::IsSurrogate(ch))) { 232 !Utf16::IsSurrogate(ch))) {
236 *dst = kInvalidCodePoint; 233 *dst = -1;
237 return 0; 234 return 0;
238 } 235 }
239 } 236 }
240 *dst = ch; 237 *dst = ch;
241 return i; 238 return i;
242 } 239 }
243 240
244 241
245 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, 242 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,
246 intptr_t array_len, 243 intptr_t array_len,
(...skipping 23 matching lines...) Expand all
270 intptr_t array_len, 267 intptr_t array_len,
271 uint16_t* dst, 268 uint16_t* dst,
272 intptr_t len) { 269 intptr_t len) {
273 intptr_t i = 0; 270 intptr_t i = 0;
274 intptr_t j = 0; 271 intptr_t j = 0;
275 intptr_t num_bytes; 272 intptr_t num_bytes;
276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
277 int32_t ch; 274 int32_t ch;
278 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); 275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);
279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
280 if (ch == kInvalidCodePoint) { 277 if (ch == -1) {
281 return false; // invalid input 278 return false; // invalid input
282 } 279 }
283 if (is_supplementary) { 280 if (is_supplementary) {
284 Utf16::Encode(ch, &dst[j]); 281 Utf16::Encode(ch, &dst[j]);
285 j = j + 1; 282 j = j + 1;
286 } else { 283 } else {
287 dst[j] = ch; 284 dst[j] = ch;
288 } 285 }
289 } 286 }
290 if ((i < array_len) && (j == len)) { 287 if ((i < array_len) && (j == len)) {
291 return false; // output overflow 288 return false; // output overflow
292 } 289 }
293 return true; // success 290 return true; // success
294 } 291 }
295 292
296 293
297 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, 294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
298 intptr_t array_len, 295 intptr_t array_len,
299 int32_t* dst, 296 uint32_t* dst,
300 intptr_t len) { 297 intptr_t len) {
301 intptr_t i = 0; 298 intptr_t i = 0;
302 intptr_t j = 0; 299 intptr_t j = 0;
303 intptr_t num_bytes; 300 intptr_t num_bytes;
304 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
305 int32_t ch; 302 int32_t ch;
306 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
307 if (ch == kInvalidCodePoint) { 304 if (ch == -1) {
308 return false; // invalid input 305 return false; // invalid input
309 } 306 }
310 dst[j] = ch; 307 dst[j] = ch;
311 } 308 }
312 if ((i < array_len) && (j == len)) { 309 if ((i < array_len) && (j == len)) {
313 return false; // output overflow 310 return false; // output overflow
314 } 311 }
315 return true; // success 312 return true; // success
316 } 313 }
317 314
318 315
319 int32_t Utf16::CodePointAt(const String& str, int index) { 316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {
320 int32_t code = str.CharAt(index); 317 ASSERT(codepoint > kMaxBmpCodepoint);
321 if (!IsLeadSurrogate(code)) return code;
322 if (index + 1 == str.Length()) return code;
323 int32_t trail = str.CharAt(index + 1);
324 if (!IsTrailSurrogate(trail)) return code;
325 return Decode(code, trail);
326 }
327
328
329 void Utf16::Encode(int32_t codePoint, uint16_t* dst) {
330 ASSERT(codePoint > kMaxBmpCodepoint);
331 ASSERT(dst != NULL); 318 ASSERT(dst != NULL);
332 dst[0] = LeadFromCodePoint(codePoint); 319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));
333 dst[1] = TrailFromCodePoint(codePoint); 320 dst[1] = (0xDC00 + (codepoint & 0x3FF));
334 } 321 }
335 322
336 } // namespace dart 323 } // namespace dart
OLDNEW
« no previous file with comments | « runtime/vm/unicode.h ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698