Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Side by Side Diff: runtime/vm/unicode.cc

Issue 11318018: - Represent strings internally in UTF-16 format, this makes it (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/
Patch Set: Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
11 namespace dart { 11 namespace dart {
12 12
13 static const uint8_t kTrailBytes[256] = { 13 static const int8_t kTrailBytes[256] = {
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
(...skipping 27 matching lines...) Expand all
51 0xFFFFFFFF, 51 0xFFFFFFFF,
52 0xFFFFFFFF 52 0xFFFFFFFF
53 }; 53 };
54 54
55 55
56 static bool IsTrailByte(uint8_t code_unit) { 56 static bool IsTrailByte(uint8_t code_unit) {
57 return (code_unit & 0xc0) == 0x80; 57 return (code_unit & 0xc0) == 0x80;
58 } 58 }
59 59
60 60
61 static bool IsAsciiSequenceStart(uint8_t code_unit) {
62 // Check is codepoint is <= U+007F
63 return (code_unit <= Utf8::kMaxOneByteChar);
64 }
65
66
67 static bool IsSmpSequenceStart(uint8_t code_unit) {
68 // Check is codepoint is >= U+10000.
69 return (code_unit >= 0xF0);
70 }
71
72
61 // Returns true if the code point is a high- or low-surrogate. 73 // Returns true if the code point is a high- or low-surrogate.
62 static bool IsSurrogate(uint32_t code_point) { 74 static bool IsSurrogate(uint32_t code_point) {
63 return (code_point & 0xfffff800) == 0xd800; 75 return (code_point & 0xfffff800) == 0xd800;
64 } 76 }
65 77
66 78
67 // Returns true if the code point value is above Plane 17. 79 // Returns true if the code point value is above Plane 17.
68 static bool IsOutOfRange(uint32_t code_point) { 80 static bool IsOutOfRange(uint32_t code_point) {
69 return code_point > 0x10FFFF; 81 return (code_point > 0x10FFFF);
70 } 82 }
71 83
72 84
73 // Returns true if the byte sequence is ill-formed. 85 // Returns true if the byte sequence is ill-formed.
74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { 86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {
75 return code_point < kOverlongMinimum[num_bytes]; 87 return code_point < kOverlongMinimum[num_bytes];
76 } 88 }
77 89
78 90
91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) {
92 ASSERT(codepoint > kMaxBmpCodepoint);
93 ASSERT(dst != NULL);
94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10));
95 dst[1] = (0xDC00 + (codepoint & 0x3FF));
96 }
97
98
79 // Returns a count of the number of UTF-8 trail bytes. 99 // Returns a count of the number of UTF-8 trail bytes.
80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { 100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
81 bool is_two_byte_string = false; 101 intptr_t array_len,
82 bool is_four_byte_string = false; 102 Type* type) {
83 intptr_t len = 0; 103 intptr_t len = 0;
84 for (; *str != '\0'; ++str) { 104 Type char_type = kAscii;
85 uint8_t code_unit = *str; 105 for (intptr_t i = 0; i < array_len; i++) {
106 uint8_t code_unit = utf8_array[i];
86 if (!IsTrailByte(code_unit)) { 107 if (!IsTrailByte(code_unit)) {
87 ++len; 108 ++len;
88 } 109 }
89 if (code_unit > 0xC3) { // > U+00FF 110 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F
90 if (code_unit < 0xF0) { // < U+10000 111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000
91 is_two_byte_string = true; 112 char_type = kSMP;
92 } else { 113 ++len;
93 is_four_byte_string = true; 114 } else if (char_type == kAscii) {
115 char_type = kBMP;
94 } 116 }
95 } 117 }
96 } 118 }
97 if (is_four_byte_string) { 119 *type = char_type;
98 *width = 4;
99 } else if (is_two_byte_string) {
100 *width = 2;
101 } else {
102 *width = 1;
103 }
104 return len; 120 return len;
105 } 121 }
106 122
107 123
108 // Returns true if str is a valid NUL-terminated UTF-8 string. 124 // Returns true if str is a valid NUL-terminated UTF-8 string.
109 bool Utf8::IsValid(const char* str) { 125 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {
110 intptr_t i = 0; 126 intptr_t i = 0;
111 while (str[i] != '\0') { 127 while (i < array_len) {
112 uint32_t ch = str[i] & 0xFF; 128 uint32_t ch = utf8_array[i] & 0xFF;
113 intptr_t j = 1; 129 intptr_t j = 1;
114 if (ch >= 0x80) { 130 if (ch >= 0x80) {
115 uint8_t num_trail_bytes = kTrailBytes[ch]; 131 int8_t num_trail_bytes = kTrailBytes[ch];
116 bool is_malformed = false; 132 bool is_malformed = false;
117 for (; j < num_trail_bytes; ++j) { 133 for (; j < num_trail_bytes; ++j) {
118 if (str[i + j] != '\0') { 134 if ((i + j) < array_len) {
119 uint8_t code_unit = str[i + j]; 135 uint8_t code_unit = utf8_array[i + j];
120 is_malformed |= !IsTrailByte(code_unit); 136 is_malformed |= !IsTrailByte(code_unit);
121 ch = (ch << 6) + code_unit; 137 ch = (ch << 6) + code_unit;
122 } else { 138 } else {
123 return false; 139 return false;
124 } 140 }
125 } 141 }
126 ch -= kMagicBits[num_trail_bytes]; 142 ch -= kMagicBits[num_trail_bytes];
127 if (!((is_malformed == false) && 143 if (!((is_malformed == false) &&
128 (j == num_trail_bytes) && 144 (j == num_trail_bytes) &&
129 !IsOutOfRange(ch) && 145 !IsOutOfRange(ch) &&
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
195 if (pos + num_bytes > len) { 211 if (pos + num_bytes > len) {
196 break; 212 break;
197 } 213 }
198 Utf8::Encode(ch, &dst[pos]); 214 Utf8::Encode(ch, &dst[pos]);
199 pos += num_bytes; 215 pos += num_bytes;
200 } 216 }
201 return pos; 217 return pos;
202 } 218 }
203 219
204 220
205 intptr_t Utf8::Decode(const char* src, int32_t* dst) { 221 intptr_t Utf8::Decode(const uint8_t* utf8_array,
206 uint32_t ch = src[0] & 0xFF; 222 intptr_t array_len,
207 uint32_t i = 1; 223 int32_t* dst) {
224 uint32_t ch = utf8_array[0] & 0xFF;
225 intptr_t i = 1;
208 if (ch >= 0x80) { 226 if (ch >= 0x80) {
209 uint32_t num_trail_bytes = kTrailBytes[ch]; 227 int32_t num_trail_bytes = kTrailBytes[ch];
210 bool is_malformed = false; 228 bool is_malformed = false;
211 for (; i < num_trail_bytes; ++i) { 229 for (; i < num_trail_bytes; ++i) {
212 if (src[i] != '\0') { 230 if (i < array_len) {
213 uint8_t code_unit = src[i]; 231 uint8_t code_unit = utf8_array[i];
214 is_malformed |= !IsTrailByte(code_unit); 232 is_malformed |= !IsTrailByte(code_unit);
215 ch = (ch << 6) + code_unit; 233 ch = (ch << 6) + code_unit;
216 } else { 234 } else {
217 *dst = -1; 235 *dst = -1;
218 return 0; 236 return 0;
219 } 237 }
220 } 238 }
221 ch -= kMagicBits[num_trail_bytes]; 239 ch -= kMagicBits[num_trail_bytes];
222 if (!((is_malformed == false) && 240 if (!((is_malformed == false) &&
223 (i == num_trail_bytes) && 241 (i == num_trail_bytes) &&
224 !IsOutOfRange(ch) && 242 !IsOutOfRange(ch) &&
225 !IsNonShortestForm(ch, i) && 243 !IsNonShortestForm(ch, i) &&
226 !IsSurrogate(ch))) { 244 !IsSurrogate(ch))) {
227 *dst = -1; 245 *dst = -1;
228 return 0; 246 return 0;
229 } 247 }
230 } 248 }
231 *dst = ch; 249 *dst = ch;
232 return i; 250 return i;
233 } 251 }
234 252
235 253
236 template<typename T> 254 bool Utf8::DecodeToAscii(const uint8_t* utf8_array,
237 static bool DecodeImpl(const char* src, T* dst, intptr_t len) { 255 intptr_t array_len,
256 uint8_t* dst,
257 intptr_t len) {
258 if (len < array_len) {
259 return false; // output overflow
260 }
261 #ifdef DEBUG
262 for (intptr_t i = 0; i < array_len; i++) {
263 ASSERT(IsAsciiSequenceStart(utf8_array[i]));
264 }
265 #endif
266 memmove(dst, utf8_array, array_len);
267 return true; // success
268 }
269
270
271 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
272 intptr_t array_len,
273 uint16_t* dst,
274 intptr_t len) {
238 intptr_t i = 0; 275 intptr_t i = 0;
239 intptr_t j = 0; 276 intptr_t j = 0;
240 intptr_t num_bytes; 277 intptr_t num_bytes;
241 for (; src[i] != '\0' && j < len; i += num_bytes, ++j) { 278 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
242 int32_t ch; 279 int32_t ch;
243 num_bytes = Utf8::Decode(&src[i], &ch); 280 bool is_smp = IsSmpSequenceStart(utf8_array[i]);
281 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
244 if (ch == -1) { 282 if (ch == -1) {
245 return false; // invalid input 283 return false; // invalid input
246 } 284 }
247 dst[j] = ch; 285 if (is_smp) {
286 ConvertUTF32ToUTF16(ch, &(dst[j]));
287 j = j + 1;
288 } else {
289 dst[j] = ch;
290 }
248 } 291 }
249 if (src[i] != '\0' && j == len) { 292 if ((i < array_len) && (j == len)) {
250 return false; // output overflow 293 return false; // output overflow
251 } 294 }
252 return true; // success 295 return true; // success
253 } 296 }
254 297
255 298
256 bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) { 299 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
257 return DecodeImpl(src, dst, len); 300 intptr_t array_len,
258 } 301 uint32_t* dst,
259 302 intptr_t len) {
260 303 intptr_t i = 0;
261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { 304 intptr_t j = 0;
262 return DecodeImpl(src, dst, len); 305 intptr_t num_bytes;
263 } 306 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
264 307 int32_t ch;
265 308 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { 309 if (ch == -1) {
267 return DecodeImpl(src, dst, len); 310 return false; // invalid input
311 }
312 dst[j] = ch;
313 }
314 if ((i < array_len) && (j == len)) {
315 return false; // output overflow
316 }
317 return true; // success
268 } 318 }
269 319
270 } // namespace dart 320 } // namespace dart
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698