Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(336)

Side by Side Diff: runtime/vm/unicode.cc

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: New version integrates feedback, adds less to standard String class. Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
58 } 58 }
59 59
60 60
61 static bool IsAsciiSequenceStart(uint8_t code_unit) { 61 static bool IsAsciiSequenceStart(uint8_t code_unit) {
62 // Check is codepoint is <= U+007F 62 // Check is codepoint is <= U+007F
63 return (code_unit <= Utf8::kMaxOneByteChar); 63 return (code_unit <= Utf8::kMaxOneByteChar);
64 } 64 }
65 65
66 66
67 static bool IsSmpSequenceStart(uint8_t code_unit) { 67 static bool IsSmpSequenceStart(uint8_t code_unit) {
68 // Check is codepoint is >= U+10000. 68 // Check the UTF-8 code unit to determine if it is a sequence start for a
69 // code point >= U+10000.
69 return (code_unit >= 0xF0); 70 return (code_unit >= 0xF0);
70 } 71 }
71 72
72 73
73 // Returns true if the code point is a high- or low-surrogate.
74 static bool IsSurrogate(uint32_t code_point) {
75 return (code_point & 0xfffff800) == 0xd800;
76 }
77
78
79 // Returns true if the code point value is above Plane 17. 74 // Returns true if the code point value is above Plane 17.
80 static bool IsOutOfRange(uint32_t code_point) { 75 static bool IsOutOfRange(uint32_t code_point) {
81 return (code_point > 0x10FFFF); 76 return (code_point > Utf16::kMaxCodePoint);
82 } 77 }
83 78
84 79
85 // Returns true if the byte sequence is ill-formed. 80 // Returns true if the byte sequence is ill-formed.
86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { 81 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {
87 return code_point < kOverlongMinimum[num_bytes]; 82 return code_point < kOverlongMinimum[num_bytes];
88 } 83 }
89 84
90 85
91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { 86 // Returns a count of the number of UTF-16 code units represented by this UTF-8
92 ASSERT(codepoint > kMaxBmpCodepoint); 87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then
93 ASSERT(dst != NULL); 88 // the type is kSMP. Otherwise it is kBMP.
94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); 89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array,
95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); 90 intptr_t array_len,
96 } 91 Type* type) {
97
98
99 // Returns a count of the number of UTF-8 trail bytes.
100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
101 intptr_t array_len,
102 Type* type) {
103 intptr_t len = 0; 92 intptr_t len = 0;
104 Type char_type = kAscii; 93 Type char_type = kAscii;
105 for (intptr_t i = 0; i < array_len; i++) { 94 for (intptr_t i = 0; i < array_len; i++) {
106 uint8_t code_unit = utf8_array[i]; 95 uint8_t code_unit = utf8_array[i];
107 if (!IsTrailByte(code_unit)) { 96 if (!IsTrailByte(code_unit)) {
108 ++len; 97 ++len;
109 } 98 }
110 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F 99 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F
111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 100 if (IsSmpSequenceStart(code_unit)) { // >= U+10000
112 char_type = kSMP; 101 char_type = kSMP;
113 ++len; 102 ++len; // Surrogate pair in the UTF-16 encoding.
114 } else if (char_type == kAscii) { 103 } else if (char_type == kAscii) {
115 char_type = kBMP; 104 char_type = kBMP;
116 } 105 }
117 } 106 }
118 } 107 }
119 *type = char_type; 108 *type = char_type;
120 return len; 109 return len;
121 } 110 }
122 111
123 112
124 // Returns true if str is a valid NUL-terminated UTF-8 string. 113 // Returns true if str is a valid UTF-8 string.
125 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { 114 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {
126 intptr_t i = 0; 115 intptr_t i = 0;
127 while (i < array_len) { 116 while (i < array_len) {
128 uint32_t ch = utf8_array[i] & 0xFF; 117 uint32_t ch = utf8_array[i] & 0xFF;
129 intptr_t j = 1; 118 intptr_t j = 1;
130 if (ch >= 0x80) { 119 if (ch >= 0x80) {
131 int8_t num_trail_bytes = kTrailBytes[ch]; 120 int8_t num_trail_bytes = kTrailBytes[ch];
132 bool is_malformed = false; 121 bool is_malformed = false;
133 for (; j < num_trail_bytes; ++j) { 122 for (; j < num_trail_bytes; ++j) {
134 if ((i + j) < array_len) { 123 if ((i + j) < array_len) {
135 uint8_t code_unit = utf8_array[i + j]; 124 uint8_t code_unit = utf8_array[i + j];
136 is_malformed |= !IsTrailByte(code_unit); 125 is_malformed |= !IsTrailByte(code_unit);
137 ch = (ch << 6) + code_unit; 126 ch = (ch << 6) + code_unit;
138 } else { 127 } else {
139 return false; 128 return false;
140 } 129 }
141 } 130 }
142 ch -= kMagicBits[num_trail_bytes]; 131 ch -= kMagicBits[num_trail_bytes];
143 if (!((is_malformed == false) && 132 if (!((is_malformed == false) &&
144 (j == num_trail_bytes) && 133 (j == num_trail_bytes) &&
145 !IsOutOfRange(ch) && 134 !IsOutOfRange(ch) &&
146 !IsNonShortestForm(ch, j) && 135 !IsNonShortestForm(ch, j) &&
147 !IsSurrogate(ch))) { 136 !Utf16::IsSurrogate(ch))) {
148 return false; 137 return false;
149 } 138 }
150 } 139 }
151 i += j; 140 i += j;
152 } 141 }
153 return true; 142 return true;
154 } 143 }
155 144
156 145
157 intptr_t Utf8::Length(int32_t ch) { 146 intptr_t Utf8::Length(int32_t ch) {
158 if (ch <= kMaxOneByteChar) { 147 if (ch <= kMaxOneByteChar) {
159 return 1; 148 return 1;
160 } else if (ch <= kMaxTwoByteChar) { 149 } else if (ch <= kMaxTwoByteChar) {
161 return 2; 150 return 2;
162 } else if (ch <= kMaxThreeByteChar) { 151 } else if (ch <= kMaxThreeByteChar) {
163 return 3; 152 return 3;
164 } 153 }
165 ASSERT(ch <= kMaxFourByteChar); 154 ASSERT(ch <= kMaxFourByteChar);
166 return 4; 155 return 4;
167 } 156 }
168 157
169 158
170 intptr_t Utf8::Length(const String& str) { 159 intptr_t Utf8::Length(const String& str) {
171 intptr_t length = 0; 160 intptr_t length = 0;
172 for (intptr_t i = 0; i < str.Length(); ++i) { 161 for (intptr_t i = 0; i < str.Length(); ++i) {
173 int32_t ch = str.CharAt(i); 162 int32_t ch = Utf16::CodePointAt(str, i);
174 length += Utf8::Length(ch); 163 length += Utf8::Length(ch);
164 if (ch >= 0x10000) i++; // Surrogate pair in input
175 } 165 }
176 return length; 166 return length;
177 } 167 }
178 168
179 169
180 intptr_t Utf8::Encode(int32_t ch, char* dst) { 170 intptr_t Utf8::Encode(int32_t ch, char* dst) {
181 static const int kMask = ~(1 << 6); 171 static const int kMask = ~(1 << 6);
182 if (ch <= kMaxOneByteChar) { 172 if (ch <= kMaxOneByteChar) {
183 dst[0] = ch; 173 dst[0] = ch;
184 return 1; 174 return 1;
(...skipping 14 matching lines...) Expand all
199 dst[1] = 0x80 | ((ch >> 12) & kMask); 189 dst[1] = 0x80 | ((ch >> 12) & kMask);
200 dst[2] = 0x80 | ((ch >> 6) & kMask); 190 dst[2] = 0x80 | ((ch >> 6) & kMask);
201 dst[3] = 0x80 | (ch & kMask); 191 dst[3] = 0x80 | (ch & kMask);
202 return 4; 192 return 4;
203 } 193 }
204 194
205 195
206 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { 196 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {
207 intptr_t pos = 0; 197 intptr_t pos = 0;
208 for (intptr_t i = 0; i < src.Length(); ++i) { 198 for (intptr_t i = 0; i < src.Length(); ++i) {
209 intptr_t ch = src.CharAt(i); 199 intptr_t ch = Utf16::CodePointAt(src, i);
210 intptr_t num_bytes = Utf8::Length(ch); 200 intptr_t num_bytes = Utf8::Length(ch);
211 if (pos + num_bytes > len) { 201 if (pos + num_bytes > len) {
212 break; 202 break;
213 } 203 }
214 Utf8::Encode(ch, &dst[pos]); 204 Utf8::Encode(ch, &dst[pos]);
215 pos += num_bytes; 205 pos += num_bytes;
206 if (num_bytes > 3) i++; // Surrogate pair in input.
216 } 207 }
217 return pos; 208 return pos;
218 } 209 }
219 210
220 211
221 intptr_t Utf8::Decode(const uint8_t* utf8_array, 212 intptr_t Utf8::Decode(const uint8_t* utf8_array,
222 intptr_t array_len, 213 intptr_t array_len,
223 int32_t* dst) { 214 uint32_t* dst) {
224 uint32_t ch = utf8_array[0] & 0xFF; 215 uint32_t ch = utf8_array[0] & 0xFF;
225 intptr_t i = 1; 216 intptr_t i = 1;
226 if (ch >= 0x80) { 217 if (ch >= 0x80) {
227 int32_t num_trail_bytes = kTrailBytes[ch]; 218 int32_t num_trail_bytes = kTrailBytes[ch];
228 bool is_malformed = false; 219 bool is_malformed = false;
229 for (; i < num_trail_bytes; ++i) { 220 for (; i < num_trail_bytes; ++i) {
230 if (i < array_len) { 221 if (i < array_len) {
231 uint8_t code_unit = utf8_array[i]; 222 uint8_t code_unit = utf8_array[i];
232 is_malformed |= !IsTrailByte(code_unit); 223 is_malformed |= !IsTrailByte(code_unit);
233 ch = (ch << 6) + code_unit; 224 ch = (ch << 6) + code_unit;
234 } else { 225 } else {
235 *dst = -1; 226 *dst = kInvalidCodePoint;
236 return 0; 227 return 0;
237 } 228 }
238 } 229 }
239 ch -= kMagicBits[num_trail_bytes]; 230 ch -= kMagicBits[num_trail_bytes];
240 if (!((is_malformed == false) && 231 if (!((is_malformed == false) &&
241 (i == num_trail_bytes) && 232 (i == num_trail_bytes) &&
242 !IsOutOfRange(ch) && 233 !IsOutOfRange(ch) &&
243 !IsNonShortestForm(ch, i) && 234 !IsNonShortestForm(ch, i) &&
244 !IsSurrogate(ch))) { 235 !Utf16::IsSurrogate(ch))) {
245 *dst = -1; 236 *dst = kInvalidCodePoint;
246 return 0; 237 return 0;
247 } 238 }
248 } 239 }
249 *dst = ch; 240 *dst = ch;
250 return i; 241 return i;
251 } 242 }
252 243
253 244
254 bool Utf8::DecodeToAscii(const uint8_t* utf8_array, 245 bool Utf8::DecodeToAscii(const uint8_t* utf8_array,
255 intptr_t array_len, 246 intptr_t array_len,
(...skipping 13 matching lines...) Expand all
269 260
270 261
271 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, 262 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
272 intptr_t array_len, 263 intptr_t array_len,
273 uint16_t* dst, 264 uint16_t* dst,
274 intptr_t len) { 265 intptr_t len) {
275 intptr_t i = 0; 266 intptr_t i = 0;
276 intptr_t j = 0; 267 intptr_t j = 0;
277 intptr_t num_bytes; 268 intptr_t num_bytes;
278 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 269 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
279 int32_t ch; 270 uint32_t ch;
280 bool is_smp = IsSmpSequenceStart(utf8_array[i]); 271 bool is_smp = IsSmpSequenceStart(utf8_array[i]);
281 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 272 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
282 if (ch == -1) { 273 if (ch == kInvalidCodePoint) {
283 return false; // invalid input 274 return false; // invalid input
284 } 275 }
285 if (is_smp) { 276 if (is_smp) {
286 ConvertUTF32ToUTF16(ch, &(dst[j])); 277 dst[j] = Utf16::LeadFromCodePoint(ch);
287 j = j + 1; 278 dst[j + 1] = Utf16::TrailFromCodePoint(ch);
279 ++j;
288 } else { 280 } else {
289 dst[j] = ch; 281 dst[j] = ch;
290 } 282 }
291 } 283 }
292 if ((i < array_len) && (j == len)) { 284 if ((i < array_len) && (j == len)) {
293 return false; // output overflow 285 return false; // output overflow
294 } 286 }
295 return true; // success 287 return true; // success
296 } 288 }
297 289
298 290
299 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, 291 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
300 intptr_t array_len, 292 intptr_t array_len,
301 uint32_t* dst, 293 uint32_t* dst,
302 intptr_t len) { 294 intptr_t len) {
303 intptr_t i = 0; 295 intptr_t i = 0;
304 intptr_t j = 0; 296 intptr_t j = 0;
305 intptr_t num_bytes; 297 intptr_t num_bytes;
306 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 298 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
307 int32_t ch; 299 uint32_t ch;
308 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 300 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
309 if (ch == -1) { 301 if (ch == kInvalidCodePoint) {
310 return false; // invalid input 302 return false; // invalid input
311 } 303 }
312 dst[j] = ch; 304 dst[j] = ch;
313 } 305 }
314 if ((i < array_len) && (j == len)) { 306 if ((i < array_len) && (j == len)) {
315 return false; // output overflow 307 return false; // output overflow
316 } 308 }
317 return true; // success 309 return true; // success
318 } 310 }
319 311
312
313 uint32_t Utf16::CodePointAt(const String& str, int index) {
314 uint32_t code = str.CodeUnitAt(index);
315 if (!IsLeadSurrogate(code)) return code;
316 if (index + 1 == str.Length()) return code;
317 uint32_t trail = str.CodeUnitAt(index + 1);
318 if (!IsTrailSurrogate(trail)) return code;
319 return CodePointFromCodeUnits(code, trail);
320 }
321
320 } // namespace dart 322 } // namespace dart
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698