Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(64)

Side by Side Diff: runtime/vm/unicode.cc

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Implemented feedback from patch set 3 Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
(...skipping 30 matching lines...) Expand all
41 }; 41 };
42 42
43 43
44 // Minimum values of code points used to check shortest form. 44 // Minimum values of code points used to check shortest form.
45 static const uint32_t kOverlongMinimum[7] = { 45 static const uint32_t kOverlongMinimum[7] = {
46 0, // padding 46 0, // padding
47 0x0, 47 0x0,
48 0x80, 48 0x80,
49 0x800, 49 0x800,
50 0x10000, 50 0x10000,
51 0xFFFFFFFF, 51 0xFFFFFFFF, // We never allow 5 byte sequences.
52 0xFFFFFFFF 52 0xFFFFFFFF // We never allow 6 byte sequences.
53 }; 53 };
54 54
55 55
56 static bool IsTrailByte(uint8_t code_unit) { 56 static bool IsTrailByte(uint8_t code_unit) {
57 return (code_unit & 0xc0) == 0x80; 57 return (code_unit & 0xc0) == 0x80;
58 } 58 }
59 59
60 60
61 static bool IsLatin1SequenceStart(uint8_t code_unit) { 61 static bool IsLatin1SequenceStart(uint8_t code_unit) {
62 // Check is codepoint is <= U+00FF 62 // Check is codepoint is <= U+00FF
63 return (code_unit <= Utf8::kMaxOneByteChar); 63 return (code_unit <= Utf8::kMaxOneByteChar);
64 } 64 }
65 65
66 66
67 static bool IsSmpSequenceStart(uint8_t code_unit) { 67 static bool IsSmpSequenceStart(uint8_t code_unit) {
68 // Check is codepoint is >= U+10000. 68 // Check the UTF-8 code unit to determine if it is a sequence start for a
69 // code point >= U+10000.
69 return (code_unit >= 0xF0); 70 return (code_unit >= 0xF0);
70 } 71 }
71 72
72 73
73 // Returns true if the code point is a high- or low-surrogate.
74 static bool IsSurrogate(uint32_t code_point) {
75 return (code_point & 0xfffff800) == 0xd800;
76 }
77
78
79 // Returns true if the code point value is above Plane 17. 74 // Returns true if the code point value is above Plane 17.
80 static bool IsOutOfRange(uint32_t code_point) { 75 static bool IsOutOfRange(int32_t code_point) {
81 return (code_point > 0x10FFFF); 76 return (code_point > Utf16::kMaxCodePoint);
82 } 77 }
83 78
84 79
85 // Returns true if the byte sequence is ill-formed. 80 // Returns true if the byte sequence is ill-formed.
86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { 81 static bool IsNonShortestForm(int32_t code_point, size_t num_bytes) {
87 return code_point < kOverlongMinimum[num_bytes]; 82 return static_cast<uint32_t>(code_point) < kOverlongMinimum[num_bytes];
88 } 83 }
89 84
90 85
91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { 86 // Returns a count of the number of UTF-16 code units represented by this UTF-8
92 ASSERT(codepoint > kMaxBmpCodepoint); 87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then
93 ASSERT(dst != NULL); 88 // the type is kSMP. Otherwise it is kBMP.
94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); 89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array,
95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); 90 intptr_t array_len,
96 } 91 Type* type) {
97
98
99 // Returns a count of the number of UTF-8 trail bytes.
100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
101 intptr_t array_len,
102 Type* type) {
103 intptr_t len = 0; 92 intptr_t len = 0;
104 Type char_type = kLatin1; 93 Type char_type = kLatin1;
105 for (intptr_t i = 0; i < array_len; i++) { 94 for (intptr_t i = 0; i < array_len; i++) {
106 uint8_t code_unit = utf8_array[i]; 95 uint8_t code_unit = utf8_array[i];
107 if (!IsTrailByte(code_unit)) { 96 if (!IsTrailByte(code_unit)) {
108 ++len; 97 ++len;
109 } 98 }
110 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF 99 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF
111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 100 if (IsSmpSequenceStart(code_unit)) { // >= U+10000
112 char_type = kSMP; 101 char_type = kSMP;
113 ++len; 102 ++len; // Surrogate pair in the UTF-16 encoding.
114 } else if (char_type == kLatin1) { 103 } else if (char_type == kLatin1) {
115 char_type = kBMP; 104 char_type = kBMP;
116 } 105 }
117 } 106 }
118 } 107 }
119 *type = char_type; 108 *type = char_type;
120 return len; 109 return len;
121 } 110 }
122 111
123 112
124 // Returns true if str is a valid NUL-terminated UTF-8 string. 113 // Returns true if str is a valid UTF-8 string.
125 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { 114 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {
126 intptr_t i = 0; 115 intptr_t i = 0;
127 while (i < array_len) { 116 while (i < array_len) {
128 uint32_t ch = utf8_array[i] & 0xFF; 117 uint32_t ch = utf8_array[i] & 0xFF;
129 intptr_t j = 1; 118 intptr_t j = 1;
130 if (ch >= 0x80) { 119 if (ch >= 0x80) {
131 int8_t num_trail_bytes = kTrailBytes[ch]; 120 int8_t num_trail_bytes = kTrailBytes[ch];
132 bool is_malformed = false; 121 bool is_malformed = false;
133 for (; j < num_trail_bytes; ++j) { 122 for (; j < num_trail_bytes; ++j) {
134 if ((i + j) < array_len) { 123 if ((i + j) < array_len) {
135 uint8_t code_unit = utf8_array[i + j]; 124 uint8_t code_unit = utf8_array[i + j];
136 is_malformed |= !IsTrailByte(code_unit); 125 is_malformed |= !IsTrailByte(code_unit);
137 ch = (ch << 6) + code_unit; 126 ch = (ch << 6) + code_unit;
138 } else { 127 } else {
139 return false; 128 return false;
140 } 129 }
141 } 130 }
142 ch -= kMagicBits[num_trail_bytes]; 131 ch -= kMagicBits[num_trail_bytes];
143 if (!((is_malformed == false) && 132 if (!((is_malformed == false) &&
144 (j == num_trail_bytes) && 133 (j == num_trail_bytes) &&
145 !IsOutOfRange(ch) && 134 !IsOutOfRange(ch) &&
146 !IsNonShortestForm(ch, j) && 135 !IsNonShortestForm(ch, j) &&
147 !IsSurrogate(ch))) { 136 !Utf16::IsSurrogate(ch))) {
148 return false; 137 return false;
149 } 138 }
150 } 139 }
151 i += j; 140 i += j;
152 } 141 }
153 return true; 142 return true;
154 } 143 }
155 144
156 145
157 intptr_t Utf8::Length(int32_t ch) { 146 intptr_t Utf8::Length(int32_t ch) {
158 if (ch <= kMaxOneByteChar) { 147 if (ch <= kMaxOneByteChar) {
159 return 1; 148 return 1;
160 } else if (ch <= kMaxTwoByteChar) { 149 } else if (ch <= kMaxTwoByteChar) {
161 return 2; 150 return 2;
162 } else if (ch <= kMaxThreeByteChar) { 151 } else if (ch <= kMaxThreeByteChar) {
163 return 3; 152 return 3;
164 } 153 }
165 ASSERT(ch <= kMaxFourByteChar); 154 ASSERT(ch <= kMaxFourByteChar);
166 return 4; 155 return 4;
167 } 156 }
168 157
169 158
170 intptr_t Utf8::Length(const String& str) { 159 intptr_t Utf8::Length(const String& str) {
171 intptr_t length = 0; 160 intptr_t length = 0;
172 for (intptr_t i = 0; i < str.Length(); ++i) { 161 for (intptr_t i = 0; i < str.Length(); ++i) {
173 int32_t ch = str.CharAt(i); 162 int32_t ch = Utf16::CodePointAt(str, i);
174 length += Utf8::Length(ch); 163 length += Utf8::Length(ch);
164 if (ch >= 0x10000) i++; // Surrogate pair in input
175 } 165 }
176 return length; 166 return length;
177 } 167 }
178 168
179 169
180 intptr_t Utf8::Encode(int32_t ch, char* dst) { 170 intptr_t Utf8::Encode(int32_t ch, char* dst) {
181 static const int kMask = ~(1 << 6); 171 static const int kMask = ~(1 << 6);
182 if (ch <= kMaxOneByteChar) { 172 if (ch <= kMaxOneByteChar) {
183 dst[0] = ch; 173 dst[0] = ch;
184 return 1; 174 return 1;
(...skipping 14 matching lines...) Expand all
199 dst[1] = 0x80 | ((ch >> 12) & kMask); 189 dst[1] = 0x80 | ((ch >> 12) & kMask);
200 dst[2] = 0x80 | ((ch >> 6) & kMask); 190 dst[2] = 0x80 | ((ch >> 6) & kMask);
201 dst[3] = 0x80 | (ch & kMask); 191 dst[3] = 0x80 | (ch & kMask);
202 return 4; 192 return 4;
203 } 193 }
204 194
205 195
206 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { 196 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {
207 intptr_t pos = 0; 197 intptr_t pos = 0;
208 for (intptr_t i = 0; i < src.Length(); ++i) { 198 for (intptr_t i = 0; i < src.Length(); ++i) {
209 intptr_t ch = src.CharAt(i); 199 intptr_t ch = Utf16::CodePointAt(src, i);
210 intptr_t num_bytes = Utf8::Length(ch); 200 intptr_t num_bytes = Utf8::Length(ch);
211 if (pos + num_bytes > len) { 201 if (pos + num_bytes > len) {
212 break; 202 break;
213 } 203 }
214 Utf8::Encode(ch, &dst[pos]); 204 Utf8::Encode(ch, &dst[pos]);
215 pos += num_bytes; 205 pos += num_bytes;
206 if (num_bytes > 3) i++; // Surrogate pair in input.
216 } 207 }
217 return pos; 208 return pos;
218 } 209 }
219 210
220 211
221 intptr_t Utf8::Decode(const uint8_t* utf8_array, 212 intptr_t Utf8::Decode(const uint8_t* utf8_array,
222 intptr_t array_len, 213 intptr_t array_len,
223 int32_t* dst) { 214 int32_t* dst) {
224 uint32_t ch = utf8_array[0] & 0xFF; 215 int32_t ch = utf8_array[0] & 0xFF;
225 intptr_t i = 1; 216 intptr_t i = 1;
226 if (ch >= 0x80) { 217 if (ch >= 0x80) {
227 int32_t num_trail_bytes = kTrailBytes[ch]; 218 int32_t num_trail_bytes = kTrailBytes[ch];
228 bool is_malformed = false; 219 bool is_malformed = false;
229 for (; i < num_trail_bytes; ++i) { 220 for (; i < num_trail_bytes; ++i) {
230 if (i < array_len) { 221 if (i < array_len) {
231 uint8_t code_unit = utf8_array[i]; 222 uint8_t code_unit = utf8_array[i];
232 is_malformed |= !IsTrailByte(code_unit); 223 is_malformed |= !IsTrailByte(code_unit);
233 ch = (ch << 6) + code_unit; 224 ch = (ch << 6) + code_unit;
234 } else { 225 } else {
235 *dst = -1; 226 *dst = kInvalidCodePoint;
236 return 0; 227 return 0;
237 } 228 }
238 } 229 }
239 ch -= kMagicBits[num_trail_bytes]; 230 ch -= kMagicBits[num_trail_bytes];
240 if (!((is_malformed == false) && 231 if (!((is_malformed == false) &&
241 (i == num_trail_bytes) && 232 (i == num_trail_bytes) &&
242 !IsOutOfRange(ch) && 233 !IsOutOfRange(ch) &&
243 !IsNonShortestForm(ch, i) && 234 !IsNonShortestForm(ch, i) &&
244 !IsSurrogate(ch))) { 235 !Utf16::IsSurrogate(ch))) {
245 *dst = -1; 236 *dst = kInvalidCodePoint;
246 return 0; 237 return 0;
247 } 238 }
248 } 239 }
249 *dst = ch; 240 *dst = ch;
250 return i; 241 return i;
251 } 242 }
252 243
253 244
254 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, 245 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,
255 intptr_t array_len, 246 intptr_t array_len,
(...skipping 23 matching lines...) Expand all
279 intptr_t array_len, 270 intptr_t array_len,
280 uint16_t* dst, 271 uint16_t* dst,
281 intptr_t len) { 272 intptr_t len) {
282 intptr_t i = 0; 273 intptr_t i = 0;
283 intptr_t j = 0; 274 intptr_t j = 0;
284 intptr_t num_bytes; 275 intptr_t num_bytes;
285 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
286 int32_t ch; 277 int32_t ch;
287 bool is_smp = IsSmpSequenceStart(utf8_array[i]); 278 bool is_smp = IsSmpSequenceStart(utf8_array[i]);
288 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
289 if (ch == -1) { 280 if (ch == kInvalidCodePoint) {
290 return false; // invalid input 281 return false; // invalid input
291 } 282 }
292 if (is_smp) { 283 if (is_smp) {
293 ConvertUTF32ToUTF16(ch, &(dst[j])); 284 dst[j] = Utf16::LeadFromCodePoint(ch);
294 j = j + 1; 285 dst[j + 1] = Utf16::TrailFromCodePoint(ch);
286 ++j;
295 } else { 287 } else {
296 dst[j] = ch; 288 dst[j] = ch;
297 } 289 }
298 } 290 }
299 if ((i < array_len) && (j == len)) { 291 if ((i < array_len) && (j == len)) {
300 return false; // output overflow 292 return false; // output overflow
301 } 293 }
302 return true; // success 294 return true; // success
303 } 295 }
304 296
305 297
306 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, 298 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
307 intptr_t array_len, 299 intptr_t array_len,
308 uint32_t* dst, 300 int32_t* dst,
309 intptr_t len) { 301 intptr_t len) {
310 intptr_t i = 0; 302 intptr_t i = 0;
311 intptr_t j = 0; 303 intptr_t j = 0;
312 intptr_t num_bytes; 304 intptr_t num_bytes;
313 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 305 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
314 int32_t ch; 306 int32_t ch;
315 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 307 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
316 if (ch == -1) { 308 if (ch == kInvalidCodePoint) {
317 return false; // invalid input 309 return false; // invalid input
318 } 310 }
319 dst[j] = ch; 311 dst[j] = ch;
320 } 312 }
321 if ((i < array_len) && (j == len)) { 313 if ((i < array_len) && (j == len)) {
322 return false; // output overflow 314 return false; // output overflow
323 } 315 }
324 return true; // success 316 return true; // success
325 } 317 }
326 318
319
320 int32_t Utf16::CodePointAt(const String& str, int index) {
321 int32_t code = str.CharAt(index);
322 if (!IsLeadSurrogate(code)) return code;
323 if (index + 1 == str.Length()) return code;
324 int32_t trail = str.CharAt(index + 1);
325 if (!IsTrailSurrogate(trail)) return code;
326 return CodePointFromCodeUnits(code, trail);
327 }
328
327 } // namespace dart 329 } // namespace dart
OLDNEW
« runtime/vm/unicode.h ('K') | « runtime/vm/unicode.h ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698