Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(977)

Side by Side Diff: runtime/vm/unicode.cc

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after
58 } 58 }
59 59
60 60
61 static bool IsAsciiSequenceStart(uint8_t code_unit) { 61 static bool IsAsciiSequenceStart(uint8_t code_unit) {
62 // Check is codepoint is <= U+007F 62 // Check is codepoint is <= U+007F
63 return (code_unit <= Utf8::kMaxOneByteChar); 63 return (code_unit <= Utf8::kMaxOneByteChar);
64 } 64 }
65 65
66 66
67 static bool IsSmpSequenceStart(uint8_t code_unit) { 67 static bool IsSmpSequenceStart(uint8_t code_unit) {
68 // Check is codepoint is >= U+10000. 68 // Check the UTF-8 code unit to determine if it is a sequence start for a
69 // code point >= U+10000.
69 return (code_unit >= 0xF0); 70 return (code_unit >= 0xF0);
70 } 71 }
71 72
72 73
73 // Returns true if the code point is a high- or low-surrogate.
74 static bool IsSurrogate(uint32_t code_point) {
75 return (code_point & 0xfffff800) == 0xd800;
76 }
77
78
79 // Returns true if the code point value is above Plane 17. 74 // Returns true if the code point value is above Plane 17.
80 static bool IsOutOfRange(uint32_t code_point) { 75 static bool IsOutOfRange(uint32_t code_point) {
81 return (code_point > 0x10FFFF); 76 return (code_point > Utf16::kMaxCodePoint);
82 } 77 }
83 78
84 79
85 // Returns true if the byte sequence is ill-formed. 80 // Returns true if the byte sequence is ill-formed.
86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { 81 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {
87 return code_point < kOverlongMinimum[num_bytes]; 82 return code_point < kOverlongMinimum[num_bytes];
88 } 83 }
89 84
90 85
91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { 86 // Returns a count of the number of UTF-16 code units represented by this UTF-8
92 ASSERT(codepoint > kMaxBmpCodepoint); 87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then
93 ASSERT(dst != NULL); 88 // the type is kSMP. Otherwise it is kBMP.
94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); 89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array,
95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); 90 intptr_t array_len,
96 } 91 Type* type) {
97
98
99 // Returns a count of the number of UTF-8 trail bytes.
100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
101 intptr_t array_len,
102 Type* type) {
103 intptr_t len = 0; 92 intptr_t len = 0;
104 Type char_type = kAscii; 93 Type char_type = kAscii;
105 for (intptr_t i = 0; i < array_len; i++) { 94 for (intptr_t i = 0; i < array_len; i++) {
106 uint8_t code_unit = utf8_array[i]; 95 uint8_t code_unit = utf8_array[i];
107 if (!IsTrailByte(code_unit)) { 96 if (!IsTrailByte(code_unit)) {
108 ++len; 97 ++len;
109 } 98 }
110 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F 99 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F
111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 100 if (IsSmpSequenceStart(code_unit)) { // >= U+10000
112 char_type = kSMP; 101 char_type = kSMP;
113 ++len; 102 ++len; // Surrogate pair in the UTF-16 encoding.
114 } else if (char_type == kAscii) { 103 } else if (char_type == kAscii) {
115 char_type = kBMP; 104 char_type = kBMP;
116 } 105 }
117 } 106 }
118 } 107 }
119 *type = char_type; 108 *type = char_type;
120 return len; 109 return len;
121 } 110 }
122 111
123 112
(...skipping 13 matching lines...) Expand all
137 ch = (ch << 6) + code_unit; 126 ch = (ch << 6) + code_unit;
138 } else { 127 } else {
139 return false; 128 return false;
140 } 129 }
141 } 130 }
142 ch -= kMagicBits[num_trail_bytes]; 131 ch -= kMagicBits[num_trail_bytes];
143 if (!((is_malformed == false) && 132 if (!((is_malformed == false) &&
144 (j == num_trail_bytes) && 133 (j == num_trail_bytes) &&
145 !IsOutOfRange(ch) && 134 !IsOutOfRange(ch) &&
146 !IsNonShortestForm(ch, j) && 135 !IsNonShortestForm(ch, j) &&
147 !IsSurrogate(ch))) { 136 !Utf16::IsSurrogate(ch))) {
148 return false; 137 return false;
149 } 138 }
150 } 139 }
151 i += j; 140 i += j;
152 } 141 }
153 return true; 142 return true;
154 } 143 }
155 144
156 145
157 intptr_t Utf8::Length(int32_t ch) { 146 intptr_t Utf8::Length(int32_t ch) {
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
213 } 202 }
214 Utf8::Encode(ch, &dst[pos]); 203 Utf8::Encode(ch, &dst[pos]);
215 pos += num_bytes; 204 pos += num_bytes;
216 } 205 }
217 return pos; 206 return pos;
218 } 207 }
219 208
220 209
221 intptr_t Utf8::Decode(const uint8_t* utf8_array, 210 intptr_t Utf8::Decode(const uint8_t* utf8_array,
222 intptr_t array_len, 211 intptr_t array_len,
223 int32_t* dst) { 212 uint32_t* dst) {
224 uint32_t ch = utf8_array[0] & 0xFF; 213 uint32_t ch = utf8_array[0] & 0xFF;
225 intptr_t i = 1; 214 intptr_t i = 1;
226 if (ch >= 0x80) { 215 if (ch >= 0x80) {
227 int32_t num_trail_bytes = kTrailBytes[ch]; 216 int32_t num_trail_bytes = kTrailBytes[ch];
228 bool is_malformed = false; 217 bool is_malformed = false;
229 for (; i < num_trail_bytes; ++i) { 218 for (; i < num_trail_bytes; ++i) {
230 if (i < array_len) { 219 if (i < array_len) {
231 uint8_t code_unit = utf8_array[i]; 220 uint8_t code_unit = utf8_array[i];
232 is_malformed |= !IsTrailByte(code_unit); 221 is_malformed |= !IsTrailByte(code_unit);
233 ch = (ch << 6) + code_unit; 222 ch = (ch << 6) + code_unit;
234 } else { 223 } else {
235 *dst = -1; 224 *dst = kInvalidCodePoint;
236 return 0; 225 return 0;
237 } 226 }
238 } 227 }
239 ch -= kMagicBits[num_trail_bytes]; 228 ch -= kMagicBits[num_trail_bytes];
240 if (!((is_malformed == false) && 229 if (!((is_malformed == false) &&
241 (i == num_trail_bytes) && 230 (i == num_trail_bytes) &&
242 !IsOutOfRange(ch) && 231 !IsOutOfRange(ch) &&
243 !IsNonShortestForm(ch, i) && 232 !IsNonShortestForm(ch, i) &&
244 !IsSurrogate(ch))) { 233 !Utf16::IsSurrogate(ch))) {
245 *dst = -1; 234 *dst = kInvalidCodePoint;
246 return 0; 235 return 0;
247 } 236 }
248 } 237 }
249 *dst = ch; 238 *dst = ch;
250 return i; 239 return i;
251 } 240 }
252 241
253 242
254 bool Utf8::DecodeToAscii(const uint8_t* utf8_array, 243 bool Utf8::DecodeToAscii(const uint8_t* utf8_array,
255 intptr_t array_len, 244 intptr_t array_len,
(...skipping 13 matching lines...) Expand all
269 258
270 259
271 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, 260 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
272 intptr_t array_len, 261 intptr_t array_len,
273 uint16_t* dst, 262 uint16_t* dst,
274 intptr_t len) { 263 intptr_t len) {
275 intptr_t i = 0; 264 intptr_t i = 0;
276 intptr_t j = 0; 265 intptr_t j = 0;
277 intptr_t num_bytes; 266 intptr_t num_bytes;
278 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 267 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
279 int32_t ch; 268 uint32_t ch;
280 bool is_smp = IsSmpSequenceStart(utf8_array[i]); 269 bool is_smp = IsSmpSequenceStart(utf8_array[i]);
281 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 270 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
282 if (ch == -1) { 271 if (ch == kInvalidCodePoint) {
283 return false; // invalid input 272 return false; // invalid input
284 } 273 }
285 if (is_smp) { 274 if (is_smp) {
286 ConvertUTF32ToUTF16(ch, &(dst[j])); 275 dst[j] = Utf16::LeadFromCodePoint(ch);
287 j = j + 1; 276 dst[j + 1] = Utf16::TrailFromCodePoint(ch);
277 ++j;
288 } else { 278 } else {
289 dst[j] = ch; 279 dst[j] = ch;
290 } 280 }
291 } 281 }
292 if ((i < array_len) && (j == len)) { 282 if ((i < array_len) && (j == len)) {
293 return false; // output overflow 283 return false; // output overflow
294 } 284 }
295 return true; // success 285 return true; // success
296 } 286 }
297 287
298 288
299 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, 289 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
300 intptr_t array_len, 290 intptr_t array_len,
301 uint32_t* dst, 291 uint32_t* dst,
302 intptr_t len) { 292 intptr_t len) {
303 intptr_t i = 0; 293 intptr_t i = 0;
304 intptr_t j = 0; 294 intptr_t j = 0;
305 intptr_t num_bytes; 295 intptr_t num_bytes;
306 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 296 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
307 int32_t ch; 297 uint32_t ch;
308 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 298 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
309 if (ch == -1) { 299 if (ch == kInvalidCodePoint) {
310 return false; // invalid input 300 return false; // invalid input
311 } 301 }
312 dst[j] = ch; 302 dst[j] = ch;
313 } 303 }
314 if ((i < array_len) && (j == len)) { 304 if ((i < array_len) && (j == len)) {
315 return false; // output overflow 305 return false; // output overflow
316 } 306 }
317 return true; // success 307 return true; // success
318 } 308 }
319 309
320 } // namespace dart 310 } // namespace dart
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698