Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(163)

Side by Side Diff: sdk/lib/utf/utf8.dart

Issue 11783009: Big merge from experimental to bleeding edge. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « sdk/lib/utf/utf32.dart ('k') | sdk/lib/utf/utf_core.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.utf; 5 part of dart.utf;
6 6
7 const int _UTF8_ONE_BYTE_MAX = 0x7f; 7 const int _UTF8_ONE_BYTE_MAX = 0x7f;
8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; 8 const int _UTF8_TWO_BYTE_MAX = 0x7ff;
9 const int _UTF8_THREE_BYTE_MAX = 0xffff; 9 const int _UTF8_THREE_BYTE_MAX = 0xffff;
10 10
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
79 encodedLength++; 79 encodedLength++;
80 } else if (value <= _UTF8_TWO_BYTE_MAX) { 80 } else if (value <= _UTF8_TWO_BYTE_MAX) {
81 encodedLength += 2; 81 encodedLength += 2;
82 } else if (value <= _UTF8_THREE_BYTE_MAX) { 82 } else if (value <= _UTF8_THREE_BYTE_MAX) {
83 encodedLength += 3; 83 encodedLength += 3;
84 } else if (value <= UNICODE_VALID_RANGE_MAX) { 84 } else if (value <= UNICODE_VALID_RANGE_MAX) {
85 encodedLength += 4; 85 encodedLength += 4;
86 } 86 }
87 } 87 }
88 88
89 List<int> encoded = new List<int>(encodedLength); 89 List<int> encoded = new List<int>.fixedLength(encodedLength);
90 int insertAt = 0; 90 int insertAt = 0;
91 for (int value in source) { 91 for (int value in source) {
92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { 92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) {
93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]); 93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]);
94 insertAt += 3; 94 insertAt += 3;
95 } else if (value <= _UTF8_ONE_BYTE_MAX) { 95 } else if (value <= _UTF8_ONE_BYTE_MAX) {
96 encoded[insertAt] = value; 96 encoded[insertAt] = value;
97 insertAt++; 97 insertAt++;
98 } else if (value <= _UTF8_TWO_BYTE_MAX) { 98 } else if (value <= _UTF8_TWO_BYTE_MAX) {
99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( 99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | (
(...skipping 22 matching lines...) Expand all
122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { 122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
123 return new Utf8Decoder(utf8EncodedBytes, offset, length, 123 return new Utf8Decoder(utf8EncodedBytes, offset, length,
124 replacementCodepoint).decodeRest(); 124 replacementCodepoint).decodeRest();
125 } 125 }
126 126
127 /** 127 /**
128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type 128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type
129 * provides an iterator on demand and the iterator will only translate bytes 129 * provides an iterator on demand and the iterator will only translate bytes
130 * as requested by the user of the iterator. (Note: results are not cached.) 130 * as requested by the user of the iterator. (Note: results are not cached.)
131 */ 131 */
132 class IterableUtf8Decoder implements Iterable<int> { 132 // TODO(floitsch): Consider removing the extend and switch to implements since
133 // that's cheaper to allocate.
134 class IterableUtf8Decoder extends Iterable<int> {
133 final List<int> bytes; 135 final List<int> bytes;
134 final int offset; 136 final int offset;
135 final int length; 137 final int length;
136 final int replacementCodepoint; 138 final int replacementCodepoint;
137 139
138 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, 140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,
139 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); 141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
140 142
141 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, 143 Utf8Decoder get iterator =>
142 replacementCodepoint); 144 new Utf8Decoder(bytes, offset, length, replacementCodepoint);
143 } 145 }
144 146
145 /** 147 /**
146 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The 148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The
147 * parameters can set an offset into a list of bytes (as int), limit the length 149 * parameters can set an offset into a list of bytes (as int), limit the length
148 * of the values to be decoded, and override the default Unicode replacement 150 * of the values to be decoded, and override the default Unicode replacement
149 * character. Set the replacementCharacter to null to throw an 151 * character. Set the replacementCharacter to null to throw an
150 * ArgumentError rather than replace the bad value. The return value 152 * ArgumentError rather than replace the bad value. The return value
151 * from this method can be used as an Iterable (e.g. in a for-loop). 153 * from this method can be used as an Iterable (e.g. in a for-loop).
152 */ 154 */
153 class Utf8Decoder implements Iterator<int> { 155 class Utf8Decoder implements Iterator<int> {
154 final _ListRangeIterator utf8EncodedBytesIterator; 156 final _ListRangeIterator utf8EncodedBytesIterator;
155 final int replacementCodepoint; 157 final int replacementCodepoint;
158 int _current = null;
156 159
157 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, 160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
158 this.replacementCodepoint = 161 this.replacementCodepoint =
159 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : 162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
160 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, 163 utf8EncodedBytesIterator =
161 length)).iterator(); 164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator;
162 165
163 166
164 Utf8Decoder._fromListRangeIterator(_ListRange source, [ 167 Utf8Decoder._fromListRangeIterator(_ListRange source, [
165 this.replacementCodepoint = 168 this.replacementCodepoint =
166 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : 169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
167 utf8EncodedBytesIterator = source.iterator(); 170 utf8EncodedBytesIterator = source.iterator;
168 171
169 /** Decode the remaininder of the characters in this decoder 172 /** Decode the remaininder of the characters in this decoder
170 * into a [List<int>]. 173 * into a [List<int>].
171 */ 174 */
172 List<int> decodeRest() { 175 List<int> decodeRest() {
173 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); 176 List<int> codepoints = new List<int>.fixedLength(utf8EncodedBytesIterator.re maining);
174 int i = 0; 177 int i = 0;
175 while (hasNext) { 178 while (moveNext()) {
176 codepoints[i++] = next(); 179 codepoints[i++] = current;
177 } 180 }
178 if (i == codepoints.length) { 181 if (i == codepoints.length) {
179 return codepoints; 182 return codepoints;
180 } else { 183 } else {
181 List<int> truncCodepoints = new List<int>(i); 184 List<int> truncCodepoints = new List<int>.fixedLength(i);
182 truncCodepoints.setRange(0, i, codepoints); 185 truncCodepoints.setRange(0, i, codepoints);
183 return truncCodepoints; 186 return truncCodepoints;
184 } 187 }
185 } 188 }
186 189
187 bool get hasNext => utf8EncodedBytesIterator.hasNext; 190 int get current => _current;
188 191
189 int next() { 192 bool moveNext() {
190 int value = utf8EncodedBytesIterator.next(); 193 _current = null;
194
195 if (!utf8EncodedBytesIterator.moveNext()) return false;
196
197 int value = utf8EncodedBytesIterator.current;
191 int additionalBytes = 0; 198 int additionalBytes = 0;
192 199
193 if (value < 0) { 200 if (value < 0) {
194 if (replacementCodepoint != null) { 201 if (replacementCodepoint != null) {
195 return replacementCodepoint; 202 _current = replacementCodepoint;
203 return true;
196 } else { 204 } else {
197 throw new ArgumentError( 205 throw new ArgumentError(
198 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); 206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
199 } 207 }
200 } else if (value <= _UTF8_ONE_BYTE_MAX) { 208 } else if (value <= _UTF8_ONE_BYTE_MAX) {
201 return value; 209 _current = value;
210 return true;
202 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { 211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
203 if (replacementCodepoint != null) { 212 if (replacementCodepoint != null) {
204 return replacementCodepoint; 213 _current = replacementCodepoint;
214 return true;
205 } else { 215 } else {
206 throw new ArgumentError( 216 throw new ArgumentError(
207 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); 217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
208 } 218 }
209 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { 219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
210 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; 220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
211 additionalBytes = 1; 221 additionalBytes = 1;
212 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { 222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
213 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; 223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
214 additionalBytes = 2; 224 additionalBytes = 2;
215 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { 225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
216 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; 226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
217 additionalBytes = 3; 227 additionalBytes = 3;
218 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { 228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
219 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; 229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
220 additionalBytes = 4; 230 additionalBytes = 4;
221 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { 231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
222 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; 232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
223 additionalBytes = 5; 233 additionalBytes = 5;
224 } else if (replacementCodepoint != null) { 234 } else if (replacementCodepoint != null) {
225 return replacementCodepoint; 235 _current = replacementCodepoint;
236 return true;
226 } else { 237 } else {
227 throw new ArgumentError( 238 throw new ArgumentError(
228 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); 239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
229 } 240 }
230 int j = 0; 241 int j = 0;
231 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext) { 242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
232 int nextValue = utf8EncodedBytesIterator.next(); 243 int nextValue = utf8EncodedBytesIterator.current;
233 if (nextValue > _UTF8_ONE_BYTE_MAX && 244 if (nextValue > _UTF8_ONE_BYTE_MAX &&
234 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { 245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
235 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); 246 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
236 } else { 247 } else {
237 // if sequence-starting code unit, reposition cursor to start here 248 // if sequence-starting code unit, reposition cursor to start here
238 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { 249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
239 utf8EncodedBytesIterator.backup(); 250 utf8EncodedBytesIterator.backup();
240 } 251 }
241 break; 252 break;
242 } 253 }
243 j++; 254 j++;
244 } 255 }
245 bool validSequence = (j == additionalBytes && ( 256 bool validSequence = (j == additionalBytes && (
246 value < UNICODE_UTF16_RESERVED_LO || 257 value < UNICODE_UTF16_RESERVED_LO ||
247 value > UNICODE_UTF16_RESERVED_HI)); 258 value > UNICODE_UTF16_RESERVED_HI));
248 bool nonOverlong = 259 bool nonOverlong =
249 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || 260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
250 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || 261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
251 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); 262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
252 bool inRange = value <= UNICODE_VALID_RANGE_MAX; 263 bool inRange = value <= UNICODE_VALID_RANGE_MAX;
253 if (validSequence && nonOverlong && inRange) { 264 if (validSequence && nonOverlong && inRange) {
254 return value; 265 _current = value;
266 return true;
255 } else if (replacementCodepoint != null) { 267 } else if (replacementCodepoint != null) {
256 return replacementCodepoint; 268 _current = replacementCodepoint;
269 return true;
257 } else { 270 } else {
258 throw new ArgumentError( 271 throw new ArgumentError(
259 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); 272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
260 } 273 }
261 } 274 }
262 } 275 }
OLDNEW
« no previous file with comments | « sdk/lib/utf/utf32.dart ('k') | sdk/lib/utf/utf_core.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698