Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(191)

Side by Side Diff: sdk/lib/utf/utf8.dart

Issue 11410086: Use iterator, moveNext(), current. (Closed) Base URL: https://dart.googlecode.com/svn/experimental/lib_v2/dart
Patch Set: Address comments. Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 const int _UTF8_ONE_BYTE_MAX = 0x7f; 5 const int _UTF8_ONE_BYTE_MAX = 0x7f;
6 const int _UTF8_TWO_BYTE_MAX = 0x7ff; 6 const int _UTF8_TWO_BYTE_MAX = 0x7ff;
7 const int _UTF8_THREE_BYTE_MAX = 0xffff; 7 const int _UTF8_THREE_BYTE_MAX = 0xffff;
8 8
9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; 9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;
10 10
(...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after
129 */ 129 */
130 class IterableUtf8Decoder extends Iterable<int> { 130 class IterableUtf8Decoder extends Iterable<int> {
131 final List<int> bytes; 131 final List<int> bytes;
132 final int offset; 132 final int offset;
133 final int length; 133 final int length;
134 final int replacementCodepoint; 134 final int replacementCodepoint;
135 135
136 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, 136 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,
137 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); 137 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
138 138
139 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, 139 Utf8Decoder get iterator =>
140 replacementCodepoint); 140 new Utf8Decoder(bytes, offset, length, replacementCodepoint);
141 } 141 }
142 142
143 /** 143 /**
144 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The 144 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The
145 * parameters can set an offset into a list of bytes (as int), limit the length 145 * parameters can set an offset into a list of bytes (as int), limit the length
146 * of the values to be decoded, and override the default Unicode replacement 146 * of the values to be decoded, and override the default Unicode replacement
147 * character. Set the replacementCharacter to null to throw an 147 * character. Set the replacementCharacter to null to throw an
148 * ArgumentError rather than replace the bad value. The return value 148 * ArgumentError rather than replace the bad value. The return value
149 * from this method can be used as an Iterable (e.g. in a for-loop). 149 * from this method can be used as an Iterable (e.g. in a for-loop).
150 */ 150 */
151 class Utf8Decoder implements Iterator<int> { 151 class Utf8Decoder implements Iterator<int> {
152 final _ListRangeIterator utf8EncodedBytesIterator; 152 final _ListRangeIterator utf8EncodedBytesIterator;
153 final int replacementCodepoint; 153 final int replacementCodepoint;
154 int _current = -1;
154 155
155 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, 156 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,
156 this.replacementCodepoint = 157 this.replacementCodepoint =
157 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : 158 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
158 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, 159 utf8EncodedBytesIterator =
159 length)).iterator(); 160 (new _ListRange(utf8EncodedBytes, offset, length)).iterator;
160 161
161 162
162 Utf8Decoder._fromListRangeIterator(_ListRange source, [ 163 Utf8Decoder._fromListRangeIterator(_ListRange source, [
163 this.replacementCodepoint = 164 this.replacementCodepoint =
164 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : 165 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
165 utf8EncodedBytesIterator = source.iterator(); 166 utf8EncodedBytesIterator = source.iterator;
166 167
167 /** Decode the remaininder of the characters in this decoder 168 /** Decode the remaininder of the characters in this decoder
168 * into a [List<int>]. 169 * into a [List<int>].
169 */ 170 */
170 List<int> decodeRest() { 171 List<int> decodeRest() {
171 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); 172 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
172 int i = 0; 173 int i = 0;
173 while (hasNext) { 174 while (moveNext()) {
174 codepoints[i++] = next(); 175 codepoints[i++] = current;
175 } 176 }
176 if (i == codepoints.length) { 177 if (i == codepoints.length) {
177 return codepoints; 178 return codepoints;
178 } else { 179 } else {
179 List<int> truncCodepoints = new List<int>(i); 180 List<int> truncCodepoints = new List<int>(i);
180 truncCodepoints.setRange(0, i, codepoints); 181 truncCodepoints.setRange(0, i, codepoints);
181 return truncCodepoints; 182 return truncCodepoints;
182 } 183 }
183 } 184 }
184 185
185 bool get hasNext => utf8EncodedBytesIterator.hasNext; 186 int get current {
187 if (_current == -1) {
188 // TODO(floitsch): bad error message.
189 throw new StateError("No more elements");
190 }
191 return _current;
192 }
186 193
187 int next() { 194 bool moveNext() {
188 int value = utf8EncodedBytesIterator.next(); 195 _current = -1;
196
197 if (!utf8EncodedBytesIterator.moveNext()) return false;
198
199 int value = utf8EncodedBytesIterator.current;
189 int additionalBytes = 0; 200 int additionalBytes = 0;
190 201
191 if (value < 0) { 202 if (value < 0) {
192 if (replacementCodepoint != null) { 203 if (replacementCodepoint != null) {
193 return replacementCodepoint; 204 _current = replacementCodepoint;
205 return true;
194 } else { 206 } else {
195 throw new ArgumentError( 207 throw new ArgumentError(
196 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); 208 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
197 } 209 }
198 } else if (value <= _UTF8_ONE_BYTE_MAX) { 210 } else if (value <= _UTF8_ONE_BYTE_MAX) {
199 return value; 211 _current = value;
212 return true;
200 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { 213 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
201 if (replacementCodepoint != null) { 214 if (replacementCodepoint != null) {
202 return replacementCodepoint; 215 _current = replacementCodepoint;
216 return true;
203 } else { 217 } else {
204 throw new ArgumentError( 218 throw new ArgumentError(
205 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); 219 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
206 } 220 }
207 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { 221 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {
208 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; 222 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;
209 additionalBytes = 1; 223 additionalBytes = 1;
210 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { 224 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {
211 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; 225 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;
212 additionalBytes = 2; 226 additionalBytes = 2;
213 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { 227 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {
214 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; 228 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;
215 additionalBytes = 3; 229 additionalBytes = 3;
216 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { 230 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {
217 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; 231 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;
218 additionalBytes = 4; 232 additionalBytes = 4;
219 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { 233 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {
220 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; 234 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;
221 additionalBytes = 5; 235 additionalBytes = 5;
222 } else if (replacementCodepoint != null) { 236 } else if (replacementCodepoint != null) {
223 return replacementCodepoint; 237 _current = replacementCodepoint;
238 return true;
224 } else { 239 } else {
225 throw new ArgumentError( 240 throw new ArgumentError(
226 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); 241 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");
227 } 242 }
228 int j = 0; 243 int j = 0;
229 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext) { 244 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {
230 int nextValue = utf8EncodedBytesIterator.next(); 245 int nextValue = utf8EncodedBytesIterator.current;
231 if (nextValue > _UTF8_ONE_BYTE_MAX && 246 if (nextValue > _UTF8_ONE_BYTE_MAX &&
232 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { 247 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {
233 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); 248 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
234 } else { 249 } else {
235 // if sequence-starting code unit, reposition cursor to start here 250 // if sequence-starting code unit, reposition cursor to start here
236 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { 251 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
237 utf8EncodedBytesIterator.backup(); 252 utf8EncodedBytesIterator.backup();
238 } 253 }
239 break; 254 break;
240 } 255 }
241 j++; 256 j++;
242 } 257 }
243 bool validSequence = (j == additionalBytes && ( 258 bool validSequence = (j == additionalBytes && (
244 value < UNICODE_UTF16_RESERVED_LO || 259 value < UNICODE_UTF16_RESERVED_LO ||
245 value > UNICODE_UTF16_RESERVED_HI)); 260 value > UNICODE_UTF16_RESERVED_HI));
246 bool nonOverlong = 261 bool nonOverlong =
247 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || 262 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
248 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || 263 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
249 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); 264 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
250 bool inRange = value <= UNICODE_VALID_RANGE_MAX; 265 bool inRange = value <= UNICODE_VALID_RANGE_MAX;
251 if (validSequence && nonOverlong && inRange) { 266 if (validSequence && nonOverlong && inRange) {
252 return value; 267 _current = value;
268 return true;
253 } else if (replacementCodepoint != null) { 269 } else if (replacementCodepoint != null) {
254 return replacementCodepoint; 270 _current = replacementCodepoint;
271 return true;
255 } else { 272 } else {
256 throw new ArgumentError( 273 throw new ArgumentError(
257 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); 274 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
258 } 275 }
259 } 276 }
260 } 277 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698