sdk/lib/utf/utf8.dart - Issue 11410086: Use iterator, moveNext(), current.

Side by Side Diff: sdk/lib/utf/utf8.dart

Issue 11410086: Use iterator, moveNext(), current. (Closed) Base URL: https://dart.googlecode.com/svn/experimental/lib_v2/dart

Patch Set: Address comments. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 const int _UTF8_ONE_BYTE_MAX = 0x7f;	5 const int _UTF8_ONE_BYTE_MAX = 0x7f;

6 const int _UTF8_TWO_BYTE_MAX = 0x7ff;	6 const int _UTF8_TWO_BYTE_MAX = 0x7ff;

7 const int _UTF8_THREE_BYTE_MAX = 0xffff;	7 const int _UTF8_THREE_BYTE_MAX = 0xffff;

8	8

9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;	9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;

10	10

(...skipping 118 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
129 */	129 */

130 class IterableUtf8Decoder extends Iterable<int> {	130 class IterableUtf8Decoder extends Iterable<int> {

131 final List<int> bytes;	131 final List<int> bytes;

132 final int offset;	132 final int offset;

133 final int length;	133 final int length;

134 final int replacementCodepoint;	134 final int replacementCodepoint;

135	135

136 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,	136 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,

137 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);	137 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);

138	138

139 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length,	139 Utf8Decoder get iterator =>

140 replacementCodepoint);	140 new Utf8Decoder(bytes, offset, length, replacementCodepoint);

141 }	141 }

142	142

143 /**	143 /**

144 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The	144 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The

145 * parameters can set an offset into a list of bytes (as int), limit the length	145 * parameters can set an offset into a list of bytes (as int), limit the length

146 * of the values to be decoded, and override the default Unicode replacement	146 * of the values to be decoded, and override the default Unicode replacement

147 * character. Set the replacementCharacter to null to throw an	147 * character. Set the replacementCharacter to null to throw an

148 * ArgumentError rather than replace the bad value. The return value	148 * ArgumentError rather than replace the bad value. The return value

149 * from this method can be used as an Iterable (e.g. in a for-loop).	149 * from this method can be used as an Iterable (e.g. in a for-loop).

150 */	150 */

151 class Utf8Decoder implements Iterator<int> {	151 class Utf8Decoder implements Iterator<int> {

152 final _ListRangeIterator utf8EncodedBytesIterator;	152 final _ListRangeIterator utf8EncodedBytesIterator;

153 final int replacementCodepoint;	153 final int replacementCodepoint;

	154 int _current = -1;

154	155

155 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,	156 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,

156 this.replacementCodepoint =	157 this.replacementCodepoint =

157 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :	158 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

158 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset,	159 utf8EncodedBytesIterator =

159 length)).iterator();	160 (new _ListRange(utf8EncodedBytes, offset, length)).iterator;

160	161

161	162

162 Utf8Decoder._fromListRangeIterator(_ListRange source, [	163 Utf8Decoder._fromListRangeIterator(_ListRange source, [

163 this.replacementCodepoint =	164 this.replacementCodepoint =

164 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :	165 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

165 utf8EncodedBytesIterator = source.iterator();	166 utf8EncodedBytesIterator = source.iterator;

166	167

167 /** Decode the remaininder of the characters in this decoder	168 /** Decode the remaininder of the characters in this decoder

168 * into a [List<int>].	169 * into a [List<int>].

169 */	170 */

170 List<int> decodeRest() {	171 List<int> decodeRest() {

171 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);	172 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);

172 int i = 0;	173 int i = 0;

173 while (hasNext) {	174 while (moveNext()) {

174 codepoints[i++] = next();	175 codepoints[i++] = current;

175 }	176 }

176 if (i == codepoints.length) {	177 if (i == codepoints.length) {

177 return codepoints;	178 return codepoints;

178 } else {	179 } else {

179 List<int> truncCodepoints = new List<int>(i);	180 List<int> truncCodepoints = new List<int>(i);

180 truncCodepoints.setRange(0, i, codepoints);	181 truncCodepoints.setRange(0, i, codepoints);

181 return truncCodepoints;	182 return truncCodepoints;

182 }	183 }

183 }	184 }

184	185

185 bool get hasNext => utf8EncodedBytesIterator.hasNext;	186 int get current {

	187 if (_current == -1) {

	188 // TODO(floitsch): bad error message.

	189 throw new StateError("No more elements");

	190 }

	191 return _current;

	192 }

186	193

187 int next() {	194 bool moveNext() {

188 int value = utf8EncodedBytesIterator.next();	195 _current = -1;

	196

	197 if (!utf8EncodedBytesIterator.moveNext()) return false;

	198

	199 int value = utf8EncodedBytesIterator.current;

189 int additionalBytes = 0;	200 int additionalBytes = 0;

190	201

191 if (value < 0) {	202 if (value < 0) {

192 if (replacementCodepoint != null) {	203 if (replacementCodepoint != null) {

193 return replacementCodepoint;	204 _current = replacementCodepoint;

	205 return true;

194 } else {	206 } else {

195 throw new ArgumentError(	207 throw new ArgumentError(

196 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");	208 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

197 }	209 }

198 } else if (value <= _UTF8_ONE_BYTE_MAX) {	210 } else if (value <= _UTF8_ONE_BYTE_MAX) {

199 return value;	211 _current = value;

	212 return true;

200 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {	213 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {

201 if (replacementCodepoint != null) {	214 if (replacementCodepoint != null) {

202 return replacementCodepoint;	215 _current = replacementCodepoint;

	216 return true;

203 } else {	217 } else {

204 throw new ArgumentError(	218 throw new ArgumentError(

205 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");	219 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

206 }	220 }

207 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {	221 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {

208 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;	222 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;

209 additionalBytes = 1;	223 additionalBytes = 1;

210 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {	224 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {

211 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;	225 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;

212 additionalBytes = 2;	226 additionalBytes = 2;

213 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {	227 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {

214 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;	228 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;

215 additionalBytes = 3;	229 additionalBytes = 3;

216 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {	230 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {

217 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;	231 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;

218 additionalBytes = 4;	232 additionalBytes = 4;

219 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {	233 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {

220 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;	234 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;

221 additionalBytes = 5;	235 additionalBytes = 5;

222 } else if (replacementCodepoint != null) {	236 } else if (replacementCodepoint != null) {

223 return replacementCodepoint;	237 _current = replacementCodepoint;

	238 return true;

224 } else {	239 } else {

225 throw new ArgumentError(	240 throw new ArgumentError(

226 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");	241 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

227 }	242 }

228 int j = 0;	243 int j = 0;

229 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext) {	244 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {

230 int nextValue = utf8EncodedBytesIterator.next();	245 int nextValue = utf8EncodedBytesIterator.current;

231 if (nextValue > _UTF8_ONE_BYTE_MAX &&	246 if (nextValue > _UTF8_ONE_BYTE_MAX &&

232 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {	247 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {

233 value = ((value << 6) \| (nextValue & _UTF8_LO_SIX_BIT_MASK));	248 value = ((value << 6) \| (nextValue & _UTF8_LO_SIX_BIT_MASK));

234 } else {	249 } else {

235 // if sequence-starting code unit, reposition cursor to start here	250 // if sequence-starting code unit, reposition cursor to start here

236 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {	251 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {

237 utf8EncodedBytesIterator.backup();	252 utf8EncodedBytesIterator.backup();

238 }	253 }

239 break;	254 break;

240 }	255 }

241 j++;	256 j++;

242 }	257 }

243 bool validSequence = (j == additionalBytes && (	258 bool validSequence = (j == additionalBytes && (

244 value < UNICODE_UTF16_RESERVED_LO \|\|	259 value < UNICODE_UTF16_RESERVED_LO \|\|

245 value > UNICODE_UTF16_RESERVED_HI));	260 value > UNICODE_UTF16_RESERVED_HI));

246 bool nonOverlong =	261 bool nonOverlong =

247 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) \|\|	262 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) \|\|

248 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) \|\|	263 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) \|\|

249 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);	264 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);

250 bool inRange = value <= UNICODE_VALID_RANGE_MAX;	265 bool inRange = value <= UNICODE_VALID_RANGE_MAX;

251 if (validSequence && nonOverlong && inRange) {	266 if (validSequence && nonOverlong && inRange) {

252 return value;	267 _current = value;

	268 return true;

253 } else if (replacementCodepoint != null) {	269 } else if (replacementCodepoint != null) {

254 return replacementCodepoint;	270 _current = replacementCodepoint;

	271 return true;

255 } else {	272 } else {

256 throw new ArgumentError(	273 throw new ArgumentError(

257 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");	274 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");

258 }	275 }

259 }	276 }

260 }	277 }

OLD	NEW

« runtime/vm/intrinsifier.h ('K') | « sdk/lib/utf/utf32.dart ('k') | sdk/lib/utf/utf_core.dart » ('j') | no next file with comments »