sdk/lib/utf/utf8.dart - Issue 11783009: Big merge from experimental to bleeding edge.

Side by Side Diff: sdk/lib/utf/utf8.dart

Issue 11783009: Big merge from experimental to bleeding edge. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of dart.utf;	5 part of dart.utf;

6	6

7 const int _UTF8_ONE_BYTE_MAX = 0x7f;	7 const int _UTF8_ONE_BYTE_MAX = 0x7f;

8 const int _UTF8_TWO_BYTE_MAX = 0x7ff;	8 const int _UTF8_TWO_BYTE_MAX = 0x7ff;

9 const int _UTF8_THREE_BYTE_MAX = 0xffff;	9 const int _UTF8_THREE_BYTE_MAX = 0xffff;

10	10

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
79 encodedLength++;	79 encodedLength++;

80 } else if (value <= _UTF8_TWO_BYTE_MAX) {	80 } else if (value <= _UTF8_TWO_BYTE_MAX) {

81 encodedLength += 2;	81 encodedLength += 2;

82 } else if (value <= _UTF8_THREE_BYTE_MAX) {	82 } else if (value <= _UTF8_THREE_BYTE_MAX) {

83 encodedLength += 3;	83 encodedLength += 3;

84 } else if (value <= UNICODE_VALID_RANGE_MAX) {	84 } else if (value <= UNICODE_VALID_RANGE_MAX) {

85 encodedLength += 4;	85 encodedLength += 4;

86 }	86 }

87 }	87 }

88	88

89 List<int> encoded = new List<int>(encodedLength);	89 List<int> encoded = new List<int>.fixedLength(encodedLength);

90 int insertAt = 0;	90 int insertAt = 0;

91 for (int value in source) {	91 for (int value in source) {

92 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {	92 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {

93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]);	93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]);

94 insertAt += 3;	94 insertAt += 3;

95 } else if (value <= _UTF8_ONE_BYTE_MAX) {	95 } else if (value <= _UTF8_ONE_BYTE_MAX) {

96 encoded[insertAt] = value;	96 encoded[insertAt] = value;

97 insertAt++;	97 insertAt++;

98 } else if (value <= _UTF8_TWO_BYTE_MAX) {	98 } else if (value <= _UTF8_TWO_BYTE_MAX) {

99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE \| (	99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE \| (

(...skipping 22 matching lines...) Expand all Loading...
122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {	122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

123 return new Utf8Decoder(utf8EncodedBytes, offset, length,	123 return new Utf8Decoder(utf8EncodedBytes, offset, length,

124 replacementCodepoint).decodeRest();	124 replacementCodepoint).decodeRest();

125 }	125 }

126	126

127 /**	127 /**

128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type	128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type

129 * provides an iterator on demand and the iterator will only translate bytes	129 * provides an iterator on demand and the iterator will only translate bytes

130 * as requested by the user of the iterator. (Note: results are not cached.)	130 * as requested by the user of the iterator. (Note: results are not cached.)

131 */	131 */

132 class IterableUtf8Decoder implements Iterable<int> {	132 // TODO(floitsch): Consider removing the extend and switch to implements since

	133 // that's cheaper to allocate.

	134 class IterableUtf8Decoder extends Iterable<int> {

133 final List<int> bytes;	135 final List<int> bytes;

134 final int offset;	136 final int offset;

135 final int length;	137 final int length;

136 final int replacementCodepoint;	138 final int replacementCodepoint;

137	139

138 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,	140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,

139 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);	141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);

140	142

141 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length,	143 Utf8Decoder get iterator =>

142 replacementCodepoint);	144 new Utf8Decoder(bytes, offset, length, replacementCodepoint);

143 }	145 }

144	146

145 /**	147 /**

146 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The	148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The

147 * parameters can set an offset into a list of bytes (as int), limit the length	149 * parameters can set an offset into a list of bytes (as int), limit the length

148 * of the values to be decoded, and override the default Unicode replacement	150 * of the values to be decoded, and override the default Unicode replacement

149 * character. Set the replacementCharacter to null to throw an	151 * character. Set the replacementCharacter to null to throw an

150 * ArgumentError rather than replace the bad value. The return value	152 * ArgumentError rather than replace the bad value. The return value

151 * from this method can be used as an Iterable (e.g. in a for-loop).	153 * from this method can be used as an Iterable (e.g. in a for-loop).

152 */	154 */

153 class Utf8Decoder implements Iterator<int> {	155 class Utf8Decoder implements Iterator<int> {

154 final _ListRangeIterator utf8EncodedBytesIterator;	156 final _ListRangeIterator utf8EncodedBytesIterator;

155 final int replacementCodepoint;	157 final int replacementCodepoint;

	158 int _current = null;

156	159

157 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,	160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,

158 this.replacementCodepoint =	161 this.replacementCodepoint =

159 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :	162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

160 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset,	163 utf8EncodedBytesIterator =

161 length)).iterator();	164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator;

162	165

163	166

164 Utf8Decoder._fromListRangeIterator(_ListRange source, [	167 Utf8Decoder._fromListRangeIterator(_ListRange source, [

165 this.replacementCodepoint =	168 this.replacementCodepoint =

166 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :	169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

167 utf8EncodedBytesIterator = source.iterator();	170 utf8EncodedBytesIterator = source.iterator;

168	171

169 /** Decode the remaininder of the characters in this decoder	172 /** Decode the remaininder of the characters in this decoder

170 * into a [List<int>].	173 * into a [List<int>].

171 */	174 */

172 List<int> decodeRest() {	175 List<int> decodeRest() {

173 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);	176 List<int> codepoints = new List<int>.fixedLength(utf8EncodedBytesIterator.re maining);

174 int i = 0;	177 int i = 0;

175 while (hasNext) {	178 while (moveNext()) {

176 codepoints[i++] = next();	179 codepoints[i++] = current;

177 }	180 }

178 if (i == codepoints.length) {	181 if (i == codepoints.length) {

179 return codepoints;	182 return codepoints;

180 } else {	183 } else {

181 List<int> truncCodepoints = new List<int>(i);	184 List<int> truncCodepoints = new List<int>.fixedLength(i);

182 truncCodepoints.setRange(0, i, codepoints);	185 truncCodepoints.setRange(0, i, codepoints);

183 return truncCodepoints;	186 return truncCodepoints;

184 }	187 }

185 }	188 }

186	189

187 bool get hasNext => utf8EncodedBytesIterator.hasNext;	190 int get current => _current;

188	191

189 int next() {	192 bool moveNext() {

190 int value = utf8EncodedBytesIterator.next();	193 _current = null;

	194

	195 if (!utf8EncodedBytesIterator.moveNext()) return false;

	196

	197 int value = utf8EncodedBytesIterator.current;

191 int additionalBytes = 0;	198 int additionalBytes = 0;

192	199

193 if (value < 0) {	200 if (value < 0) {

194 if (replacementCodepoint != null) {	201 if (replacementCodepoint != null) {

195 return replacementCodepoint;	202 _current = replacementCodepoint;

	203 return true;

196 } else {	204 } else {

197 throw new ArgumentError(	205 throw new ArgumentError(

198 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");	206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

199 }	207 }

200 } else if (value <= _UTF8_ONE_BYTE_MAX) {	208 } else if (value <= _UTF8_ONE_BYTE_MAX) {

201 return value;	209 _current = value;

	210 return true;

202 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {	211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {

203 if (replacementCodepoint != null) {	212 if (replacementCodepoint != null) {

204 return replacementCodepoint;	213 _current = replacementCodepoint;

	214 return true;

205 } else {	215 } else {

206 throw new ArgumentError(	216 throw new ArgumentError(

207 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");	217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

208 }	218 }

209 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {	219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {

210 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;	220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;

211 additionalBytes = 1;	221 additionalBytes = 1;

212 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {	222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {

213 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;	223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;

214 additionalBytes = 2;	224 additionalBytes = 2;

215 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {	225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {

216 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;	226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;

217 additionalBytes = 3;	227 additionalBytes = 3;

218 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {	228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {

219 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;	229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;

220 additionalBytes = 4;	230 additionalBytes = 4;

221 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {	231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {

222 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;	232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;

223 additionalBytes = 5;	233 additionalBytes = 5;

224 } else if (replacementCodepoint != null) {	234 } else if (replacementCodepoint != null) {

225 return replacementCodepoint;	235 _current = replacementCodepoint;

	236 return true;

226 } else {	237 } else {

227 throw new ArgumentError(	238 throw new ArgumentError(

228 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");	239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

229 }	240 }

230 int j = 0;	241 int j = 0;

231 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext) {	242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {

232 int nextValue = utf8EncodedBytesIterator.next();	243 int nextValue = utf8EncodedBytesIterator.current;

233 if (nextValue > _UTF8_ONE_BYTE_MAX &&	244 if (nextValue > _UTF8_ONE_BYTE_MAX &&

234 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {	245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {

235 value = ((value << 6) \| (nextValue & _UTF8_LO_SIX_BIT_MASK));	246 value = ((value << 6) \| (nextValue & _UTF8_LO_SIX_BIT_MASK));

236 } else {	247 } else {

237 // if sequence-starting code unit, reposition cursor to start here	248 // if sequence-starting code unit, reposition cursor to start here

238 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {	249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {

239 utf8EncodedBytesIterator.backup();	250 utf8EncodedBytesIterator.backup();

240 }	251 }

241 break;	252 break;

242 }	253 }

243 j++;	254 j++;

244 }	255 }

245 bool validSequence = (j == additionalBytes && (	256 bool validSequence = (j == additionalBytes && (

246 value < UNICODE_UTF16_RESERVED_LO \|\|	257 value < UNICODE_UTF16_RESERVED_LO \|\|

247 value > UNICODE_UTF16_RESERVED_HI));	258 value > UNICODE_UTF16_RESERVED_HI));

248 bool nonOverlong =	259 bool nonOverlong =

249 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) \|\|	260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) \|\|

250 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) \|\|	261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) \|\|

251 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);	262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);

252 bool inRange = value <= UNICODE_VALID_RANGE_MAX;	263 bool inRange = value <= UNICODE_VALID_RANGE_MAX;

253 if (validSequence && nonOverlong && inRange) {	264 if (validSequence && nonOverlong && inRange) {

254 return value;	265 _current = value;

	266 return true;

255 } else if (replacementCodepoint != null) {	267 } else if (replacementCodepoint != null) {

256 return replacementCodepoint;	268 _current = replacementCodepoint;

	269 return true;

257 } else {	270 } else {

258 throw new ArgumentError(	271 throw new ArgumentError(

259 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");	272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");

260 }	273 }

261 }	274 }

262 }	275 }

OLD	NEW

« no previous file with comments | « sdk/lib/utf/utf32.dart ('k') | sdk/lib/utf/utf_core.dart » ('j') | no next file with comments »