pkg/utf/lib/utf8.dart - Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links

Side by Side Diff: pkg/utf/lib/utf8.dart

Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 part of utf;

6

7 const int _UTF8_ONE_BYTE_MAX = 0x7f;

8 const int _UTF8_TWO_BYTE_MAX = 0x7ff;

9 const int _UTF8_THREE_BYTE_MAX = 0xffff;

10

11 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;

12

13 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0;

14 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0;

15 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0;

16 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8;

17 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc;

18

19 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f;

20 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf;

21 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7;

22

23 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe;

24 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80;

25

26 /**

27 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert

28 * as much of the input as needed. Set the replacementCharacter to null to

29 * throw an ArgumentError rather than replace the bad value.

30 */

31 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0,

32 int length,

33 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

34 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint);

35 }

36

37 /**

38 * Produce a String from a List of UTF-8 encoded bytes. The parameters

39 * can set an offset into a list of bytes (as int), limit the length of the

40 * values to be decoded, and override the default Unicode replacement character.

41 * Set the replacementCharacter to null to throw an ArgumentError

42 * rather than replace the bad value.

43 */

44 String decodeUtf8(List<int> bytes, [int offset = 0, int length,

45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

46 return new String.fromCharCodes(

47 (new Utf8Decoder(bytes, offset, length, replacementCodepoint))

48 .decodeRest());

49 }

50

51 /**

52 * Produce a sequence of UTF-8 encoded bytes from the provided string.

53 */

54 List<int> encodeUtf8(String str) =>

55 codepointsToUtf8(stringToCodepoints(str));

56

57 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) {

58 while (bytes > 0) {

59 buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE \|

60 (value & _UTF8_LO_SIX_BIT_MASK);

61 value = value >> 6;

62 bytes--;

63 }

64 return value;

65 }

66

67 /**

68 * Encode code points as UTF-8 code units.

69 */

70 List<int> codepointsToUtf8(

71 List<int> codepoints, [int offset = 0, int length]) {

72 _ListRange source = new _ListRange(codepoints, offset, length);

73

74 int encodedLength = 0;

75 for (int value in source) {

76 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {

77 encodedLength += 3;

78 } else if (value <= _UTF8_ONE_BYTE_MAX) {

79 encodedLength++;

80 } else if (value <= _UTF8_TWO_BYTE_MAX) {

81 encodedLength += 2;

82 } else if (value <= _UTF8_THREE_BYTE_MAX) {

83 encodedLength += 3;

84 } else if (value <= UNICODE_VALID_RANGE_MAX) {

85 encodedLength += 4;

86 }

87 }

88

89 List<int> encoded = new List<int>(encodedLength);

90 int insertAt = 0;

91 for (int value in source) {

92 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {

93 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]);

94 insertAt += 3;

95 } else if (value <= _UTF8_ONE_BYTE_MAX) {

96 encoded[insertAt] = value;

97 insertAt++;

98 } else if (value <= _UTF8_TWO_BYTE_MAX) {

99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE \| (

100 _UTF8_FIRST_BYTE_OF_TWO_MASK &

101 _addToEncoding(insertAt, 1, value, encoded));

102 insertAt += 2;

103 } else if (value <= _UTF8_THREE_BYTE_MAX) {

104 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE \| (

105 _UTF8_FIRST_BYTE_OF_THREE_MASK &

106 _addToEncoding(insertAt, 2, value, encoded));

107 insertAt += 3;

108 } else if (value <= UNICODE_VALID_RANGE_MAX) {

109 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE \| (

110 _UTF8_FIRST_BYTE_OF_FOUR_MASK &

111 _addToEncoding(insertAt, 3, value, encoded));

112 insertAt += 4;

113 }

114 }

115 return encoded;

116 }

117

118 // Because UTF-8 specifies byte order, we do not have to follow the pattern

119 // used by UTF-16 & UTF-32 regarding byte order.

120 List<int> utf8ToCodepoints(

121 List<int> utf8EncodedBytes, [int offset = 0, int length,

122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

123 return new Utf8Decoder(utf8EncodedBytes, offset, length,

124 replacementCodepoint).decodeRest();

125 }

126

127 /**

128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type

129 * provides an iterator on demand and the iterator will only translate bytes

130 * as requested by the user of the iterator. (Note: results are not cached.)

131 */

132 // TODO(floitsch): Consider removing the extend and switch to implements since

133 // that's cheaper to allocate.

134 class IterableUtf8Decoder extends IterableBase<int> {

135 final List<int> bytes;

136 final int offset;

137 final int length;

138 final int replacementCodepoint;

139

140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,

141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);

142

143 Utf8Decoder get iterator =>

144 new Utf8Decoder(bytes, offset, length, replacementCodepoint);

145 }

146

147 /**

148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The

149 * parameters can set an offset into a list of bytes (as int), limit the length

150 * of the values to be decoded, and override the default Unicode replacement

151 * character. Set the replacementCharacter to null to throw an

152 * ArgumentError rather than replace the bad value. The return value

153 * from this method can be used as an Iterable (e.g. in a for-loop).

154 */

155 class Utf8Decoder implements Iterator<int> {

156 final _ListRangeIterator utf8EncodedBytesIterator;

157 final int replacementCodepoint;

158 int _current = null;

159

160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,

161 this.replacementCodepoint =

162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

163 utf8EncodedBytesIterator =

164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator;

165

166

167 Utf8Decoder._fromListRangeIterator(_ListRange source, [

168 this.replacementCodepoint =

169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

170 utf8EncodedBytesIterator = source.iterator;

171

172 /** Decode the remaininder of the characters in this decoder

173 * into a [List<int>].

174 */

175 List<int> decodeRest() {

176 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);

177 int i = 0;

178 while (moveNext()) {

179 codepoints[i++] = current;

180 }

181 if (i == codepoints.length) {

182 return codepoints;

183 } else {

184 List<int> truncCodepoints = new List<int>(i);

185 truncCodepoints.setRange(0, i, codepoints);

186 return truncCodepoints;

187 }

188 }

189

190 int get current => _current;

191

192 bool moveNext() {

193 _current = null;

194

195 if (!utf8EncodedBytesIterator.moveNext()) return false;

196

197 int value = utf8EncodedBytesIterator.current;

198 int additionalBytes = 0;

199

200 if (value < 0) {

201 if (replacementCodepoint != null) {

202 _current = replacementCodepoint;

203 return true;

204 } else {

205 throw new ArgumentError(

206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

207 }

208 } else if (value <= _UTF8_ONE_BYTE_MAX) {

209 _current = value;

210 return true;

211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) {

212 if (replacementCodepoint != null) {

213 _current = replacementCodepoint;

214 return true;

215 } else {

216 throw new ArgumentError(

217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

218 }

219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) {

220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE;

221 additionalBytes = 1;

222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) {

223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE;

224 additionalBytes = 2;

225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) {

226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE;

227 additionalBytes = 3;

228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) {

229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE;

230 additionalBytes = 4;

231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) {

232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE;

233 additionalBytes = 5;

234 } else if (replacementCodepoint != null) {

235 _current = replacementCodepoint;

236 return true;

237 } else {

238 throw new ArgumentError(

239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}");

240 }

241 int j = 0;

242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) {

243 int nextValue = utf8EncodedBytesIterator.current;

244 if (nextValue > _UTF8_ONE_BYTE_MAX &&

245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) {

246 value = ((value << 6) \| (nextValue & _UTF8_LO_SIX_BIT_MASK));

247 } else {

248 // if sequence-starting code unit, reposition cursor to start here

249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {

250 utf8EncodedBytesIterator.backup();

251 }

252 break;

253 }

254 j++;

255 }

256 bool validSequence = (j == additionalBytes && (

257 value < UNICODE_UTF16_RESERVED_LO \|\|

258 value > UNICODE_UTF16_RESERVED_HI));

259 bool nonOverlong =

260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) \|\|

261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) \|\|

262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);

263 bool inRange = value <= UNICODE_VALID_RANGE_MAX;

264 if (validSequence && nonOverlong && inRange) {

265 _current = value;

266 return true;

267 } else if (replacementCodepoint != null) {

268 _current = replacementCodepoint;

269 return true;

270 } else {

271 throw new ArgumentError(

272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");

273 }

274 }

275 }

OLD	NEW

« no previous file with comments | « pkg/utf/lib/utf32.dart ('k') | pkg/utf/lib/utf_stream.dart » ('j') | no next file with comments »