pkg/utf/lib/utf32.dart - Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links

Side by Side Diff: pkg/utf/lib/utf32.dart

Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 part of utf;

6

7 /**

8 * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert

9 * as much of the input as needed. Determines the byte order from the BOM,

10 * or uses big-endian as a default. This method always strips a leading BOM.

11 * Set the replacementCharacter to null to throw an ArgumentError

12 * rather than replace the bad value.

13 */

14 IterableUtf32Decoder decodeUtf32AsIterable(List<int> bytes, [

15 int offset = 0, int length,

16 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

17 return new IterableUtf32Decoder._(

18 () => new Utf32BytesDecoder(bytes, offset, length, replacementCodepoint));

19 }

20

21 /**

22 * Decodes the UTF-32BE bytes as an iterable. Thus, the consumer can only conver t

23 * as much of the input as needed. This method strips a leading BOM by default,

24 * but can be overridden by setting the optional parameter [stripBom] to false.

25 * Set the replacementCharacter to null to throw an ArgumentError

26 * rather than replace the bad value.

27 */

28 IterableUtf32Decoder decodeUtf32beAsIterable(List<int> bytes, [

29 int offset = 0, int length, bool stripBom = true,

30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

31 return new IterableUtf32Decoder._(

32 () => new Utf32beBytesDecoder(bytes, offset, length, stripBom,

33 replacementCodepoint));

34 }

35

36 /**

37 * Decodes the UTF-32LE bytes as an iterable. Thus, the consumer can only conver t

38 * as much of the input as needed. This method strips a leading BOM by default,

39 * but can be overridden by setting the optional parameter [stripBom] to false.

40 * Set the replacementCharacter to null to throw an ArgumentError

41 * rather than replace the bad value.

42 */

43 IterableUtf32Decoder decodeUtf32leAsIterable(List<int> bytes, [

44 int offset = 0, int length, bool stripBom = true,

45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

46 return new IterableUtf32Decoder._(

47 () => new Utf32leBytesDecoder(bytes, offset, length, stripBom,

48 replacementCodepoint));

49 }

50

51 /**

52 * Produce a String from a sequence of UTF-32 encoded bytes. The parameters

53 * allow an offset into a list of bytes (as int), limiting the length of the

54 * values be decoded and the ability of override the default Unicode

55 * replacement character. Set the replacementCharacter to null to throw an

56 * ArgumentError rather than replace the bad value.

57 */

58 String decodeUtf32(List<int> bytes, [int offset = 0, int length,

59 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

60 return new String.fromCharCodes((new Utf32BytesDecoder(bytes, offset, length,

61 replacementCodepoint)).decodeRest());

62 }

63 /**

64 * Produce a String from a sequence of UTF-32BE encoded bytes. The parameters

65 * allow an offset into a list of bytes (as int), limiting the length of the

66 * values be decoded and the ability of override the default Unicode

67 * replacement character. Set the replacementCharacter to null to throw an

68 * ArgumentError rather than replace the bad value.

69 */

70 String decodeUtf32be(

71 List<int> bytes, [int offset = 0, int length, bool stripBom = true,

72 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) =>

73 new String.fromCharCodes((new Utf32beBytesDecoder(bytes, offset, length,

74 stripBom, replacementCodepoint)).decodeRest());

75

76 /**

77 * Produce a String from a sequence of UTF-32LE encoded bytes. The parameters

78 * allow an offset into a list of bytes (as int), limiting the length of the

79 * values be decoded and the ability of override the default Unicode

80 * replacement character. Set the replacementCharacter to null to throw an

81 * ArgumentError rather than replace the bad value.

82 */

83 String decodeUtf32le(

84 List<int> bytes, [int offset = 0, int length, bool stripBom = true,

85 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) =>

86 new String.fromCharCodes((new Utf32leBytesDecoder(bytes, offset, length,

87 stripBom, replacementCodepoint)).decodeRest());

88

89 /**

90 * Produce a list of UTF-32 encoded bytes. This method prefixes the resulting

91 * bytes with a big-endian byte-order-marker.

92 */

93 List<int> encodeUtf32(String str) =>

94 encodeUtf32be(str, true);

95

96 /**

97 * Produce a list of UTF-32BE encoded bytes. By default, this method produces

98 * UTF-32BE bytes with no BOM.

99 */

100 List<int> encodeUtf32be(String str, [bool writeBOM = false]) {

101 List<int> utf32CodeUnits = stringToCodepoints(str);

102 List<int> encoding = new List<int>(4 * utf32CodeUnits.length +

103 (writeBOM ? 4 : 0));

104 int i = 0;

105 if (writeBOM) {

106 encoding[i++] = 0;

107 encoding[i++] = 0;

108 encoding[i++] = UNICODE_UTF_BOM_HI;

109 encoding[i++] = UNICODE_UTF_BOM_LO;

110 }

111 for (int unit in utf32CodeUnits) {

112 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK;

113 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK;

114 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK;

115 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;

116 }

117 return encoding;

118 }

119

120 /**

121 * Produce a list of UTF-32LE encoded bytes. By default, this method produces

122 * UTF-32BE bytes with no BOM.

123 */

124 List<int> encodeUtf32le(String str, [bool writeBOM = false]) {

125 List<int> utf32CodeUnits = stringToCodepoints(str);

126 List<int> encoding = new List<int>(4 * utf32CodeUnits.length +

127 (writeBOM ? 4 : 0));

128 int i = 0;

129 if (writeBOM) {

130 encoding[i++] = UNICODE_UTF_BOM_LO;

131 encoding[i++] = UNICODE_UTF_BOM_HI;

132 encoding[i++] = 0;

133 encoding[i++] = 0;

134 }

135 for (int unit in utf32CodeUnits) {

136 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;

137 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK;

138 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK;

139 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK;

140 }

141 return encoding;

142 }

143

144 /**

145 * Identifies whether a List of bytes starts (based on offset) with a

146 * byte-order marker (BOM).

147 */

148 bool hasUtf32Bom(

149 List<int> utf32EncodedBytes, [int offset = 0, int length]) {

150 return hasUtf32beBom(utf32EncodedBytes, offset, length) \|\|

151 hasUtf32leBom(utf32EncodedBytes, offset, length);

152 }

153

154 /**

155 * Identifies whether a List of bytes starts (based on offset) with a

156 * big-endian byte-order marker (BOM).

157 */

158 bool hasUtf32beBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) {

159 int end = length != null ? offset + length : utf32EncodedBytes.length;

160 return (offset + 4) <= end &&

161 utf32EncodedBytes[offset] == 0 && utf32EncodedBytes[offset + 1] == 0 &&

162 utf32EncodedBytes[offset + 2] == UNICODE_UTF_BOM_HI &&

163 utf32EncodedBytes[offset + 3] == UNICODE_UTF_BOM_LO;

164 }

165

166 /**

167 * Identifies whether a List of bytes starts (based on offset) with a

168 * little-endian byte-order marker (BOM).

169 */

170 bool hasUtf32leBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) {

171 int end = length != null ? offset + length : utf32EncodedBytes.length;

172 return (offset + 4) <= end &&

173 utf32EncodedBytes[offset] == UNICODE_UTF_BOM_LO &&

174 utf32EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI &&

175 utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0;

176 }

177

178 typedef Utf32BytesDecoder Utf32BytesDecoderProvider();

179

180 /**

181 * Return type of [decodeUtf32AsIterable] and variants. The Iterable type

182 * provides an iterator on demand and the iterator will only translate bytes

183 * as requested by the user of the iterator. (Note: results are not cached.)

184 */

185 // TODO(floitsch): Consider removing the extend and switch to implements since

186 // that's cheaper to allocate.

187 class IterableUtf32Decoder extends IterableBase<int> {

188 final Utf32BytesDecoderProvider codeunitsProvider;

189

190 IterableUtf32Decoder._(this.codeunitsProvider);

191

192 Utf32BytesDecoder get iterator => codeunitsProvider();

193 }

194

195 /**

196 * Abstrace parent class converts encoded bytes to codepoints.

197 */

198 abstract class Utf32BytesDecoder implements _ListRangeIterator {

199 final _ListRangeIterator utf32EncodedBytesIterator;

200 final int replacementCodepoint;

201 int _current = null;

202

203 Utf32BytesDecoder._fromListRangeIterator(

204 this.utf32EncodedBytesIterator, this.replacementCodepoint);

205

206 factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [

207 int offset = 0, int length,

208 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

209 if (length == null) {

210 length = utf32EncodedBytes.length - offset;

211 }

212 if (hasUtf32beBom(utf32EncodedBytes, offset, length)) {

213 return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4,

214 false, replacementCodepoint);

215 } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) {

216 return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4,

217 false, replacementCodepoint);

218 } else {

219 return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false,

220 replacementCodepoint);

221 }

222 }

223

224 List<int> decodeRest() {

225 List<int> codeunits = new List<int>(remaining);

226 int i = 0;

227 while (moveNext()) {

228 codeunits[i++] = current;

229 }

230 return codeunits;

231 }

232

233 int get current => _current;

234

235 bool moveNext() {

236 _current = null;

237 int remaining = utf32EncodedBytesIterator.remaining;

238 if (remaining == 0) {

239 _current = null;

240 return false;

241 }

242 if (remaining < 4) {

243 utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining);

244 if (replacementCodepoint != null) {

245 _current = replacementCodepoint;

246 return true;

247 } else {

248 throw new ArgumentError(

249 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}");

250 }

251 }

252 int codepoint = decode();

253 if (_validCodepoint(codepoint)) {

254 _current = codepoint;

255 return true;

256 } else if (replacementCodepoint != null) {

257 _current = replacementCodepoint;

258 return true;

259 } else {

260 throw new ArgumentError(

261 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}");

262 }

263 }

264

265 int get position => utf32EncodedBytesIterator.position ~/ 4;

266

267 void backup([int by = 1]) {

268 utf32EncodedBytesIterator.backup(4 * by);

269 }

270

271 int get remaining => (utf32EncodedBytesIterator.remaining + 3) ~/ 4;

272

273 void skip([int count = 1]) {

274 utf32EncodedBytesIterator.skip(4 * count);

275 }

276

277 int decode();

278 }

279

280 /**

281 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes

282 * to produce the unicode codepoint.

283 */

284 class Utf32beBytesDecoder extends Utf32BytesDecoder {

285 Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0,

286 int length, bool stripBom = true,

287 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

288 super._fromListRangeIterator(

289 (new _ListRange(utf32EncodedBytes, offset, length)).iterator,

290 replacementCodepoint) {

291 if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) {

292 skip();

293 }

294 }

295

296 int decode() {

297 utf32EncodedBytesIterator.moveNext();

298 int value = utf32EncodedBytesIterator.current;

299 utf32EncodedBytesIterator.moveNext();

300 value = (value << 8) + utf32EncodedBytesIterator.current;

301 utf32EncodedBytesIterator.moveNext();

302 value = (value << 8) + utf32EncodedBytesIterator.current;

303 utf32EncodedBytesIterator.moveNext();

304 value = (value << 8) + utf32EncodedBytesIterator.current;

305 return value;

306 }

307 }

308

309 /**

310 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes

311 * to produce the unicode codepoint.

312 */

313 class Utf32leBytesDecoder extends Utf32BytesDecoder {

314 Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0,

315 int length, bool stripBom = true,

316 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

317 super._fromListRangeIterator(

318 (new _ListRange(utf32EncodedBytes, offset, length)).iterator,

319 replacementCodepoint) {

320 if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) {

321 skip();

322 }

323 }

324

325 int decode() {

326 utf32EncodedBytesIterator.moveNext();

327 int value = utf32EncodedBytesIterator.current;

328 utf32EncodedBytesIterator.moveNext();

329 value += (utf32EncodedBytesIterator.current << 8);

330 utf32EncodedBytesIterator.moveNext();

331 value += (utf32EncodedBytesIterator.current << 16);

332 utf32EncodedBytesIterator.moveNext();

333 value += (utf32EncodedBytesIterator.current << 24);

334 return value;

335 }

336 }

337

338 bool _validCodepoint(int codepoint) {

339 return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) \|\|

340 (codepoint > UNICODE_UTF16_RESERVED_HI &&

341 codepoint < UNICODE_VALID_RANGE_MAX);

342 }

OLD	NEW

« no previous file with comments | « pkg/utf/lib/utf16.dart ('k') | pkg/utf/lib/utf8.dart » ('j') | no next file with comments »