packages/utf/lib/src/utf/utf16.dart - Issue 2989763002: Update charted to 0.4.8 and roll

Side by Side Diff: packages/utf/lib/src/utf/utf16.dart

Issue 2989763002: Update charted to 0.4.8 and roll (Closed)

Patch Set: Removed Cutch from list of reviewers Created 3 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 part of utf;

6

7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).

8 /**

9 * Provide a list of Unicode codepoints for a given string.

10 */

11 List<int> stringToCodepoints(String str) {

12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.

13 // So we need to convert.

14 return utf16CodeUnitsToCodepoints(str.codeUnits);

15 }

16

17 /**

18 * Generate a string from the provided Unicode codepoints.

19 *

20 * Deprecated Use [String.fromCharCodes] instead.

21 */

22 @deprecated

23 String codepointsToString(List<int> codepoints) {

24 return new String.fromCharCodes(codepoints);

25 }

26 /**

27 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert

28 * as much of the input as needed. Determines the byte order from the BOM,

29 * or uses big-endian as a default. This method always strips a leading BOM.

30 * Set the [replacementCodepoint] to null to throw an ArgumentError

31 * rather than replace the bad value. The default value for

32 * [replacementCodepoint] is U+FFFD.

33 */

34 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0,

35 int length, int replacementCodepoint =

36 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

37 return new IterableUtf16Decoder._(

38 () => new Utf16BytesToCodeUnitsDecoder(bytes, offset, length,

39 replacementCodepoint), replacementCodepoint);

40 }

41

42 /**

43 * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only

44 * convert as much of the input as needed. This method strips a leading BOM by

45 * default, but can be overridden by setting the optional parameter [stripBom]

46 * to false. Set the [replacementCodepoint] to null to throw an

47 * ArgumentError rather than replace the bad value. The default

48 * value for the [replacementCodepoint] is U+FFFD.

49 */

50 IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, [int offset = 0,

51 int length, bool stripBom = true, int replacementCodepoint =

52 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

53 return new IterableUtf16Decoder._(

54 () => new Utf16beBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,

55 replacementCodepoint), replacementCodepoint);

56 }

57

58 /**

59 * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only

60 * convert as much of the input as needed. This method strips a leading BOM by

61 * default, but can be overridden by setting the optional parameter [stripBom]

62 * to false. Set the [replacementCodepoint] to null to throw an

63 * ArgumentError rather than replace the bad value. The default

64 * value for the [replacementCodepoint] is U+FFFD.

65 */

66 IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, [int offset = 0,

67 int length, bool stripBom = true, int replacementCodepoint =

68 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

69 return new IterableUtf16Decoder._(

70 () => new Utf16leBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,

71 replacementCodepoint), replacementCodepoint);

72 }

73

74 /**

75 * Produce a String from a sequence of UTF-16 encoded bytes. This method always

76 * strips a leading BOM. Set the [replacementCodepoint] to null to throw an

77 * ArgumentError rather than replace the bad value. The default

78 * value for the [replacementCodepoint] is U+FFFD.

79 */

80 String decodeUtf16(List<int> bytes, [int offset = 0, int length,

81 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

82 Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder(bytes,

83 offset, length, replacementCodepoint);

84 List<int> codeunits = decoder.decodeRest();

85 return new String.fromCharCodes(

86 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));

87 }

88

89 /**

90 * Produce a String from a sequence of UTF-16BE encoded bytes. This method

91 * strips a leading BOM by default, but can be overridden by setting the

92 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to

93 * null to throw an ArgumentError rather than replace the bad value.

94 * The default value for the [replacementCodepoint] is U+FFFD.

95 */

96 String decodeUtf16be(List<int> bytes, [int offset = 0, int length,

97 bool stripBom = true,

98 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

99 List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder(bytes, offset,

100 length, stripBom, replacementCodepoint)).decodeRest();

101 return new String.fromCharCodes(

102 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));

103 }

104

105 /**

106 * Produce a String from a sequence of UTF-16LE encoded bytes. This method

107 * strips a leading BOM by default, but can be overridden by setting the

108 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to

109 * null to throw an ArgumentError rather than replace the bad value.

110 * The default value for the [replacementCodepoint] is U+FFFD.

111 */

112 String decodeUtf16le(List<int> bytes, [int offset = 0, int length,

113 bool stripBom = true,

114 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

115 List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder(bytes, offset,

116 length, stripBom, replacementCodepoint)).decodeRest();

117 return new String.fromCharCodes(

118 utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));

119 }

120

121 /**

122 * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting

123 * bytes with a big-endian byte-order-marker.

124 */

125 List<int> encodeUtf16(String str) =>

126 encodeUtf16be(str, true);

127

128 /**

129 * Produce a list of UTF-16BE encoded bytes. By default, this method produces

130 * UTF-16BE bytes with no BOM.

131 */

132 List<int> encodeUtf16be(String str, [bool writeBOM = false]) {

133 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);

134 List<int> encoding =

135 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));

136 int i = 0;

137 if (writeBOM) {

138 encoding[i++] = UNICODE_UTF_BOM_HI;

139 encoding[i++] = UNICODE_UTF_BOM_LO;

140 }

141 for (int unit in utf16CodeUnits) {

142 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;

143 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;

144 }

145 return encoding;

146 }

147

148 /**

149 * Produce a list of UTF-16LE encoded bytes. By default, this method produces

150 * UTF-16LE bytes with no BOM.

151 */

152 List<int> encodeUtf16le(String str, [bool writeBOM = false]) {

153 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);

154 List<int> encoding =

155 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));

156 int i = 0;

157 if (writeBOM) {

158 encoding[i++] = UNICODE_UTF_BOM_LO;

159 encoding[i++] = UNICODE_UTF_BOM_HI;

160 }

161 for (int unit in utf16CodeUnits) {

162 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;

163 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;

164 }

165 return encoding;

166 }

167

168 /**

169 * Identifies whether a List of bytes starts (based on offset) with a

170 * byte-order marker (BOM).

171 */

172 bool hasUtf16Bom(List<int> utf32EncodedBytes, [int offset = 0, int length]) {

173 return hasUtf16beBom(utf32EncodedBytes, offset, length) \|\|

174 hasUtf16leBom(utf32EncodedBytes, offset, length);

175 }

176

177 /**

178 * Identifies whether a List of bytes starts (based on offset) with a

179 * big-endian byte-order marker (BOM).

180 */

181 bool hasUtf16beBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {

182 int end = length != null ? offset + length : utf16EncodedBytes.length;

183 return (offset + 2) <= end &&

184 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_HI &&

185 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_LO;

186 }

187

188 /**

189 * Identifies whether a List of bytes starts (based on offset) with a

190 * little-endian byte-order marker (BOM).

191 */

192 bool hasUtf16leBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {

193 int end = length != null ? offset + length : utf16EncodedBytes.length;

194 return (offset + 2) <= end &&

195 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_LO &&

196 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI;

197 }

198

199 List<int> _stringToUtf16CodeUnits(String str) {

200 return codepointsToUtf16CodeUnits(str.codeUnits);

201 }

202

203 typedef ListRangeIterator _CodeUnitsProvider();

204

205 /**

206 * Return type of [decodeUtf16AsIterable] and variants. The Iterable type

207 * provides an iterator on demand and the iterator will only translate bytes

208 * as requested by the user of the iterator. (Note: results are not cached.)

209 */

210 // TODO(floitsch): Consider removing the extend and switch to implements since

211 // that's cheaper to allocate.

212 class IterableUtf16Decoder extends IterableBase<int> {

213 final _CodeUnitsProvider codeunitsProvider;

214 final int replacementCodepoint;

215

216 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint);

217

218 Utf16CodeUnitDecoder get iterator =>

219 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(),

220 replacementCodepoint);

221 }

222

223 /**

224 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes

225 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine

226 * endian-ness, and defaults to BE.

227 */

228 abstract class Utf16BytesToCodeUnitsDecoder implements ListRangeIterator {

229 // TODO(kevmoo): should this field be private?

230 final ListRangeIterator utf16EncodedBytesIterator;

231 final int replacementCodepoint;

232 int _current = null;

233

234 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator(

235 this.utf16EncodedBytesIterator, this.replacementCodepoint);

236

237 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [

238 int offset = 0, int length,

239 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

240 if (length == null) {

241 length = utf16EncodedBytes.length - offset;

242 }

243 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) {

244 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,

245 length - 2, false, replacementCodepoint);

246 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) {

247 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,

248 length - 2, false, replacementCodepoint);

249 } else {

250 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset,

251 length, false, replacementCodepoint);

252 }

253 }

254

255 /**

256 * Provides a fast way to decode the rest of the source bytes in a single

257 * call. This method trades memory for improved speed in that it potentially

258 * over-allocates the List containing results.

259 */

260 List<int> decodeRest() {

261 List<int> codeunits = new List<int>(remaining);

262 int i = 0;

263 while (moveNext()) {

264 codeunits[i++] = current;

265 }

266 if (i == codeunits.length) {

267 return codeunits;

268 } else {

269 List<int> truncCodeunits = new List<int>(i);

270 truncCodeunits.setRange(0, i, codeunits);

271 return truncCodeunits;

272 }

273 }

274

275 int get current => _current;

276

277 bool moveNext() {

278 _current = null;

279 int remaining = utf16EncodedBytesIterator.remaining;

280 if (remaining == 0) {

281 _current = null;

282 return false;

283 }

284 if (remaining == 1) {

285 utf16EncodedBytesIterator.moveNext();

286 if (replacementCodepoint != null) {

287 _current = replacementCodepoint;

288 return true;

289 } else {

290 throw new ArgumentError(

291 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}");

292 }

293 }

294 _current = decode();

295 return true;

296 }

297

298 int get position => utf16EncodedBytesIterator.position ~/ 2;

299

300 void backup([int by = 1]) {

301 utf16EncodedBytesIterator.backup(2 * by);

302 }

303

304 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2;

305

306 void skip([int count = 1]) {

307 utf16EncodedBytesIterator.skip(2 * count);

308 }

309

310 int decode();

311 }

312

313 /**

314 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes

315 * to produce the code unit (0-(2^16)-1).

316 */

317 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {

318 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [

319 int offset = 0, int length, bool stripBom = true,

320 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

321 super._fromListRangeIterator(

322 (new ListRange(utf16EncodedBytes, offset, length)).iterator,

323 replacementCodepoint) {

324 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) {

325 skip();

326 }

327 }

328

329 int decode() {

330 utf16EncodedBytesIterator.moveNext();

331 int hi = utf16EncodedBytesIterator.current;

332 utf16EncodedBytesIterator.moveNext();

333 int lo = utf16EncodedBytesIterator.current;

334 return (hi << 8) + lo;

335 }

336 }

337

338 /**

339 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes

340 * to produce the code unit (0-(2^16)-1).

341 */

342 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {

343 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [

344 int offset = 0, int length, bool stripBom = true,

345 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

346 super._fromListRangeIterator(

347 (new ListRange(utf16EncodedBytes, offset, length)).iterator,

348 replacementCodepoint) {

349 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) {

350 skip();

351 }

352 }

353

354 int decode() {

355 utf16EncodedBytesIterator.moveNext();

356 int lo = utf16EncodedBytesIterator.current;

357 utf16EncodedBytesIterator.moveNext();

358 int hi = utf16EncodedBytesIterator.current;

359 return (hi << 8) + lo;

360 }

361 }

OLD	NEW

« no previous file with comments | « packages/utf/lib/src/shared.dart ('k') | packages/utf/lib/src/utf/utf32.dart » ('j') | no next file with comments »