pkg/utf/lib/utf16.dart - Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links

Side by Side Diff: pkg/utf/lib/utf16.dart

Issue 418433003: pkg/utf: fixed layout, added todos, updated docs and homepage pubspec links (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 part of utf;

6

7 // TODO(jmesserly): would be nice to have this on String (dartbug.com/6501).

8 /**

9 * Provide a list of Unicode codepoints for a given string.

10 */

11 List<int> stringToCodepoints(String str) {

12 // Note: str.codeUnits gives us 16-bit code units on all Dart implementations.

13 // So we need to convert.

14 return _utf16CodeUnitsToCodepoints(str.codeUnits);

15 }

16

17 /**

18 * Generate a string from the provided Unicode codepoints.

19 *

20 * Deprecated Use [String.fromCharCodes] instead.

21 */

22 String codepointsToString(List<int> codepoints) {

23 return new String.fromCharCodes(codepoints);

24 }

25

26 /**

27 * An Iterator<int> of codepoints built on an Iterator of UTF-16 code units.

28 * The parameters can override the default Unicode replacement character. Set

29 * the replacementCharacter to null to throw an ArgumentError

30 * rather than replace the bad value.

31 */

32 class Utf16CodeUnitDecoder implements Iterator<int> {

33 final _ListRangeIterator utf16CodeUnitIterator;

34 final int replacementCodepoint;

35 int _current = null;

36

37 Utf16CodeUnitDecoder(List<int> utf16CodeUnits, [int offset = 0, int length,

38 int this.replacementCodepoint =

39 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

40 utf16CodeUnitIterator =

41 (new _ListRange(utf16CodeUnits, offset, length)).iterator;

42

43 Utf16CodeUnitDecoder.fromListRangeIterator(

44 _ListRangeIterator this.utf16CodeUnitIterator,

45 int this.replacementCodepoint);

46

47 Iterator<int> get iterator => this;

48

49 int get current => _current;

50

51 bool moveNext() {

52 _current = null;

53 if (!utf16CodeUnitIterator.moveNext()) return false;

54

55 int value = utf16CodeUnitIterator.current;

56 if (value < 0) {

57 if (replacementCodepoint != null) {

58 _current = replacementCodepoint;

59 } else {

60 throw new ArgumentError(

61 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

62 }

63 } else if (value < UNICODE_UTF16_RESERVED_LO \|\|

64 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

65 // transfer directly

66 _current = value;

67 } else if (value < UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&

68 utf16CodeUnitIterator.moveNext()) {

69 // merge surrogate pair

70 int nextValue = utf16CodeUnitIterator.current;

71 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_1_BASE &&

72 nextValue <= UNICODE_UTF16_RESERVED_HI) {

73 value = (value - UNICODE_UTF16_SURROGATE_UNIT_0_BASE) << 10;

74 value += UNICODE_UTF16_OFFSET +

75 (nextValue - UNICODE_UTF16_SURROGATE_UNIT_1_BASE);

76 _current = value;

77 } else {

78 if (nextValue >= UNICODE_UTF16_SURROGATE_UNIT_0_BASE &&

79 nextValue < UNICODE_UTF16_SURROGATE_UNIT_1_BASE) {

80 utf16CodeUnitIterator.backup();

81 }

82 if (replacementCodepoint != null) {

83 _current = replacementCodepoint;

84 } else {

85 throw new ArgumentError(

86 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

87 }

88 }

89 } else if (replacementCodepoint != null) {

90 _current = replacementCodepoint;

91 } else {

92 throw new ArgumentError(

93 "Invalid UTF16 at ${utf16CodeUnitIterator.position}");

94 }

95 return true;

96 }

97 }

98

99 /**

100 * Encode code points as UTF16 code units.

101 */

102 List<int> _codepointsToUtf16CodeUnits(

103 List<int> codepoints,

104 [int offset = 0,

105 int length,

106 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

107

108 _ListRange listRange = new _ListRange(codepoints, offset, length);

109 int encodedLength = 0;

110 for (int value in listRange) {

111 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) \|\|

112 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

113 encodedLength++;

114 } else if (value > UNICODE_PLANE_ONE_MAX &&

115 value <= UNICODE_VALID_RANGE_MAX) {

116 encodedLength += 2;

117 } else {

118 encodedLength++;

119 }

120 }

121

122 List<int> codeUnitsBuffer = new List<int>(encodedLength);

123 int j = 0;

124 for (int value in listRange) {

125 if ((value >= 0 && value < UNICODE_UTF16_RESERVED_LO) \|\|

126 (value > UNICODE_UTF16_RESERVED_HI && value <= UNICODE_PLANE_ONE_MAX)) {

127 codeUnitsBuffer[j++] = value;

128 } else if (value > UNICODE_PLANE_ONE_MAX &&

129 value <= UNICODE_VALID_RANGE_MAX) {

130 int base = value - UNICODE_UTF16_OFFSET;

131 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_0_BASE +

132 ((base & UNICODE_UTF16_HI_MASK) >> 10);

133 codeUnitsBuffer[j++] = UNICODE_UTF16_SURROGATE_UNIT_1_BASE +

134 (base & UNICODE_UTF16_LO_MASK);

135 } else if (replacementCodepoint != null) {

136 codeUnitsBuffer[j++] = replacementCodepoint;

137 } else {

138 throw new ArgumentError("Invalid encoding");

139 }

140 }

141 return codeUnitsBuffer;

142 }

143

144 /**

145 * Decodes the utf16 codeunits to codepoints.

146 */

147 List<int> _utf16CodeUnitsToCodepoints(

148 List<int> utf16CodeUnits, [int offset = 0, int length,

149 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

150 _ListRangeIterator source =

151 (new _ListRange(utf16CodeUnits, offset, length)).iterator;

152 Utf16CodeUnitDecoder decoder = new Utf16CodeUnitDecoder

153 .fromListRangeIterator(source, replacementCodepoint);

154 List<int> codepoints = new List<int>(source.remaining);

155 int i = 0;

156 while (decoder.moveNext()) {

157 codepoints[i++] = decoder.current;

158 }

159 if (i == codepoints.length) {

160 return codepoints;

161 } else {

162 List<int> codepointTrunc = new List<int>(i);

163 codepointTrunc.setRange(0, i, codepoints);

164 return codepointTrunc;

165 }

166 }

167

168 /**

169 * Decodes the UTF-16 bytes as an iterable. Thus, the consumer can only convert

170 * as much of the input as needed. Determines the byte order from the BOM,

171 * or uses big-endian as a default. This method always strips a leading BOM.

172 * Set the [replacementCodepoint] to null to throw an ArgumentError

173 * rather than replace the bad value. The default value for

174 * [replacementCodepoint] is U+FFFD.

175 */

176 IterableUtf16Decoder decodeUtf16AsIterable(List<int> bytes, [int offset = 0,

177 int length, int replacementCodepoint =

178 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

179 return new IterableUtf16Decoder._(

180 () => new Utf16BytesToCodeUnitsDecoder(bytes, offset, length,

181 replacementCodepoint), replacementCodepoint);

182 }

183

184 /**

185 * Decodes the UTF-16BE bytes as an iterable. Thus, the consumer can only

186 * convert as much of the input as needed. This method strips a leading BOM by

187 * default, but can be overridden by setting the optional parameter [stripBom]

188 * to false. Set the [replacementCodepoint] to null to throw an

189 * ArgumentError rather than replace the bad value. The default

190 * value for the [replacementCodepoint] is U+FFFD.

191 */

192 IterableUtf16Decoder decodeUtf16beAsIterable(List<int> bytes, [int offset = 0,

193 int length, bool stripBom = true, int replacementCodepoint =

194 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

195 return new IterableUtf16Decoder._(

196 () => new Utf16beBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,

197 replacementCodepoint), replacementCodepoint);

198 }

199

200 /**

201 * Decodes the UTF-16LE bytes as an iterable. Thus, the consumer can only

202 * convert as much of the input as needed. This method strips a leading BOM by

203 * default, but can be overridden by setting the optional parameter [stripBom]

204 * to false. Set the [replacementCodepoint] to null to throw an

205 * ArgumentError rather than replace the bad value. The default

206 * value for the [replacementCodepoint] is U+FFFD.

207 */

208 IterableUtf16Decoder decodeUtf16leAsIterable(List<int> bytes, [int offset = 0,

209 int length, bool stripBom = true, int replacementCodepoint =

210 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

211 return new IterableUtf16Decoder._(

212 () => new Utf16leBytesToCodeUnitsDecoder(bytes, offset, length, stripBom,

213 replacementCodepoint), replacementCodepoint);

214 }

215

216 /**

217 * Produce a String from a sequence of UTF-16 encoded bytes. This method always

218 * strips a leading BOM. Set the [replacementCodepoint] to null to throw an

219 * ArgumentError rather than replace the bad value. The default

220 * value for the [replacementCodepoint] is U+FFFD.

221 */

222 String decodeUtf16(List<int> bytes, [int offset = 0, int length,

223 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

224 Utf16BytesToCodeUnitsDecoder decoder = new Utf16BytesToCodeUnitsDecoder(bytes,

225 offset, length, replacementCodepoint);

226 List<int> codeunits = decoder.decodeRest();

227 return new String.fromCharCodes(

228 _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));

229 }

230

231 /**

232 * Produce a String from a sequence of UTF-16BE encoded bytes. This method

233 * strips a leading BOM by default, but can be overridden by setting the

234 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to

235 * null to throw an ArgumentError rather than replace the bad value.

236 * The default value for the [replacementCodepoint] is U+FFFD.

237 */

238 String decodeUtf16be(List<int> bytes, [int offset = 0, int length,

239 bool stripBom = true,

240 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

241 List<int> codeunits = (new Utf16beBytesToCodeUnitsDecoder(bytes, offset,

242 length, stripBom, replacementCodepoint)).decodeRest();

243 return new String.fromCharCodes(

244 _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));

245 }

246

247 /**

248 * Produce a String from a sequence of UTF-16LE encoded bytes. This method

249 * strips a leading BOM by default, but can be overridden by setting the

250 * optional parameter [stripBom] to false. Set the [replacementCodepoint] to

251 * null to throw an ArgumentError rather than replace the bad value.

252 * The default value for the [replacementCodepoint] is U+FFFD.

253 */

254 String decodeUtf16le(List<int> bytes, [int offset = 0, int length,

255 bool stripBom = true,

256 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

257 List<int> codeunits = (new Utf16leBytesToCodeUnitsDecoder(bytes, offset,

258 length, stripBom, replacementCodepoint)).decodeRest();

259 return new String.fromCharCodes(

260 _utf16CodeUnitsToCodepoints(codeunits, 0, null, replacementCodepoint));

261 }

262

263 /**

264 * Produce a list of UTF-16 encoded bytes. This method prefixes the resulting

265 * bytes with a big-endian byte-order-marker.

266 */

267 List<int> encodeUtf16(String str) =>

268 encodeUtf16be(str, true);

269

270 /**

271 * Produce a list of UTF-16BE encoded bytes. By default, this method produces

272 * UTF-16BE bytes with no BOM.

273 */

274 List<int> encodeUtf16be(String str, [bool writeBOM = false]) {

275 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);

276 List<int> encoding =

277 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));

278 int i = 0;

279 if (writeBOM) {

280 encoding[i++] = UNICODE_UTF_BOM_HI;

281 encoding[i++] = UNICODE_UTF_BOM_LO;

282 }

283 for (int unit in utf16CodeUnits) {

284 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;

285 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;

286 }

287 return encoding;

288 }

289

290 /**

291 * Produce a list of UTF-16LE encoded bytes. By default, this method produces

292 * UTF-16LE bytes with no BOM.

293 */

294 List<int> encodeUtf16le(String str, [bool writeBOM = false]) {

295 List<int> utf16CodeUnits = _stringToUtf16CodeUnits(str);

296 List<int> encoding =

297 new List<int>(2 * utf16CodeUnits.length + (writeBOM ? 2 : 0));

298 int i = 0;

299 if (writeBOM) {

300 encoding[i++] = UNICODE_UTF_BOM_LO;

301 encoding[i++] = UNICODE_UTF_BOM_HI;

302 }

303 for (int unit in utf16CodeUnits) {

304 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK;

305 encoding[i++] = (unit & UNICODE_BYTE_ONE_MASK) >> 8;

306 }

307 return encoding;

308 }

309

310 /**

311 * Identifies whether a List of bytes starts (based on offset) with a

312 * byte-order marker (BOM).

313 */

314 bool hasUtf16Bom(List<int> utf32EncodedBytes, [int offset = 0, int length]) {

315 return hasUtf16beBom(utf32EncodedBytes, offset, length) \|\|

316 hasUtf16leBom(utf32EncodedBytes, offset, length);

317 }

318

319 /**

320 * Identifies whether a List of bytes starts (based on offset) with a

321 * big-endian byte-order marker (BOM).

322 */

323 bool hasUtf16beBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {

324 int end = length != null ? offset + length : utf16EncodedBytes.length;

325 return (offset + 2) <= end &&

326 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_HI &&

327 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_LO;

328 }

329

330 /**

331 * Identifies whether a List of bytes starts (based on offset) with a

332 * little-endian byte-order marker (BOM).

333 */

334 bool hasUtf16leBom(List<int> utf16EncodedBytes, [int offset = 0, int length]) {

335 int end = length != null ? offset + length : utf16EncodedBytes.length;

336 return (offset + 2) <= end &&

337 utf16EncodedBytes[offset] == UNICODE_UTF_BOM_LO &&

338 utf16EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI;

339 }

340

341 List<int> _stringToUtf16CodeUnits(String str) {

342 return _codepointsToUtf16CodeUnits(str.codeUnits);

343 }

344

345 typedef _ListRangeIterator _CodeUnitsProvider();

346

347 /**

348 * Return type of [decodeUtf16AsIterable] and variants. The Iterable type

349 * provides an iterator on demand and the iterator will only translate bytes

350 * as requested by the user of the iterator. (Note: results are not cached.)

351 */

352 // TODO(floitsch): Consider removing the extend and switch to implements since

353 // that's cheaper to allocate.

354 class IterableUtf16Decoder extends IterableBase<int> {

355 final _CodeUnitsProvider codeunitsProvider;

356 final int replacementCodepoint;

357

358 IterableUtf16Decoder._(this.codeunitsProvider, this.replacementCodepoint);

359

360 Utf16CodeUnitDecoder get iterator =>

361 new Utf16CodeUnitDecoder.fromListRangeIterator(codeunitsProvider(),

362 replacementCodepoint);

363 }

364

365 /**

366 * Convert UTF-16 encoded bytes to UTF-16 code units by grouping 1-2 bytes

367 * to produce the code unit (0-(2^16)-1). Relies on BOM to determine

368 * endian-ness, and defaults to BE.

369 */

370 abstract class Utf16BytesToCodeUnitsDecoder implements _ListRangeIterator {

371 final _ListRangeIterator utf16EncodedBytesIterator;

372 final int replacementCodepoint;

373 int _current = null;

374

375 Utf16BytesToCodeUnitsDecoder._fromListRangeIterator(

376 this.utf16EncodedBytesIterator, this.replacementCodepoint);

377

378 factory Utf16BytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [

379 int offset = 0, int length,

380 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

381 if (length == null) {

382 length = utf16EncodedBytes.length - offset;

383 }

384 if (hasUtf16beBom(utf16EncodedBytes, offset, length)) {

385 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,

386 length - 2, false, replacementCodepoint);

387 } else if (hasUtf16leBom(utf16EncodedBytes, offset, length)) {

388 return new Utf16leBytesToCodeUnitsDecoder(utf16EncodedBytes, offset + 2,

389 length - 2, false, replacementCodepoint);

390 } else {

391 return new Utf16beBytesToCodeUnitsDecoder(utf16EncodedBytes, offset,

392 length, false, replacementCodepoint);

393 }

394 }

395

396 /**

397 * Provides a fast way to decode the rest of the source bytes in a single

398 * call. This method trades memory for improved speed in that it potentially

399 * over-allocates the List containing results.

400 */

401 List<int> decodeRest() {

402 List<int> codeunits = new List<int>(remaining);

403 int i = 0;

404 while (moveNext()) {

405 codeunits[i++] = current;

406 }

407 if (i == codeunits.length) {

408 return codeunits;

409 } else {

410 List<int> truncCodeunits = new List<int>(i);

411 truncCodeunits.setRange(0, i, codeunits);

412 return truncCodeunits;

413 }

414 }

415

416 int get current => _current;

417

418 bool moveNext() {

419 _current = null;

420 int remaining = utf16EncodedBytesIterator.remaining;

421 if (remaining == 0) {

422 _current = null;

423 return false;

424 }

425 if (remaining == 1) {

426 utf16EncodedBytesIterator.moveNext();

427 if (replacementCodepoint != null) {

428 _current = replacementCodepoint;

429 return true;

430 } else {

431 throw new ArgumentError(

432 "Invalid UTF16 at ${utf16EncodedBytesIterator.position}");

433 }

434 }

435 _current = decode();

436 return true;

437 }

438

439 int get position => utf16EncodedBytesIterator.position ~/ 2;

440

441 void backup([int by = 1]) {

442 utf16EncodedBytesIterator.backup(2 * by);

443 }

444

445 int get remaining => (utf16EncodedBytesIterator.remaining + 1) ~/ 2;

446

447 void skip([int count = 1]) {

448 utf16EncodedBytesIterator.skip(2 * count);

449 }

450

451 int decode();

452 }

453

454 /**

455 * Convert UTF-16BE encoded bytes to utf16 code units by grouping 1-2 bytes

456 * to produce the code unit (0-(2^16)-1).

457 */

458 class Utf16beBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {

459 Utf16beBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [

460 int offset = 0, int length, bool stripBom = true,

461 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

462 super._fromListRangeIterator(

463 (new _ListRange(utf16EncodedBytes, offset, length)).iterator,

464 replacementCodepoint) {

465 if (stripBom && hasUtf16beBom(utf16EncodedBytes, offset, length)) {

466 skip();

467 }

468 }

469

470 int decode() {

471 utf16EncodedBytesIterator.moveNext();

472 int hi = utf16EncodedBytesIterator.current;

473 utf16EncodedBytesIterator.moveNext();

474 int lo = utf16EncodedBytesIterator.current;

475 return (hi << 8) + lo;

476 }

477 }

478

479 /**

480 * Convert UTF-16LE encoded bytes to utf16 code units by grouping 1-2 bytes

481 * to produce the code unit (0-(2^16)-1).

482 */

483 class Utf16leBytesToCodeUnitsDecoder extends Utf16BytesToCodeUnitsDecoder {

484 Utf16leBytesToCodeUnitsDecoder(List<int> utf16EncodedBytes, [

485 int offset = 0, int length, bool stripBom = true,

486 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

487 super._fromListRangeIterator(

488 (new _ListRange(utf16EncodedBytes, offset, length)).iterator,

489 replacementCodepoint) {

490 if (stripBom && hasUtf16leBom(utf16EncodedBytes, offset, length)) {

491 skip();

492 }

493 }

494

495 int decode() {

496 utf16EncodedBytesIterator.moveNext();

497 int lo = utf16EncodedBytesIterator.current;

498 utf16EncodedBytesIterator.moveNext();

499 int hi = utf16EncodedBytesIterator.current;

500 return (hi << 8) + lo;

501 }

502 }

OLD	NEW

« no previous file with comments | « pkg/utf/lib/utf.dart ('k') | pkg/utf/lib/utf32.dart » ('j') | no next file with comments »