pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart - Issue 2698353003: unfork DDC's copy of most SDK libraries

Side by Side Diff: pkg/dev_compiler/tool/input_sdk/lib/convert/utf.dart

Issue 2698353003: unfork DDC's copy of most SDK libraries (Closed)

Patch Set: revert core_patch Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « pkg/dev_compiler/tool/input_sdk/lib/convert/string_conversion.dart ('k') | pkg/dev_compiler/tool/input_sdk/lib/core/annotations.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 part of dart.convert;

6

7 /** The Unicode Replacement character `U+FFFD` (�). */

8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;

9

10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */

11 const int UNICODE_BOM_CHARACTER_RUNE = 0xFEFF;

12

13 /**

14 * An instance of the default implementation of the [Utf8Codec].

15 *

16 * This instance provides a convenient access to the most common UTF-8

17 * use cases.

18 *

19 * Examples:

20 *

21 * var encoded = UTF8.encode("Îñţérñåţîöñåļîžåţîờñ");

22 * var decoded = UTF8.decode([0x62, 0x6c, 0xc3, 0xa5, 0x62, 0xc3, 0xa6,

23 * 0x72, 0x67, 0x72, 0xc3, 0xb8, 0x64]);

24 */

25 const Utf8Codec UTF8 = const Utf8Codec();

26

27 /**

28 * A [Utf8Codec] encodes strings to utf-8 code units (bytes) and decodes

29 * UTF-8 code units to strings.

30 */

31 class Utf8Codec extends Encoding {

32 final bool _allowMalformed;

33

34 /**

35 * Instantiates a new [Utf8Codec].

36 *

37 * The optional [allowMalformed] argument defines how [decoder] (and [decode])

38 * deal with invalid or unterminated character sequences.

39 *

40 * If it is `true` (and not overridden at the method invocation) [decode] and

41 * the [decoder] replace invalid (or unterminated) octet

42 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise

43 * they throw a [FormatException].

44 */

45 const Utf8Codec({ bool allowMalformed: false })

46 : _allowMalformed = allowMalformed;

47

48 String get name => "utf-8";

49

50 /**

51 * Decodes the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

52 * corresponding string.

53 *

54 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this

55 * character is discarded.

56 *

57 * If [allowMalformed] is `true` the decoder replaces invalid (or

58 * unterminated) character sequences with the Unicode Replacement character

59 * `U+FFFD` (�). Otherwise it throws a [FormatException].

60 *

61 * If [allowMalformed] is not given, it defaults to the `allowMalformed` that

62 * was used to instantiate `this`.

63 */

64 String decode(List<int> codeUnits, { bool allowMalformed }) {

65 if (allowMalformed == null) allowMalformed = _allowMalformed;

66 return new Utf8Decoder(allowMalformed: allowMalformed).convert(codeUnits);

67 }

68

69 Utf8Encoder get encoder => const Utf8Encoder();

70 Utf8Decoder get decoder {

71 return new Utf8Decoder(allowMalformed: _allowMalformed);

72 }

73 }

74

75 /**

76 * This class converts strings to their UTF-8 code units (a list of

77 * unsigned 8-bit integers).

78 */

79 class Utf8Encoder extends Converter<String, List<int>> {

80

81 const Utf8Encoder();

82

83 /**

84 * Converts [string] to its UTF-8 code units (a list of

85 * unsigned 8-bit integers).

86 *

87 * If [start] and [end] are provided, only the substring

88 * `string.substring(start, end)` is converted.

89 */

90 List<int> convert(String string, [int start = 0, int end]) {

91 int stringLength = string.length;

92 RangeError.checkValidRange(start, end, stringLength);

93 if (end == null) end = stringLength;

94 int length = end - start;

95 if (length == 0) return new Uint8List(0);

96 // Create a new encoder with a length that is guaranteed to be big enough.

97 // A single code unit uses at most 3 bytes, a surrogate pair at most 4.

98 _Utf8Encoder encoder = new _Utf8Encoder.withBufferSize(length * 3);

99 int endPosition = encoder._fillBuffer(string, start, end);

100 assert(endPosition >= end - 1);

101 if (endPosition != end) {

102 // Encoding skipped the last code unit.

103 // That can only happen if the last code unit is a leadsurrogate.

104 // Force encoding of the lead surrogate by itself.

105 int lastCodeUnit = string.codeUnitAt(end - 1);

106 assert(_isLeadSurrogate(lastCodeUnit));

107 // We use a non-surrogate as `nextUnit` so that _writeSurrogate just

108 // writes the lead-surrogate.

109 bool wasCombined = encoder._writeSurrogate(lastCodeUnit, 0);

110 assert(!wasCombined);

111 }

112 return encoder._buffer.sublist(0, encoder._bufferIndex);

113 }

114

115 /**

116 * Starts a chunked conversion.

117 *

118 * The converter works more efficiently if the given [sink] is a

119 * [ByteConversionSink].

120 */

121 StringConversionSink startChunkedConversion(Sink<List<int>> sink) {

122 if (sink is! ByteConversionSink) {

123 sink = new ByteConversionSink.from(sink);

124 }

125 return new _Utf8EncoderSink(sink);

126 }

127

128 // Override the base-classes bind, to provide a better type.

129 Stream<List<int>> bind(Stream<String> stream) => super.bind(stream);

130 }

131

132 /**

133 * This class encodes Strings to UTF-8 code units (unsigned 8 bit integers).

134 */

135 // TODO(floitsch): make this class public.

136 class _Utf8Encoder {

137 int _carry = 0;

138 int _bufferIndex = 0;

139 final List<int> _buffer;

140

141 static const _DEFAULT_BYTE_BUFFER_SIZE = 1024;

142

143 _Utf8Encoder() : this.withBufferSize(_DEFAULT_BYTE_BUFFER_SIZE);

144

145 _Utf8Encoder.withBufferSize(int bufferSize)

146 : _buffer = _createBuffer(bufferSize);

147

148 /**

149 * Allow an implementation to pick the most efficient way of storing bytes.

150 */

151 static List<int> _createBuffer(int size) => new Uint8List(size);

152

153 /**

154 * Tries to combine the given [leadingSurrogate] with the [nextCodeUnit] and

155 * writes it to [_buffer].

156 *

157 * Returns true if the [nextCodeUnit] was combined with the

158 * [leadingSurrogate]. If it wasn't then nextCodeUnit was not a trailing

159 * surrogate and has not been written yet.

160 *

161 * It is safe to pass 0 for [nextCodeUnit] in which case only the leading

162 * surrogate is written.

163 */

164 bool _writeSurrogate(int leadingSurrogate, int nextCodeUnit) {

165 if (_isTailSurrogate(nextCodeUnit)) {

166 int rune = _combineSurrogatePair(leadingSurrogate, nextCodeUnit);

167 // If the rune is encoded with 2 code-units then it must be encoded

168 // with 4 bytes in UTF-8.

169 assert(rune > _THREE_BYTE_LIMIT);

170 assert(rune <= _FOUR_BYTE_LIMIT);

171 _buffer[_bufferIndex++] = 0xF0 \| (rune >> 18);

172 _buffer[_bufferIndex++] = 0x80 \| ((rune >> 12) & 0x3f);

173 _buffer[_bufferIndex++] = 0x80 \| ((rune >> 6) & 0x3f);

174 _buffer[_bufferIndex++] = 0x80 \| (rune & 0x3f);

175 return true;

176 } else {

177 // TODO(floitsch): allow to throw on malformed strings.

178 // Encode the half-surrogate directly into UTF-8. This yields

179 // invalid UTF-8, but we started out with invalid UTF-16.

180

181 // Surrogates are always encoded in 3 bytes in UTF-8.

182 _buffer[_bufferIndex++] = 0xE0 \| (leadingSurrogate >> 12);

183 _buffer[_bufferIndex++] = 0x80 \| ((leadingSurrogate >> 6) & 0x3f);

184 _buffer[_bufferIndex++] = 0x80 \| (leadingSurrogate & 0x3f);

185 return false;

186 }

187 }

188

189 /**

190 * Fills the [_buffer] with as many characters as possible.

191 *

192 * Does not encode any trailing lead-surrogate. This must be done by the

193 * caller.

194 *

195 * Returns the position in the string. The returned index points to the

196 * first code unit that hasn't been encoded.

197 */

198 int _fillBuffer(String str, int start, int end) {

199 if (start != end && _isLeadSurrogate(str.codeUnitAt(end - 1))) {

200 // Don't handle a trailing lead-surrogate in this loop. The caller has

201 // to deal with those.

202 end--;

203 }

204 int stringIndex;

205 for (stringIndex = start; stringIndex < end; stringIndex++) {

206 int codeUnit = str.codeUnitAt(stringIndex);

207 // ASCII has the same representation in UTF-8 and UTF-16.

208 if (codeUnit <= _ONE_BYTE_LIMIT) {

209 if (_bufferIndex >= _buffer.length) break;

210 _buffer[_bufferIndex++] = codeUnit;

211 } else if (_isLeadSurrogate(codeUnit)) {

212 if (_bufferIndex + 3 >= _buffer.length) break;

213 // Note that it is safe to read the next code unit. We decremented

214 // [end] above when the last valid code unit was a leading surrogate.

215 int nextCodeUnit = str.codeUnitAt(stringIndex + 1);

216 bool wasCombined = _writeSurrogate(codeUnit, nextCodeUnit);

217 if (wasCombined) stringIndex++;

218 } else {

219 int rune = codeUnit;

220 if (rune <= _TWO_BYTE_LIMIT) {

221 if (_bufferIndex + 1 >= _buffer.length) break;

222 _buffer[_bufferIndex++] = 0xC0 \| (rune >> 6);

223 _buffer[_bufferIndex++] = 0x80 \| (rune & 0x3f);

224 } else {

225 assert(rune <= _THREE_BYTE_LIMIT);

226 if (_bufferIndex + 2 >= _buffer.length) break;

227 _buffer[_bufferIndex++] = 0xE0 \| (rune >> 12);

228 _buffer[_bufferIndex++] = 0x80 \| ((rune >> 6) & 0x3f);

229 _buffer[_bufferIndex++] = 0x80 \| (rune & 0x3f);

230 }

231 }

232 }

233 return stringIndex;

234 }

235 }

236

237 /**

238 * This class encodes chunked strings to UTF-8 code units (unsigned 8-bit

239 * integers).

240 */

241 class _Utf8EncoderSink extends _Utf8Encoder with StringConversionSinkMixin {

242

243 final ByteConversionSink _sink;

244

245 _Utf8EncoderSink(this._sink);

246

247 void close() {

248 if (_carry != 0) {

249 // addSlice will call close again, but then the carry must be equal to 0.

250 addSlice("", 0, 0, true);

251 return;

252 }

253 _sink.close();

254 }

255

256 void addSlice(String str, int start, int end, bool isLast) {

257 _bufferIndex = 0;

258

259 if (start == end && !isLast) {

260 return;

261 }

262

263 if (_carry != 0) {

264 int nextCodeUnit = 0;

265 if (start != end) {

266 nextCodeUnit = str.codeUnitAt(start);

267 } else {

268 assert(isLast);

269 }

270 bool wasCombined = _writeSurrogate(_carry, nextCodeUnit);

271 // Either we got a non-empty string, or we must not have been combined.

272 assert(!wasCombined \|\| start != end );

273 if (wasCombined) start++;

274 _carry = 0;

275 }

276 do {

277 start = _fillBuffer(str, start, end);

278 bool isLastSlice = isLast && (start == end);

279 if (start == end - 1 && _isLeadSurrogate(str.codeUnitAt(start))) {

280 if (isLast && _bufferIndex < _buffer.length - 3) {

281 // There is still space for the last incomplete surrogate.

282 // We use a non-surrogate as second argument. This way the

283 // function will just add the surrogate-half to the buffer.

284 bool hasBeenCombined = _writeSurrogate(str.codeUnitAt(start), 0);

285 assert(!hasBeenCombined);

286 } else {

287 // Otherwise store it in the carry. If isLast is true, then

288 // close will flush the last carry.

289 _carry = str.codeUnitAt(start);

290 }

291 start++;

292 }

293 _sink.addSlice(_buffer, 0, _bufferIndex, isLastSlice);

294 _bufferIndex = 0;

295 } while (start < end);

296 if (isLast) close();

297 }

298

299 // TODO(floitsch): implement asUtf8Sink. Sligthly complicated because it

300 // needs to deal with malformed input.

301 }

302

303 /**

304 * This class converts UTF-8 code units (lists of unsigned 8-bit integers)

305 * to a string.

306 */

307 class Utf8Decoder extends Converter<List<int>, String> {

308 final bool _allowMalformed;

309

310 /**

311 * Instantiates a new [Utf8Decoder].

312 *

313 * The optional [allowMalformed] argument defines how [convert] deals

314 * with invalid or unterminated character sequences.

315 *

316 * If it is `true` [convert] replaces invalid (or unterminated) character

317 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise

318 * it throws a [FormatException].

319 */

320 const Utf8Decoder({ bool allowMalformed: false })

321 : this._allowMalformed = allowMalformed;

322

323 /**

324 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the

325 * corresponding string.

326 *

327 * Uses the code units from [start] to, but no including, [end].

328 * If [end] is omitted, it defaults to `codeUnits.length`.

329 *

330 * If the [codeUnits] start with a leading [UNICODE_BOM_CHARACTER_RUNE] this

331 * character is discarded.

332 */

333 String convert(List<int> codeUnits, [int start = 0, int end]) {

334 // Allow the implementation to intercept and specialize based on the type

335 // of codeUnits.

336 String result = _convertIntercepted(_allowMalformed, codeUnits, start, end);

337 if (result != null) {

338 return result;

339 }

340

341 int length = codeUnits.length;

342 RangeError.checkValidRange(start, end, length);

343 if (end == null) end = length;

344 StringBuffer buffer = new StringBuffer();

345 _Utf8Decoder decoder = new _Utf8Decoder(buffer, _allowMalformed);

346 decoder.convert(codeUnits, start, end);

347 decoder.close();

348 return buffer.toString();

349 }

350

351 /**

352 * Starts a chunked conversion.

353 *

354 * The converter works more efficiently if the given [sink] is a

355 * [StringConversionSink].

356 */

357 ByteConversionSink startChunkedConversion(Sink<String> sink) {

358 StringConversionSink stringSink;

359 if (sink is StringConversionSink) {

360 stringSink = sink;

361 } else {

362 stringSink = new StringConversionSink.from(sink);

363 }

364 return stringSink.asUtf8Sink(_allowMalformed);

365 }

366

367 // Override the base-classes bind, to provide a better type.

368 Stream<String> bind(Stream<List<int>> stream) => super.bind(stream);

369

370 external Converter<List<int>, dynamic/=T/> fuse/<T>/(

371 Converter<String, dynamic/=T/> next);

372

373 external static String _convertIntercepted(

374 bool allowMalformed, List<int> codeUnits, int start, int end);

375 }

376

377 // UTF-8 constants.

378 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bits

379 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bits

380 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bits

381 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bits, truncated to Unicode max.

382

383 // UTF-16 constants.

384 const int _SURROGATE_MASK = 0xF800;

385 const int _SURROGATE_TAG_MASK = 0xFC00;

386 const int _SURROGATE_VALUE_MASK = 0x3FF;

387 const int _LEAD_SURROGATE_MIN = 0xD800;

388 const int _TAIL_SURROGATE_MIN = 0xDC00;

389

390 bool _isLeadSurrogate(int codeUnit) =>

391 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;

392 bool _isTailSurrogate(int codeUnit) =>

393 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;

394 int _combineSurrogatePair(int lead, int tail) =>

395 0x10000 + ((lead & _SURROGATE_VALUE_MASK) << 10)

396 \| (tail & _SURROGATE_VALUE_MASK);

397

398 /**

399 * Decodes UTF-8.

400 *

401 * The decoder handles chunked input.

402 */

403 // TODO(floitsch): make this class public.

404 class _Utf8Decoder {

405 final bool _allowMalformed;

406 final StringSink _stringSink;

407 bool _isFirstCharacter = true;

408 int _value = 0;

409 int _expectedUnits = 0;

410 int _extraUnits = 0;

411

412 _Utf8Decoder(this._stringSink, this._allowMalformed);

413

414 bool get hasPartialInput => _expectedUnits > 0;

415

416 // Limits of one through four byte encodings.

417 static const List<int> _LIMITS = const <int>[

418 _ONE_BYTE_LIMIT,

419 _TWO_BYTE_LIMIT,

420 _THREE_BYTE_LIMIT,

421 _FOUR_BYTE_LIMIT ];

422

423 void close() {

424 flush();

425 }

426

427 /**

428 * Flushes this decoder as if closed.

429 *

430 * This method throws if the input was partial and the decoder was

431 * constructed with `allowMalformed` set to `false`.

432 */

433 void flush() {

434 if (hasPartialInput) {

435 if (!_allowMalformed) {

436 throw new FormatException("Unfinished UTF-8 octet sequence");

437 }

438 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

439 _value = 0;

440 _expectedUnits = 0;

441 _extraUnits = 0;

442 }

443 }

444

445 void convert(List<int> codeUnits, int startIndex, int endIndex) {

446 int value = _value;

447 int expectedUnits = _expectedUnits;

448 int extraUnits = _extraUnits;

449 _value = 0;

450 _expectedUnits = 0;

451 _extraUnits = 0;

452

453 int scanOneByteCharacters(units, int from) {

454 final to = endIndex;

455 final mask = _ONE_BYTE_LIMIT;

456 for (var i = from; i < to; i++) {

457 final unit = units[i];

458 if ((unit & mask) != unit) return i - from;

459 }

460 return to - from;

461 }

462

463 void addSingleBytes(int from, int to) {

464 assert(from >= startIndex && from <= endIndex);

465 assert(to >= startIndex && to <= endIndex);

466 _stringSink.write(new String.fromCharCodes(codeUnits, from, to));

467 }

468

469 int i = startIndex;

470 loop: while (true) {

471 multibyte: if (expectedUnits > 0) {

472 do {

473 if (i == endIndex) {

474 break loop;

475 }

476 int unit = codeUnits[i];

477 if ((unit & 0xC0) != 0x80) {

478 expectedUnits = 0;

479 if (!_allowMalformed) {

480 throw new FormatException(

481 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

482 }

483 _isFirstCharacter = false;

484 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

485 break multibyte;

486 } else {

487 value = (value << 6) \| (unit & 0x3f);

488 expectedUnits--;

489 i++;

490 }

491 } while (expectedUnits > 0);

492 if (value <= _LIMITS[extraUnits - 1]) {

493 // Overly long encoding. The value could be encoded with a shorter

494 // encoding.

495 if (!_allowMalformed) {

496 throw new FormatException(

497 "Overlong encoding of 0x${value.toRadixString(16)}");

498 }

499 expectedUnits = extraUnits = 0;

500 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

501 }

502 if (value > _FOUR_BYTE_LIMIT) {

503 if (!_allowMalformed) {

504 throw new FormatException("Character outside valid Unicode range: "

505 "0x${value.toRadixString(16)}");

506 }

507 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

508 }

509 if (!_isFirstCharacter \|\| value != UNICODE_BOM_CHARACTER_RUNE) {

510 _stringSink.writeCharCode(value);

511 }

512 _isFirstCharacter = false;

513 }

514

515 while (i < endIndex) {

516 int oneBytes = scanOneByteCharacters(codeUnits, i);

517 if (oneBytes > 0) {

518 _isFirstCharacter = false;

519 addSingleBytes(i, i + oneBytes);

520 i += oneBytes;

521 if (i == endIndex) break;

522 }

523 int unit = codeUnits[i++];

524 // TODO(floitsch): the way we test we could potentially allow

525 // units that are too large, if they happen to have the

526 // right bit-pattern. (Same is true for the multibyte loop above).

527 // TODO(floitsch): optimize this loop. See:

528 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80

529 if (unit < 0) {

530 // TODO(floitsch): should this be unit <= 0 ?

531 if (!_allowMalformed) {

532 throw new FormatException(

533 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");

534 }

535 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

536 } else {

537 assert(unit > _ONE_BYTE_LIMIT);

538 if ((unit & 0xE0) == 0xC0) {

539 value = unit & 0x1F;

540 expectedUnits = extraUnits = 1;

541 continue loop;

542 }

543 if ((unit & 0xF0) == 0xE0) {

544 value = unit & 0x0F;

545 expectedUnits = extraUnits = 2;

546 continue loop;

547 }

548 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.

549 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {

550 value = unit & 0x07;

551 expectedUnits = extraUnits = 3;

552 continue loop;

553 }

554 if (!_allowMalformed) {

555 throw new FormatException(

556 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

557 }

558 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

559 expectedUnits = extraUnits = 0;

560 _isFirstCharacter = false;

561 _stringSink.writeCharCode(value);

562 }

563 }

564 break loop;

565 }

566 if (expectedUnits > 0) {

567 _value = value;

568 _expectedUnits = expectedUnits;

569 _extraUnits = extraUnits;

570 }

571 }

572 }

OLD	NEW