packages/utf/lib/src/utf8.dart - Issue 2989763002: Update charted to 0.4.8 and roll

Side by Side Diff: packages/utf/lib/src/utf8.dart

Issue 2989763002: Update charted to 0.4.8 and roll (Closed)

Patch Set: Removed Cutch from list of reviewers Created 3 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of utf;	5 library utf.utf8;

	6

	7 import "dart:collection";

	8

	9 import 'constants.dart';

	10 import 'list_range.dart';

	11 import 'shared.dart';

6	12

7 const int _UTF8_ONE_BYTE_MAX = 0x7f;	13 const int _UTF8_ONE_BYTE_MAX = 0x7f;

8 const int _UTF8_TWO_BYTE_MAX = 0x7ff;	14 const int _UTF8_TWO_BYTE_MAX = 0x7ff;

9 const int _UTF8_THREE_BYTE_MAX = 0xffff;	15 const int _UTF8_THREE_BYTE_MAX = 0xffff;

10	16

11 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;	17 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;

12	18

13 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0;	19 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0;

14 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0;	20 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0;

15 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0;	21 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0;

16 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8;	22 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8;

17 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc;	23 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc;

18	24

19 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f;	25 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f;

20 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf;	26 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf;

21 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7;	27 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7;

22	28

23 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe;	29 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe;

24 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80;	30 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80;

25	31

26 /**	32 /**

27 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert	33 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert

28 * as much of the input as needed. Set the replacementCharacter to null to	34 * as much of the input as needed. Set the replacementCharacter to null to

29 * throw an ArgumentError rather than replace the bad value.	35 * throw an ArgumentError rather than replace the bad value.

30 */	36 */

31 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0,	37 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes,

	38 [int offset = 0,

32 int length,	39 int length,

33 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {	40 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

34 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint);	41 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint);

35 }	42 }

36	43

37 /**	44 /**

38 * Produce a String from a List of UTF-8 encoded bytes. The parameters	45 * Produce a String from a List of UTF-8 encoded bytes. The parameters

39 * can set an offset into a list of bytes (as int), limit the length of the	46 * can set an offset into a list of bytes (as int), limit the length of the

40 * values to be decoded, and override the default Unicode replacement character.	47 * values to be decoded, and override the default Unicode replacement character.

41 * Set the replacementCharacter to null to throw an ArgumentError	48 * Set the replacementCharacter to null to throw an ArgumentError

42 * rather than replace the bad value.	49 * rather than replace the bad value.

43 */	50 */

44 String decodeUtf8(List<int> bytes, [int offset = 0, int length,	51 String decodeUtf8(List<int> bytes,

	52 [int offset = 0,

	53 int length,

45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {	54 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

46 return new String.fromCharCodes(	55 return new String.fromCharCodes(

47 (new Utf8Decoder(bytes, offset, length, replacementCodepoint))	56 (new Utf8Decoder(bytes, offset, length, replacementCodepoint))

48 .decodeRest());	57 .decodeRest());

49 }	58 }

50	59

51 /**	60 /**

52 * Produce a sequence of UTF-8 encoded bytes from the provided string.	61 * Produce a sequence of UTF-8 encoded bytes from the provided string.

53 */	62 */

54 List<int> encodeUtf8(String str) =>	63 List<int> encodeUtf8(String str) => codepointsToUtf8(stringToCodepoints(str));

55 codepointsToUtf8(stringToCodepoints(str));

56	64

57 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) {	65 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) {

58 while (bytes > 0) {	66 while (bytes > 0) {

59 buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE \|	67 buffer[offset + bytes] =

60 (value & _UTF8_LO_SIX_BIT_MASK);	68 _UTF8_SUBSEQUENT_BYTE_BASE \| (value & _UTF8_LO_SIX_BIT_MASK);

61 value = value >> 6;	69 value = value >> 6;

62 bytes--;	70 bytes--;

63 }	71 }

64 return value;	72 return value;

65 }	73 }

66	74

67 /**	75 /**

68 * Encode code points as UTF-8 code units.	76 * Encode code points as UTF-8 code units.

69 */	77 */

70 List<int> codepointsToUtf8(	78 List<int> codepointsToUtf8(List<int> codepoints, [int offset = 0, int length]) {

71 List<int> codepoints, [int offset = 0, int length]) {

72 ListRange source = new ListRange(codepoints, offset, length);	79 ListRange source = new ListRange(codepoints, offset, length);

73	80

74 int encodedLength = 0;	81 int encodedLength = 0;

75 for (int value in source) {	82 for (int value in source) {

76 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {	83 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {

77 encodedLength += 3;	84 encodedLength += 3;

78 } else if (value <= _UTF8_ONE_BYTE_MAX) {	85 } else if (value <= _UTF8_ONE_BYTE_MAX) {

79 encodedLength++;	86 encodedLength++;

80 } else if (value <= _UTF8_TWO_BYTE_MAX) {	87 } else if (value <= _UTF8_TWO_BYTE_MAX) {

81 encodedLength += 2;	88 encodedLength += 2;

82 } else if (value <= _UTF8_THREE_BYTE_MAX) {	89 } else if (value <= _UTF8_THREE_BYTE_MAX) {

83 encodedLength += 3;	90 encodedLength += 3;

84 } else if (value <= UNICODE_VALID_RANGE_MAX) {	91 } else if (value <= UNICODE_VALID_RANGE_MAX) {

85 encodedLength += 4;	92 encodedLength += 4;

86 }	93 }

87 }	94 }

88	95

89 List<int> encoded = new List<int>(encodedLength);	96 List<int> encoded = new List<int>(encodedLength);

90 int insertAt = 0;	97 int insertAt = 0;

91 for (int value in source) {	98 for (int value in source) {

92 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {	99 if (value < 0 \|\| value > UNICODE_VALID_RANGE_MAX) {

93 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]);	100 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]);

94 insertAt += 3;	101 insertAt += 3;

95 } else if (value <= _UTF8_ONE_BYTE_MAX) {	102 } else if (value <= _UTF8_ONE_BYTE_MAX) {

96 encoded[insertAt] = value;	103 encoded[insertAt] = value;

97 insertAt++;	104 insertAt++;

98 } else if (value <= _UTF8_TWO_BYTE_MAX) {	105 } else if (value <= _UTF8_TWO_BYTE_MAX) {

99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE \| (	106 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE \|

100 _UTF8_FIRST_BYTE_OF_TWO_MASK &	107 (_UTF8_FIRST_BYTE_OF_TWO_MASK &

101 _addToEncoding(insertAt, 1, value, encoded));	108 _addToEncoding(insertAt, 1, value, encoded));

102 insertAt += 2;	109 insertAt += 2;

103 } else if (value <= _UTF8_THREE_BYTE_MAX) {	110 } else if (value <= _UTF8_THREE_BYTE_MAX) {

104 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE \| (	111 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE \|

105 _UTF8_FIRST_BYTE_OF_THREE_MASK &	112 (_UTF8_FIRST_BYTE_OF_THREE_MASK &

106 _addToEncoding(insertAt, 2, value, encoded));	113 _addToEncoding(insertAt, 2, value, encoded));

107 insertAt += 3;	114 insertAt += 3;

108 } else if (value <= UNICODE_VALID_RANGE_MAX) {	115 } else if (value <= UNICODE_VALID_RANGE_MAX) {

109 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE \| (	116 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE \|

110 _UTF8_FIRST_BYTE_OF_FOUR_MASK &	117 (_UTF8_FIRST_BYTE_OF_FOUR_MASK &

111 _addToEncoding(insertAt, 3, value, encoded));	118 _addToEncoding(insertAt, 3, value, encoded));

112 insertAt += 4;	119 insertAt += 4;

113 }	120 }

114 }	121 }

115 return encoded;	122 return encoded;

116 }	123 }

117	124

118 // Because UTF-8 specifies byte order, we do not have to follow the pattern	125 // Because UTF-8 specifies byte order, we do not have to follow the pattern

119 // used by UTF-16 & UTF-32 regarding byte order.	126 // used by UTF-16 & UTF-32 regarding byte order.

120 List<int> utf8ToCodepoints(	127 List<int> utf8ToCodepoints(List<int> utf8EncodedBytes,

121 List<int> utf8EncodedBytes, [int offset = 0, int length,	128 [int offset = 0,

	129 int length,

122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {	130 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

123 return new Utf8Decoder(utf8EncodedBytes, offset, length,	131 return new Utf8Decoder(utf8EncodedBytes, offset, length, replacementCodepoint)

124 replacementCodepoint).decodeRest();	132 .decodeRest();

125 }	133 }

126	134

127 /**	135 /**

128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type	136 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type

129 * provides an iterator on demand and the iterator will only translate bytes	137 * provides an iterator on demand and the iterator will only translate bytes

130 * as requested by the user of the iterator. (Note: results are not cached.)	138 * as requested by the user of the iterator. (Note: results are not cached.)

131 */	139 */

132 // TODO(floitsch): Consider removing the extend and switch to implements since	140 // TODO(floitsch): Consider removing the extend and switch to implements since

133 // that's cheaper to allocate.	141 // that's cheaper to allocate.

134 class IterableUtf8Decoder extends IterableBase<int> {	142 class IterableUtf8Decoder extends IterableBase<int> {

135 final List<int> bytes;	143 final List<int> bytes;

136 final int offset;	144 final int offset;

137 final int length;	145 final int length;

138 final int replacementCodepoint;	146 final int replacementCodepoint;

139	147

140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null,	148 IterableUtf8Decoder(this.bytes,

	149 [this.offset = 0,

	150 this.length = null,

141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);	151 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);

142	152

143 Utf8Decoder get iterator =>	153 Utf8Decoder get iterator =>

144 new Utf8Decoder(bytes, offset, length, replacementCodepoint);	154 new Utf8Decoder(bytes, offset, length, replacementCodepoint);

145 }	155 }

146	156

147 /**	157 /**

148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The	158 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The

149 * parameters can set an offset into a list of bytes (as int), limit the length	159 * parameters can set an offset into a list of bytes (as int), limit the length

150 * of the values to be decoded, and override the default Unicode replacement	160 * of the values to be decoded, and override the default Unicode replacement

151 * character. Set the replacementCharacter to null to throw an	161 * character. Set the replacementCharacter to null to throw an

152 * ArgumentError rather than replace the bad value. The return value	162 * ArgumentError rather than replace the bad value. The return value

153 * from this method can be used as an Iterable (e.g. in a for-loop).	163 * from this method can be used as an Iterable (e.g. in a for-loop).

154 */	164 */

155 class Utf8Decoder implements Iterator<int> {	165 class Utf8Decoder implements Iterator<int> {

156 // TODO(kevmoo): should this field be private?	166 // TODO(kevmoo): should this field be private?

157 final ListRangeIterator utf8EncodedBytesIterator;	167 final ListRangeIterator utf8EncodedBytesIterator;

158 final int replacementCodepoint;	168 final int replacementCodepoint;

159 int _current = null;	169 int _current = null;

160	170

161 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length,	171 Utf8Decoder(List<int> utf8EncodedBytes,

162 this.replacementCodepoint =	172 [int offset = 0,

163 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :	173 int length,

164 utf8EncodedBytesIterator =	174 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])

165 (new ListRange(utf8EncodedBytes, offset, length)).iterator;	175 : utf8EncodedBytesIterator =

	176 (new ListRange(utf8EncodedBytes, offset, length)).iterator;

166	177

167	178 Utf8Decoder._fromListRangeIterator(ListRange source,

168 Utf8Decoder._fromListRangeIterator(ListRange source, [	179 [this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])

169 this.replacementCodepoint =	180 : utf8EncodedBytesIterator = source.iterator;

170 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :

171 utf8EncodedBytesIterator = source.iterator;

172	181

173 /** Decode the remaininder of the characters in this decoder	182 /** Decode the remaininder of the characters in this decoder

174 * into a [List<int>].	183 * into a [List<int>].

175 */	184 */

176 List<int> decodeRest() {	185 List<int> decodeRest() {

177 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);	186 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);

178 int i = 0;	187 int i = 0;

179 while (moveNext()) {	188 while (moveNext()) {

180 codepoints[i++] = current;	189 codepoints[i++] = current;

181 }	190 }

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
247 value = ((value << 6) \| (nextValue & _UTF8_LO_SIX_BIT_MASK));	256 value = ((value << 6) \| (nextValue & _UTF8_LO_SIX_BIT_MASK));

248 } else {	257 } else {

249 // if sequence-starting code unit, reposition cursor to start here	258 // if sequence-starting code unit, reposition cursor to start here

250 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {	259 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {

251 utf8EncodedBytesIterator.backup();	260 utf8EncodedBytesIterator.backup();

252 }	261 }

253 break;	262 break;

254 }	263 }

255 j++;	264 j++;

256 }	265 }

257 bool validSequence = (j == additionalBytes && (	266 bool validSequence = (j == additionalBytes &&

258 value < UNICODE_UTF16_RESERVED_LO \|\|	267 (value < UNICODE_UTF16_RESERVED_LO \|\|

259 value > UNICODE_UTF16_RESERVED_HI));	268 value > UNICODE_UTF16_RESERVED_HI));

260 bool nonOverlong =	269 bool nonOverlong = (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) \|\|

261 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) \|\|

262 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) \|\|	270 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) \|\|

263 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);	271 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);

264 bool inRange = value <= UNICODE_VALID_RANGE_MAX;	272 bool inRange = value <= UNICODE_VALID_RANGE_MAX;

265 if (validSequence && nonOverlong && inRange) {	273 if (validSequence && nonOverlong && inRange) {

266 _current = value;	274 _current = value;

267 return true;	275 return true;

268 } else if (replacementCodepoint != null) {	276 } else if (replacementCodepoint != null) {

269 _current = replacementCodepoint;	277 _current = replacementCodepoint;

270 return true;	278 return true;

271 } else {	279 } else {

272 throw new ArgumentError(	280 throw new ArgumentError(

273 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");	281 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");

274 }	282 }

275 }	283 }

276 }	284 }

OLD	NEW

« no previous file with comments | « packages/utf/lib/src/utf32.dart ('k') | packages/utf/lib/src/utf_16_code_unit_decoder.dart » ('j') | no next file with comments »