Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(214)

Side by Side Diff: packages/utf/lib/src/utf8.dart

Issue 2989763002: Update charted to 0.4.8 and roll (Closed)
Patch Set: Removed Cutch from list of reviewers Created 3 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « packages/utf/lib/src/utf32.dart ('k') | packages/utf/lib/src/utf_16_code_unit_decoder.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of utf; 5 library utf.utf8;
6
7 import "dart:collection";
8
9 import 'constants.dart';
10 import 'list_range.dart';
11 import 'shared.dart';
6 12
7 const int _UTF8_ONE_BYTE_MAX = 0x7f; 13 const int _UTF8_ONE_BYTE_MAX = 0x7f;
8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; 14 const int _UTF8_TWO_BYTE_MAX = 0x7ff;
9 const int _UTF8_THREE_BYTE_MAX = 0xffff; 15 const int _UTF8_THREE_BYTE_MAX = 0xffff;
10 16
11 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; 17 const int _UTF8_LO_SIX_BIT_MASK = 0x3f;
12 18
13 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; 19 const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0;
14 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; 20 const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0;
15 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; 21 const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0;
16 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; 22 const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8;
17 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; 23 const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc;
18 24
19 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; 25 const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f;
20 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; 26 const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf;
21 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; 27 const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7;
22 28
23 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; 29 const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe;
24 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; 30 const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80;
25 31
26 /** 32 /**
27 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert 33 * Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert
28 * as much of the input as needed. Set the replacementCharacter to null to 34 * as much of the input as needed. Set the replacementCharacter to null to
29 * throw an ArgumentError rather than replace the bad value. 35 * throw an ArgumentError rather than replace the bad value.
30 */ 36 */
31 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, [int offset = 0, 37 IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes,
38 [int offset = 0,
32 int length, 39 int length,
33 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { 40 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
34 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); 41 return new IterableUtf8Decoder(bytes, offset, length, replacementCodepoint);
35 } 42 }
36 43
37 /** 44 /**
38 * Produce a String from a List of UTF-8 encoded bytes. The parameters 45 * Produce a String from a List of UTF-8 encoded bytes. The parameters
39 * can set an offset into a list of bytes (as int), limit the length of the 46 * can set an offset into a list of bytes (as int), limit the length of the
40 * values to be decoded, and override the default Unicode replacement character. 47 * values to be decoded, and override the default Unicode replacement character.
41 * Set the replacementCharacter to null to throw an ArgumentError 48 * Set the replacementCharacter to null to throw an ArgumentError
42 * rather than replace the bad value. 49 * rather than replace the bad value.
43 */ 50 */
44 String decodeUtf8(List<int> bytes, [int offset = 0, int length, 51 String decodeUtf8(List<int> bytes,
52 [int offset = 0,
53 int length,
45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { 54 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
46 return new String.fromCharCodes( 55 return new String.fromCharCodes(
47 (new Utf8Decoder(bytes, offset, length, replacementCodepoint)) 56 (new Utf8Decoder(bytes, offset, length, replacementCodepoint))
48 .decodeRest()); 57 .decodeRest());
49 } 58 }
50 59
51 /** 60 /**
52 * Produce a sequence of UTF-8 encoded bytes from the provided string. 61 * Produce a sequence of UTF-8 encoded bytes from the provided string.
53 */ 62 */
54 List<int> encodeUtf8(String str) => 63 List<int> encodeUtf8(String str) => codepointsToUtf8(stringToCodepoints(str));
55 codepointsToUtf8(stringToCodepoints(str));
56 64
57 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { 65 int _addToEncoding(int offset, int bytes, int value, List<int> buffer) {
58 while (bytes > 0) { 66 while (bytes > 0) {
59 buffer[offset + bytes] = _UTF8_SUBSEQUENT_BYTE_BASE | 67 buffer[offset + bytes] =
60 (value & _UTF8_LO_SIX_BIT_MASK); 68 _UTF8_SUBSEQUENT_BYTE_BASE | (value & _UTF8_LO_SIX_BIT_MASK);
61 value = value >> 6; 69 value = value >> 6;
62 bytes--; 70 bytes--;
63 } 71 }
64 return value; 72 return value;
65 } 73 }
66 74
67 /** 75 /**
68 * Encode code points as UTF-8 code units. 76 * Encode code points as UTF-8 code units.
69 */ 77 */
70 List<int> codepointsToUtf8( 78 List<int> codepointsToUtf8(List<int> codepoints, [int offset = 0, int length]) {
71 List<int> codepoints, [int offset = 0, int length]) {
72 ListRange source = new ListRange(codepoints, offset, length); 79 ListRange source = new ListRange(codepoints, offset, length);
73 80
74 int encodedLength = 0; 81 int encodedLength = 0;
75 for (int value in source) { 82 for (int value in source) {
76 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { 83 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) {
77 encodedLength += 3; 84 encodedLength += 3;
78 } else if (value <= _UTF8_ONE_BYTE_MAX) { 85 } else if (value <= _UTF8_ONE_BYTE_MAX) {
79 encodedLength++; 86 encodedLength++;
80 } else if (value <= _UTF8_TWO_BYTE_MAX) { 87 } else if (value <= _UTF8_TWO_BYTE_MAX) {
81 encodedLength += 2; 88 encodedLength += 2;
82 } else if (value <= _UTF8_THREE_BYTE_MAX) { 89 } else if (value <= _UTF8_THREE_BYTE_MAX) {
83 encodedLength += 3; 90 encodedLength += 3;
84 } else if (value <= UNICODE_VALID_RANGE_MAX) { 91 } else if (value <= UNICODE_VALID_RANGE_MAX) {
85 encodedLength += 4; 92 encodedLength += 4;
86 } 93 }
87 } 94 }
88 95
89 List<int> encoded = new List<int>(encodedLength); 96 List<int> encoded = new List<int>(encodedLength);
90 int insertAt = 0; 97 int insertAt = 0;
91 for (int value in source) { 98 for (int value in source) {
92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { 99 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) {
93 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); 100 encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]);
94 insertAt += 3; 101 insertAt += 3;
95 } else if (value <= _UTF8_ONE_BYTE_MAX) { 102 } else if (value <= _UTF8_ONE_BYTE_MAX) {
96 encoded[insertAt] = value; 103 encoded[insertAt] = value;
97 insertAt++; 104 insertAt++;
98 } else if (value <= _UTF8_TWO_BYTE_MAX) { 105 } else if (value <= _UTF8_TWO_BYTE_MAX) {
99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( 106 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE |
100 _UTF8_FIRST_BYTE_OF_TWO_MASK & 107 (_UTF8_FIRST_BYTE_OF_TWO_MASK &
101 _addToEncoding(insertAt, 1, value, encoded)); 108 _addToEncoding(insertAt, 1, value, encoded));
102 insertAt += 2; 109 insertAt += 2;
103 } else if (value <= _UTF8_THREE_BYTE_MAX) { 110 } else if (value <= _UTF8_THREE_BYTE_MAX) {
104 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | ( 111 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE |
105 _UTF8_FIRST_BYTE_OF_THREE_MASK & 112 (_UTF8_FIRST_BYTE_OF_THREE_MASK &
106 _addToEncoding(insertAt, 2, value, encoded)); 113 _addToEncoding(insertAt, 2, value, encoded));
107 insertAt += 3; 114 insertAt += 3;
108 } else if (value <= UNICODE_VALID_RANGE_MAX) { 115 } else if (value <= UNICODE_VALID_RANGE_MAX) {
109 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | ( 116 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE |
110 _UTF8_FIRST_BYTE_OF_FOUR_MASK & 117 (_UTF8_FIRST_BYTE_OF_FOUR_MASK &
111 _addToEncoding(insertAt, 3, value, encoded)); 118 _addToEncoding(insertAt, 3, value, encoded));
112 insertAt += 4; 119 insertAt += 4;
113 } 120 }
114 } 121 }
115 return encoded; 122 return encoded;
116 } 123 }
117 124
118 // Because UTF-8 specifies byte order, we do not have to follow the pattern 125 // Because UTF-8 specifies byte order, we do not have to follow the pattern
119 // used by UTF-16 & UTF-32 regarding byte order. 126 // used by UTF-16 & UTF-32 regarding byte order.
120 List<int> utf8ToCodepoints( 127 List<int> utf8ToCodepoints(List<int> utf8EncodedBytes,
121 List<int> utf8EncodedBytes, [int offset = 0, int length, 128 [int offset = 0,
129 int length,
122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { 130 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
123 return new Utf8Decoder(utf8EncodedBytes, offset, length, 131 return new Utf8Decoder(utf8EncodedBytes, offset, length, replacementCodepoint)
124 replacementCodepoint).decodeRest(); 132 .decodeRest();
125 } 133 }
126 134
127 /** 135 /**
128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type 136 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type
129 * provides an iterator on demand and the iterator will only translate bytes 137 * provides an iterator on demand and the iterator will only translate bytes
130 * as requested by the user of the iterator. (Note: results are not cached.) 138 * as requested by the user of the iterator. (Note: results are not cached.)
131 */ 139 */
132 // TODO(floitsch): Consider removing the extend and switch to implements since 140 // TODO(floitsch): Consider removing the extend and switch to implements since
133 // that's cheaper to allocate. 141 // that's cheaper to allocate.
134 class IterableUtf8Decoder extends IterableBase<int> { 142 class IterableUtf8Decoder extends IterableBase<int> {
135 final List<int> bytes; 143 final List<int> bytes;
136 final int offset; 144 final int offset;
137 final int length; 145 final int length;
138 final int replacementCodepoint; 146 final int replacementCodepoint;
139 147
140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, 148 IterableUtf8Decoder(this.bytes,
149 [this.offset = 0,
150 this.length = null,
141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); 151 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
142 152
143 Utf8Decoder get iterator => 153 Utf8Decoder get iterator =>
144 new Utf8Decoder(bytes, offset, length, replacementCodepoint); 154 new Utf8Decoder(bytes, offset, length, replacementCodepoint);
145 } 155 }
146 156
147 /** 157 /**
148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The 158 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The
149 * parameters can set an offset into a list of bytes (as int), limit the length 159 * parameters can set an offset into a list of bytes (as int), limit the length
150 * of the values to be decoded, and override the default Unicode replacement 160 * of the values to be decoded, and override the default Unicode replacement
151 * character. Set the replacementCharacter to null to throw an 161 * character. Set the replacementCharacter to null to throw an
152 * ArgumentError rather than replace the bad value. The return value 162 * ArgumentError rather than replace the bad value. The return value
153 * from this method can be used as an Iterable (e.g. in a for-loop). 163 * from this method can be used as an Iterable (e.g. in a for-loop).
154 */ 164 */
155 class Utf8Decoder implements Iterator<int> { 165 class Utf8Decoder implements Iterator<int> {
156 // TODO(kevmoo): should this field be private? 166 // TODO(kevmoo): should this field be private?
157 final ListRangeIterator utf8EncodedBytesIterator; 167 final ListRangeIterator utf8EncodedBytesIterator;
158 final int replacementCodepoint; 168 final int replacementCodepoint;
159 int _current = null; 169 int _current = null;
160 170
161 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, 171 Utf8Decoder(List<int> utf8EncodedBytes,
162 this.replacementCodepoint = 172 [int offset = 0,
163 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : 173 int length,
164 utf8EncodedBytesIterator = 174 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
165 (new ListRange(utf8EncodedBytes, offset, length)).iterator; 175 : utf8EncodedBytesIterator =
176 (new ListRange(utf8EncodedBytes, offset, length)).iterator;
166 177
167 178 Utf8Decoder._fromListRangeIterator(ListRange source,
168 Utf8Decoder._fromListRangeIterator(ListRange source, [ 179 [this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
169 this.replacementCodepoint = 180 : utf8EncodedBytesIterator = source.iterator;
170 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) :
171 utf8EncodedBytesIterator = source.iterator;
172 181
173 /** Decode the remaininder of the characters in this decoder 182 /** Decode the remaininder of the characters in this decoder
174 * into a [List<int>]. 183 * into a [List<int>].
175 */ 184 */
176 List<int> decodeRest() { 185 List<int> decodeRest() {
177 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); 186 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining);
178 int i = 0; 187 int i = 0;
179 while (moveNext()) { 188 while (moveNext()) {
180 codepoints[i++] = current; 189 codepoints[i++] = current;
181 } 190 }
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
247 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); 256 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK));
248 } else { 257 } else {
249 // if sequence-starting code unit, reposition cursor to start here 258 // if sequence-starting code unit, reposition cursor to start here
250 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { 259 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) {
251 utf8EncodedBytesIterator.backup(); 260 utf8EncodedBytesIterator.backup();
252 } 261 }
253 break; 262 break;
254 } 263 }
255 j++; 264 j++;
256 } 265 }
257 bool validSequence = (j == additionalBytes && ( 266 bool validSequence = (j == additionalBytes &&
258 value < UNICODE_UTF16_RESERVED_LO || 267 (value < UNICODE_UTF16_RESERVED_LO ||
259 value > UNICODE_UTF16_RESERVED_HI)); 268 value > UNICODE_UTF16_RESERVED_HI));
260 bool nonOverlong = 269 bool nonOverlong = (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
261 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) ||
262 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || 270 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) ||
263 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); 271 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX);
264 bool inRange = value <= UNICODE_VALID_RANGE_MAX; 272 bool inRange = value <= UNICODE_VALID_RANGE_MAX;
265 if (validSequence && nonOverlong && inRange) { 273 if (validSequence && nonOverlong && inRange) {
266 _current = value; 274 _current = value;
267 return true; 275 return true;
268 } else if (replacementCodepoint != null) { 276 } else if (replacementCodepoint != null) {
269 _current = replacementCodepoint; 277 _current = replacementCodepoint;
270 return true; 278 return true;
271 } else { 279 } else {
272 throw new ArgumentError( 280 throw new ArgumentError(
273 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); 281 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}");
274 } 282 }
275 } 283 }
276 } 284 }
OLDNEW
« no previous file with comments | « packages/utf/lib/src/utf32.dart ('k') | packages/utf/lib/src/utf_16_code_unit_decoder.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698