OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.utf; | 5 part of dart.utf; |
6 | 6 |
7 const int _UTF8_ONE_BYTE_MAX = 0x7f; | 7 const int _UTF8_ONE_BYTE_MAX = 0x7f; |
8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | 8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
9 const int _UTF8_THREE_BYTE_MAX = 0xffff; | 9 const int _UTF8_THREE_BYTE_MAX = 0xffff; |
10 | 10 |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
79 encodedLength++; | 79 encodedLength++; |
80 } else if (value <= _UTF8_TWO_BYTE_MAX) { | 80 } else if (value <= _UTF8_TWO_BYTE_MAX) { |
81 encodedLength += 2; | 81 encodedLength += 2; |
82 } else if (value <= _UTF8_THREE_BYTE_MAX) { | 82 } else if (value <= _UTF8_THREE_BYTE_MAX) { |
83 encodedLength += 3; | 83 encodedLength += 3; |
84 } else if (value <= UNICODE_VALID_RANGE_MAX) { | 84 } else if (value <= UNICODE_VALID_RANGE_MAX) { |
85 encodedLength += 4; | 85 encodedLength += 4; |
86 } | 86 } |
87 } | 87 } |
88 | 88 |
89 List<int> encoded = new List<int>(encodedLength); | 89 List<int> encoded = new List<int>.fixedLength(encodedLength); |
90 int insertAt = 0; | 90 int insertAt = 0; |
91 for (int value in source) { | 91 for (int value in source) { |
92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | 92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]); | 93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]); |
94 insertAt += 3; | 94 insertAt += 3; |
95 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 95 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
96 encoded[insertAt] = value; | 96 encoded[insertAt] = value; |
97 insertAt++; | 97 insertAt++; |
98 } else if (value <= _UTF8_TWO_BYTE_MAX) { | 98 } else if (value <= _UTF8_TWO_BYTE_MAX) { |
99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( | 99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( |
(...skipping 22 matching lines...) Expand all Loading... |
122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
123 return new Utf8Decoder(utf8EncodedBytes, offset, length, | 123 return new Utf8Decoder(utf8EncodedBytes, offset, length, |
124 replacementCodepoint).decodeRest(); | 124 replacementCodepoint).decodeRest(); |
125 } | 125 } |
126 | 126 |
127 /** | 127 /** |
128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type | 128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type |
129 * provides an iterator on demand and the iterator will only translate bytes | 129 * provides an iterator on demand and the iterator will only translate bytes |
130 * as requested by the user of the iterator. (Note: results are not cached.) | 130 * as requested by the user of the iterator. (Note: results are not cached.) |
131 */ | 131 */ |
132 class IterableUtf8Decoder implements Iterable<int> { | 132 // TODO(floitsch): Consider removing the extend and switch to implements since |
| 133 // that's cheaper to allocate. |
| 134 class IterableUtf8Decoder extends Iterable<int> { |
133 final List<int> bytes; | 135 final List<int> bytes; |
134 final int offset; | 136 final int offset; |
135 final int length; | 137 final int length; |
136 final int replacementCodepoint; | 138 final int replacementCodepoint; |
137 | 139 |
138 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, | 140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, |
139 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | 141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
140 | 142 |
141 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, | 143 Utf8Decoder get iterator => |
142 replacementCodepoint); | 144 new Utf8Decoder(bytes, offset, length, replacementCodepoint); |
143 } | 145 } |
144 | 146 |
145 /** | 147 /** |
146 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | 148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
147 * parameters can set an offset into a list of bytes (as int), limit the length | 149 * parameters can set an offset into a list of bytes (as int), limit the length |
148 * of the values to be decoded, and override the default Unicode replacement | 150 * of the values to be decoded, and override the default Unicode replacement |
149 * character. Set the replacementCharacter to null to throw an | 151 * character. Set the replacementCharacter to null to throw an |
150 * ArgumentError rather than replace the bad value. The return value | 152 * ArgumentError rather than replace the bad value. The return value |
151 * from this method can be used as an Iterable (e.g. in a for-loop). | 153 * from this method can be used as an Iterable (e.g. in a for-loop). |
152 */ | 154 */ |
153 class Utf8Decoder implements Iterator<int> { | 155 class Utf8Decoder implements Iterator<int> { |
154 final _ListRangeIterator utf8EncodedBytesIterator; | 156 final _ListRangeIterator utf8EncodedBytesIterator; |
155 final int replacementCodepoint; | 157 final int replacementCodepoint; |
| 158 int _current = null; |
156 | 159 |
157 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | 160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, |
158 this.replacementCodepoint = | 161 this.replacementCodepoint = |
159 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
160 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, | 163 utf8EncodedBytesIterator = |
161 length)).iterator(); | 164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator; |
162 | 165 |
163 | 166 |
164 Utf8Decoder._fromListRangeIterator(_ListRange source, [ | 167 Utf8Decoder._fromListRangeIterator(_ListRange source, [ |
165 this.replacementCodepoint = | 168 this.replacementCodepoint = |
166 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
167 utf8EncodedBytesIterator = source.iterator(); | 170 utf8EncodedBytesIterator = source.iterator; |
168 | 171 |
169 /** Decode the remaininder of the characters in this decoder | 172 /** Decode the remaininder of the characters in this decoder |
170 * into a [List<int>]. | 173 * into a [List<int>]. |
171 */ | 174 */ |
172 List<int> decodeRest() { | 175 List<int> decodeRest() { |
173 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); | 176 List<int> codepoints = new List<int>.fixedLength(utf8EncodedBytesIterator.re
maining); |
174 int i = 0; | 177 int i = 0; |
175 while (hasNext) { | 178 while (moveNext()) { |
176 codepoints[i++] = next(); | 179 codepoints[i++] = current; |
177 } | 180 } |
178 if (i == codepoints.length) { | 181 if (i == codepoints.length) { |
179 return codepoints; | 182 return codepoints; |
180 } else { | 183 } else { |
181 List<int> truncCodepoints = new List<int>(i); | 184 List<int> truncCodepoints = new List<int>.fixedLength(i); |
182 truncCodepoints.setRange(0, i, codepoints); | 185 truncCodepoints.setRange(0, i, codepoints); |
183 return truncCodepoints; | 186 return truncCodepoints; |
184 } | 187 } |
185 } | 188 } |
186 | 189 |
187 bool get hasNext => utf8EncodedBytesIterator.hasNext; | 190 int get current => _current; |
188 | 191 |
189 int next() { | 192 bool moveNext() { |
190 int value = utf8EncodedBytesIterator.next(); | 193 _current = null; |
| 194 |
| 195 if (!utf8EncodedBytesIterator.moveNext()) return false; |
| 196 |
| 197 int value = utf8EncodedBytesIterator.current; |
191 int additionalBytes = 0; | 198 int additionalBytes = 0; |
192 | 199 |
193 if (value < 0) { | 200 if (value < 0) { |
194 if (replacementCodepoint != null) { | 201 if (replacementCodepoint != null) { |
195 return replacementCodepoint; | 202 _current = replacementCodepoint; |
| 203 return true; |
196 } else { | 204 } else { |
197 throw new ArgumentError( | 205 throw new ArgumentError( |
198 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
199 } | 207 } |
200 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 208 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
201 return value; | 209 _current = value; |
| 210 return true; |
202 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
203 if (replacementCodepoint != null) { | 212 if (replacementCodepoint != null) { |
204 return replacementCodepoint; | 213 _current = replacementCodepoint; |
| 214 return true; |
205 } else { | 215 } else { |
206 throw new ArgumentError( | 216 throw new ArgumentError( |
207 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
208 } | 218 } |
209 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | 219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
210 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | 220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
211 additionalBytes = 1; | 221 additionalBytes = 1; |
212 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | 222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
213 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | 223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
214 additionalBytes = 2; | 224 additionalBytes = 2; |
215 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | 225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
216 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | 226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
217 additionalBytes = 3; | 227 additionalBytes = 3; |
218 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | 228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
219 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | 229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
220 additionalBytes = 4; | 230 additionalBytes = 4; |
221 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | 231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
222 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | 232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
223 additionalBytes = 5; | 233 additionalBytes = 5; |
224 } else if (replacementCodepoint != null) { | 234 } else if (replacementCodepoint != null) { |
225 return replacementCodepoint; | 235 _current = replacementCodepoint; |
| 236 return true; |
226 } else { | 237 } else { |
227 throw new ArgumentError( | 238 throw new ArgumentError( |
228 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
229 } | 240 } |
230 int j = 0; | 241 int j = 0; |
231 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext) { | 242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { |
232 int nextValue = utf8EncodedBytesIterator.next(); | 243 int nextValue = utf8EncodedBytesIterator.current; |
233 if (nextValue > _UTF8_ONE_BYTE_MAX && | 244 if (nextValue > _UTF8_ONE_BYTE_MAX && |
234 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
235 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | 246 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
236 } else { | 247 } else { |
237 // if sequence-starting code unit, reposition cursor to start here | 248 // if sequence-starting code unit, reposition cursor to start here |
238 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
239 utf8EncodedBytesIterator.backup(); | 250 utf8EncodedBytesIterator.backup(); |
240 } | 251 } |
241 break; | 252 break; |
242 } | 253 } |
243 j++; | 254 j++; |
244 } | 255 } |
245 bool validSequence = (j == additionalBytes && ( | 256 bool validSequence = (j == additionalBytes && ( |
246 value < UNICODE_UTF16_RESERVED_LO || | 257 value < UNICODE_UTF16_RESERVED_LO || |
247 value > UNICODE_UTF16_RESERVED_HI)); | 258 value > UNICODE_UTF16_RESERVED_HI)); |
248 bool nonOverlong = | 259 bool nonOverlong = |
249 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | 260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
250 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | 261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
251 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | 262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
252 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | 263 bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
253 if (validSequence && nonOverlong && inRange) { | 264 if (validSequence && nonOverlong && inRange) { |
254 return value; | 265 _current = value; |
| 266 return true; |
255 } else if (replacementCodepoint != null) { | 267 } else if (replacementCodepoint != null) { |
256 return replacementCodepoint; | 268 _current = replacementCodepoint; |
| 269 return true; |
257 } else { | 270 } else { |
258 throw new ArgumentError( | 271 throw new ArgumentError( |
259 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | 272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
260 } | 273 } |
261 } | 274 } |
262 } | 275 } |
OLD | NEW |