| OLD | NEW |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of dart.utf; | 5 part of dart.utf; |
| 6 | 6 |
| 7 const int _UTF8_ONE_BYTE_MAX = 0x7f; | 7 const int _UTF8_ONE_BYTE_MAX = 0x7f; |
| 8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | 8 const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
| 9 const int _UTF8_THREE_BYTE_MAX = 0xffff; | 9 const int _UTF8_THREE_BYTE_MAX = 0xffff; |
| 10 | 10 |
| (...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 79 encodedLength++; | 79 encodedLength++; |
| 80 } else if (value <= _UTF8_TWO_BYTE_MAX) { | 80 } else if (value <= _UTF8_TWO_BYTE_MAX) { |
| 81 encodedLength += 2; | 81 encodedLength += 2; |
| 82 } else if (value <= _UTF8_THREE_BYTE_MAX) { | 82 } else if (value <= _UTF8_THREE_BYTE_MAX) { |
| 83 encodedLength += 3; | 83 encodedLength += 3; |
| 84 } else if (value <= UNICODE_VALID_RANGE_MAX) { | 84 } else if (value <= UNICODE_VALID_RANGE_MAX) { |
| 85 encodedLength += 4; | 85 encodedLength += 4; |
| 86 } | 86 } |
| 87 } | 87 } |
| 88 | 88 |
| 89 List<int> encoded = new List<int>(encodedLength); | 89 List<int> encoded = new List<int>.fixedLength(encodedLength); |
| 90 int insertAt = 0; | 90 int insertAt = 0; |
| 91 for (int value in source) { | 91 for (int value in source) { |
| 92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { | 92 if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
| 93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]); | 93 encoded.setRange(insertAt, 3, [0xef, 0xbf, 0xbd]); |
| 94 insertAt += 3; | 94 insertAt += 3; |
| 95 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 95 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| 96 encoded[insertAt] = value; | 96 encoded[insertAt] = value; |
| 97 insertAt++; | 97 insertAt++; |
| 98 } else if (value <= _UTF8_TWO_BYTE_MAX) { | 98 } else if (value <= _UTF8_TWO_BYTE_MAX) { |
| 99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( | 99 encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | ( |
| (...skipping 22 matching lines...) Expand all Loading... |
| 122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | 122 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 123 return new Utf8Decoder(utf8EncodedBytes, offset, length, | 123 return new Utf8Decoder(utf8EncodedBytes, offset, length, |
| 124 replacementCodepoint).decodeRest(); | 124 replacementCodepoint).decodeRest(); |
| 125 } | 125 } |
| 126 | 126 |
| 127 /** | 127 /** |
| 128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type | 128 * Return type of [decodeUtf8AsIterable] and variants. The Iterable type |
| 129 * provides an iterator on demand and the iterator will only translate bytes | 129 * provides an iterator on demand and the iterator will only translate bytes |
| 130 * as requested by the user of the iterator. (Note: results are not cached.) | 130 * as requested by the user of the iterator. (Note: results are not cached.) |
| 131 */ | 131 */ |
| 132 class IterableUtf8Decoder implements Iterable<int> { | 132 // TODO(floitsch): Consider removing the extend and switch to implements since |
| 133 // that's cheaper to allocate. |
| 134 class IterableUtf8Decoder extends Iterable<int> { |
| 133 final List<int> bytes; | 135 final List<int> bytes; |
| 134 final int offset; | 136 final int offset; |
| 135 final int length; | 137 final int length; |
| 136 final int replacementCodepoint; | 138 final int replacementCodepoint; |
| 137 | 139 |
| 138 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, | 140 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, |
| 139 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | 141 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
| 140 | 142 |
| 141 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, | 143 Utf8Decoder get iterator => |
| 142 replacementCodepoint); | 144 new Utf8Decoder(bytes, offset, length, replacementCodepoint); |
| 143 } | 145 } |
| 144 | 146 |
| 145 /** | 147 /** |
| 146 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | 148 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
| 147 * parameters can set an offset into a list of bytes (as int), limit the length | 149 * parameters can set an offset into a list of bytes (as int), limit the length |
| 148 * of the values to be decoded, and override the default Unicode replacement | 150 * of the values to be decoded, and override the default Unicode replacement |
| 149 * character. Set the replacementCharacter to null to throw an | 151 * character. Set the replacementCharacter to null to throw an |
| 150 * ArgumentError rather than replace the bad value. The return value | 152 * ArgumentError rather than replace the bad value. The return value |
| 151 * from this method can be used as an Iterable (e.g. in a for-loop). | 153 * from this method can be used as an Iterable (e.g. in a for-loop). |
| 152 */ | 154 */ |
| 153 class Utf8Decoder implements Iterator<int> { | 155 class Utf8Decoder implements Iterator<int> { |
| 154 final _ListRangeIterator utf8EncodedBytesIterator; | 156 final _ListRangeIterator utf8EncodedBytesIterator; |
| 155 final int replacementCodepoint; | 157 final int replacementCodepoint; |
| 158 int _current = null; |
| 156 | 159 |
| 157 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | 160 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, |
| 158 this.replacementCodepoint = | 161 this.replacementCodepoint = |
| 159 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 162 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 160 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, | 163 utf8EncodedBytesIterator = |
| 161 length)).iterator(); | 164 (new _ListRange(utf8EncodedBytes, offset, length)).iterator; |
| 162 | 165 |
| 163 | 166 |
| 164 Utf8Decoder._fromListRangeIterator(_ListRange source, [ | 167 Utf8Decoder._fromListRangeIterator(_ListRange source, [ |
| 165 this.replacementCodepoint = | 168 this.replacementCodepoint = |
| 166 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 169 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 167 utf8EncodedBytesIterator = source.iterator(); | 170 utf8EncodedBytesIterator = source.iterator; |
| 168 | 171 |
| 169 /** Decode the remaininder of the characters in this decoder | 172 /** Decode the remaininder of the characters in this decoder |
| 170 * into a [List<int>]. | 173 * into a [List<int>]. |
| 171 */ | 174 */ |
| 172 List<int> decodeRest() { | 175 List<int> decodeRest() { |
| 173 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); | 176 List<int> codepoints = new List<int>.fixedLength(utf8EncodedBytesIterator.re
maining); |
| 174 int i = 0; | 177 int i = 0; |
| 175 while (hasNext) { | 178 while (moveNext()) { |
| 176 codepoints[i++] = next(); | 179 codepoints[i++] = current; |
| 177 } | 180 } |
| 178 if (i == codepoints.length) { | 181 if (i == codepoints.length) { |
| 179 return codepoints; | 182 return codepoints; |
| 180 } else { | 183 } else { |
| 181 List<int> truncCodepoints = new List<int>(i); | 184 List<int> truncCodepoints = new List<int>.fixedLength(i); |
| 182 truncCodepoints.setRange(0, i, codepoints); | 185 truncCodepoints.setRange(0, i, codepoints); |
| 183 return truncCodepoints; | 186 return truncCodepoints; |
| 184 } | 187 } |
| 185 } | 188 } |
| 186 | 189 |
| 187 bool get hasNext => utf8EncodedBytesIterator.hasNext; | 190 int get current => _current; |
| 188 | 191 |
| 189 int next() { | 192 bool moveNext() { |
| 190 int value = utf8EncodedBytesIterator.next(); | 193 _current = null; |
| 194 |
| 195 if (!utf8EncodedBytesIterator.moveNext()) return false; |
| 196 |
| 197 int value = utf8EncodedBytesIterator.current; |
| 191 int additionalBytes = 0; | 198 int additionalBytes = 0; |
| 192 | 199 |
| 193 if (value < 0) { | 200 if (value < 0) { |
| 194 if (replacementCodepoint != null) { | 201 if (replacementCodepoint != null) { |
| 195 return replacementCodepoint; | 202 _current = replacementCodepoint; |
| 203 return true; |
| 196 } else { | 204 } else { |
| 197 throw new ArgumentError( | 205 throw new ArgumentError( |
| 198 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 206 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 199 } | 207 } |
| 200 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 208 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| 201 return value; | 209 _current = value; |
| 210 return true; |
| 202 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 211 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 203 if (replacementCodepoint != null) { | 212 if (replacementCodepoint != null) { |
| 204 return replacementCodepoint; | 213 _current = replacementCodepoint; |
| 214 return true; |
| 205 } else { | 215 } else { |
| 206 throw new ArgumentError( | 216 throw new ArgumentError( |
| 207 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 217 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 208 } | 218 } |
| 209 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | 219 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
| 210 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | 220 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
| 211 additionalBytes = 1; | 221 additionalBytes = 1; |
| 212 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | 222 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
| 213 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | 223 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
| 214 additionalBytes = 2; | 224 additionalBytes = 2; |
| 215 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | 225 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
| 216 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | 226 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
| 217 additionalBytes = 3; | 227 additionalBytes = 3; |
| 218 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | 228 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
| 219 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | 229 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
| 220 additionalBytes = 4; | 230 additionalBytes = 4; |
| 221 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | 231 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
| 222 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | 232 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
| 223 additionalBytes = 5; | 233 additionalBytes = 5; |
| 224 } else if (replacementCodepoint != null) { | 234 } else if (replacementCodepoint != null) { |
| 225 return replacementCodepoint; | 235 _current = replacementCodepoint; |
| 236 return true; |
| 226 } else { | 237 } else { |
| 227 throw new ArgumentError( | 238 throw new ArgumentError( |
| 228 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 239 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 229 } | 240 } |
| 230 int j = 0; | 241 int j = 0; |
| 231 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext) { | 242 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { |
| 232 int nextValue = utf8EncodedBytesIterator.next(); | 243 int nextValue = utf8EncodedBytesIterator.current; |
| 233 if (nextValue > _UTF8_ONE_BYTE_MAX && | 244 if (nextValue > _UTF8_ONE_BYTE_MAX && |
| 234 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 245 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 235 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | 246 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
| 236 } else { | 247 } else { |
| 237 // if sequence-starting code unit, reposition cursor to start here | 248 // if sequence-starting code unit, reposition cursor to start here |
| 238 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 249 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 239 utf8EncodedBytesIterator.backup(); | 250 utf8EncodedBytesIterator.backup(); |
| 240 } | 251 } |
| 241 break; | 252 break; |
| 242 } | 253 } |
| 243 j++; | 254 j++; |
| 244 } | 255 } |
| 245 bool validSequence = (j == additionalBytes && ( | 256 bool validSequence = (j == additionalBytes && ( |
| 246 value < UNICODE_UTF16_RESERVED_LO || | 257 value < UNICODE_UTF16_RESERVED_LO || |
| 247 value > UNICODE_UTF16_RESERVED_HI)); | 258 value > UNICODE_UTF16_RESERVED_HI)); |
| 248 bool nonOverlong = | 259 bool nonOverlong = |
| 249 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | 260 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
| 250 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | 261 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
| 251 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | 262 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
| 252 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | 263 bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
| 253 if (validSequence && nonOverlong && inRange) { | 264 if (validSequence && nonOverlong && inRange) { |
| 254 return value; | 265 _current = value; |
| 266 return true; |
| 255 } else if (replacementCodepoint != null) { | 267 } else if (replacementCodepoint != null) { |
| 256 return replacementCodepoint; | 268 _current = replacementCodepoint; |
| 269 return true; |
| 257 } else { | 270 } else { |
| 258 throw new ArgumentError( | 271 throw new ArgumentError( |
| 259 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | 272 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
| 260 } | 273 } |
| 261 } | 274 } |
| 262 } | 275 } |
| OLD | NEW |