| OLD | NEW |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 const int _UTF8_ONE_BYTE_MAX = 0x7f; | 5 const int _UTF8_ONE_BYTE_MAX = 0x7f; |
| 6 const int _UTF8_TWO_BYTE_MAX = 0x7ff; | 6 const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
| 7 const int _UTF8_THREE_BYTE_MAX = 0xffff; | 7 const int _UTF8_THREE_BYTE_MAX = 0xffff; |
| 8 | 8 |
| 9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; | 9 const int _UTF8_LO_SIX_BIT_MASK = 0x3f; |
| 10 | 10 |
| (...skipping 118 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 129 */ | 129 */ |
| 130 class IterableUtf8Decoder extends Iterable<int> { | 130 class IterableUtf8Decoder extends Iterable<int> { |
| 131 final List<int> bytes; | 131 final List<int> bytes; |
| 132 final int offset; | 132 final int offset; |
| 133 final int length; | 133 final int length; |
| 134 final int replacementCodepoint; | 134 final int replacementCodepoint; |
| 135 | 135 |
| 136 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, | 136 IterableUtf8Decoder(this.bytes, [this.offset = 0, this.length = null, |
| 137 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); | 137 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
| 138 | 138 |
| 139 Utf8Decoder iterator() => new Utf8Decoder(bytes, offset, length, | 139 Utf8Decoder get iterator => |
| 140 replacementCodepoint); | 140 new Utf8Decoder(bytes, offset, length, replacementCodepoint); |
| 141 } | 141 } |
| 142 | 142 |
| 143 /** | 143 /** |
| 144 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The | 144 * Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
| 145 * parameters can set an offset into a list of bytes (as int), limit the length | 145 * parameters can set an offset into a list of bytes (as int), limit the length |
| 146 * of the values to be decoded, and override the default Unicode replacement | 146 * of the values to be decoded, and override the default Unicode replacement |
| 147 * character. Set the replacementCharacter to null to throw an | 147 * character. Set the replacementCharacter to null to throw an |
| 148 * ArgumentError rather than replace the bad value. The return value | 148 * ArgumentError rather than replace the bad value. The return value |
| 149 * from this method can be used as an Iterable (e.g. in a for-loop). | 149 * from this method can be used as an Iterable (e.g. in a for-loop). |
| 150 */ | 150 */ |
| 151 class Utf8Decoder implements Iterator<int> { | 151 class Utf8Decoder implements Iterator<int> { |
| 152 final _ListRangeIterator utf8EncodedBytesIterator; | 152 final _ListRangeIterator utf8EncodedBytesIterator; |
| 153 final int replacementCodepoint; | 153 final int replacementCodepoint; |
| 154 int _current = -1; |
| 154 | 155 |
| 155 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, | 156 Utf8Decoder(List<int> utf8EncodedBytes, [int offset = 0, int length, |
| 156 this.replacementCodepoint = | 157 this.replacementCodepoint = |
| 157 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 158 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 158 utf8EncodedBytesIterator = (new _ListRange(utf8EncodedBytes, offset, | 159 utf8EncodedBytesIterator = |
| 159 length)).iterator(); | 160 (new _ListRange(utf8EncodedBytes, offset, length)).iterator; |
| 160 | 161 |
| 161 | 162 |
| 162 Utf8Decoder._fromListRangeIterator(_ListRange source, [ | 163 Utf8Decoder._fromListRangeIterator(_ListRange source, [ |
| 163 this.replacementCodepoint = | 164 this.replacementCodepoint = |
| 164 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | 165 UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 165 utf8EncodedBytesIterator = source.iterator(); | 166 utf8EncodedBytesIterator = source.iterator; |
| 166 | 167 |
| 167 /** Decode the remaininder of the characters in this decoder | 168 /** Decode the remaininder of the characters in this decoder |
| 168 * into a [List<int>]. | 169 * into a [List<int>]. |
| 169 */ | 170 */ |
| 170 List<int> decodeRest() { | 171 List<int> decodeRest() { |
| 171 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); | 172 List<int> codepoints = new List<int>(utf8EncodedBytesIterator.remaining); |
| 172 int i = 0; | 173 int i = 0; |
| 173 while (hasNext) { | 174 while (moveNext()) { |
| 174 codepoints[i++] = next(); | 175 codepoints[i++] = current; |
| 175 } | 176 } |
| 176 if (i == codepoints.length) { | 177 if (i == codepoints.length) { |
| 177 return codepoints; | 178 return codepoints; |
| 178 } else { | 179 } else { |
| 179 List<int> truncCodepoints = new List<int>(i); | 180 List<int> truncCodepoints = new List<int>(i); |
| 180 truncCodepoints.setRange(0, i, codepoints); | 181 truncCodepoints.setRange(0, i, codepoints); |
| 181 return truncCodepoints; | 182 return truncCodepoints; |
| 182 } | 183 } |
| 183 } | 184 } |
| 184 | 185 |
| 185 bool get hasNext => utf8EncodedBytesIterator.hasNext; | 186 int get current { |
| 187 if (_current == -1) { |
| 188 // TODO(floitsch): bad error message. |
| 189 throw new StateError("No more elements"); |
| 190 } |
| 191 return _current; |
| 192 } |
| 186 | 193 |
| 187 int next() { | 194 bool moveNext() { |
| 188 int value = utf8EncodedBytesIterator.next(); | 195 _current = -1; |
| 196 |
| 197 if (!utf8EncodedBytesIterator.moveNext()) return false; |
| 198 |
| 199 int value = utf8EncodedBytesIterator.current; |
| 189 int additionalBytes = 0; | 200 int additionalBytes = 0; |
| 190 | 201 |
| 191 if (value < 0) { | 202 if (value < 0) { |
| 192 if (replacementCodepoint != null) { | 203 if (replacementCodepoint != null) { |
| 193 return replacementCodepoint; | 204 _current = replacementCodepoint; |
| 205 return true; |
| 194 } else { | 206 } else { |
| 195 throw new ArgumentError( | 207 throw new ArgumentError( |
| 196 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 208 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 197 } | 209 } |
| 198 } else if (value <= _UTF8_ONE_BYTE_MAX) { | 210 } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| 199 return value; | 211 _current = value; |
| 212 return true; |
| 200 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 213 } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 201 if (replacementCodepoint != null) { | 214 if (replacementCodepoint != null) { |
| 202 return replacementCodepoint; | 215 _current = replacementCodepoint; |
| 216 return true; |
| 203 } else { | 217 } else { |
| 204 throw new ArgumentError( | 218 throw new ArgumentError( |
| 205 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 219 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 206 } | 220 } |
| 207 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { | 221 } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
| 208 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; | 222 value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
| 209 additionalBytes = 1; | 223 additionalBytes = 1; |
| 210 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { | 224 } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
| 211 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; | 225 value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
| 212 additionalBytes = 2; | 226 additionalBytes = 2; |
| 213 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { | 227 } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
| 214 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; | 228 value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
| 215 additionalBytes = 3; | 229 additionalBytes = 3; |
| 216 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { | 230 } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
| 217 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; | 231 value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
| 218 additionalBytes = 4; | 232 additionalBytes = 4; |
| 219 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { | 233 } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
| 220 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; | 234 value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
| 221 additionalBytes = 5; | 235 additionalBytes = 5; |
| 222 } else if (replacementCodepoint != null) { | 236 } else if (replacementCodepoint != null) { |
| 223 return replacementCodepoint; | 237 _current = replacementCodepoint; |
| 238 return true; |
| 224 } else { | 239 } else { |
| 225 throw new ArgumentError( | 240 throw new ArgumentError( |
| 226 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); | 241 "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); |
| 227 } | 242 } |
| 228 int j = 0; | 243 int j = 0; |
| 229 while (j < additionalBytes && utf8EncodedBytesIterator.hasNext) { | 244 while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { |
| 230 int nextValue = utf8EncodedBytesIterator.next(); | 245 int nextValue = utf8EncodedBytesIterator.current; |
| 231 if (nextValue > _UTF8_ONE_BYTE_MAX && | 246 if (nextValue > _UTF8_ONE_BYTE_MAX && |
| 232 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 247 nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 233 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); | 248 value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
| 234 } else { | 249 } else { |
| 235 // if sequence-starting code unit, reposition cursor to start here | 250 // if sequence-starting code unit, reposition cursor to start here |
| 236 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { | 251 if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| 237 utf8EncodedBytesIterator.backup(); | 252 utf8EncodedBytesIterator.backup(); |
| 238 } | 253 } |
| 239 break; | 254 break; |
| 240 } | 255 } |
| 241 j++; | 256 j++; |
| 242 } | 257 } |
| 243 bool validSequence = (j == additionalBytes && ( | 258 bool validSequence = (j == additionalBytes && ( |
| 244 value < UNICODE_UTF16_RESERVED_LO || | 259 value < UNICODE_UTF16_RESERVED_LO || |
| 245 value > UNICODE_UTF16_RESERVED_HI)); | 260 value > UNICODE_UTF16_RESERVED_HI)); |
| 246 bool nonOverlong = | 261 bool nonOverlong = |
| 247 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || | 262 (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
| 248 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || | 263 (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
| 249 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); | 264 (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
| 250 bool inRange = value <= UNICODE_VALID_RANGE_MAX; | 265 bool inRange = value <= UNICODE_VALID_RANGE_MAX; |
| 251 if (validSequence && nonOverlong && inRange) { | 266 if (validSequence && nonOverlong && inRange) { |
| 252 return value; | 267 _current = value; |
| 268 return true; |
| 253 } else if (replacementCodepoint != null) { | 269 } else if (replacementCodepoint != null) { |
| 254 return replacementCodepoint; | 270 _current = replacementCodepoint; |
| 271 return true; |
| 255 } else { | 272 } else { |
| 256 throw new ArgumentError( | 273 throw new ArgumentError( |
| 257 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); | 274 "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); |
| 258 } | 275 } |
| 259 } | 276 } |
| 260 } | 277 } |
| OLD | NEW |