| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 part of utf; | |
| 6 | |
| 7 /** | |
| 8 * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert | |
| 9 * as much of the input as needed. Determines the byte order from the BOM, | |
| 10 * or uses big-endian as a default. This method always strips a leading BOM. | |
| 11 * Set the replacementCharacter to null to throw an ArgumentError | |
| 12 * rather than replace the bad value. | |
| 13 */ | |
| 14 IterableUtf32Decoder decodeUtf32AsIterable(List<int> bytes, [ | |
| 15 int offset = 0, int length, | |
| 16 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 17 return new IterableUtf32Decoder._( | |
| 18 () => new Utf32BytesDecoder(bytes, offset, length, replacementCodepoint)); | |
| 19 } | |
| 20 | |
| 21 /** | |
| 22 * Decodes the UTF-32BE bytes as an iterable. Thus, the consumer can only conver
t | |
| 23 * as much of the input as needed. This method strips a leading BOM by default, | |
| 24 * but can be overridden by setting the optional parameter [stripBom] to false. | |
| 25 * Set the replacementCharacter to null to throw an ArgumentError | |
| 26 * rather than replace the bad value. | |
| 27 */ | |
| 28 IterableUtf32Decoder decodeUtf32beAsIterable(List<int> bytes, [ | |
| 29 int offset = 0, int length, bool stripBom = true, | |
| 30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 31 return new IterableUtf32Decoder._( | |
| 32 () => new Utf32beBytesDecoder(bytes, offset, length, stripBom, | |
| 33 replacementCodepoint)); | |
| 34 } | |
| 35 | |
| 36 /** | |
| 37 * Decodes the UTF-32LE bytes as an iterable. Thus, the consumer can only conver
t | |
| 38 * as much of the input as needed. This method strips a leading BOM by default, | |
| 39 * but can be overridden by setting the optional parameter [stripBom] to false. | |
| 40 * Set the replacementCharacter to null to throw an ArgumentError | |
| 41 * rather than replace the bad value. | |
| 42 */ | |
| 43 IterableUtf32Decoder decodeUtf32leAsIterable(List<int> bytes, [ | |
| 44 int offset = 0, int length, bool stripBom = true, | |
| 45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 46 return new IterableUtf32Decoder._( | |
| 47 () => new Utf32leBytesDecoder(bytes, offset, length, stripBom, | |
| 48 replacementCodepoint)); | |
| 49 } | |
| 50 | |
| 51 /** | |
| 52 * Produce a String from a sequence of UTF-32 encoded bytes. The parameters | |
| 53 * allow an offset into a list of bytes (as int), limiting the length of the | |
| 54 * values be decoded and the ability of override the default Unicode | |
| 55 * replacement character. Set the replacementCharacter to null to throw an | |
| 56 * ArgumentError rather than replace the bad value. | |
| 57 */ | |
| 58 String decodeUtf32(List<int> bytes, [int offset = 0, int length, | |
| 59 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 60 return new String.fromCharCodes((new Utf32BytesDecoder(bytes, offset, length, | |
| 61 replacementCodepoint)).decodeRest()); | |
| 62 } | |
| 63 /** | |
| 64 * Produce a String from a sequence of UTF-32BE encoded bytes. The parameters | |
| 65 * allow an offset into a list of bytes (as int), limiting the length of the | |
| 66 * values be decoded and the ability of override the default Unicode | |
| 67 * replacement character. Set the replacementCharacter to null to throw an | |
| 68 * ArgumentError rather than replace the bad value. | |
| 69 */ | |
| 70 String decodeUtf32be( | |
| 71 List<int> bytes, [int offset = 0, int length, bool stripBom = true, | |
| 72 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => | |
| 73 new String.fromCharCodes((new Utf32beBytesDecoder(bytes, offset, length, | |
| 74 stripBom, replacementCodepoint)).decodeRest()); | |
| 75 | |
| 76 /** | |
| 77 * Produce a String from a sequence of UTF-32LE encoded bytes. The parameters | |
| 78 * allow an offset into a list of bytes (as int), limiting the length of the | |
| 79 * values be decoded and the ability of override the default Unicode | |
| 80 * replacement character. Set the replacementCharacter to null to throw an | |
| 81 * ArgumentError rather than replace the bad value. | |
| 82 */ | |
| 83 String decodeUtf32le( | |
| 84 List<int> bytes, [int offset = 0, int length, bool stripBom = true, | |
| 85 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => | |
| 86 new String.fromCharCodes((new Utf32leBytesDecoder(bytes, offset, length, | |
| 87 stripBom, replacementCodepoint)).decodeRest()); | |
| 88 | |
| 89 /** | |
| 90 * Produce a list of UTF-32 encoded bytes. This method prefixes the resulting | |
| 91 * bytes with a big-endian byte-order-marker. | |
| 92 */ | |
| 93 List<int> encodeUtf32(String str) => | |
| 94 encodeUtf32be(str, true); | |
| 95 | |
| 96 /** | |
| 97 * Produce a list of UTF-32BE encoded bytes. By default, this method produces | |
| 98 * UTF-32BE bytes with no BOM. | |
| 99 */ | |
| 100 List<int> encodeUtf32be(String str, [bool writeBOM = false]) { | |
| 101 List<int> utf32CodeUnits = stringToCodepoints(str); | |
| 102 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + | |
| 103 (writeBOM ? 4 : 0)); | |
| 104 int i = 0; | |
| 105 if (writeBOM) { | |
| 106 encoding[i++] = 0; | |
| 107 encoding[i++] = 0; | |
| 108 encoding[i++] = UNICODE_UTF_BOM_HI; | |
| 109 encoding[i++] = UNICODE_UTF_BOM_LO; | |
| 110 } | |
| 111 for (int unit in utf32CodeUnits) { | |
| 112 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; | |
| 113 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; | |
| 114 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; | |
| 115 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
| 116 } | |
| 117 return encoding; | |
| 118 } | |
| 119 | |
| 120 /** | |
| 121 * Produce a list of UTF-32LE encoded bytes. By default, this method produces | |
| 122 * UTF-32BE bytes with no BOM. | |
| 123 */ | |
| 124 List<int> encodeUtf32le(String str, [bool writeBOM = false]) { | |
| 125 List<int> utf32CodeUnits = stringToCodepoints(str); | |
| 126 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + | |
| 127 (writeBOM ? 4 : 0)); | |
| 128 int i = 0; | |
| 129 if (writeBOM) { | |
| 130 encoding[i++] = UNICODE_UTF_BOM_LO; | |
| 131 encoding[i++] = UNICODE_UTF_BOM_HI; | |
| 132 encoding[i++] = 0; | |
| 133 encoding[i++] = 0; | |
| 134 } | |
| 135 for (int unit in utf32CodeUnits) { | |
| 136 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; | |
| 137 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; | |
| 138 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; | |
| 139 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; | |
| 140 } | |
| 141 return encoding; | |
| 142 } | |
| 143 | |
| 144 /** | |
| 145 * Identifies whether a List of bytes starts (based on offset) with a | |
| 146 * byte-order marker (BOM). | |
| 147 */ | |
| 148 bool hasUtf32Bom( | |
| 149 List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
| 150 return hasUtf32beBom(utf32EncodedBytes, offset, length) || | |
| 151 hasUtf32leBom(utf32EncodedBytes, offset, length); | |
| 152 } | |
| 153 | |
| 154 /** | |
| 155 * Identifies whether a List of bytes starts (based on offset) with a | |
| 156 * big-endian byte-order marker (BOM). | |
| 157 */ | |
| 158 bool hasUtf32beBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
| 159 int end = length != null ? offset + length : utf32EncodedBytes.length; | |
| 160 return (offset + 4) <= end && | |
| 161 utf32EncodedBytes[offset] == 0 && utf32EncodedBytes[offset + 1] == 0 && | |
| 162 utf32EncodedBytes[offset + 2] == UNICODE_UTF_BOM_HI && | |
| 163 utf32EncodedBytes[offset + 3] == UNICODE_UTF_BOM_LO; | |
| 164 } | |
| 165 | |
| 166 /** | |
| 167 * Identifies whether a List of bytes starts (based on offset) with a | |
| 168 * little-endian byte-order marker (BOM). | |
| 169 */ | |
| 170 bool hasUtf32leBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { | |
| 171 int end = length != null ? offset + length : utf32EncodedBytes.length; | |
| 172 return (offset + 4) <= end && | |
| 173 utf32EncodedBytes[offset] == UNICODE_UTF_BOM_LO && | |
| 174 utf32EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI && | |
| 175 utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0; | |
| 176 } | |
| 177 | |
| 178 typedef Utf32BytesDecoder Utf32BytesDecoderProvider(); | |
| 179 | |
| 180 /** | |
| 181 * Return type of [decodeUtf32AsIterable] and variants. The Iterable type | |
| 182 * provides an iterator on demand and the iterator will only translate bytes | |
| 183 * as requested by the user of the iterator. (Note: results are not cached.) | |
| 184 */ | |
| 185 // TODO(floitsch): Consider removing the extend and switch to implements since | |
| 186 // that's cheaper to allocate. | |
| 187 class IterableUtf32Decoder extends IterableBase<int> { | |
| 188 final Utf32BytesDecoderProvider codeunitsProvider; | |
| 189 | |
| 190 IterableUtf32Decoder._(this.codeunitsProvider); | |
| 191 | |
| 192 Utf32BytesDecoder get iterator => codeunitsProvider(); | |
| 193 } | |
| 194 | |
| 195 /** | |
| 196 * Abstrace parent class converts encoded bytes to codepoints. | |
| 197 */ | |
| 198 abstract class Utf32BytesDecoder implements _ListRangeIterator { | |
| 199 final _ListRangeIterator utf32EncodedBytesIterator; | |
| 200 final int replacementCodepoint; | |
| 201 int _current = null; | |
| 202 | |
| 203 Utf32BytesDecoder._fromListRangeIterator( | |
| 204 this.utf32EncodedBytesIterator, this.replacementCodepoint); | |
| 205 | |
| 206 factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [ | |
| 207 int offset = 0, int length, | |
| 208 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { | |
| 209 if (length == null) { | |
| 210 length = utf32EncodedBytes.length - offset; | |
| 211 } | |
| 212 if (hasUtf32beBom(utf32EncodedBytes, offset, length)) { | |
| 213 return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | |
| 214 false, replacementCodepoint); | |
| 215 } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) { | |
| 216 return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, | |
| 217 false, replacementCodepoint); | |
| 218 } else { | |
| 219 return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false, | |
| 220 replacementCodepoint); | |
| 221 } | |
| 222 } | |
| 223 | |
| 224 List<int> decodeRest() { | |
| 225 List<int> codeunits = new List<int>(remaining); | |
| 226 int i = 0; | |
| 227 while (moveNext()) { | |
| 228 codeunits[i++] = current; | |
| 229 } | |
| 230 return codeunits; | |
| 231 } | |
| 232 | |
| 233 int get current => _current; | |
| 234 | |
| 235 bool moveNext() { | |
| 236 _current = null; | |
| 237 int remaining = utf32EncodedBytesIterator.remaining; | |
| 238 if (remaining == 0) { | |
| 239 _current = null; | |
| 240 return false; | |
| 241 } | |
| 242 if (remaining < 4) { | |
| 243 utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining); | |
| 244 if (replacementCodepoint != null) { | |
| 245 _current = replacementCodepoint; | |
| 246 return true; | |
| 247 } else { | |
| 248 throw new ArgumentError( | |
| 249 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | |
| 250 } | |
| 251 } | |
| 252 int codepoint = decode(); | |
| 253 if (_validCodepoint(codepoint)) { | |
| 254 _current = codepoint; | |
| 255 return true; | |
| 256 } else if (replacementCodepoint != null) { | |
| 257 _current = replacementCodepoint; | |
| 258 return true; | |
| 259 } else { | |
| 260 throw new ArgumentError( | |
| 261 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); | |
| 262 } | |
| 263 } | |
| 264 | |
| 265 int get position => utf32EncodedBytesIterator.position ~/ 4; | |
| 266 | |
| 267 void backup([int by = 1]) { | |
| 268 utf32EncodedBytesIterator.backup(4 * by); | |
| 269 } | |
| 270 | |
| 271 int get remaining => (utf32EncodedBytesIterator.remaining + 3) ~/ 4; | |
| 272 | |
| 273 void skip([int count = 1]) { | |
| 274 utf32EncodedBytesIterator.skip(4 * count); | |
| 275 } | |
| 276 | |
| 277 int decode(); | |
| 278 } | |
| 279 | |
| 280 /** | |
| 281 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | |
| 282 * to produce the unicode codepoint. | |
| 283 */ | |
| 284 class Utf32beBytesDecoder extends Utf32BytesDecoder { | |
| 285 Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | |
| 286 int length, bool stripBom = true, | |
| 287 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
| 288 super._fromListRangeIterator( | |
| 289 (new _ListRange(utf32EncodedBytes, offset, length)).iterator, | |
| 290 replacementCodepoint) { | |
| 291 if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) { | |
| 292 skip(); | |
| 293 } | |
| 294 } | |
| 295 | |
| 296 int decode() { | |
| 297 utf32EncodedBytesIterator.moveNext(); | |
| 298 int value = utf32EncodedBytesIterator.current; | |
| 299 utf32EncodedBytesIterator.moveNext(); | |
| 300 value = (value << 8) + utf32EncodedBytesIterator.current; | |
| 301 utf32EncodedBytesIterator.moveNext(); | |
| 302 value = (value << 8) + utf32EncodedBytesIterator.current; | |
| 303 utf32EncodedBytesIterator.moveNext(); | |
| 304 value = (value << 8) + utf32EncodedBytesIterator.current; | |
| 305 return value; | |
| 306 } | |
| 307 } | |
| 308 | |
| 309 /** | |
| 310 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes | |
| 311 * to produce the unicode codepoint. | |
| 312 */ | |
| 313 class Utf32leBytesDecoder extends Utf32BytesDecoder { | |
| 314 Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, | |
| 315 int length, bool stripBom = true, | |
| 316 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : | |
| 317 super._fromListRangeIterator( | |
| 318 (new _ListRange(utf32EncodedBytes, offset, length)).iterator, | |
| 319 replacementCodepoint) { | |
| 320 if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) { | |
| 321 skip(); | |
| 322 } | |
| 323 } | |
| 324 | |
| 325 int decode() { | |
| 326 utf32EncodedBytesIterator.moveNext(); | |
| 327 int value = utf32EncodedBytesIterator.current; | |
| 328 utf32EncodedBytesIterator.moveNext(); | |
| 329 value += (utf32EncodedBytesIterator.current << 8); | |
| 330 utf32EncodedBytesIterator.moveNext(); | |
| 331 value += (utf32EncodedBytesIterator.current << 16); | |
| 332 utf32EncodedBytesIterator.moveNext(); | |
| 333 value += (utf32EncodedBytesIterator.current << 24); | |
| 334 return value; | |
| 335 } | |
| 336 } | |
| 337 | |
| 338 bool _validCodepoint(int codepoint) { | |
| 339 return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) || | |
| 340 (codepoint > UNICODE_UTF16_RESERVED_HI && | |
| 341 codepoint < UNICODE_VALID_RANGE_MAX); | |
| 342 } | |
| OLD | NEW |