OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. |
| 4 |
| 5 part of utf; |
| 6 |
| 7 /** |
| 8 * Decodes the UTF-32 bytes as an iterable. Thus, the consumer can only convert |
| 9 * as much of the input as needed. Determines the byte order from the BOM, |
| 10 * or uses big-endian as a default. This method always strips a leading BOM. |
| 11 * Set the replacementCharacter to null to throw an ArgumentError |
| 12 * rather than replace the bad value. |
| 13 */ |
| 14 IterableUtf32Decoder decodeUtf32AsIterable(List<int> bytes, [ |
| 15 int offset = 0, int length, |
| 16 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 17 return new IterableUtf32Decoder._( |
| 18 () => new Utf32BytesDecoder(bytes, offset, length, replacementCodepoint)); |
| 19 } |
| 20 |
| 21 /** |
| 22 * Decodes the UTF-32BE bytes as an iterable. Thus, the consumer can only conver
t |
| 23 * as much of the input as needed. This method strips a leading BOM by default, |
| 24 * but can be overridden by setting the optional parameter [stripBom] to false. |
| 25 * Set the replacementCharacter to null to throw an ArgumentError |
| 26 * rather than replace the bad value. |
| 27 */ |
| 28 IterableUtf32Decoder decodeUtf32beAsIterable(List<int> bytes, [ |
| 29 int offset = 0, int length, bool stripBom = true, |
| 30 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 31 return new IterableUtf32Decoder._( |
| 32 () => new Utf32beBytesDecoder(bytes, offset, length, stripBom, |
| 33 replacementCodepoint)); |
| 34 } |
| 35 |
| 36 /** |
| 37 * Decodes the UTF-32LE bytes as an iterable. Thus, the consumer can only conver
t |
| 38 * as much of the input as needed. This method strips a leading BOM by default, |
| 39 * but can be overridden by setting the optional parameter [stripBom] to false. |
| 40 * Set the replacementCharacter to null to throw an ArgumentError |
| 41 * rather than replace the bad value. |
| 42 */ |
| 43 IterableUtf32Decoder decodeUtf32leAsIterable(List<int> bytes, [ |
| 44 int offset = 0, int length, bool stripBom = true, |
| 45 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 46 return new IterableUtf32Decoder._( |
| 47 () => new Utf32leBytesDecoder(bytes, offset, length, stripBom, |
| 48 replacementCodepoint)); |
| 49 } |
| 50 |
| 51 /** |
| 52 * Produce a String from a sequence of UTF-32 encoded bytes. The parameters |
| 53 * allow an offset into a list of bytes (as int), limiting the length of the |
| 54 * values be decoded and the ability of override the default Unicode |
| 55 * replacement character. Set the replacementCharacter to null to throw an |
| 56 * ArgumentError rather than replace the bad value. |
| 57 */ |
| 58 String decodeUtf32(List<int> bytes, [int offset = 0, int length, |
| 59 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 60 return new String.fromCharCodes((new Utf32BytesDecoder(bytes, offset, length, |
| 61 replacementCodepoint)).decodeRest()); |
| 62 } |
| 63 /** |
| 64 * Produce a String from a sequence of UTF-32BE encoded bytes. The parameters |
| 65 * allow an offset into a list of bytes (as int), limiting the length of the |
| 66 * values be decoded and the ability of override the default Unicode |
| 67 * replacement character. Set the replacementCharacter to null to throw an |
| 68 * ArgumentError rather than replace the bad value. |
| 69 */ |
| 70 String decodeUtf32be( |
| 71 List<int> bytes, [int offset = 0, int length, bool stripBom = true, |
| 72 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => |
| 73 new String.fromCharCodes((new Utf32beBytesDecoder(bytes, offset, length, |
| 74 stripBom, replacementCodepoint)).decodeRest()); |
| 75 |
| 76 /** |
| 77 * Produce a String from a sequence of UTF-32LE encoded bytes. The parameters |
| 78 * allow an offset into a list of bytes (as int), limiting the length of the |
| 79 * values be decoded and the ability of override the default Unicode |
| 80 * replacement character. Set the replacementCharacter to null to throw an |
| 81 * ArgumentError rather than replace the bad value. |
| 82 */ |
| 83 String decodeUtf32le( |
| 84 List<int> bytes, [int offset = 0, int length, bool stripBom = true, |
| 85 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) => |
| 86 new String.fromCharCodes((new Utf32leBytesDecoder(bytes, offset, length, |
| 87 stripBom, replacementCodepoint)).decodeRest()); |
| 88 |
| 89 /** |
| 90 * Produce a list of UTF-32 encoded bytes. This method prefixes the resulting |
| 91 * bytes with a big-endian byte-order-marker. |
| 92 */ |
| 93 List<int> encodeUtf32(String str) => |
| 94 encodeUtf32be(str, true); |
| 95 |
| 96 /** |
| 97 * Produce a list of UTF-32BE encoded bytes. By default, this method produces |
| 98 * UTF-32BE bytes with no BOM. |
| 99 */ |
| 100 List<int> encodeUtf32be(String str, [bool writeBOM = false]) { |
| 101 List<int> utf32CodeUnits = stringToCodepoints(str); |
| 102 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + |
| 103 (writeBOM ? 4 : 0)); |
| 104 int i = 0; |
| 105 if (writeBOM) { |
| 106 encoding[i++] = 0; |
| 107 encoding[i++] = 0; |
| 108 encoding[i++] = UNICODE_UTF_BOM_HI; |
| 109 encoding[i++] = UNICODE_UTF_BOM_LO; |
| 110 } |
| 111 for (int unit in utf32CodeUnits) { |
| 112 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; |
| 113 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; |
| 114 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; |
| 115 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; |
| 116 } |
| 117 return encoding; |
| 118 } |
| 119 |
| 120 /** |
| 121 * Produce a list of UTF-32LE encoded bytes. By default, this method produces |
| 122 * UTF-32BE bytes with no BOM. |
| 123 */ |
| 124 List<int> encodeUtf32le(String str, [bool writeBOM = false]) { |
| 125 List<int> utf32CodeUnits = stringToCodepoints(str); |
| 126 List<int> encoding = new List<int>(4 * utf32CodeUnits.length + |
| 127 (writeBOM ? 4 : 0)); |
| 128 int i = 0; |
| 129 if (writeBOM) { |
| 130 encoding[i++] = UNICODE_UTF_BOM_LO; |
| 131 encoding[i++] = UNICODE_UTF_BOM_HI; |
| 132 encoding[i++] = 0; |
| 133 encoding[i++] = 0; |
| 134 } |
| 135 for (int unit in utf32CodeUnits) { |
| 136 encoding[i++] = unit & UNICODE_BYTE_ZERO_MASK; |
| 137 encoding[i++] = (unit >> 8) & UNICODE_BYTE_ZERO_MASK; |
| 138 encoding[i++] = (unit >> 16) & UNICODE_BYTE_ZERO_MASK; |
| 139 encoding[i++] = (unit >> 24) & UNICODE_BYTE_ZERO_MASK; |
| 140 } |
| 141 return encoding; |
| 142 } |
| 143 |
| 144 /** |
| 145 * Identifies whether a List of bytes starts (based on offset) with a |
| 146 * byte-order marker (BOM). |
| 147 */ |
| 148 bool hasUtf32Bom( |
| 149 List<int> utf32EncodedBytes, [int offset = 0, int length]) { |
| 150 return hasUtf32beBom(utf32EncodedBytes, offset, length) || |
| 151 hasUtf32leBom(utf32EncodedBytes, offset, length); |
| 152 } |
| 153 |
| 154 /** |
| 155 * Identifies whether a List of bytes starts (based on offset) with a |
| 156 * big-endian byte-order marker (BOM). |
| 157 */ |
| 158 bool hasUtf32beBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { |
| 159 int end = length != null ? offset + length : utf32EncodedBytes.length; |
| 160 return (offset + 4) <= end && |
| 161 utf32EncodedBytes[offset] == 0 && utf32EncodedBytes[offset + 1] == 0 && |
| 162 utf32EncodedBytes[offset + 2] == UNICODE_UTF_BOM_HI && |
| 163 utf32EncodedBytes[offset + 3] == UNICODE_UTF_BOM_LO; |
| 164 } |
| 165 |
| 166 /** |
| 167 * Identifies whether a List of bytes starts (based on offset) with a |
| 168 * little-endian byte-order marker (BOM). |
| 169 */ |
| 170 bool hasUtf32leBom(List<int> utf32EncodedBytes, [int offset = 0, int length]) { |
| 171 int end = length != null ? offset + length : utf32EncodedBytes.length; |
| 172 return (offset + 4) <= end && |
| 173 utf32EncodedBytes[offset] == UNICODE_UTF_BOM_LO && |
| 174 utf32EncodedBytes[offset + 1] == UNICODE_UTF_BOM_HI && |
| 175 utf32EncodedBytes[offset + 2] == 0 && utf32EncodedBytes[offset + 3] == 0; |
| 176 } |
| 177 |
| 178 typedef Utf32BytesDecoder Utf32BytesDecoderProvider(); |
| 179 |
| 180 /** |
| 181 * Return type of [decodeUtf32AsIterable] and variants. The Iterable type |
| 182 * provides an iterator on demand and the iterator will only translate bytes |
| 183 * as requested by the user of the iterator. (Note: results are not cached.) |
| 184 */ |
| 185 // TODO(floitsch): Consider removing the extend and switch to implements since |
| 186 // that's cheaper to allocate. |
| 187 class IterableUtf32Decoder extends IterableBase<int> { |
| 188 final Utf32BytesDecoderProvider codeunitsProvider; |
| 189 |
| 190 IterableUtf32Decoder._(this.codeunitsProvider); |
| 191 |
| 192 Utf32BytesDecoder get iterator => codeunitsProvider(); |
| 193 } |
| 194 |
| 195 /** |
| 196 * Abstrace parent class converts encoded bytes to codepoints. |
| 197 */ |
| 198 abstract class Utf32BytesDecoder implements ListRangeIterator { |
| 199 // TODO(kevmoo): should this field be private? |
| 200 final ListRangeIterator utf32EncodedBytesIterator; |
| 201 final int replacementCodepoint; |
| 202 int _current = null; |
| 203 |
| 204 Utf32BytesDecoder._fromListRangeIterator( |
| 205 this.utf32EncodedBytesIterator, this.replacementCodepoint); |
| 206 |
| 207 factory Utf32BytesDecoder(List<int> utf32EncodedBytes, [ |
| 208 int offset = 0, int length, |
| 209 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| 210 if (length == null) { |
| 211 length = utf32EncodedBytes.length - offset; |
| 212 } |
| 213 if (hasUtf32beBom(utf32EncodedBytes, offset, length)) { |
| 214 return new Utf32beBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, |
| 215 false, replacementCodepoint); |
| 216 } else if (hasUtf32leBom(utf32EncodedBytes, offset, length)) { |
| 217 return new Utf32leBytesDecoder(utf32EncodedBytes, offset + 4, length - 4, |
| 218 false, replacementCodepoint); |
| 219 } else { |
| 220 return new Utf32beBytesDecoder(utf32EncodedBytes, offset, length, false, |
| 221 replacementCodepoint); |
| 222 } |
| 223 } |
| 224 |
| 225 List<int> decodeRest() { |
| 226 List<int> codeunits = new List<int>(remaining); |
| 227 int i = 0; |
| 228 while (moveNext()) { |
| 229 codeunits[i++] = current; |
| 230 } |
| 231 return codeunits; |
| 232 } |
| 233 |
| 234 int get current => _current; |
| 235 |
| 236 bool moveNext() { |
| 237 _current = null; |
| 238 int remaining = utf32EncodedBytesIterator.remaining; |
| 239 if (remaining == 0) { |
| 240 _current = null; |
| 241 return false; |
| 242 } |
| 243 if (remaining < 4) { |
| 244 utf32EncodedBytesIterator.skip(utf32EncodedBytesIterator.remaining); |
| 245 if (replacementCodepoint != null) { |
| 246 _current = replacementCodepoint; |
| 247 return true; |
| 248 } else { |
| 249 throw new ArgumentError( |
| 250 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); |
| 251 } |
| 252 } |
| 253 int codepoint = decode(); |
| 254 if (_validCodepoint(codepoint)) { |
| 255 _current = codepoint; |
| 256 return true; |
| 257 } else if (replacementCodepoint != null) { |
| 258 _current = replacementCodepoint; |
| 259 return true; |
| 260 } else { |
| 261 throw new ArgumentError( |
| 262 "Invalid UTF32 at ${utf32EncodedBytesIterator.position}"); |
| 263 } |
| 264 } |
| 265 |
| 266 int get position => utf32EncodedBytesIterator.position ~/ 4; |
| 267 |
| 268 void backup([int by = 1]) { |
| 269 utf32EncodedBytesIterator.backup(4 * by); |
| 270 } |
| 271 |
| 272 int get remaining => (utf32EncodedBytesIterator.remaining + 3) ~/ 4; |
| 273 |
| 274 void skip([int count = 1]) { |
| 275 utf32EncodedBytesIterator.skip(4 * count); |
| 276 } |
| 277 |
| 278 int decode(); |
| 279 } |
| 280 |
| 281 /** |
| 282 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes |
| 283 * to produce the unicode codepoint. |
| 284 */ |
| 285 class Utf32beBytesDecoder extends Utf32BytesDecoder { |
| 286 Utf32beBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, |
| 287 int length, bool stripBom = true, |
| 288 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 289 super._fromListRangeIterator( |
| 290 (new ListRange(utf32EncodedBytes, offset, length)).iterator, |
| 291 replacementCodepoint) { |
| 292 if (stripBom && hasUtf32beBom(utf32EncodedBytes, offset, length)) { |
| 293 skip(); |
| 294 } |
| 295 } |
| 296 |
| 297 int decode() { |
| 298 utf32EncodedBytesIterator.moveNext(); |
| 299 int value = utf32EncodedBytesIterator.current; |
| 300 utf32EncodedBytesIterator.moveNext(); |
| 301 value = (value << 8) + utf32EncodedBytesIterator.current; |
| 302 utf32EncodedBytesIterator.moveNext(); |
| 303 value = (value << 8) + utf32EncodedBytesIterator.current; |
| 304 utf32EncodedBytesIterator.moveNext(); |
| 305 value = (value << 8) + utf32EncodedBytesIterator.current; |
| 306 return value; |
| 307 } |
| 308 } |
| 309 |
| 310 /** |
| 311 * Convert UTF-32BE encoded bytes to codepoints by grouping 4 bytes |
| 312 * to produce the unicode codepoint. |
| 313 */ |
| 314 class Utf32leBytesDecoder extends Utf32BytesDecoder { |
| 315 Utf32leBytesDecoder(List<int> utf32EncodedBytes, [int offset = 0, |
| 316 int length, bool stripBom = true, |
| 317 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) : |
| 318 super._fromListRangeIterator( |
| 319 (new ListRange(utf32EncodedBytes, offset, length)).iterator, |
| 320 replacementCodepoint) { |
| 321 if (stripBom && hasUtf32leBom(utf32EncodedBytes, offset, length)) { |
| 322 skip(); |
| 323 } |
| 324 } |
| 325 |
| 326 int decode() { |
| 327 utf32EncodedBytesIterator.moveNext(); |
| 328 int value = utf32EncodedBytesIterator.current; |
| 329 utf32EncodedBytesIterator.moveNext(); |
| 330 value += (utf32EncodedBytesIterator.current << 8); |
| 331 utf32EncodedBytesIterator.moveNext(); |
| 332 value += (utf32EncodedBytesIterator.current << 16); |
| 333 utf32EncodedBytesIterator.moveNext(); |
| 334 value += (utf32EncodedBytesIterator.current << 24); |
| 335 return value; |
| 336 } |
| 337 } |
| 338 |
| 339 bool _validCodepoint(int codepoint) { |
| 340 return (codepoint >= 0 && codepoint < UNICODE_UTF16_RESERVED_LO) || |
| 341 (codepoint > UNICODE_UTF16_RESERVED_HI && |
| 342 codepoint < UNICODE_VALID_RANGE_MAX); |
| 343 } |
OLD | NEW |