Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 19187002: Replace old utf8 decoder with new one. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Address comments. Created 7 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « sdk/lib/codec/encoding.dart ('k') | tests/lib/convert/utf82_test.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.convert; 5 part of dart.convert;
6 6
7 /** 7 /**
8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of 8 * A [Utf8Encoder] converts strings to their UTF-8 code units (a list of
9 * unsigned 8-bit integers). 9 * unsigned 8-bit integers).
10 */ 10 */
11 class Utf8Encoder extends Converter<String, List<int>> { 11 class Utf8Encoder extends Converter<String, List<int>> {
12 /** 12 /**
13 * Converts [string] to its UTF-8 code units (a list of 13 * Converts [string] to its UTF-8 code units (a list of
14 * unsigned 8-bit integers). 14 * unsigned 8-bit integers).
15 */ 15 */
16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string); 16 List<int> convert(String string) => OLD_UTF_LIB.encodeUtf8(string);
17 } 17 }
18 18
19 /** 19 /**
20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers) 20 * A [Utf8Decoder] converts UTF-8 code units (lists of unsigned 8-bit integers)
21 * to a string. 21 * to a string.
22 */ 22 */
23 class Utf8Decoder extends Converter<List<int>, String> { 23 class Utf8Decoder extends Converter<List<int>, String> {
24 final bool _allowMalformed;
25
26 /**
27 * Instantiates a new [Utf8Decoder].
28 *
29 * The optional [allowMalformed] argument defines how [convert] deals
30 * with invalid or unterminated character sequences.
31 *
32 * If it is `true` [convert] replaces invalid (or unterminated) character
33 * sequences with the Unicode Replacement character `U+FFFD` (�). Otherwise
34 * it throws a [FormatException].
35 */
36 Utf8Decoder({ bool allowMalformed: false })
37 : this._allowMalformed = allowMalformed;
38
24 /** 39 /**
25 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the 40 * Converts the UTF-8 [codeUnits] (a list of unsigned 8-bit integers) to the
26 * corresponding string. 41 * corresponding string.
27 */ 42 */
28 // TODO(floitsch): allow to configure the decoder (for example the replacement 43 String convert(List<int> codeUnits) {
29 // character). 44 StringBuffer buffer = new StringBuffer();
30 String convert(List<int> codeUnits) => OLD_UTF_LIB.decodeUtf8(codeUnits); 45 _Utf8Decoder decoder = new _Utf8Decoder(_allowMalformed);
46 decoder.convert(codeUnits, 0, codeUnits.length, buffer);
47 decoder.close(buffer);
48 return buffer.toString();
49 }
31 } 50 }
51
52 // UTF-8 constants.
53 const int _ONE_BYTE_LIMIT = 0x7f; // 7 bytes
54 const int _TWO_BYTE_LIMIT = 0x7ff; // 11 bytes
55 const int _THREE_BYTE_LIMIT = 0xffff; // 16 bytes
56 const int _FOUR_BYTE_LIMIT = 0x10ffff; // 21 bytes, truncated to Unicode max.
57
58 // UTF-16 constants.
59 const int _SURROGATE_MASK = 0xF800;
60 const int _SURROGATE_TAG_MASK = 0xFC00;
61 const int _SURROGATE_VALUE_MASK = 0x3FF;
62 const int _LEAD_SURROGATE_MIN = 0xD800;
63 const int _TAIL_SURROGATE_MIN = 0xDC00;
64
65 const int _REPLACEMENT_CHARACTER = 0xFFFD;
66 const int _BOM_CHARACTER = 0xFEFF;
67
68 bool _isSurrogate(int codeUnit) =>
69 (codeUnit & _SURROGATE_MASK) == _LEAD_SURROGATE_MIN;
70 bool _isLeadSurrogate(int codeUnit) =>
71 (codeUnit & _SURROGATE_TAG_MASK) == _LEAD_SURROGATE_MIN;
72 bool _isTailSurrogate(int codeUnit) =>
73 (codeUnit & _SURROGATE_TAG_MASK) == _TAIL_SURROGATE_MIN;
74 int _combineSurrogatePair(int lead, int tail) =>
75 0x10000 | ((lead & _SURROGATE_VALUE_MASK) << 10)
76 | (tail & _SURROGATE_VALUE_MASK);
77
78
79 /**
80 * Decodes UTF-8.
81 *
82 * The decoder handles chunked input.
83 */
84 // TODO(floitsch): make this class public.
85 class _Utf8Decoder {
86 final bool _allowMalformed;
87 bool _isFirstCharacter = true;
88 int _value = 0;
89 int _expectedUnits = 0;
90 int _extraUnits = 0;
91
92 _Utf8Decoder(this._allowMalformed);
93
94 bool get hasPartialInput => _expectedUnits > 0;
95
96 // Limits of one through four byte encodings.
97 static const List<int> _LIMITS = const <int>[
98 _ONE_BYTE_LIMIT,
99 _TWO_BYTE_LIMIT,
100 _THREE_BYTE_LIMIT,
101 _FOUR_BYTE_LIMIT ];
102
103 void close(StringSink sink) {
104 if (hasPartialInput) {
105 if (!_allowMalformed) {
106 throw new FormatException("Unfinished UTF-8 octet sequence");
107 }
108 sink.writeCharCode(_REPLACEMENT_CHARACTER);
109 }
110 }
111
112 void convert(List<int> codeUnits, int startIndex, int endIndex,
113 StringSink sink) {
114 int value = _value;
115 int expectedUnits = _expectedUnits;
116 int extraUnits = _extraUnits;
117 _value = 0;
118 _expectedUnits = 0;
119 _extraUnits = 0;
120
121 int i = startIndex;
122 loop: while (true) {
123 multibyte: if (expectedUnits > 0) {
124 do {
125 if (i == endIndex) {
126 break loop;
127 }
128 int unit = codeUnits[i];
129 if ((unit & 0xC0) != 0x80) {
130 expectedUnits = 0;
131 if (!_allowMalformed) {
132 throw new FormatException(
133 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
134 }
135 _isFirstCharacter = false;
136 sink.writeCharCode(_REPLACEMENT_CHARACTER);
137 break multibyte;
138 } else {
139 value = (value << 6) | (unit & 0x3f);
140 expectedUnits--;
141 i++;
142 }
143 } while (expectedUnits > 0);
144 if (value <= _LIMITS[extraUnits - 1]) {
145 // Overly long encoding. The value could be encoded with a shorter
146 // encoding.
147 if (!_allowMalformed) {
148 throw new FormatException(
149 "Overlong encoding of 0x${value.toRadixString(16)}");
150 }
151 expectedUnits = extraUnits = 0;
152 value = _REPLACEMENT_CHARACTER;
153 }
154 if (value > _FOUR_BYTE_LIMIT) {
155 if (!_allowMalformed) {
156 throw new FormatException("Character outside valid Unicode range: "
157 "0x${value.toRadixString(16)}");
158 }
159 value = _REPLACEMENT_CHARACTER;
160 }
161 if (!_isFirstCharacter || value != _BOM_CHARACTER) {
162 sink.writeCharCode(value);
163 }
164 _isFirstCharacter = false;
165 }
166
167 while (i < endIndex) {
168 int unit = codeUnits[i++];
169 if (unit <= _ONE_BYTE_LIMIT) {
170 _isFirstCharacter = false;
171 sink.writeCharCode(unit);
172 } else {
173 if ((unit & 0xE0) == 0xC0) {
174 value = unit & 0x1F;
175 expectedUnits = extraUnits = 1;
176 continue loop;
177 }
178 if ((unit & 0xF0) == 0xE0) {
179 value = unit & 0x0F;
180 expectedUnits = extraUnits = 2;
181 continue loop;
182 }
183 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
184 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
185 value = unit & 0x07;
186 expectedUnits = extraUnits = 3;
187 continue loop;
188 }
189 if (!_allowMalformed) {
190 throw new FormatException(
191 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
192 }
193 value = _REPLACEMENT_CHARACTER;
194 expectedUnits = extraUnits = 0;
195 _isFirstCharacter = false;
196 sink.writeCharCode(value);
197 }
198 }
199 break loop;
200 }
201 if (expectedUnits > 0) {
202 _value = value;
203 _expectedUnits = expectedUnits;
204 _extraUnits = extraUnits;
205 }
206 }
207 }
OLDNEW
« no previous file with comments | « sdk/lib/codec/encoding.dart ('k') | tests/lib/convert/utf82_test.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698