pkg/third_party/html5lib/lib/src/char_encodings.dart - Issue 814113004: Pull args, intl, logging, shelf, and source_maps out of the SDK.

Side by Side Diff: pkg/third_party/html5lib/lib/src/char_encodings.dart

Issue 814113004: Pull args, intl, logging, shelf, and source_maps out of the SDK. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Also csslib. Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /// Decodes bytes using the correct name. See [decodeBytes].

2 library char_encodings;

3

4 import 'dart:collection';

5 import 'package:utf/utf.dart';

6

7 // TODO(jmesserly): this function is conspicuously absent from dart:utf.

8 /// Returns true if the [bytes] starts with a UTF-8 byte order mark.

9 /// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is

10 /// used in HTML to detect the UTF-

11 bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {

12 int end = length != null ? offset + length : bytes.length;

13 return (offset + 3) <= end &&

14 bytes[offset] == 0xEF &&

15 bytes[offset + 1] == 0xBB &&

16 bytes[offset + 2] == 0xBF;

17 }

18

19 // TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire

20 // file, but dart:utf does not expose stream-based decoders yet.

21 /// Decodes the [bytes] with the provided [encoding] and returns an iterable for

22 /// the codepoints. Supports the major unicode encodings as well as ascii and

23 /// and windows-1252 encodings.

24 Iterable<int> decodeBytes(String encoding, List<int> bytes,

25 [int offset = 0, int length,

26 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

27 if (length == null) length = bytes.length;

28 final replace = replacementCodepoint;

29 switch (encoding) {

30 case 'ascii':

31 bytes = bytes.sublist(offset, offset + length);

32 // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart

33 for (int byte in bytes) {

34 if (byte > 127) {

35 // TODO(jmesserly): ideally this would be DecoderException, like the

36 // one thrown in runtime/bin/string_stream.dart, but we don't want to

37 // depend on dart:io.

38 throw new FormatException("Illegal ASCII character $byte");

39 }

40 }

41 return bytes;

42

43 case 'windows-1252':

44 case 'cp1252':

45 return decodeWindows1252AsIterable(bytes, offset, length, replace);

46

47 case 'utf-8':

48 // NOTE: to match the behavior of the other decode functions, we eat the

49 // utf-8 BOM here.

50 if (hasUtf8Bom(bytes, offset, length)) {

51 offset += 3;

52 length -= 3;

53 }

54 return decodeUtf8AsIterable(bytes, offset, length, replace);

55

56 case 'utf-16':

57 return decodeUtf16AsIterable(bytes, offset, length, replace);

58 case 'utf-16-be':

59 return decodeUtf16beAsIterable(bytes, offset, length, true, replace);

60 case 'utf-16-le':

61 return decodeUtf16leAsIterable(bytes, offset, length, true, replace);

62

63 case 'utf-32':

64 return decodeUtf32AsIterable(bytes, offset, length, replace);

65 case 'utf-32-be':

66 return decodeUtf32beAsIterable(bytes, offset, length, true, replace);

67 case 'utf-32-le':

68 return decodeUtf32leAsIterable(bytes, offset, length, true, replace);

69

70 default:

71 throw new ArgumentError('Encoding $encoding not supported');

72 }

73 }

74

75

76 // TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.

77 /// Returns the code points for the [input]. This works like [String.charCodes]

78 /// but it decodes UTF-16 surrogate pairs.

79 List<int> toCodepoints(String input) {

80 var newCodes = <int>[];

81 for (int i = 0; i < input.length; i++) {

82 var c = input.codeUnitAt(i);

83 if (0xD800 <= c && c <= 0xDBFF) {

84 int next = i + 1;

85 if (next < input.length) {

86 var d = input.codeUnitAt(next);

87 if (0xDC00 <= d && d <= 0xDFFF) {

88 c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);

89 i = next;

90 }

91 }

92 }

93 newCodes.add(c);

94 }

95 return newCodes;

96 }

97

98

99 /// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as

100 /// an iterable. Thus, the consumer can only convert as much of the input as

101 /// needed. Set the [replacementCharacter] to null to throw an [ArgumentError]

102 /// rather than replace the bad value.

103 IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,

104 [int offset = 0, int length,

105 int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {

106 return new IterableWindows1252Decoder(bytes, offset, length,

107 replacementCodepoint);

108 }

109

110

111 /// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type

112 /// provides an iterator on demand and the iterator will only translate bytes

113 /// as requested by the user of the iterator. (Note: results are not cached.)

114 class IterableWindows1252Decoder extends IterableBase<int> {

115 final List<int> bytes;

116 final int offset;

117 final int length;

118 final int replacementCodepoint;

119

120 IterableWindows1252Decoder(List<int> this.bytes, [int this.offset = 0,

121 int this.length = null,

122 int this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);

123

124 Windows1252Decoder get iterator =>

125 new Windows1252Decoder(bytes, offset, length, replacementCodepoint);

126 }

127

128

129 /// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.

130 /// The parameters can set an offset into a list of bytes (as int), limit the

131 /// length of the values to be decoded, and override the default Unicode

132 /// replacement character. Set the replacementCharacter to null to throw an

133 /// ArgumentError rather than replace the bad value. The return value

134 /// from this method can be used as an Iterable (e.g. in a for-loop).

135 class Windows1252Decoder implements Iterator<int> {

136 final int replacementCodepoint;

137 final List<int> _bytes;

138 int _offset;

139 final int _length;

140

141 Windows1252Decoder(List<int> bytes, [int offset = 0, int length,

142 this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])

143 : _bytes = bytes,

144 _offset = offset - 1,

145 _length = length == null ? bytes.length : length;

146

147 bool get _inRange => _offset >= 0 && _offset < _length;

148 int get current => _inRange ? _mapChar(_bytes[_offset]) : null;

149

150 bool moveNext() {

151 _offset++;

152 return _inRange;

153 }

154

155 int _mapChar(int char) {

156 // TODO(jmesserly): this is duplicating entitiesWindows1252 and

157 // replacementCharacters from constants.dart

158 switch (char) {

159 case 0x80: return 0x20AC; // EURO SIGN

160 case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK

161 case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK

162 case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK

163 case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS

164 case 0x86: return 0x2020; // DAGGER

165 case 0x87: return 0x2021; // DOUBLE DAGGER

166 case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT

167 case 0x89: return 0x2030; // PER MILLE SIGN

168 case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON

169 case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK

170 case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE

171 case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON

172 case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK

173 case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK

174 case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK

175 case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK

176 case 0x95: return 0x2022; // BULLET

177 case 0x96: return 0x2013; // EN DASH

178 case 0x97: return 0x2014; // EM DASH

179 case 0x98: return 0x02DC; // SMALL TILDE

180 case 0x99: return 0x2122; // TRADE MARK SIGN

181 case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON

182 case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK

183 case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE

184 case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON

185 case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS

186

187 case 0x81:

188 case 0x8D:

189 case 0x8F:

190 case 0x90:

191 case 0x9D:

192 if (replacementCodepoint == null) {

193 throw new ArgumentError(

194 "Invalid windows-1252 code point $char at $_offset");

195 }

196 return replacementCodepoint;

197 }

198 return char;

199 }

200 }

OLD	NEW

« no previous file with comments | « pkg/third_party/html5lib/lib/parser_console.dart ('k') | pkg/third_party/html5lib/lib/src/constants.dart » ('j') | no next file with comments »