Source/core/html/parser/TextResourceDecoder.cpp - Issue 133273007: Revert "Moved text decoding to the parser thread"

Side by Side Diff: Source/core/html/parser/TextResourceDecoder.cpp

Issue 133273007: Revert "Moved text decoding to the parser thread" (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)

3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.

4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)

5

6 This library is free software; you can redistribute it and/or

7 modify it under the terms of the GNU Library General Public

8 License as published by the Free Software Foundation; either

9 version 2 of the License, or (at your option) any later version.

10

11 This library is distributed in the hope that it will be useful,

12 but WITHOUT ANY WARRANTY; without even the implied warranty of

13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 Library General Public License for more details.

15

16 You should have received a copy of the GNU Library General Public License

17 along with this library; see the file COPYING.LIB. If not, write to

18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

19 Boston, MA 02110-1301, USA.

20 */

21

22

23 #include "config.h"

24 #include "core/html/parser/TextResourceDecoder.h"

25

26 #include "HTMLNames.h"

27 #include "core/dom/DOMImplementation.h"

28 #include "core/html/parser/HTMLMetaCharsetParser.h"

29 #include "platform/text/TextEncodingDetector.h"

30 #include "wtf/StringExtras.h"

31 #include "wtf/text/TextCodec.h"

32 #include "wtf/text/TextEncodingRegistry.h"

33

34 using namespace WTF;

35

36 namespace WebCore {

37

38 using namespace HTMLNames;

39

40 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)

41 {

42 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4;

43 }

44

45 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5)

46 {

47 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5;

48 }

49

50 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7)

51 {

52 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7;

53 }

54

55 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9)

56 {

57 return p[0] == b0 && p[1] == b1 && p[2] == b2 && p[3] == b3 && p[4] == b4 && p[5] == b5 && p[6] == b6 && p[7] == b7 && p[8] == b8 && p[9] == b9;

58 }

59

60 // You might think we should put these find functions elsewhere, perhaps with th e

61 // similar functions that operate on UChar, but arguably only the decoder has

62 // a reason to process strings of char rather than UChar.

63

64 static int find(const char* subject, size_t subjectLength, const char* target)

65 {

66 size_t targetLength = strlen(target);

67 if (targetLength > subjectLength)

68 return -1;

69 for (size_t i = 0; i <= subjectLength - targetLength; ++i) {

70 bool match = true;

71 for (size_t j = 0; j < targetLength; ++j) {

72 if (subject[i + j] != target[j]) {

73 match = false;

74 break;

75 }

76 }

77 if (match)

78 return i;

79 }

80 return -1;

81 }

82

83 static WTF::TextEncoding findTextEncoding(const char* encodingName, int length)

84 {

85 Vector<char, 64> buffer(length + 1);

86 memcpy(buffer.data(), encodingName, length);

87 buffer[length] = '\0';

88 return buffer.data();

89 }

90

91 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)

92 {

93 if (equalIgnoringCase(mimeType, "text/css"))

94 return CSSContent;

95 if (equalIgnoringCase(mimeType, "text/html"))

96 return HTMLContent;

97 if (DOMImplementation::isXMLMIMEType(mimeType))

98 return XMLContent;

99 return PlainTextContent;

100 }

101

102 const WTF::TextEncoding& TextResourceDecoder::defaultEncoding(ContentType conten tType, const WTF::TextEncoding& specifiedDefaultEncoding)

103 {

104 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII

105 // for text/xml. This matches Firefox.

106 if (contentType == XMLContent)

107 return UTF8Encoding();

108 if (!specifiedDefaultEncoding.isValid())

109 return Latin1Encoding();

110 return specifiedDefaultEncoding;

111 }

112

113 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector)

114 : m_contentType(determineContentType(mimeType))

115 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))

116 , m_source(DefaultEncoding)

117 , m_hintEncoding(0)

118 , m_checkedForBOM(false)

119 , m_checkedForCSSCharset(false)

120 , m_checkedForXMLCharset(false)

121 , m_checkedForMetaCharset(false)

122 , m_useLenientXMLDecoding(false)

123 , m_sawError(false)

124 , m_usesEncodingDetector(usesEncodingDetector)

125 {

126 }

127

128 TextResourceDecoder::~TextResourceDecoder()

129 {

130 }

131

132 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)

133 {

134 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).

135 if (!encoding.isValid())

136 return;

137

138 // When encoding comes from meta tag (i.e. it cannot be XML files sent via X HR),

139 // treat x-user-defined as windows-1252 (bug 18270)

140 if (source == EncodingFromMetaTag && !strcasecmp(encoding.name(), "x-user-de fined"))

141 m_encoding = "windows-1252";

142 else if (source == EncodingFromMetaTag \|\| source == EncodingFromXMLHeader \|\| source == EncodingFromCSSCharset)

143 m_encoding = encoding.closestByteBasedEquivalent();

144 else

145 m_encoding = encoding;

146

147 m_codec.clear();

148 m_source = source;

149 }

150

151 // Returns the position of the encoding string.

152 static int findXMLEncoding(const char* str, int len, int& encodingLength)

153 {

154 int pos = find(str, len, "encoding");

155 if (pos == -1)

156 return -1;

157 pos += 8;

158

159 // Skip spaces and stray control characters.

160 while (pos < len && str[pos] <= ' ')

161 ++pos;

162

163 // Skip equals sign.

164 if (pos >= len \|\| str[pos] != '=')

165 return -1;

166 ++pos;

167

168 // Skip spaces and stray control characters.

169 while (pos < len && str[pos] <= ' ')

170 ++pos;

171

172 // Skip quotation mark.

173 if (pos >= len)

174 return - 1;

175 char quoteMark = str[pos];

176 if (quoteMark != '"' && quoteMark != '\'')

177 return -1;

178 ++pos;

179

180 // Find the trailing quotation mark.

181 int end = pos;

182 while (end < len && str[end] != quoteMark)

183 ++end;

184 if (end >= len)

185 return -1;

186

187 encodingLength = end - pos;

188 return pos;

189 }

190

191 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)

192 {

193 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure s ign of a Unicode encoding.

194 // We let it override even a user-chosen encoding.

195 ASSERT(!m_checkedForBOM);

196

197 size_t lengthOfBOM = 0;

198

199 size_t bufferLength = m_buffer.size();

200

201 size_t buf1Len = bufferLength;

202 size_t buf2Len = len;

203 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());

204 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);

205 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

206 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

207 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

208 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

209

210 // Check for the BOM.

211 if (c1 == 0xFF && c2 == 0xFE) {

212 if (c3 \|\| c4) {

213 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

214 lengthOfBOM = 2;

215 } else {

216 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

217 lengthOfBOM = 4;

218 }

219 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {

220 setEncoding(UTF8Encoding(), AutoDetectedEncoding);

221 lengthOfBOM = 3;

222 } else if (c1 == 0xFE && c2 == 0xFF) {

223 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

224 lengthOfBOM = 2;

225 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {

226 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

227 lengthOfBOM = 4;

228 }

229

230 if (lengthOfBOM \|\| bufferLength + len >= 4)

231 m_checkedForBOM = true;

232

233 return lengthOfBOM;

234 }

235

236 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)

237 {

238 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {

239 m_checkedForCSSCharset = true;

240 return true;

241 }

242

243 size_t oldSize = m_buffer.size();

244 m_buffer.grow(oldSize + len);

245 memcpy(m_buffer.data() + oldSize, data, len);

246

247 movedDataToBuffer = true;

248

249 if (m_buffer.size() <= 13) // strlen('@charset "x";') == 13

250 return false;

251

252 const char* dataStart = m_buffer.data();

253 const char* dataEnd = dataStart + m_buffer.size();

254

255 if (bytesEqual(dataStart, '@', 'c', 'h', 'a', 'r', 's', 'e', 't', ' ', '"')) {

256 dataStart += 10;

257 const char* pos = dataStart;

258

259 while (pos < dataEnd && *pos != '"')

260 ++pos;

261 if (pos == dataEnd)

262 return false;

263

264 int encodingNameLength = pos - dataStart;

265

266 ++pos;

267

268 if (*pos == ';')

269 setEncoding(findTextEncoding(dataStart, encodingNameLength), Encodin gFromCSSCharset);

270 }

271

272 m_checkedForCSSCharset = true;

273 return true;

274 }

275

276 bool TextResourceDecoder::checkForXMLCharset(const char* data, size_t len, bool& movedDataToBuffer)

277 {

278 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {

279 m_checkedForXMLCharset = true;

280 return true;

281 }

282

283 // This is not completely efficient, since the function might go

284 // through the HTML head several times.

285

286 size_t oldSize = m_buffer.size();

287 m_buffer.grow(oldSize + len);

288 memcpy(m_buffer.data() + oldSize, data, len);

289

290 movedDataToBuffer = true;

291

292 const char* ptr = m_buffer.data();

293 const char* pEnd = ptr + m_buffer.size();

294

295 // Is there enough data available to check for XML declaration?

296 if (m_buffer.size() < 8)

297 return false;

298

299 // Handle XML declaration, which can have encoding in it. This encoding is h onored even for HTML documents.

300 // It is an error for an XML declaration not to be at the start of an XML do cument, and it is ignored in HTML documents in such case.

301 if (bytesEqual(ptr, '<', '?', 'x', 'm', 'l')) {

302 const char* xmlDeclarationEnd = ptr;

303 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')

304 ++xmlDeclarationEnd;

305 if (xmlDeclarationEnd == pEnd)

306 return false;

307 // No need for +1, because we have an extra "?" to lose at the end of XM L declaration.

308 int len = 0;

309 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);

310 if (pos != -1)

311 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader) ;

312 // continue looking for a charset - it may be specified in an HTTP-Equiv meta

313 } else if (bytesEqual(ptr, '<', 0, '?', 0, 'x', 0)) {

314 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

315 } else if (bytesEqual(ptr, 0, '<', 0, '?', 0, 'x')) {

316 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

317 } else if (bytesEqual(ptr, '<', 0, 0, 0, '?', 0, 0, 0)) {

318 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

319 } else if (bytesEqual(ptr, 0, 0, 0, '<', 0, 0, 0, '?')) {

320 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

321 }

322

323 m_checkedForXMLCharset = true;

324 return true;

325 }

326

327 void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)

328 {

329 if (m_source == UserChosenEncoding \|\| m_source == EncodingFromHTTPHeader \|\| m_source == AutoDetectedEncoding) {

330 m_checkedForMetaCharset = true;

331 return;

332 }

333

334 if (!m_charsetParser)

335 m_charsetParser = HTMLMetaCharsetParser::create();

336

337 if (!m_charsetParser->checkForMetaCharset(data, length))

338 return;

339

340 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);

341 m_charsetParser.clear();

342 m_checkedForMetaCharset = true;

343 return;

344 }

345

346 // We use the encoding detector in two cases:

347 // 1. Encoding detector is turned ON and no other encoding source is

348 // available (that is, it's DefaultEncoding).

349 // 2. Encoding detector is turned ON and the encoding is set to

350 // the encoding of the parent frame, which is also auto-detected.

351 // Note that condition #2 is NOT satisfied unless parent-child frame

352 // relationship is compliant to the same-origin policy. If they're from

353 // different domains, \|m_source\| would not be set to EncodingFromParentFrame

354 // in the first place.

355 bool TextResourceDecoder::shouldAutoDetect() const

356 {

357 // Just checking m_hintEncoding suffices here because it's only set

358 // in setHintEncoding when the source is AutoDetectedEncoding.

359 return m_usesEncodingDetector

360 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));

361 }

362

363 String TextResourceDecoder::decode(const char* data, size_t len)

364 {

365 size_t lengthOfBOM = 0;

366 if (!m_checkedForBOM)

367 lengthOfBOM = checkForBOM(data, len);

368

369 bool movedDataToBuffer = false;

370

371 if (m_contentType == CSSContent && !m_checkedForCSSCharset) {

372 if (!checkForCSSCharset(data, len, movedDataToBuffer))

373 return emptyString();

374 }

375

376 if ((m_contentType == HTMLContent \|\| m_contentType == XMLContent) && !m_chec kedForXMLCharset) {

377 if (!checkForXMLCharset(data, len, movedDataToBuffer))

378 return emptyString();

379 }

380

381 const char* dataForDecode = data + lengthOfBOM;

382 size_t lengthForDecode = len - lengthOfBOM;

383

384 if (!m_buffer.isEmpty()) {

385 if (!movedDataToBuffer) {

386 size_t oldSize = m_buffer.size();

387 m_buffer.grow(oldSize + len);

388 memcpy(m_buffer.data() + oldSize, data, len);

389 }

390

391 dataForDecode = m_buffer.data() + lengthOfBOM;

392 lengthForDecode = m_buffer.size() - lengthOfBOM;

393 }

394

395 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)

396 checkForMetaCharset(dataForDecode, lengthForDecode);

397

398 if (shouldAutoDetect()) {

399 WTF::TextEncoding detectedEncoding;

400 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))

401 setEncoding(detectedEncoding, EncodingFromContentSniffing);

402 }

403

404 ASSERT(m_encoding.isValid());

405

406 if (!m_codec)

407 m_codec = newTextCodec(m_encoding);

408

409 String result = m_codec->decode(dataForDecode, lengthForDecode, false, m_con tentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

410

411 m_buffer.clear();

412 return result;

413 }

414

415 String TextResourceDecoder::flush()

416 {

417 // If we can not identify the encoding even after a document is completely

418 // loaded, we need to detect the encoding if other conditions for

419 // autodetection is satisfied.

420 if (m_buffer.size() && shouldAutoDetect()

421 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent \|\| m_conte ntType == XMLContent)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {

422 WTF::TextEncoding detectedEncoding;

423 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))

424 setEncoding(detectedEncoding, EncodingFromContentSniffing);

425 }

426

427 if (!m_codec)

428 m_codec = newTextCodec(m_encoding);

429

430 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_co ntentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

431 m_buffer.clear();

432 m_codec.clear();

433 m_checkedForBOM = false; // Skip BOM again when re-decoding.

434 return result;

435 }

436

437 }

OLD	NEW

« no previous file with comments | « Source/core/html/parser/TextResourceDecoder.h ('k') | Source/core/html/parser/XSSAuditor.h » ('j') | no next file with comments »