Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(139)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2006 Alexey Proskuryakov (ap@nypop.com) 3 Copyright (C) 2006 Alexey Proskuryakov (ap@nypop.com)
4 Copyright (C) 2006, 2008 Apple Inc. All rights reserved. 4 Copyright (C) 2006, 2008 Apple Inc. All rights reserved.
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
73 { 73 {
74 m_hintEncoding = encoding.name(); 74 m_hintEncoding = encoding.name();
75 } 75 }
76 76
77 void useLenientXMLDecoding() { m_useLenientXMLDecoding = true; } 77 void useLenientXMLDecoding() { m_useLenientXMLDecoding = true; }
78 bool sawError() const { return m_sawError; } 78 bool sawError() const { return m_sawError; }
79 size_t checkForBOM(const char*, size_t); 79 size_t checkForBOM(const char*, size_t);
80 80
81 private: 81 private:
82 82
83 // TextResourceDecoder does three kind of encoding detection: 83 // TextResourceDecoder does four kinds of encoding detection:
84 // 1. By BOM, 84 // 1. By BOM,
85 // 2. By Content if |m_contentType| is not |PlainTextContext| 85 // 2. By Content if |m_contentType| is not |PlainTextContext|
86 // (e.g. <meta> tag for HTML), and 86 // (e.g. <meta> tag for HTML),
87 // 3. By detectTextEncoding(). 87 // 3. By isUTF8Encoded() to detect if the document
88 // is of UTF-8, and
89 // 4. By detectTextEncodingUniversal().
88 enum EncodingDetectionOption { 90 enum EncodingDetectionOption {
89 // Use 1. + 2. + 3. 91 // Use 1. + 2. + 4.
90 UseAllAutoDetection, 92 UseAllAutoDetection,
91 93
92 // Use 1. + 2. 94 // Use 1. + 2. + 3.
93 UseContentAndBOMBasedDetection, 95 UseContentAndBOMBasedDetection,
94 96
95 // Use None of them. 97 // Use None of them.
96 // |m_contentType| must be |PlainTextContent| and 98 // |m_contentType| must be |PlainTextContent| and
97 // |m_encoding| must be UTF8Encoding. 99 // |m_encoding| must be UTF8Encoding.
98 // This doesn't change encoding based on BOMs, but still processes 100 // This doesn't change encoding based on BOMs, but still processes
99 // utf-8 BOMs so that utf-8 BOMs don't appear in the decoded result. 101 // utf-8 BOMs so that utf-8 BOMs don't appear in the decoded result.
100 AlwaysUseUTF8ForText 102 AlwaysUseUTF8ForText
101 }; 103 };
102 104
103 TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& default Encoding, EncodingDetectionOption); 105 TextResourceDecoder(const String& mimeType, const WTF::TextEncoding& default Encoding, EncodingDetectionOption);
104 106
105 enum ContentType { PlainTextContent, HTMLContent, XMLContent, CSSContent }; // PlainText only checks for BOM. 107 enum ContentType { PlainTextContent, HTMLContent, XMLContent, CSSContent }; // PlainText only checks for BOM.
106 static ContentType determineContentType(const String& mimeType); 108 static ContentType determineContentType(const String& mimeType);
107 static const WTF::TextEncoding& defaultEncoding(ContentType, const WTF::Text Encoding& defaultEncoding); 109 static const WTF::TextEncoding& defaultEncoding(ContentType, const WTF::Text Encoding& defaultEncoding);
108 110
109 bool checkForCSSCharset(const char*, size_t, bool& movedDataToBuffer); 111 bool checkForCSSCharset(const char*, size_t, bool& movedDataToBuffer);
110 bool checkForXMLCharset(const char*, size_t, bool& movedDataToBuffer); 112 bool checkForXMLCharset(const char*, size_t, bool& movedDataToBuffer);
111 void checkForMetaCharset(const char*, size_t); 113 void checkForMetaCharset(const char*, size_t);
112 bool shouldAutoDetect() const; 114 bool shouldAutoDetect() const;
115 void detectTextEncoding(const char*, size_t);
113 116
114 ContentType m_contentType; 117 ContentType m_contentType;
115 WTF::TextEncoding m_encoding; 118 WTF::TextEncoding m_encoding;
116 OwnPtr<TextCodec> m_codec; 119 OwnPtr<TextCodec> m_codec;
117 EncodingSource m_source; 120 EncodingSource m_source;
118 const char* m_hintEncoding; 121 const char* m_hintEncoding;
119 Vector<char> m_buffer; 122 Vector<char> m_buffer;
120 bool m_checkedForBOM; 123 bool m_checkedForBOM;
121 bool m_checkedForCSSCharset; 124 bool m_checkedForCSSCharset;
122 bool m_checkedForXMLCharset; 125 bool m_checkedForXMLCharset;
123 bool m_checkedForMetaCharset; 126 bool m_checkedForMetaCharset;
124 bool m_useLenientXMLDecoding; // Don't stop on XML decoding errors. 127 bool m_useLenientXMLDecoding; // Don't stop on XML decoding errors.
125 bool m_sawError; 128 bool m_sawError;
126 EncodingDetectionOption m_encodingDetectionOption; 129 EncodingDetectionOption m_encodingDetectionOption;
127 130
128 OwnPtr<HTMLMetaCharsetParser> m_charsetParser; 131 OwnPtr<HTMLMetaCharsetParser> m_charsetParser;
129 }; 132 };
130 133
131 } // namespace blink 134 } // namespace blink
132 135
133 #endif 136 #endif
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698