OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (C) 2007-2009 Torch Mobile, Inc. All rights reserved. | |
3 * Copyright (C) 2010-2012 Patrick Gansterer <paroga@paroga.com> | |
4 * | |
5 * Redistribution and use in source and binary forms, with or without | |
6 * modification, are permitted provided that the following conditions | |
7 * are met: | |
8 * 1. Redistributions of source code must retain the above copyright | |
9 * notice, this list of conditions and the following disclaimer. | |
10 * 2. Redistributions in binary form must reproduce the above copyright | |
11 * notice, this list of conditions and the following disclaimer in the | |
12 * documentation and/or other materials provided with the distribution. | |
13 * | |
14 * This library is distributed in the hope that i will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 * Library General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU Library General Public License | |
20 * along with this library; see the file COPYING.LIB. If not, write to | |
21 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, | |
22 * Boston, MA 02110-1301, USA. | |
23 */ | |
24 | |
25 #include "config.h" | |
26 #include "TextCodecWin.h" | |
27 | |
28 #include "COMPtr.h" | |
29 #include <mlang.h> | |
30 #include <windows.h> | |
31 #include <wtf/HashMap.h> | |
32 #include <wtf/HashSet.h> | |
33 #include <wtf/text/CString.h> | |
34 #include <wtf/text/StringHash.h> | |
35 #include <wtf/text/WTFString.h> | |
36 | |
37 namespace WebCore { | |
38 | |
39 struct CharsetInfo { | |
40 CString m_name; | |
41 String m_friendlyName; | |
42 UINT m_codePage; | |
43 Vector<CString> m_aliases; | |
44 }; | |
45 | |
46 class LanguageManager { | |
47 private: | |
48 LanguageManager(); | |
49 | |
50 friend LanguageManager& languageManager(); | |
51 }; | |
52 | |
53 // Usage: a lookup table used to get CharsetInfo with code page ID. | |
54 // Key: code page ID. Value: charset information. | |
55 static HashMap<UINT, CString>& codePageCharsets() | |
56 { | |
57 static HashMap<UINT, CString> cc; | |
58 return cc; | |
59 } | |
60 | |
61 static HashMap<String, CharsetInfo>& knownCharsets() | |
62 { | |
63 static HashMap<String, CharsetInfo> kc; | |
64 return kc; | |
65 } | |
66 | |
67 // Usage: a map that stores charsets that are supported by system. Sorted by nam
e. | |
68 // Key: charset. Value: code page ID. | |
69 typedef HashSet<String> CharsetSet; | |
70 static CharsetSet& supportedCharsets() | |
71 { | |
72 static CharsetSet sl; | |
73 return sl; | |
74 } | |
75 | |
76 static LanguageManager& languageManager() | |
77 { | |
78 static LanguageManager lm; | |
79 return lm; | |
80 } | |
81 | |
82 LanguageManager::LanguageManager() | |
83 { | |
84 COMPtr<IMultiLanguage> multiLanguage; | |
85 if (FAILED(::CoCreateInstance(CLSID_CMultiLanguage, 0, CLSCTX_INPROC_SERVER,
IID_IMultiLanguage, reinterpret_cast<LPVOID*>(&multiLanguage)))) | |
86 return; | |
87 | |
88 COMPtr<IEnumCodePage> enumInterface; | |
89 if (FAILED(multiLanguage->EnumCodePages(MIMECONTF_BROWSER, &enumInterface))) | |
90 return; | |
91 | |
92 MIMECPINFO cpInfo; | |
93 ULONG ccpInfo; | |
94 while (SUCCEEDED(enumInterface->Next(1, &cpInfo, &ccpInfo)) && ccpInfo) { | |
95 if (!IsValidCodePage(cpInfo.uiCodePage)) | |
96 continue; | |
97 | |
98 HashMap<UINT, CString>::iterator i = codePageCharsets().find(cpInfo.uiCo
dePage); | |
99 | |
100 CString name(String(cpInfo.wszWebCharset).latin1()); | |
101 if (i == codePageCharsets().end()) { | |
102 CharsetInfo info; | |
103 info.m_codePage = cpInfo.uiCodePage; | |
104 knownCharsets().set(name.data(), info); | |
105 i = codePageCharsets().set(cpInfo.uiCodePage, name).iterator; | |
106 } | |
107 if (i != codePageCharsets().end()) { | |
108 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(Stri
ng(i->value.data(), i->value.length())); | |
109 ASSERT(j != knownCharsets().end()); | |
110 CharsetInfo& info = j->value; | |
111 info.m_name = i->value.data(); | |
112 info.m_friendlyName = cpInfo.wszDescription; | |
113 info.m_aliases.append(name); | |
114 info.m_aliases.append(String(cpInfo.wszHeaderCharset).latin1()); | |
115 info.m_aliases.append(String(cpInfo.wszBodyCharset).latin1()); | |
116 String cpName = "cp" + String::number(cpInfo.uiCodePage); | |
117 info.m_aliases.append(cpName.latin1()); | |
118 supportedCharsets().add(i->value.data()); | |
119 } | |
120 } | |
121 } | |
122 | |
123 static UINT getCodePage(const char* name) | |
124 { | |
125 // Explicitly use a "const" reference to fix the silly VS build error | |
126 // saying "==" is not found for const_iterator and iterator | |
127 const HashMap<String, CharsetInfo>& charsets = knownCharsets(); | |
128 HashMap<String, CharsetInfo>::const_iterator i = charsets.find(name); | |
129 return i == charsets.end() ? CP_ACP : i->value.m_codePage; | |
130 } | |
131 | |
132 static PassOwnPtr<TextCodec> newTextCodecWin(const TextEncoding& encoding, const
void*) | |
133 { | |
134 return adoptPtr(new TextCodecWin(getCodePage(encoding.name()))); | |
135 } | |
136 | |
137 TextCodecWin::TextCodecWin(UINT codePage) | |
138 : m_codePage(codePage) | |
139 { | |
140 } | |
141 | |
142 TextCodecWin::~TextCodecWin() | |
143 { | |
144 } | |
145 | |
146 void TextCodecWin::registerExtendedEncodingNames(EncodingNameRegistrar registrar
) | |
147 { | |
148 languageManager(); | |
149 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCha
rsets().end(); ++i) { | |
150 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); | |
151 if (j != knownCharsets().end()) { | |
152 registrar(j->value.m_name.data(), j->value.m_name.data()); | |
153 for (Vector<CString>::const_iterator alias = j->value.m_aliases.begi
n(); alias != j->value.m_aliases.end(); ++alias) | |
154 registrar(alias->data(), j->value.m_name.data()); | |
155 } | |
156 } | |
157 } | |
158 | |
159 void TextCodecWin::registerExtendedCodecs(TextCodecRegistrar registrar) | |
160 { | |
161 languageManager(); | |
162 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCha
rsets().end(); ++i) { | |
163 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); | |
164 if (j != knownCharsets().end()) | |
165 registrar(j->value.m_name.data(), newTextCodecWin, 0); | |
166 } | |
167 } | |
168 | |
169 static DWORD getCodePageFlags(UINT codePage) | |
170 { | |
171 if (codePage == 42) // Symbol | |
172 return 0; | |
173 | |
174 // Microsoft says the flag must be 0 for the following code pages | |
175 if (codePage > 50000) { | |
176 if ((codePage >= 50220 && codePage <= 50222) | |
177 || codePage == 50225 | |
178 || codePage == 50227 | |
179 || codePage == 50229 | |
180 || codePage == 52936 | |
181 || codePage == 54936 | |
182 || (codePage >= 57002 && codePage <= 57001) | |
183 || codePage == 65000 // UTF-7 | |
184 ) | |
185 return 0; | |
186 } | |
187 | |
188 return MB_PRECOMPOSED | MB_ERR_INVALID_CHARS; | |
189 } | |
190 | |
191 static inline const char* findFirstNonAsciiCharacter(const char* bytes, size_t l
ength) | |
192 { | |
193 for (const char* bytesEnd = bytes + length; bytes < bytesEnd; ++bytes) { | |
194 if (*bytes & 0x80) | |
195 break; | |
196 } | |
197 return bytes; | |
198 } | |
199 | |
200 static void decodeInternal(Vector<UChar, 8192>& result, UINT codePage, const cha
r* bytes, size_t length, size_t* left) | |
201 { | |
202 *left = length; | |
203 if (!bytes || !length) | |
204 return; | |
205 | |
206 DWORD flags = getCodePageFlags(codePage); | |
207 | |
208 int testLength = length; | |
209 int untestedLength = length; | |
210 for (;;) { | |
211 int resultLength = MultiByteToWideChar(codePage, flags, bytes, testLengt
h, 0, 0); | |
212 | |
213 if (resultLength > 0) { | |
214 int oldSize = result.size(); | |
215 result.resize(oldSize + resultLength); | |
216 | |
217 MultiByteToWideChar(codePage, flags, bytes, testLength, result.data(
) + oldSize, resultLength); | |
218 | |
219 if (testLength == untestedLength) { | |
220 *left = length - testLength; | |
221 break; | |
222 } | |
223 untestedLength -= testLength; | |
224 length -= testLength; | |
225 bytes += testLength; | |
226 } else { | |
227 untestedLength = testLength - 1; | |
228 if (!untestedLength) { | |
229 *left = length; | |
230 break; | |
231 } | |
232 } | |
233 testLength = (untestedLength + 1) / 2; | |
234 } | |
235 } | |
236 | |
237 String TextCodecWin::decode(const char* bytes, size_t length, bool flush, bool s
topOnError, bool& sawError) | |
238 { | |
239 if (!m_decodeBuffer.isEmpty()) { | |
240 m_decodeBuffer.append(bytes, length); | |
241 bytes = m_decodeBuffer.data(); | |
242 length = m_decodeBuffer.size(); | |
243 } | |
244 | |
245 size_t left; | |
246 Vector<UChar, 8192> result; | |
247 for (;;) { | |
248 decodeInternal(result, m_codePage, bytes, length, &left); | |
249 if (!left) | |
250 break; | |
251 | |
252 if (!flush && left < 16) | |
253 break; | |
254 | |
255 result.append(L'?'); | |
256 sawError = true; | |
257 if (stopOnError) | |
258 return String::adopt(result); | |
259 | |
260 if (left == 1) | |
261 break; | |
262 | |
263 bytes += length - left + 1; | |
264 length = left - 1; | |
265 } | |
266 if (left && !flush) { | |
267 if (m_decodeBuffer.isEmpty()) | |
268 m_decodeBuffer.append(bytes + length - left, left); | |
269 else { | |
270 memmove(m_decodeBuffer.data(), bytes + length - left, left); | |
271 m_decodeBuffer.resize(left); | |
272 } | |
273 } else | |
274 m_decodeBuffer.clear(); | |
275 | |
276 return String::adopt(result); | |
277 } | |
278 | |
279 CString TextCodecWin::encode(const UChar* characters, size_t length, Unencodable
Handling) | |
280 { | |
281 if (!characters || !length) | |
282 return CString(); | |
283 | |
284 int resultLength = WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, charac
ters, length, 0, 0, 0, 0); | |
285 | |
286 // FIXME: We need to implement UnencodableHandling: QuestionMarksForUnencoda
bles, EntitiesForUnencodables, and URLEncodedEntitiesForUnencodables. | |
287 | |
288 if (resultLength <= 0) | |
289 return "?"; | |
290 | |
291 char* characterBuffer; | |
292 CString result = CString::newUninitialized(resultLength, characterBuffer); | |
293 | |
294 WideCharToMultiByte(m_codePage, WC_COMPOSITECHECK, characters, length, chara
cterBuffer, resultLength, 0, 0); | |
295 | |
296 return result; | |
297 } | |
298 | |
299 void TextCodecWin::enumerateSupportedEncodings(EncodingReceiver& receiver) | |
300 { | |
301 languageManager(); | |
302 for (CharsetSet::iterator i = supportedCharsets().begin(); i != supportedCha
rsets().end(); ++i) { | |
303 HashMap<String, CharsetInfo>::iterator j = knownCharsets().find(*i); | |
304 if (j != knownCharsets().end() && !receiver.receive(j->value.m_name.data
(), j->value.m_friendlyName.charactersWithNullTermination(), j->value.m_codePage
)) | |
305 break; | |
306 } | |
307 } | |
308 | |
309 } // namespace WebCore | |
OLD | NEW |