OLD | NEW |
| (Empty) |
1 # Copyright 2013 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 """Collection of functions and classes to fix various encoding problems on | |
6 multiple platforms with python. | |
7 """ | |
8 | |
9 import codecs | |
10 import locale | |
11 import os | |
12 import sys | |
13 | |
14 | |
15 # Prevents initializing multiple times. | |
16 _SYS_ARGV_PROCESSED = False | |
17 | |
18 | |
19 def complain(message): | |
20 """If any exception occurs in this file, we'll probably try to print it | |
21 on stderr, which makes for frustrating debugging if stderr is directed | |
22 to our wrapper. So be paranoid about catching errors and reporting them | |
23 to sys.__stderr__, so that the user has a higher chance to see them. | |
24 """ | |
25 print >> sys.__stderr__, ( | |
26 isinstance(message, str) and message or repr(message)) | |
27 | |
28 | |
29 def fix_default_encoding(): | |
30 """Forces utf8 solidly on all platforms. | |
31 | |
32 By default python execution environment is lazy and defaults to ascii | |
33 encoding. | |
34 | |
35 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/ | |
36 """ | |
37 if sys.getdefaultencoding() == 'utf-8': | |
38 return False | |
39 | |
40 # Regenerate setdefaultencoding. | |
41 reload(sys) | |
42 # Module 'sys' has no 'setdefaultencoding' member | |
43 # pylint: disable=E1101 | |
44 sys.setdefaultencoding('utf-8') | |
45 for attr in dir(locale): | |
46 if attr[0:3] != 'LC_': | |
47 continue | |
48 aref = getattr(locale, attr) | |
49 try: | |
50 locale.setlocale(aref, '') | |
51 except locale.Error: | |
52 continue | |
53 try: | |
54 lang = locale.getlocale(aref)[0] | |
55 except (TypeError, ValueError): | |
56 continue | |
57 if lang: | |
58 try: | |
59 locale.setlocale(aref, (lang, 'UTF-8')) | |
60 except locale.Error: | |
61 os.environ[attr] = lang + '.UTF-8' | |
62 try: | |
63 locale.setlocale(locale.LC_ALL, '') | |
64 except locale.Error: | |
65 pass | |
66 return True | |
67 | |
68 | |
69 ############################### | |
70 # Windows specific | |
71 | |
72 | |
73 def fix_win_sys_argv(encoding): | |
74 """Converts sys.argv to 'encoding' encoded string. | |
75 | |
76 utf-8 is recommended. | |
77 | |
78 Works around <http://bugs.python.org/issue2128>. | |
79 """ | |
80 global _SYS_ARGV_PROCESSED | |
81 if _SYS_ARGV_PROCESSED: | |
82 return False | |
83 | |
84 # These types are available on linux but not Mac. | |
85 # pylint: disable=E0611,F0401 | |
86 from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE | |
87 from ctypes.wintypes import LPCWSTR, LPWSTR | |
88 | |
89 # <http://msdn.microsoft.com/en-us/library/ms683156.aspx> | |
90 GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32)) | |
91 # <http://msdn.microsoft.com/en-us/library/bb776391.aspx> | |
92 CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))( | |
93 ('CommandLineToArgvW', windll.shell32)) | |
94 | |
95 argc = c_int(0) | |
96 argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc)) | |
97 argv = [ | |
98 argv_unicode[i].encode(encoding, 'replace') | |
99 for i in xrange(0, argc.value)] | |
100 | |
101 if not hasattr(sys, 'frozen'): | |
102 # If this is an executable produced by py2exe or bbfreeze, then it | |
103 # will have been invoked directly. Otherwise, unicode_argv[0] is the | |
104 # Python interpreter, so skip that. | |
105 argv = argv[1:] | |
106 | |
107 # Also skip option arguments to the Python interpreter. | |
108 while len(argv) > 0: | |
109 arg = argv[0] | |
110 if not arg.startswith(u'-') or arg == u'-': | |
111 break | |
112 argv = argv[1:] | |
113 if arg == u'-m': | |
114 # sys.argv[0] should really be the absolute path of the | |
115 # module source, but never mind. | |
116 break | |
117 if arg == u'-c': | |
118 argv[0] = u'-c' | |
119 break | |
120 sys.argv = argv | |
121 _SYS_ARGV_PROCESSED = True | |
122 return True | |
123 | |
124 | |
125 def fix_win_codec(): | |
126 """Works around <http://bugs.python.org/issue6058>.""" | |
127 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx> | |
128 try: | |
129 codecs.lookup('cp65001') | |
130 return False | |
131 except LookupError: | |
132 codecs.register( | |
133 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) | |
134 return True | |
135 | |
136 | |
137 class WinUnicodeOutputBase(object): | |
138 """Base class to adapt sys.stdout or sys.stderr to behave correctly on | |
139 Windows. | |
140 | |
141 Setting encoding to utf-8 is recommended. | |
142 """ | |
143 def __init__(self, fileno, name, encoding): | |
144 # Corresponding file handle. | |
145 self._fileno = fileno | |
146 self.encoding = encoding | |
147 self.name = name | |
148 | |
149 self.closed = False | |
150 self.softspace = False | |
151 self.mode = 'w' | |
152 | |
153 @staticmethod | |
154 def isatty(): | |
155 return False | |
156 | |
157 def close(self): | |
158 # Don't really close the handle, that would only cause problems. | |
159 self.closed = True | |
160 | |
161 def fileno(self): | |
162 return self._fileno | |
163 | |
164 def flush(self): | |
165 raise NotImplementedError() | |
166 | |
167 def write(self, text): | |
168 raise NotImplementedError() | |
169 | |
170 def writelines(self, lines): | |
171 try: | |
172 for line in lines: | |
173 self.write(line) | |
174 except Exception, e: | |
175 complain('%s.writelines: %r' % (self.name, e)) | |
176 raise | |
177 | |
178 | |
179 class WinUnicodeConsoleOutput(WinUnicodeOutputBase): | |
180 """Output adapter to a Windows Console. | |
181 | |
182 Understands how to use the win32 console API. | |
183 """ | |
184 def __init__(self, console_handle, fileno, stream_name, encoding): | |
185 super(WinUnicodeConsoleOutput, self).__init__( | |
186 fileno, '<Unicode console %s>' % stream_name, encoding) | |
187 # Handle to use for WriteConsoleW | |
188 self._console_handle = console_handle | |
189 | |
190 # Loads the necessary function. | |
191 # These types are available on linux but not Mac. | |
192 # pylint: disable=E0611,F0401 | |
193 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE | |
194 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR | |
195 from ctypes.wintypes import LPVOID # pylint: disable=E0611 | |
196 | |
197 self._DWORD = DWORD | |
198 self._byref = byref | |
199 | |
200 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx> | |
201 self._WriteConsoleW = WINFUNCTYPE( | |
202 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)( | |
203 ('WriteConsoleW', windll.kernel32)) | |
204 self._GetLastError = GetLastError | |
205 | |
206 def flush(self): | |
207 # No need to flush the console since it's immediate. | |
208 pass | |
209 | |
210 def write(self, text): | |
211 try: | |
212 if not isinstance(text, unicode): | |
213 # Convert to unicode. | |
214 text = str(text).decode(self.encoding, 'replace') | |
215 remaining = len(text) | |
216 while remaining > 0: | |
217 n = self._DWORD(0) | |
218 # There is a shorter-than-documented limitation on the length of the | |
219 # string passed to WriteConsoleW. See | |
220 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>. | |
221 retval = self._WriteConsoleW( | |
222 self._console_handle, text, | |
223 min(remaining, 10000), | |
224 self._byref(n), None) | |
225 if retval == 0 or n.value == 0: | |
226 raise IOError( | |
227 'WriteConsoleW returned %r, n.value = %r, last error = %r' % ( | |
228 retval, n.value, self._GetLastError())) | |
229 remaining -= n.value | |
230 if not remaining: | |
231 break | |
232 text = text[n.value:] | |
233 except Exception, e: | |
234 complain('%s.write: %r' % (self.name, e)) | |
235 raise | |
236 | |
237 | |
238 class WinUnicodeOutput(WinUnicodeOutputBase): | |
239 """Output adaptor to a file output on Windows. | |
240 | |
241 If the standard FileWrite function is used, it will be encoded in the current | |
242 code page. WriteConsoleW() permits writting any character. | |
243 """ | |
244 def __init__(self, stream, fileno, encoding): | |
245 super(WinUnicodeOutput, self).__init__( | |
246 fileno, '<Unicode redirected %s>' % stream.name, encoding) | |
247 # Output stream | |
248 self._stream = stream | |
249 | |
250 # Flush right now. | |
251 self.flush() | |
252 | |
253 def flush(self): | |
254 try: | |
255 self._stream.flush() | |
256 except Exception, e: | |
257 complain('%s.flush: %r from %r' % (self.name, e, self._stream)) | |
258 raise | |
259 | |
260 def write(self, text): | |
261 try: | |
262 if isinstance(text, unicode): | |
263 # Replace characters that cannot be printed instead of failing. | |
264 text = text.encode(self.encoding, 'replace') | |
265 self._stream.write(text) | |
266 except Exception, e: | |
267 complain('%s.write: %r' % (self.name, e)) | |
268 raise | |
269 | |
270 | |
271 def win_handle_is_a_console(handle): | |
272 """Returns True if a Windows file handle is a handle to a console.""" | |
273 # These types are available on linux but not Mac. | |
274 # pylint: disable=E0611,F0401 | |
275 from ctypes import byref, POINTER, windll, WINFUNCTYPE | |
276 from ctypes.wintypes import BOOL, DWORD, HANDLE | |
277 | |
278 FILE_TYPE_CHAR = 0x0002 | |
279 FILE_TYPE_REMOTE = 0x8000 | |
280 INVALID_HANDLE_VALUE = DWORD(-1).value | |
281 | |
282 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx> | |
283 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))( | |
284 ('GetConsoleMode', windll.kernel32)) | |
285 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx> | |
286 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32)) | |
287 | |
288 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle. | |
289 if handle == INVALID_HANDLE_VALUE or handle is None: | |
290 return False | |
291 return ( | |
292 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and | |
293 GetConsoleMode(handle, byref(DWORD()))) | |
294 | |
295 | |
296 def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding): | |
297 """Returns a unicode-compatible stream. | |
298 | |
299 This function will return a direct-Console writing object only if: | |
300 - the file number is the expected console file number | |
301 - the handle the expected file handle | |
302 - the 'real' handle is in fact a handle to a console. | |
303 """ | |
304 old_fileno = getattr(stream, 'fileno', lambda: None)() | |
305 if old_fileno == excepted_fileno: | |
306 # These types are available on linux but not Mac. | |
307 # pylint: disable=E0611,F0401 | |
308 from ctypes import windll, WINFUNCTYPE | |
309 from ctypes.wintypes import DWORD, HANDLE | |
310 | |
311 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx> | |
312 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32)) | |
313 | |
314 real_output_handle = GetStdHandle(DWORD(output_handle)) | |
315 if win_handle_is_a_console(real_output_handle): | |
316 # It's a console. | |
317 return WinUnicodeConsoleOutput( | |
318 real_output_handle, old_fileno, stream.name, encoding) | |
319 | |
320 # It's something else. Create an auto-encoding stream. | |
321 return WinUnicodeOutput(stream, old_fileno, encoding) | |
322 | |
323 | |
324 def fix_win_console(encoding): | |
325 """Makes Unicode console output work independently of the current code page. | |
326 | |
327 This also fixes <http://bugs.python.org/issue1602>. | |
328 Credit to Michael Kaplan | |
329 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and | |
330 TZOmegaTZIOY | |
331 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-
python-crash/1432462#1432462>. | |
332 """ | |
333 if (isinstance(sys.stdout, WinUnicodeOutputBase) or | |
334 isinstance(sys.stderr, WinUnicodeOutputBase)): | |
335 return False | |
336 | |
337 try: | |
338 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page | |
339 # but it's not really useful since the code here is using WriteConsoleW(). | |
340 # Also, changing the code page is 'permanent' to the console and needs to be | |
341 # reverted manually. | |
342 # In practice one needs to set the console font to a TTF font to be able to | |
343 # see all the characters but it failed for me in practice. In any case, it | |
344 # won't throw any exception when printing, which is the important part. | |
345 # -11 and -12 are defined in stdio.h | |
346 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding) | |
347 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding) | |
348 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is | |
349 # "It doesn't appear to be possible to read Unicode characters in UTF-8 | |
350 # mode" and this appears to be a limitation of cmd.exe. | |
351 except Exception, e: | |
352 complain('exception %r while fixing up sys.stdout and sys.stderr' % e) | |
353 return True | |
354 | |
355 | |
356 def fix_encoding(): | |
357 """Fixes various encoding problems on all platforms. | |
358 | |
359 Should be called at the very begining of the process. | |
360 """ | |
361 ret = True | |
362 if sys.platform == 'win32': | |
363 ret &= fix_win_codec() | |
364 | |
365 ret &= fix_default_encoding() | |
366 | |
367 if sys.platform == 'win32': | |
368 encoding = sys.getdefaultencoding() | |
369 ret &= fix_win_sys_argv(encoding) | |
370 ret &= fix_win_console(encoding) | |
371 return ret | |
OLD | NEW |