OLD | NEW |
| (Empty) |
1 # Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 """Collection of functions and classes to fix various encoding problems on | |
6 multiple platforms with python. | |
7 """ | |
8 | |
9 import codecs | |
10 import locale | |
11 import os | |
12 import sys | |
13 | |
14 | |
15 # Prevents initializing multiple times. | |
16 _SYS_ARGV_PROCESSED = False | |
17 | |
18 | |
19 def complain(message): | |
20 """If any exception occurs in this file, we'll probably try to print it | |
21 on stderr, which makes for frustrating debugging if stderr is directed | |
22 to our wrapper. So be paranoid about catching errors and reporting them | |
23 to sys.__stderr__, so that the user has a higher chance to see them. | |
24 """ | |
25 print >> sys.__stderr__, ( | |
26 isinstance(message, str) and message or repr(message)) | |
27 | |
28 | |
29 def fix_default_encoding(): | |
30 """Forces utf8 solidly on all platforms. | |
31 | |
32 By default python execution environment is lazy and defaults to ascii | |
33 encoding. | |
34 | |
35 http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/ | |
36 """ | |
37 if sys.getdefaultencoding() == 'utf-8': | |
38 return False | |
39 | |
40 # Regenerate setdefaultencoding. | |
41 reload(sys) | |
42 # Module 'sys' has no 'setdefaultencoding' member | |
43 # pylint: disable=E1101 | |
44 sys.setdefaultencoding('utf-8') | |
45 for attr in dir(locale): | |
46 if attr[0:3] != 'LC_': | |
47 continue | |
48 aref = getattr(locale, attr) | |
49 locale.setlocale(aref, '') | |
50 lang, _ = locale.getlocale(aref) | |
51 if lang != None: | |
52 try: | |
53 locale.setlocale(aref, (lang, 'UTF-8')) | |
54 except locale.Error: | |
55 os.environ[attr] = lang + '.UTF-8' | |
56 locale.setlocale(locale.LC_ALL, '') | |
57 return True | |
58 | |
59 | |
60 ############################### | |
61 # Windows specific | |
62 | |
63 | |
64 def fix_win_sys_argv(encoding): | |
65 """Converts sys.argv to 'encoding' encoded string. | |
66 | |
67 utf-8 is recommended. | |
68 | |
69 Works around <http://bugs.python.org/issue2128>. | |
70 """ | |
71 global _SYS_ARGV_PROCESSED | |
72 if _SYS_ARGV_PROCESSED: | |
73 return False | |
74 | |
75 from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE | |
76 from ctypes.wintypes import LPCWSTR, LPWSTR | |
77 | |
78 # <http://msdn.microsoft.com/en-us/library/ms683156.aspx> | |
79 GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32)) | |
80 # <http://msdn.microsoft.com/en-us/library/bb776391.aspx> | |
81 CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))( | |
82 ('CommandLineToArgvW', windll.shell32)) | |
83 | |
84 argc = c_int(0) | |
85 argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc)) | |
86 argv = [ | |
87 argv_unicode[i].encode(encoding, 'replace') | |
88 for i in xrange(0, argc.value)] | |
89 | |
90 if not hasattr(sys, 'frozen'): | |
91 # If this is an executable produced by py2exe or bbfreeze, then it | |
92 # will have been invoked directly. Otherwise, unicode_argv[0] is the | |
93 # Python interpreter, so skip that. | |
94 argv = argv[1:] | |
95 | |
96 # Also skip option arguments to the Python interpreter. | |
97 while len(argv) > 0: | |
98 arg = argv[0] | |
99 if not arg.startswith(u'-') or arg == u'-': | |
100 break | |
101 argv = argv[1:] | |
102 if arg == u'-m': | |
103 # sys.argv[0] should really be the absolute path of the | |
104 # module source, but never mind. | |
105 break | |
106 if arg == u'-c': | |
107 argv[0] = u'-c' | |
108 break | |
109 sys.argv = argv | |
110 _SYS_ARGV_PROCESSED = True | |
111 return True | |
112 | |
113 | |
114 def fix_win_codec(): | |
115 """Works around <http://bugs.python.org/issue6058>.""" | |
116 # <http://msdn.microsoft.com/en-us/library/dd317756.aspx> | |
117 try: | |
118 codecs.lookup('cp65001') | |
119 return False | |
120 except LookupError: | |
121 codecs.register( | |
122 lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) | |
123 return True | |
124 | |
125 | |
126 class WinUnicodeOutputBase(object): | |
127 """Base class to adapt sys.stdout or sys.stderr to behave correctly on | |
128 Windows. | |
129 | |
130 Setting encoding to utf-8 is recommended. | |
131 """ | |
132 def __init__(self, fileno, name, encoding): | |
133 # Corresponding file handle. | |
134 self._fileno = fileno | |
135 self.encoding = encoding | |
136 self.name = name | |
137 | |
138 self.closed = False | |
139 self.softspace = False | |
140 self.mode = 'w' | |
141 | |
142 @staticmethod | |
143 def isatty(): | |
144 return False | |
145 | |
146 def close(self): | |
147 # Don't really close the handle, that would only cause problems. | |
148 self.closed = True | |
149 | |
150 def fileno(self): | |
151 return self._fileno | |
152 | |
153 def flush(self): | |
154 raise NotImplementedError() | |
155 | |
156 def write(self, text): | |
157 raise NotImplementedError() | |
158 | |
159 def writelines(self, lines): | |
160 try: | |
161 for line in lines: | |
162 self.write(line) | |
163 except Exception, e: | |
164 complain('%s.writelines: %r' % (self.name, e)) | |
165 raise | |
166 | |
167 | |
168 class WinUnicodeConsoleOutput(WinUnicodeOutputBase): | |
169 """Output adapter to a Windows Console. | |
170 | |
171 Understands how to use the win32 console API. | |
172 """ | |
173 def __init__(self, console_handle, fileno, stream_name, encoding): | |
174 super(WinUnicodeConsoleOutput, self).__init__( | |
175 fileno, '<Unicode console %s>' % stream_name, encoding) | |
176 # Handle to use for WriteConsoleW | |
177 self._console_handle = console_handle | |
178 | |
179 # Loads the necessary function. | |
180 from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE | |
181 from ctypes.wintypes import BOOL, DWORD, HANDLE, LPVOID, LPWSTR | |
182 | |
183 self._DWORD = DWORD | |
184 self._byref = byref | |
185 | |
186 # <http://msdn.microsoft.com/en-us/library/ms687401.aspx> | |
187 self._WriteConsoleW = WINFUNCTYPE( | |
188 BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)( | |
189 ('WriteConsoleW', windll.kernel32)) | |
190 self._GetLastError = GetLastError | |
191 | |
192 def flush(self): | |
193 # No need to flush the console since it's immediate. | |
194 pass | |
195 | |
196 def write(self, text): | |
197 try: | |
198 if not isinstance(text, unicode): | |
199 # Convert to unicode. | |
200 text = str(text).decode(self.encoding, 'replace') | |
201 remaining = len(text) | |
202 while remaining > 0: | |
203 n = self._DWORD(0) | |
204 # There is a shorter-than-documented limitation on the length of the | |
205 # string passed to WriteConsoleW. See | |
206 # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>. | |
207 retval = self._WriteConsoleW( | |
208 self._console_handle, text, | |
209 min(remaining, 10000), | |
210 self._byref(n), None) | |
211 if retval == 0 or n.value == 0: | |
212 raise IOError( | |
213 'WriteConsoleW returned %r, n.value = %r, last error = %r' % ( | |
214 retval, n.value, self._GetLastError())) | |
215 remaining -= n.value | |
216 if not remaining: | |
217 break | |
218 text = text[n.value:] | |
219 except Exception, e: | |
220 complain('%s.write: %r' % (self.name, e)) | |
221 raise | |
222 | |
223 | |
224 class WinUnicodeOutput(WinUnicodeOutputBase): | |
225 """Output adaptor to a file output on Windows. | |
226 | |
227 If the standard FileWrite function is used, it will be encoded in the current | |
228 code page. WriteConsoleW() permits writting any character. | |
229 """ | |
230 def __init__(self, stream, fileno, encoding): | |
231 super(WinUnicodeOutput, self).__init__( | |
232 fileno, '<Unicode redirected %s>' % stream.name, encoding) | |
233 # Output stream | |
234 self._stream = stream | |
235 | |
236 # Flush right now. | |
237 self.flush() | |
238 | |
239 def flush(self): | |
240 try: | |
241 self._stream.flush() | |
242 except Exception, e: | |
243 complain('%s.flush: %r from %r' % (self.name, e, self._stream)) | |
244 raise | |
245 | |
246 def write(self, text): | |
247 try: | |
248 if isinstance(text, unicode): | |
249 # Replace characters that cannot be printed instead of failing. | |
250 text = text.encode(self.encoding, 'replace') | |
251 self._stream.write(text) | |
252 except Exception, e: | |
253 complain('%s.write: %r' % (self.name, e)) | |
254 raise | |
255 | |
256 | |
257 def win_handle_is_a_console(handle): | |
258 """Returns True if a Windows file handle is a handle to a console.""" | |
259 from ctypes import byref, POINTER, windll, WINFUNCTYPE | |
260 from ctypes.wintypes import BOOL, DWORD, HANDLE | |
261 | |
262 FILE_TYPE_CHAR = 0x0002 | |
263 FILE_TYPE_REMOTE = 0x8000 | |
264 INVALID_HANDLE_VALUE = DWORD(-1).value | |
265 | |
266 # <http://msdn.microsoft.com/en-us/library/ms683167.aspx> | |
267 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))( | |
268 ('GetConsoleMode', windll.kernel32)) | |
269 # <http://msdn.microsoft.com/en-us/library/aa364960.aspx> | |
270 GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32)) | |
271 | |
272 # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle. | |
273 if handle == INVALID_HANDLE_VALUE or handle is None: | |
274 return False | |
275 return ( | |
276 (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and | |
277 GetConsoleMode(handle, byref(DWORD()))) | |
278 | |
279 | |
280 def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding): | |
281 """Returns a unicode-compatible stream. | |
282 | |
283 This function will return a direct-Console writing object only if: | |
284 - the file number is the expected console file number | |
285 - the handle the expected file handle | |
286 - the 'real' handle is in fact a handle to a console. | |
287 """ | |
288 old_fileno = getattr(stream, 'fileno', lambda: None)() | |
289 if old_fileno == excepted_fileno: | |
290 from ctypes import windll, WINFUNCTYPE | |
291 from ctypes.wintypes import DWORD, HANDLE | |
292 | |
293 # <http://msdn.microsoft.com/en-us/library/ms683231.aspx> | |
294 GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32)) | |
295 | |
296 real_output_handle = GetStdHandle(DWORD(output_handle)) | |
297 if win_handle_is_a_console(real_output_handle): | |
298 # It's a console. | |
299 return WinUnicodeConsoleOutput( | |
300 real_output_handle, old_fileno, stream.name, encoding) | |
301 | |
302 # It's something else. Create an auto-encoding stream. | |
303 return WinUnicodeOutput(stream, old_fileno, encoding) | |
304 | |
305 | |
306 def fix_win_console(encoding): | |
307 """Makes Unicode console output work independently of the current code page. | |
308 | |
309 This also fixes <http://bugs.python.org/issue1602>. | |
310 Credit to Michael Kaplan | |
311 <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and | |
312 TZOmegaTZIOY | |
313 <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-
python-crash/1432462#1432462>. | |
314 """ | |
315 if (isinstance(sys.stdout, WinUnicodeOutputBase) or | |
316 isinstance(sys.stderr, WinUnicodeOutputBase)): | |
317 return False | |
318 | |
319 try: | |
320 # SetConsoleCP and SetConsoleOutputCP could be used to change the code page | |
321 # but it's not really useful since the code here is using WriteConsoleW(). | |
322 # Also, changing the code page is 'permanent' to the console and needs to be | |
323 # reverted manually. | |
324 # In practice one needs to set the console font to a TTF font to be able to | |
325 # see all the characters but it failed for me in practice. In any case, it | |
326 # won't throw any exception when printing, which is the important part. | |
327 # -11 and -12 are defined in stdio.h | |
328 sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding) | |
329 sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding) | |
330 # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is | |
331 # "It doesn't appear to be possible to read Unicode characters in UTF-8 | |
332 # mode" and this appears to be a limitation of cmd.exe. | |
333 except Exception, e: | |
334 complain('exception %r while fixing up sys.stdout and sys.stderr' % e) | |
335 return True | |
336 | |
337 | |
338 def fix_encoding(): | |
339 """Fixes various encoding problems on all platforms. | |
340 | |
341 Should be called at the very begining of the process. | |
342 """ | |
343 ret = True | |
344 if sys.platform == 'win32': | |
345 ret &= fix_win_codec() | |
346 | |
347 ret &= fix_default_encoding() | |
348 | |
349 if sys.platform == 'win32': | |
350 encoding = sys.getdefaultencoding() | |
351 ret &= fix_win_sys_argv(encoding) | |
352 ret &= fix_win_console(encoding) | |
353 return ret | |
OLD | NEW |