OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 '''Pseudo RTL, (aka Fake Bidi) support. It simply wraps each word with | |
7 Unicode RTL overrides. | |
8 More info at https://sites.google.com/a/chromium.org/dev/Home/fake-bidi | |
9 ''' | |
10 | |
11 import re | |
12 | |
13 from grit import lazy_re | |
14 from grit import tclib | |
15 | |
16 ACCENTED_STRINGS = { | |
17 'a': u"\u00e5", 'e': u"\u00e9", 'i': u"\u00ee", 'o': u"\u00f6", | |
18 'u': u"\u00fb", 'A': u"\u00c5", 'E': u"\u00c9", 'I': u"\u00ce", | |
19 'O': u"\u00d6", 'U': u"\u00db", 'c': u"\u00e7", 'd': u"\u00f0", | |
20 'n': u"\u00f1", 'p': u"\u00fe", 'y': u"\u00fd", 'C': u"\u00c7", | |
21 'D': u"\u00d0", 'N': u"\u00d1", 'P': u"\u00de", 'Y': u"\u00dd", | |
22 'f': u"\u0192", 's': u"\u0161", 'S': u"\u0160", 'z': u"\u017e", | |
23 'Z': u"\u017d", 'g': u"\u011d", 'G': u"\u011c", 'h': u"\u0125", | |
24 'H': u"\u0124", 'j': u"\u0135", 'J': u"\u0134", 'k': u"\u0137", | |
25 'K': u"\u0136", 'l': u"\u013c", 'L': u"\u013b", 't': u"\u0163", | |
26 'T': u"\u0162", 'w': u"\u0175", 'W': u"\u0174", | |
27 '$': u"\u20ac", '?': u"\u00bf", 'R': u"\u00ae", r'!': u"\u00a1", | |
28 } | |
29 | |
30 # a character set containing the keys in ACCENTED_STRINGS | |
31 # We should not accent characters in an escape sequence such as "\n". | |
32 # To be safe, we assume every character following a backslash is an escaped | |
33 # character. We also need to consider the case like "\\n", which means | |
34 # a blackslash and a character "n", we will accent the character "n". | |
35 TO_ACCENT = lazy_re.compile( | |
36 r'[%s]|\\[a-z\\]' % ''.join(ACCENTED_STRINGS.keys())) | |
37 | |
38 # Lex text so that we don't interfere with html tokens and entities. | |
39 # This lexing scheme will handle all well formed tags and entities, html or | |
40 # xhtml. It will not handle comments, CDATA sections, or the unescaping tags: | |
41 # script, style, xmp or listing. If any of those appear in messages, | |
42 # something is wrong. | |
43 TOKENS = [ lazy_re.compile( | |
44 '^%s' % pattern, # match at the beginning of input | |
45 re.I | re.S # html tokens are case-insensitive | |
46 ) | |
47 for pattern in | |
48 ( | |
49 # a run of non html special characters | |
50 r'[^<&]+', | |
51 # a tag | |
52 (r'</?[a-z]\w*' # beginning of tag | |
53 r'(?:\s+\w+(?:\s*=\s*' # attribute start | |
54 r'(?:[^\s"\'>]+|"[^\"]*"|\'[^\']*\'))?' # attribute value | |
55 r')*\s*/?>'), | |
56 # an entity | |
57 r'&(?:[a-z]\w+|#\d+|#x[\da-f]+);', | |
58 # an html special character not part of a special sequence | |
59 r'.' | |
60 ) ] | |
61 | |
62 ALPHABETIC_RUN = lazy_re.compile(r'([^\W0-9_]+)') | |
63 | |
64 RLO = u'\u202e' | |
65 PDF = u'\u202c' | |
66 | |
67 def PseudoRTLString(text): | |
68 '''Returns a fake bidirectional version of the source string. This code is | |
69 based on accentString above, in turn copied from Frank Tang. | |
70 ''' | |
71 parts = [] | |
72 while text: | |
73 m = None | |
74 for token in TOKENS: | |
75 m = token.search(text) | |
76 if m: | |
77 part = m.group(0) | |
78 text = text[len(part):] | |
79 if part[0] not in ('<', '&'): | |
80 # not a tag or entity, so accent | |
81 part = ALPHABETIC_RUN.sub(lambda run: RLO + run.group() + PDF, part) | |
82 parts.append(part) | |
83 break | |
84 return ''.join(parts) | |
85 | |
86 | |
87 def PseudoRTLMessage(message): | |
88 '''Returns a pseudo-RTL (aka Fake-Bidi) translation of the provided message. | |
89 | |
90 Args: | |
91 message: tclib.Message() | |
92 | |
93 Return: | |
94 tclib.Translation() | |
95 ''' | |
96 transl = tclib.Translation() | |
97 for part in message.GetContent(): | |
98 if isinstance(part, tclib.Placeholder): | |
99 transl.AppendPlaceholder(part) | |
100 else: | |
101 transl.AppendText(PseudoRTLString(part)) | |
102 | |
103 return transl | |
OLD | NEW |