| OLD | NEW | 
|---|
|  | (Empty) | 
| 1 #!/usr/bin/env python |  | 
| 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |  | 
| 3 # Use of this source code is governed by a BSD-style license that can be |  | 
| 4 # found in the LICENSE file. |  | 
| 5 |  | 
| 6 '''A gatherer for the TotalRecall brand of HTML templates with replaceable |  | 
| 7 portions.  We wanted to reuse extern.tclib.api.handlers.html.TCHTMLParser |  | 
| 8 but this proved impossible due to the fact that the TotalRecall HTML templates |  | 
| 9 are in general quite far from parseable HTML and the TCHTMLParser derives |  | 
| 10 from HTMLParser.HTMLParser which requires relatively well-formed HTML.  Some |  | 
| 11 examples of "HTML" from the TotalRecall HTML templates that wouldn't be |  | 
| 12 parseable include things like: |  | 
| 13 |  | 
| 14   <a [PARAMS]>blabla</a>  (not parseable because attributes are invalid) |  | 
| 15 |  | 
| 16   <table><tr><td>[LOTSOFSTUFF]</tr></table> (not parseable because closing |  | 
| 17                                             </td> is in the HTML [LOTSOFSTUFF] |  | 
| 18                                             is replaced by) |  | 
| 19 |  | 
| 20 The other problem with using general parsers (such as TCHTMLParser) is that |  | 
| 21 we want to make sure we output the TotalRecall template with as little changes |  | 
| 22 as possible in terms of whitespace characters, layout etc.  With any parser |  | 
| 23 that generates a parse tree, and generates output by dumping the parse tree, |  | 
| 24 we would always have little inconsistencies which could cause bugs (the |  | 
| 25 TotalRecall template stuff is quite brittle and can break if e.g. a tab |  | 
| 26 character is replaced with spaces). |  | 
| 27 |  | 
| 28 The solution, which may be applicable to some other HTML-like template |  | 
| 29 languages floating around Google, is to create a parser with a simple state |  | 
| 30 machine that keeps track of what kind of tag it's inside, and whether it's in |  | 
| 31 a translateable section or not.  Translateable sections are: |  | 
| 32 |  | 
| 33 a) text (including [BINGO] replaceables) inside of tags that |  | 
| 34    can contain translateable text (which is all tags except |  | 
| 35    for a few) |  | 
| 36 |  | 
| 37 b) text inside of an 'alt' attribute in an <image> element, or |  | 
| 38    the 'value' attribute of a <submit>, <button> or <text> |  | 
| 39    element. |  | 
| 40 |  | 
| 41 The parser does not build up a parse tree but rather a "skeleton" which |  | 
| 42 is a list of nontranslateable strings intermingled with grit.clique.MessageCliqu
     e |  | 
| 43 objects.  This simplifies the parser considerably compared to a regular HTML |  | 
| 44 parser.  To output a translated document, each item in the skeleton is |  | 
| 45 printed out, with the relevant Translation from each MessageCliques being used |  | 
| 46 for the requested language. |  | 
| 47 |  | 
| 48 This implementation borrows some code, constants and ideas from |  | 
| 49 extern.tclib.api.handlers.html.TCHTMLParser. |  | 
| 50 ''' |  | 
| 51 |  | 
| 52 |  | 
| 53 import re |  | 
| 54 import types |  | 
| 55 |  | 
| 56 from grit import clique |  | 
| 57 from grit import exception |  | 
| 58 from grit import lazy_re |  | 
| 59 from grit import util |  | 
| 60 from grit import tclib |  | 
| 61 |  | 
| 62 from grit.gather import interface |  | 
| 63 |  | 
| 64 |  | 
| 65 # HTML tags which break (separate) chunks. |  | 
| 66 _BLOCK_TAGS = ['script', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'br', |  | 
| 67               'body', 'style', 'head', 'title', 'table', 'tr', 'td', 'th', |  | 
| 68               'ul', 'ol', 'dl', 'nl', 'li', 'div', 'object', 'center', |  | 
| 69               'html', 'link', 'form', 'select', 'textarea', |  | 
| 70               'button', 'option', 'map', 'area', 'blockquote', 'pre', |  | 
| 71               'meta', 'xmp', 'noscript', 'label', 'tbody', 'thead', |  | 
| 72               'script', 'style', 'pre', 'iframe', 'img', 'input', 'nowrap', |  | 
| 73               'fieldset', 'legend'] |  | 
| 74 |  | 
| 75 # HTML tags which may appear within a chunk. |  | 
| 76 _INLINE_TAGS = ['b', 'i', 'u', 'tt', 'code', 'font', 'a', 'span', 'small', |  | 
| 77                'key', 'nobr', 'url', 'em', 's', 'sup', 'strike', |  | 
| 78                'strong'] |  | 
| 79 |  | 
| 80 # HTML tags within which linebreaks are significant. |  | 
| 81 _PREFORMATTED_TAGS = ['textarea', 'xmp', 'pre'] |  | 
| 82 |  | 
| 83 # An array mapping some of the inline HTML tags to more meaningful |  | 
| 84 # names for those tags.  This will be used when generating placeholders |  | 
| 85 # representing these tags. |  | 
| 86 _HTML_PLACEHOLDER_NAMES = { 'a' : 'link', 'br' : 'break', 'b' : 'bold', |  | 
| 87   'i' : 'italic', 'li' : 'item', 'ol' : 'ordered_list', 'p' : 'paragraph', |  | 
| 88   'ul' : 'unordered_list', 'img' : 'image', 'em' : 'emphasis' } |  | 
| 89 |  | 
| 90 # We append each of these characters in sequence to distinguish between |  | 
| 91 # different placeholders with basically the same name (e.g. BOLD1, BOLD2). |  | 
| 92 # Keep in mind that a placeholder name must not be a substring of any other |  | 
| 93 # placeholder name in the same message, so we can't simply count (BOLD_1 |  | 
| 94 # would be a substring of BOLD_10). |  | 
| 95 _SUFFIXES = '123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' |  | 
| 96 |  | 
| 97 # Matches whitespace in an HTML document.  Also matches HTML comments, which are |  | 
| 98 # treated as whitespace. |  | 
| 99 _WHITESPACE = lazy_re.compile(r'(\s| |\\n|\\r|<!--\s*desc\s*=.*?-->)+', |  | 
| 100                               re.DOTALL) |  | 
| 101 |  | 
| 102 # Matches whitespace sequences which can be folded into a single whitespace |  | 
| 103 # character.  This matches single characters so that non-spaces are replaced |  | 
| 104 # with spaces. |  | 
| 105 _FOLD_WHITESPACE = lazy_re.compile(r'\s+') |  | 
| 106 |  | 
| 107 # Finds a non-whitespace character |  | 
| 108 _NON_WHITESPACE = lazy_re.compile(r'\S') |  | 
| 109 |  | 
| 110 # Matches two or more   in a row (a single   is not changed into |  | 
| 111 # placeholders because different languages require different numbers of spaces |  | 
| 112 # and placeholders must match exactly; more than one is probably a "special" |  | 
| 113 # whitespace sequence and should be turned into a placeholder). |  | 
| 114 _NBSP = lazy_re.compile(r' ( )+') |  | 
| 115 |  | 
| 116 # Matches nontranslateable chunks of the document |  | 
| 117 _NONTRANSLATEABLES = lazy_re.compile(r''' |  | 
| 118   <\s*script.+?<\s*/\s*script\s*> |  | 
| 119   | |  | 
| 120   <\s*style.+?<\s*/\s*style\s*> |  | 
| 121   | |  | 
| 122   <!--.+?--> |  | 
| 123   | |  | 
| 124   <\?IMPORT\s.+?>           # import tag |  | 
| 125   | |  | 
| 126   <\s*[a-zA-Z_]+:.+?>       # custom tag (open) |  | 
| 127   | |  | 
| 128   <\s*/\s*[a-zA-Z_]+:.+?>   # custom tag (close) |  | 
| 129   | |  | 
| 130   <!\s*[A-Z]+\s*([^>]+|"[^"]+"|'[^']+')*?> |  | 
| 131   ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE) |  | 
| 132 |  | 
| 133 # Matches a tag and its attributes |  | 
| 134 _ELEMENT = lazy_re.compile(r''' |  | 
| 135   # Optional closing /, element name |  | 
| 136   <\s*(?P<closing>/)?\s*(?P<element>[a-zA-Z0-9]+)\s* |  | 
| 137   # Attributes and/or replaceables inside the tag, if any |  | 
| 138   (?P<atts>( |  | 
| 139     \s*([a-zA-Z_][-:.a-zA-Z_0-9]*) # Attribute name |  | 
| 140     (\s*=\s*(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))? |  | 
| 141     | |  | 
| 142     \s*\[(\$?\~)?([A-Z0-9-_]+?)(\~\$?)?\] |  | 
| 143   )*) |  | 
| 144   \s*(?P<empty>/)?\s*> # Optional empty-tag closing /, and tag close |  | 
| 145   ''', |  | 
| 146   re.MULTILINE | re.DOTALL | re.VERBOSE) |  | 
| 147 |  | 
| 148 # Matches elements that may have translateable attributes.  The value of these |  | 
| 149 # special attributes is given by group 'value1' or 'value2'.  Note that this |  | 
| 150 # regexp demands that the attribute value be quoted; this is necessary because |  | 
| 151 # the non-tree-building nature of the parser means we don't know when we're |  | 
| 152 # writing out attributes, so we wouldn't know to escape spaces. |  | 
| 153 _SPECIAL_ELEMENT = lazy_re.compile(r''' |  | 
| 154   <\s*( |  | 
| 155     input[^>]+?value\s*=\s*(\'(?P<value3>[^\']*)\'|"(?P<value4>[^"]*)") |  | 
| 156     [^>]+type\s*=\s*"?'?(button|reset|text|submit)'?"? |  | 
| 157     | |  | 
| 158     ( |  | 
| 159       table[^>]+?title\s*= |  | 
| 160       | |  | 
| 161       img[^>]+?alt\s*= |  | 
| 162       | |  | 
| 163       input[^>]+?type\s*=\s*"?'?(button|reset|text|submit)'?"?[^>]+?value\s*= |  | 
| 164     ) |  | 
| 165     \s*(\'(?P<value1>[^\']*)\'|"(?P<value2>[^"]*)") |  | 
| 166   )[^>]*?> |  | 
| 167   ''', re.MULTILINE | re.DOTALL | re.VERBOSE | re.IGNORECASE) |  | 
| 168 |  | 
| 169 # Matches stuff that is translateable if it occurs in the right context |  | 
| 170 # (between tags).  This includes all characters and character entities. |  | 
| 171 # Note that this also matches   which needs to be handled as whitespace |  | 
| 172 # before this regexp is applied. |  | 
| 173 _CHARACTERS = lazy_re.compile(r''' |  | 
| 174   ( |  | 
| 175     \w |  | 
| 176     | |  | 
| 177     [\!\@\#\$\%\^\*\(\)\-\=\_\+\[\]\{\}\\\|\;\:\'\"\,\.\/\?\`\~] |  | 
| 178     | |  | 
| 179     &(\#[0-9]+|\#x[0-9a-fA-F]+|[A-Za-z0-9]+); |  | 
| 180   )+ |  | 
| 181   ''', re.MULTILINE | re.DOTALL | re.VERBOSE) |  | 
| 182 |  | 
| 183 # Matches Total Recall's "replaceable" tags, which are just any text |  | 
| 184 # in capitals enclosed by delimiters like [] or [~~] or [$~~$] (e.g. [HELLO], |  | 
| 185 # [~HELLO~] and [$~HELLO~$]). |  | 
| 186 _REPLACEABLE = lazy_re.compile(r'\[(\$?\~)?(?P<name>[A-Z0-9-_]+?)(\~\$?)?\]', |  | 
| 187                                re.MULTILINE) |  | 
| 188 |  | 
| 189 |  | 
| 190 # Matches the silly [!]-prefixed "header" that is used in some TotalRecall |  | 
| 191 # templates. |  | 
| 192 _SILLY_HEADER = lazy_re.compile(r'\[!\]\ntitle\t(?P<title>[^\n]+?)\n.+?\n\n', |  | 
| 193                                 re.MULTILINE | re.DOTALL) |  | 
| 194 |  | 
| 195 |  | 
| 196 # Matches a comment that provides a description for the message it occurs in. |  | 
| 197 _DESCRIPTION_COMMENT = lazy_re.compile( |  | 
| 198   r'<!--\s*desc\s*=\s*(?P<description>.+?)\s*-->', re.DOTALL) |  | 
| 199 |  | 
| 200 # Matches a comment which is used to break apart multiple messages. |  | 
| 201 _MESSAGE_BREAK_COMMENT = lazy_re.compile(r'<!--\s*message-break\s*-->', |  | 
| 202                                          re.DOTALL) |  | 
| 203 |  | 
| 204 # Matches a comment which is used to prevent block tags from splitting a message |  | 
| 205 _MESSAGE_NO_BREAK_COMMENT = re.compile(r'<!--\s*message-no-break\s*-->', |  | 
| 206                                        re.DOTALL) |  | 
| 207 |  | 
| 208 |  | 
| 209 _DEBUG = 0 |  | 
| 210 def _DebugPrint(text): |  | 
| 211   if _DEBUG: |  | 
| 212     print text.encode('utf-8') |  | 
| 213 |  | 
| 214 |  | 
| 215 class HtmlChunks(object): |  | 
| 216   '''A parser that knows how to break an HTML-like document into a list of |  | 
| 217   chunks, where each chunk is either translateable or non-translateable. |  | 
| 218   The chunks are unmodified sections of the original document, so concatenating |  | 
| 219   the text of all chunks would result in the original document.''' |  | 
| 220 |  | 
| 221   def InTranslateable(self): |  | 
| 222     return self.last_translateable != -1 |  | 
| 223 |  | 
| 224   def Rest(self): |  | 
| 225     return self.text_[self.current:] |  | 
| 226 |  | 
| 227   def StartTranslateable(self): |  | 
| 228     assert not self.InTranslateable() |  | 
| 229     if self.current != 0: |  | 
| 230       # Append a nontranslateable chunk |  | 
| 231       chunk_text = self.text_[self.chunk_start : self.last_nontranslateable + 1] |  | 
| 232       # Needed in the case where document starts with a translateable. |  | 
| 233       if len(chunk_text) > 0: |  | 
| 234         self.AddChunk(False, chunk_text) |  | 
| 235     self.chunk_start = self.last_nontranslateable + 1 |  | 
| 236     self.last_translateable = self.current |  | 
| 237     self.last_nontranslateable = -1 |  | 
| 238 |  | 
| 239   def EndTranslateable(self): |  | 
| 240     assert self.InTranslateable() |  | 
| 241     # Append a translateable chunk |  | 
| 242     self.AddChunk(True, |  | 
| 243                   self.text_[self.chunk_start : self.last_translateable + 1]) |  | 
| 244     self.chunk_start = self.last_translateable + 1 |  | 
| 245     self.last_translateable = -1 |  | 
| 246     self.last_nontranslateable = self.current |  | 
| 247 |  | 
| 248   def AdvancePast(self, match): |  | 
| 249     self.current += match.end() |  | 
| 250 |  | 
| 251   def AddChunk(self, translateable, text): |  | 
| 252     '''Adds a chunk to self, removing linebreaks and duplicate whitespace |  | 
| 253     if appropriate. |  | 
| 254     ''' |  | 
| 255     m = _DESCRIPTION_COMMENT.search(text) |  | 
| 256     if m: |  | 
| 257       self.last_description = m.group('description') |  | 
| 258       # Remove the description from the output text |  | 
| 259       text = _DESCRIPTION_COMMENT.sub('', text) |  | 
| 260 |  | 
| 261     m = _MESSAGE_BREAK_COMMENT.search(text) |  | 
| 262     if m: |  | 
| 263       # Remove the coment from the output text.  It should already effectively |  | 
| 264       # break apart messages. |  | 
| 265       text = _MESSAGE_BREAK_COMMENT.sub('', text) |  | 
| 266 |  | 
| 267     if translateable and not self.last_element_ in _PREFORMATTED_TAGS: |  | 
| 268       if self.fold_whitespace_: |  | 
| 269         # Fold whitespace sequences if appropriate.  This is optional because it |  | 
| 270         # alters the output strings. |  | 
| 271         text = _FOLD_WHITESPACE.sub(' ', text) |  | 
| 272       else: |  | 
| 273         text = text.replace('\n', ' ') |  | 
| 274         text = text.replace('\r', ' ') |  | 
| 275         # This whitespace folding doesn't work in all cases, thus the |  | 
| 276         # fold_whitespace flag to support backwards compatibility. |  | 
| 277         text = text.replace('   ', ' ') |  | 
| 278         text = text.replace('  ', ' ') |  | 
| 279 |  | 
| 280     if translateable: |  | 
| 281       description = self.last_description |  | 
| 282       self.last_description = '' |  | 
| 283     else: |  | 
| 284       description = '' |  | 
| 285 |  | 
| 286     if text != '': |  | 
| 287       self.chunks_.append((translateable, text, description)) |  | 
| 288 |  | 
| 289   def Parse(self, text, fold_whitespace): |  | 
| 290     '''Parses self.text_ into an intermediate format stored in self.chunks_ |  | 
| 291     which is translateable and nontranslateable chunks.  Also returns |  | 
| 292     self.chunks_ |  | 
| 293 |  | 
| 294     Args: |  | 
| 295       text: The HTML for parsing. |  | 
| 296       fold_whitespace: Whether whitespace sequences should be folded into a |  | 
| 297         single space. |  | 
| 298 |  | 
| 299     Return: |  | 
| 300       [chunk1, chunk2, chunk3, ...]  (instances of class Chunk) |  | 
| 301     ''' |  | 
| 302     # |  | 
| 303     # Chunker state |  | 
| 304     # |  | 
| 305 |  | 
| 306     self.text_ = text |  | 
| 307     self.fold_whitespace_ = fold_whitespace |  | 
| 308 |  | 
| 309     # A list of tuples (is_translateable, text) which represents the document |  | 
| 310     # after chunking. |  | 
| 311     self.chunks_ = [] |  | 
| 312 |  | 
| 313     # Start index of the last chunk, whether translateable or not |  | 
| 314     self.chunk_start = 0 |  | 
| 315 |  | 
| 316     # Index of the last for-sure translateable character if we are parsing |  | 
| 317     # a translateable chunk, -1 to indicate we are not in a translateable chunk. |  | 
| 318     # This is needed so that we don't include trailing whitespace in the |  | 
| 319     # translateable chunk (whitespace is neutral). |  | 
| 320     self.last_translateable = -1 |  | 
| 321 |  | 
| 322     # Index of the last for-sure nontranslateable character if we are parsing |  | 
| 323     # a nontranslateable chunk, -1 if we are not in a nontranslateable chunk. |  | 
| 324     # This is needed to make sure we can group e.g. "<b>Hello</b> there" |  | 
| 325     # together instead of just "Hello</b> there" which would be much worse |  | 
| 326     # for translation. |  | 
| 327     self.last_nontranslateable = -1 |  | 
| 328 |  | 
| 329     # Index of the character we're currently looking at. |  | 
| 330     self.current = 0 |  | 
| 331 |  | 
| 332     # The name of the last block element parsed. |  | 
| 333     self.last_element_ = '' |  | 
| 334 |  | 
| 335     # The last explicit description we found. |  | 
| 336     self.last_description = '' |  | 
| 337 |  | 
| 338     # Whether no-break was the last chunk seen |  | 
| 339     self.last_nobreak = False |  | 
| 340 |  | 
| 341     while self.current < len(self.text_): |  | 
| 342       _DebugPrint('REST: %s' % self.text_[self.current:self.current+60]) |  | 
| 343 |  | 
| 344       m = _MESSAGE_NO_BREAK_COMMENT.match(self.Rest()) |  | 
| 345       if m: |  | 
| 346         self.AdvancePast(m) |  | 
| 347         self.last_nobreak = True |  | 
| 348         continue |  | 
| 349 |  | 
| 350       # Try to match whitespace |  | 
| 351       m = _WHITESPACE.match(self.Rest()) |  | 
| 352       if m: |  | 
| 353         # Whitespace is neutral, it just advances 'current' and does not switch |  | 
| 354         # between translateable/nontranslateable.  If we are in a |  | 
| 355         # nontranslateable section that extends to the current point, we extend |  | 
| 356         # it to include the whitespace.  If we are in a translateable section, |  | 
| 357         # we do not extend it until we find |  | 
| 358         # more translateable parts, because we never want a translateable chunk |  | 
| 359         # to end with whitespace. |  | 
| 360         if (not self.InTranslateable() and |  | 
| 361             self.last_nontranslateable == self.current - 1): |  | 
| 362           self.last_nontranslateable = self.current + m.end() - 1 |  | 
| 363         self.AdvancePast(m) |  | 
| 364         continue |  | 
| 365 |  | 
| 366       # Then we try to match nontranslateables |  | 
| 367       m = _NONTRANSLATEABLES.match(self.Rest()) |  | 
| 368       if m: |  | 
| 369         if self.InTranslateable(): |  | 
| 370           self.EndTranslateable() |  | 
| 371         self.last_nontranslateable = self.current + m.end() - 1 |  | 
| 372         self.AdvancePast(m) |  | 
| 373         continue |  | 
| 374 |  | 
| 375       # Now match all other HTML element tags (opening, closing, or empty, we |  | 
| 376       # don't care). |  | 
| 377       m = _ELEMENT.match(self.Rest()) |  | 
| 378       if m: |  | 
| 379         element_name = m.group('element').lower() |  | 
| 380         if element_name in _BLOCK_TAGS: |  | 
| 381           self.last_element_ = element_name |  | 
| 382           if self.InTranslateable(): |  | 
| 383             if self.last_nobreak: |  | 
| 384               self.last_nobreak = False |  | 
| 385             else: |  | 
| 386               self.EndTranslateable() |  | 
| 387 |  | 
| 388           # Check for "special" elements, i.e. ones that have a translateable |  | 
| 389           # attribute, and handle them correctly.  Note that all of the |  | 
| 390           # "special" elements are block tags, so no need to check for this |  | 
| 391           # if the tag is not a block tag. |  | 
| 392           sm = _SPECIAL_ELEMENT.match(self.Rest()) |  | 
| 393           if sm: |  | 
| 394             # Get the appropriate group name |  | 
| 395             for group in sm.groupdict().keys(): |  | 
| 396               if sm.groupdict()[group]: |  | 
| 397                 break |  | 
| 398 |  | 
| 399             # First make a nontranslateable chunk up to and including the |  | 
| 400             # quote before the translateable attribute value |  | 
| 401             self.AddChunk(False, self.text_[ |  | 
| 402               self.chunk_start : self.current + sm.start(group)]) |  | 
| 403             # Then a translateable for the translateable bit |  | 
| 404             self.AddChunk(True, self.Rest()[sm.start(group) : sm.end(group)]) |  | 
| 405             # Finally correct the data invariant for the parser |  | 
| 406             self.chunk_start = self.current + sm.end(group) |  | 
| 407 |  | 
| 408           self.last_nontranslateable = self.current + m.end() - 1 |  | 
| 409         elif self.InTranslateable(): |  | 
| 410           # We're in a translateable and the tag is an inline tag, so we |  | 
| 411           # need to include it in the translateable. |  | 
| 412           self.last_translateable = self.current + m.end() - 1 |  | 
| 413         self.AdvancePast(m) |  | 
| 414         continue |  | 
| 415 |  | 
| 416       # Anything else we find must be translateable, so we advance one character |  | 
| 417       # at a time until one of the above matches. |  | 
| 418       if not self.InTranslateable(): |  | 
| 419         self.StartTranslateable() |  | 
| 420       else: |  | 
| 421         self.last_translateable = self.current |  | 
| 422       self.current += 1 |  | 
| 423 |  | 
| 424     # Close the final chunk |  | 
| 425     if self.InTranslateable(): |  | 
| 426       self.AddChunk(True, self.text_[self.chunk_start : ]) |  | 
| 427     else: |  | 
| 428       self.AddChunk(False, self.text_[self.chunk_start : ]) |  | 
| 429 |  | 
| 430     return self.chunks_ |  | 
| 431 |  | 
| 432 |  | 
| 433 def HtmlToMessage(html, include_block_tags=False, description=''): |  | 
| 434   '''Takes a bit of HTML, which must contain only "inline" HTML elements, |  | 
| 435   and changes it into a tclib.Message.  This involves escaping any entities and |  | 
| 436   replacing any HTML code with placeholders. |  | 
| 437 |  | 
| 438   If include_block_tags is true, no error will be given if block tags (e.g. |  | 
| 439   <p> or <br>) are included in the HTML. |  | 
| 440 |  | 
| 441   Args: |  | 
| 442     html: 'Hello <b>[USERNAME]</b>, how <i>are</i> you?' |  | 
| 443     include_block_tags: False |  | 
| 444 |  | 
| 445   Return: |  | 
| 446     tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, ' |  | 
| 447                   'howNBSPSTART_ITALICareEND_ITALIC you?', |  | 
| 448                   [ Placeholder('START_BOLD', '<b>', ''), |  | 
| 449                     Placeholder('USERNAME', '[USERNAME]', ''), |  | 
| 450                     Placeholder('END_BOLD', '</b>', ''), |  | 
| 451                     Placeholder('START_ITALIC', '<i>', ''), |  | 
| 452                     Placeholder('END_ITALIC', '</i>', ''), ]) |  | 
| 453   ''' |  | 
| 454   # Approach is: |  | 
| 455   # - first placeholderize, finding <elements>, [REPLACEABLES] and   |  | 
| 456   # - then escape all character entities in text in-between placeholders |  | 
| 457 |  | 
| 458   parts = []  # List of strings (for text chunks) and tuples (ID, original) |  | 
| 459               # for placeholders |  | 
| 460 |  | 
| 461   count_names = {}  # Map of base names to number of times used |  | 
| 462   end_names = {}  # Map of base names to stack of end tags (for correct nesting) |  | 
| 463 |  | 
| 464   def MakeNameClosure(base, type = ''): |  | 
| 465     '''Returns a closure that can be called once all names have been allocated |  | 
| 466     to return the final name of the placeholder.  This allows us to minimally |  | 
| 467     number placeholders for non-overlap. |  | 
| 468 |  | 
| 469     Also ensures that END_XXX_Y placeholders have the same Y as the |  | 
| 470     corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same |  | 
| 471     type. |  | 
| 472 |  | 
| 473     Args: |  | 
| 474       base: 'phname' |  | 
| 475       type: '' | 'begin' | 'end' |  | 
| 476 |  | 
| 477     Return: |  | 
| 478       Closure() |  | 
| 479     ''' |  | 
| 480     name = base.upper() |  | 
| 481     if type != '': |  | 
| 482       name = ('%s_%s' % (type, base)).upper() |  | 
| 483 |  | 
| 484     if name in count_names.keys(): |  | 
| 485       count_names[name] += 1 |  | 
| 486     else: |  | 
| 487       count_names[name] = 1 |  | 
| 488 |  | 
| 489     def MakeFinalName(name_ = name, index = count_names[name] - 1): |  | 
| 490       if (type.lower() == 'end' and |  | 
| 491           base in end_names.keys() and len(end_names[base])): |  | 
| 492         return end_names[base].pop(-1)  # For correct nesting |  | 
| 493       if count_names[name_] != 1: |  | 
| 494         name_ = '%s_%s' % (name_, _SUFFIXES[index]) |  | 
| 495         # We need to use a stack to ensure that the end-tag suffixes match |  | 
| 496         # the begin-tag suffixes.  Only needed when more than one tag of the |  | 
| 497         # same type. |  | 
| 498         if type == 'begin': |  | 
| 499           end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper() |  | 
| 500           if base in end_names.keys(): |  | 
| 501             end_names[base].append(end_name) |  | 
| 502           else: |  | 
| 503             end_names[base] = [end_name] |  | 
| 504 |  | 
| 505       return name_ |  | 
| 506 |  | 
| 507     return MakeFinalName |  | 
| 508 |  | 
| 509   current = 0 |  | 
| 510   last_nobreak = False |  | 
| 511 |  | 
| 512   while current < len(html): |  | 
| 513     m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:]) |  | 
| 514     if m: |  | 
| 515       last_nobreak = True |  | 
| 516       current += m.end() |  | 
| 517       continue |  | 
| 518 |  | 
| 519     m = _NBSP.match(html[current:]) |  | 
| 520     if m: |  | 
| 521       parts.append((MakeNameClosure('SPACE'), m.group())) |  | 
| 522       current += m.end() |  | 
| 523       continue |  | 
| 524 |  | 
| 525     m = _REPLACEABLE.match(html[current:]) |  | 
| 526     if m: |  | 
| 527       # Replaceables allow - but placeholders don't, so replace - with _ |  | 
| 528       ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_')) |  | 
| 529       parts.append((ph_name, m.group())) |  | 
| 530       current += m.end() |  | 
| 531       continue |  | 
| 532 |  | 
| 533     m = _SPECIAL_ELEMENT.match(html[current:]) |  | 
| 534     if m: |  | 
| 535       if not include_block_tags: |  | 
| 536         if last_nobreak: |  | 
| 537           last_nobreak = False |  | 
| 538         else: |  | 
| 539           raise exception.BlockTagInTranslateableChunk(html) |  | 
| 540       element_name = 'block'  # for simplification |  | 
| 541       # Get the appropriate group name |  | 
| 542       for group in m.groupdict().keys(): |  | 
| 543         if m.groupdict()[group]: |  | 
| 544           break |  | 
| 545       parts.append((MakeNameClosure(element_name, 'begin'), |  | 
| 546                     html[current : current + m.start(group)])) |  | 
| 547       parts.append(m.group(group)) |  | 
| 548       parts.append((MakeNameClosure(element_name, 'end'), |  | 
| 549                     html[current + m.end(group) : current + m.end()])) |  | 
| 550       current += m.end() |  | 
| 551       continue |  | 
| 552 |  | 
| 553     m = _ELEMENT.match(html[current:]) |  | 
| 554     if m: |  | 
| 555       element_name = m.group('element').lower() |  | 
| 556       if not include_block_tags and not element_name in _INLINE_TAGS: |  | 
| 557         if last_nobreak: |  | 
| 558           last_nobreak = False |  | 
| 559         else: |  | 
| 560           raise exception.BlockTagInTranslateableChunk(html[current:]) |  | 
| 561       if element_name in _HTML_PLACEHOLDER_NAMES:  # use meaningful names |  | 
| 562         element_name = _HTML_PLACEHOLDER_NAMES[element_name] |  | 
| 563 |  | 
| 564       # Make a name for the placeholder |  | 
| 565       type = '' |  | 
| 566       if not m.group('empty'): |  | 
| 567         if m.group('closing'): |  | 
| 568           type = 'end' |  | 
| 569         else: |  | 
| 570           type = 'begin' |  | 
| 571       parts.append((MakeNameClosure(element_name, type), m.group())) |  | 
| 572       current += m.end() |  | 
| 573       continue |  | 
| 574 |  | 
| 575     if len(parts) and isinstance(parts[-1], types.StringTypes): |  | 
| 576       parts[-1] += html[current] |  | 
| 577     else: |  | 
| 578       parts.append(html[current]) |  | 
| 579     current += 1 |  | 
| 580 |  | 
| 581   msg_text = '' |  | 
| 582   placeholders = [] |  | 
| 583   for part in parts: |  | 
| 584     if isinstance(part, types.TupleType): |  | 
| 585       final_name = part[0]() |  | 
| 586       original = part[1] |  | 
| 587       msg_text += final_name |  | 
| 588       placeholders.append(tclib.Placeholder(final_name, original, '(HTML code)')
     ) |  | 
| 589     else: |  | 
| 590       msg_text += part |  | 
| 591 |  | 
| 592   msg = tclib.Message(text=msg_text, placeholders=placeholders, |  | 
| 593                       description=description) |  | 
| 594   content = msg.GetContent() |  | 
| 595   for ix in range(len(content)): |  | 
| 596     if isinstance(content[ix], types.StringTypes): |  | 
| 597       content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False) |  | 
| 598 |  | 
| 599   return msg |  | 
| 600 |  | 
| 601 |  | 
| 602 class TrHtml(interface.GathererBase): |  | 
| 603   '''Represents a document or message in the template format used by |  | 
| 604   Total Recall for HTML documents.''' |  | 
| 605 |  | 
| 606   def __init__(self, *args, **kwargs): |  | 
| 607     super(TrHtml, self).__init__(*args, **kwargs) |  | 
| 608     self.have_parsed_ = False |  | 
| 609     self.skeleton_ = []  # list of strings and MessageClique objects |  | 
| 610     self.fold_whitespace_ = False |  | 
| 611 |  | 
| 612   def SetAttributes(self, attrs): |  | 
| 613     '''Sets node attributes used by the gatherer. |  | 
| 614 |  | 
| 615     This checks the fold_whitespace attribute. |  | 
| 616 |  | 
| 617     Args: |  | 
| 618       attrs: The mapping of node attributes. |  | 
| 619     ''' |  | 
| 620     self.fold_whitespace_ = ('fold_whitespace' in attrs and |  | 
| 621                              attrs['fold_whitespace'] == 'true') |  | 
| 622 |  | 
| 623   def GetText(self): |  | 
| 624     '''Returns the original text of the HTML document''' |  | 
| 625     return self.text_ |  | 
| 626 |  | 
| 627   def GetTextualIds(self): |  | 
| 628     return [self.extkey] |  | 
| 629 |  | 
| 630   def GetCliques(self): |  | 
| 631     '''Returns the message cliques for each translateable message in the |  | 
| 632     document.''' |  | 
| 633     return [x for x in self.skeleton_ if isinstance(x, clique.MessageClique)] |  | 
| 634 |  | 
| 635   def Translate(self, lang, pseudo_if_not_available=True, |  | 
| 636                 skeleton_gatherer=None, fallback_to_english=False): |  | 
| 637     '''Returns this document with translateable messages filled with |  | 
| 638     the translation for language 'lang'. |  | 
| 639 |  | 
| 640     Args: |  | 
| 641       lang: 'en' |  | 
| 642       pseudo_if_not_available: True |  | 
| 643 |  | 
| 644     Return: |  | 
| 645       'ID_THIS_SECTION TYPE\n...BEGIN\n  "Translated message"\n......\nEND |  | 
| 646 |  | 
| 647     Raises: |  | 
| 648       grit.exception.NotReady() if used before Parse() has been successfully |  | 
| 649       called. |  | 
| 650       grit.exception.NoSuchTranslation() if 'pseudo_if_not_available' is false |  | 
| 651       and there is no translation for the requested language. |  | 
| 652     ''' |  | 
| 653     if len(self.skeleton_) == 0: |  | 
| 654       raise exception.NotReady() |  | 
| 655 |  | 
| 656     # TODO(joi) Implement support for skeleton gatherers here. |  | 
| 657 |  | 
| 658     out = [] |  | 
| 659     for item in self.skeleton_: |  | 
| 660       if isinstance(item, types.StringTypes): |  | 
| 661         out.append(item) |  | 
| 662       else: |  | 
| 663         msg = item.MessageForLanguage(lang, |  | 
| 664                                       pseudo_if_not_available, |  | 
| 665                                       fallback_to_english) |  | 
| 666         for content in msg.GetContent(): |  | 
| 667           if isinstance(content, tclib.Placeholder): |  | 
| 668             out.append(content.GetOriginal()) |  | 
| 669           else: |  | 
| 670             # We escape " characters to increase the chance that attributes |  | 
| 671             # will be properly escaped. |  | 
| 672             out.append(util.EscapeHtml(content, True)) |  | 
| 673 |  | 
| 674     return ''.join(out) |  | 
| 675 |  | 
| 676   def Parse(self): |  | 
| 677     if self.have_parsed_: |  | 
| 678       return |  | 
| 679     self.have_parsed_ = True |  | 
| 680 |  | 
| 681     text = self._LoadInputFile() |  | 
| 682 |  | 
| 683     # Ignore the BOM character if the document starts with one. |  | 
| 684     if text.startswith(u'\ufeff'): |  | 
| 685       text = text[1:] |  | 
| 686 |  | 
| 687     self.text_ = text |  | 
| 688 |  | 
| 689     # Parsing is done in two phases:  First, we break the document into |  | 
| 690     # translateable and nontranslateable chunks.  Second, we run through each |  | 
| 691     # translateable chunk and insert placeholders for any HTML elements, |  | 
| 692     # unescape escaped characters, etc. |  | 
| 693 |  | 
| 694     # First handle the silly little [!]-prefixed header because it's not |  | 
| 695     # handled by our HTML parsers. |  | 
| 696     m = _SILLY_HEADER.match(text) |  | 
| 697     if m: |  | 
| 698       self.skeleton_.append(text[:m.start('title')]) |  | 
| 699       self.skeleton_.append(self.uberclique.MakeClique( |  | 
| 700         tclib.Message(text=text[m.start('title'):m.end('title')]))) |  | 
| 701       self.skeleton_.append(text[m.end('title') : m.end()]) |  | 
| 702       text = text[m.end():] |  | 
| 703 |  | 
| 704     chunks = HtmlChunks().Parse(text, self.fold_whitespace_) |  | 
| 705 |  | 
| 706     for chunk in chunks: |  | 
| 707       if chunk[0]:  # Chunk is translateable |  | 
| 708         self.skeleton_.append(self.uberclique.MakeClique( |  | 
| 709           HtmlToMessage(chunk[1], description=chunk[2]))) |  | 
| 710       else: |  | 
| 711         self.skeleton_.append(chunk[1]) |  | 
| 712 |  | 
| 713     # Go through the skeleton and change any messages that consist solely of |  | 
| 714     # placeholders and whitespace into nontranslateable strings. |  | 
| 715     for ix in range(len(self.skeleton_)): |  | 
| 716       got_text = False |  | 
| 717       if isinstance(self.skeleton_[ix], clique.MessageClique): |  | 
| 718         msg = self.skeleton_[ix].GetMessage() |  | 
| 719         for item in msg.GetContent(): |  | 
| 720           if (isinstance(item, types.StringTypes) and _NON_WHITESPACE.search(ite
     m) |  | 
| 721               and item != ' '): |  | 
| 722             got_text = True |  | 
| 723             break |  | 
| 724         if not got_text: |  | 
| 725           self.skeleton_[ix] = msg.GetRealContent() |  | 
| 726 |  | 
| 727   def SubstituteMessages(self, substituter): |  | 
| 728     '''Applies substitutions to all messages in the tree. |  | 
| 729 |  | 
| 730     Goes through the skeleton and finds all MessageCliques. |  | 
| 731 |  | 
| 732     Args: |  | 
| 733       substituter: a grit.util.Substituter object. |  | 
| 734     ''' |  | 
| 735     new_skel = [] |  | 
| 736     for chunk in self.skeleton_: |  | 
| 737       if isinstance(chunk, clique.MessageClique): |  | 
| 738         old_message = chunk.GetMessage() |  | 
| 739         new_message = substituter.SubstituteMessage(old_message) |  | 
| 740         if new_message is not old_message: |  | 
| 741           new_skel.append(self.uberclique.MakeClique(new_message)) |  | 
| 742           continue |  | 
| 743       new_skel.append(chunk) |  | 
| 744     self.skeleton_ = new_skel |  | 
| 745 |  | 
| OLD | NEW | 
|---|