grit/gather/tr_html.py - Issue 7994004: Initial source commit to grit-i18n project.

Side by Side Diff: grit/gather/tr_html.py

Issue 7994004: Initial source commit to grit-i18n project. (Closed) Base URL: http://grit-i18n.googlecode.com/svn/trunk/

Patch Set: Created 9 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 #!/usr/bin/python2.4

	2 # Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 '''A gatherer for the TotalRecall brand of HTML templates with replaceable

	7 portions. We wanted to reuse extern.tclib.api.handlers.html.TCHTMLParser

	8 but this proved impossible due to the fact that the TotalRecall HTML templates

	9 are in general quite far from parseable HTML and the TCHTMLParser derives

	10 from HTMLParser.HTMLParser which requires relatively well-formed HTML. Some

	11 examples of "HTML" from the TotalRecall HTML templates that wouldn't be

	12 parseable include things like:

	13

	14 <a [PARAMS]>blabla</a> (not parseable because attributes are invalid)

	15

	16 <table><tr><td>[LOTSOFSTUFF]</tr></table> (not parseable because closing

	17 </td> is in the HTML [LOTSOFSTUFF]

	18 is replaced by)

	19

	20 The other problem with using general parsers (such as TCHTMLParser) is that

	21 we want to make sure we output the TotalRecall template with as little changes

	22 as possible in terms of whitespace characters, layout etc. With any parser

	23 that generates a parse tree, and generates output by dumping the parse tree,

	24 we would always have little inconsistencies which could cause bugs (the

	25 TotalRecall template stuff is quite brittle and can break if e.g. a tab

	26 character is replaced with spaces).

	27

	28 The solution, which may be applicable to some other HTML-like template

	29 languages floating around Google, is to create a parser with a simple state

	30 machine that keeps track of what kind of tag it's inside, and whether it's in

	31 a translateable section or not. Translateable sections are:

	32

	33 a) text (including [BINGO] replaceables) inside of tags that

	34 can contain translateable text (which is all tags except

	35 for a few)

	36

	37 b) text inside of an 'alt' attribute in an <image> element, or

	38 the 'value' attribute of a <submit>, <button> or <text>

	39 element.

	40

	41 The parser does not build up a parse tree but rather a "skeleton" which

	42 is a list of nontranslateable strings intermingled with grit.clique.MessageCliqu e

	43 objects. This simplifies the parser considerably compared to a regular HTML

	44 parser. To output a translated document, each item in the skeleton is

	45 printed out, with the relevant Translation from each MessageCliques being used

	46 for the requested language.

	47

	48 This implementation borrows some code, constants and ideas from

	49 extern.tclib.api.handlers.html.TCHTMLParser.

	50 '''

	51

	52

	53 import re

	54 import types

	55

	56 from grit import clique

	57 from grit import exception

	58 from grit import util

	59 from grit import tclib

	60

	61 from grit.gather import interface

	62

	63

	64 # HTML tags which break (separate) chunks.

	65 _BLOCK_TAGS = ['script', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'br',

	66 'body', 'style', 'head', 'title', 'table', 'tr', 'td', 'th',

	67 'ul', 'ol', 'dl', 'nl', 'li', 'div', 'object', 'center',

	68 'html', 'link', 'form', 'select', 'textarea',

	69 'button', 'option', 'map', 'area', 'blockquote', 'pre',

	70 'meta', 'xmp', 'noscript', 'label', 'tbody', 'thead',

	71 'script', 'style', 'pre', 'iframe', 'img', 'input', 'nowrap']

	72

	73 # HTML tags which may appear within a chunk.

	74 _INLINE_TAGS = ['b', 'i', 'u', 'tt', 'code', 'font', 'a', 'span', 'small',

	75 'key', 'nobr', 'url', 'em', 's', 'sup', 'strike',

	76 'strong']

	77

	78 # HTML tags within which linebreaks are significant.

	79 _PREFORMATTED_TAGS = ['textarea', 'xmp', 'pre']

	80

	81 # An array mapping some of the inline HTML tags to more meaningful

	82 # names for those tags. This will be used when generating placeholders

	83 # representing these tags.

	84 _HTML_PLACEHOLDER_NAMES = { 'a' : 'link', 'br' : 'break', 'b' : 'bold',

	85 'i' : 'italic', 'li' : 'item', 'ol' : 'ordered_list', 'p' : 'paragraph',

	86 'ul' : 'unordered_list', 'img' : 'image', 'em' : 'emphasis' }

	87

	88 # We append each of these characters in sequence to distinguish between

	89 # different placeholders with basically the same name (e.g. BOLD1, BOLD2).

	90 # Keep in mind that a placeholder name must not be a substring of any other

	91 # placeholder name in the same message, so we can't simply count (BOLD_1

	92 # would be a substring of BOLD_10).

	93 _SUFFIXES = '123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'

	94

	95 # Matches whitespace in an HTML document. Also matches HTML comments, which are

	96 # treated as whitespace.

	97 _WHITESPACE = re.compile(r'(\s\| \|\\n\|\\r\|<!--\sdesc\s=.*?-->)+',

	98 re.DOTALL)

	99

	100 # Finds a non-whitespace character

	101 _NON_WHITESPACE = re.compile(r'\S')

	102

	103 # Matches two or more   in a row (a single &nbsp is not changed into

	104 # placeholders because different languages require different numbers of spaces

	105 # and placeholders must match exactly; more than one is probably a "special"

	106 # whitespace sequence and should be turned into a placeholder).

	107 _NBSP = re.compile(r' ( )+')

	108

	109 # Matches nontranslateable chunks of the document

	110 _NONTRANSLATEABLES = re.compile(r'''

	111 <\sscript.+?<\s/\sscript\s>

	112 \|

	113 <\sstyle.+?<\s/\sstyle\s>

	114 \|

	115 <!--.+?-->

	116 \|

	117 <\?IMPORT\s.+?> # import tag

	118 \|

	119 <\s*[a-zA-Z_]+:.+?> # custom tag (open)

	120 \|

	121 <\s/\s[a-zA-Z_]+:.+?> # custom tag (close)

	122 \|

	123 <!\s[A-Z]+\s([^>]+\|"[^"]+"\|'[^']+')*?>

	124 ''', re.MULTILINE \| re.DOTALL \| re.VERBOSE \| re.IGNORECASE)

	125

	126 # Matches a tag and its attributes

	127 _ELEMENT = re.compile(r'''

	128 # Optional closing /, element name

	129 <\s(?P<closing>/)?\s(?P<element>[a-zA-Z0-9]+)\s*

	130 # Attributes and/or replaceables inside the tag, if any

	131 (?P<atts>(

	132 \s([a-zA-Z_][-:.a-zA-Z_0-9]) # Attribute name

	133 (\s=\s(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./,:;+%?!&$_#=~\'"@]))?

	134 \|

	135 \s*\[(\$?\~)?([A-Z0-9-_]+?)(\~\$?)?\]

	136 )*)

	137 \s(?P<empty>/)?\s> # Optional empty-tag closing /, and tag close

	138 ''',

	139 re.MULTILINE \| re.DOTALL \| re.VERBOSE)

	140

	141 # Matches elements that may have translateable attributes. The value of these

	142 # special attributes is given by group 'value1' or 'value2'. Note that this

	143 # regexp demands that the attribute value be quoted; this is necessary because

	144 # the non-tree-building nature of the parser means we don't know when we're

	145 # writing out attributes, so we wouldn't know to escape spaces.

	146 _SPECIAL_ELEMENT = re.compile(r'''

	147 <\s*(

	148 input[^>]+?value\s=\s(\'(?P<value3>[^\'])\'\|"(?P<value4>[^"])")

	149 [^>]+type\s=\s"?'?(button\|reset\|text\|submit)'?"?

	150 \|

	151 (

	152 table[^>]+?title\s*=

	153 \|

	154 img[^>]+?alt\s*=

	155 \|

	156 input[^>]+?type\s=\s"?'?(button\|reset\|text\|submit)'?"?[^>]+?value\s*=

	157 )

	158 \s(\'(?P<value1>[^\'])\'\|"(?P<value2>[^"]*)")

	159 )[^>]*?>

	160 ''', re.MULTILINE \| re.DOTALL \| re.VERBOSE \| re.IGNORECASE)

	161

	162 # Matches stuff that is translateable if it occurs in the right context

	163 # (between tags). This includes all characters and character entities.

	164 # Note that this also matches   which needs to be handled as whitespace

	165 # before this regexp is applied.

	166 _CHARACTERS = re.compile(r'''

	167 (

	168 \w

	169 \|

	170 [\!\@\#\$\%\^\*\-\=\_\+\[\]\{\}\\\\|\;\:\'\"\,\.\/\?\`\~]

	171 \|

	172 &(\#[0-9]+\|\#x[0-9a-fA-F]+\|[A-Za-z0-9]+);

	173 )+

	174 ''', re.MULTILINE \| re.DOTALL \| re.VERBOSE)

	175

	176 # Matches Total Recall's "replaceable" tags, which are just any text

	177 # in capitals enclosed by delimiters like [] or [~~] or [$~~$] (e.g. [HELLO],

	178 # [~HELLO~] and [$~HELLO~$]).

	179 _REPLACEABLE = re.compile(r'\[(\$?\~)?(?P<name>[A-Z0-9-_]+?)(\~\$?)?\]',

	180 re.MULTILINE)

	181

	182

	183 # Matches the silly [!]-prefixed "header" that is used in some TotalRecall

	184 # templates.

	185 _SILLY_HEADER = re.compile(r'\[!\]\ntitle\t(?P<title>[^\n]+?)\n.+?\n\n',

	186 re.MULTILINE \| re.DOTALL)

	187

	188

	189 # Matches a comment that provides a description for the message it occurs in.

	190 _DESCRIPTION_COMMENT = re.compile(

	191 r'<!--\sdesc\s=\s(?P<description>.+?)\s-->', re.DOTALL)

	192

	193

	194 _DEBUG = 0

	195 def _DebugPrint(text):

	196 if _DEBUG:

	197 print text.encode('utf-8')

	198

	199

	200 class HtmlChunks(object):

	201 '''A parser that knows how to break an HTML-like document into a list of

	202 chunks, where each chunk is either translateable or non-translateable.

	203 The chunks are unmodified sections of the original document, so concatenating

	204 the text of all chunks would result in the original document.'''

	205

	206 def InTranslateable(self):

	207 return self.last_translateable != -1

	208

	209 def Rest(self):

	210 return self.text_[self.current:]

	211

	212 def StartTranslateable(self):

	213 assert not self.InTranslateable()

	214 if self.current != 0:

	215 # Append a nontranslateable chunk

	216 chunk_text = self.text_[self.chunk_start : self.last_nontranslateable + 1]

	217 # Needed in the case where document starts with a translateable.

	218 if len(chunk_text) > 0:

	219 self.AddChunk(False, chunk_text)

	220 self.chunk_start = self.last_nontranslateable + 1

	221 self.last_translateable = self.current

	222 self.last_nontranslateable = -1

	223

	224 def EndTranslateable(self):

	225 assert self.InTranslateable()

	226 # Append a translateable chunk

	227 self.AddChunk(True,

	228 self.text_[self.chunk_start : self.last_translateable + 1])

	229 self.chunk_start = self.last_translateable + 1

	230 self.last_translateable = -1

	231 self.last_nontranslateable = self.current

	232

	233 def AdvancePast(self, match):

	234 self.current += match.end()

	235

	236 def AddChunk(self, translateable, text):

	237 '''Adds a chunk to self, removing linebreaks and duplicate whitespace

	238 if appropriate.

	239 '''

	240 if translateable and not self.last_element_ in _PREFORMATTED_TAGS:

	241 text = text.replace('\n', ' ')

	242 text = text.replace('\r', ' ')

	243 text = text.replace(' ', ' ')

	244 text = text.replace(' ', ' ')

	245

	246 m = _DESCRIPTION_COMMENT.search(text)

	247 if m:

	248 self.last_description = m.group('description')

	249 # remove the description from the output text

	250 text = _DESCRIPTION_COMMENT.sub('', text)

	251

	252 if translateable:

	253 description = self.last_description

	254 self.last_description = ''

	255 else:

	256 description = ''

	257

	258 if text != '':

	259 self.chunks_.append((translateable, text, description))

	260

	261 def Parse(self, text):

	262 '''Parses self.text_ into an intermediate format stored in self.chunks_

	263 which is translateable and nontranslateable chunks. Also returns

	264 self.chunks_

	265

	266 Return:

	267 [chunk1, chunk2, chunk3, ...] (instances of class Chunk)

	268 '''

	269 #

	270 # Chunker state

	271 #

	272

	273 self.text_ = text

	274

	275 # A list of tuples (is_translateable, text) which represents the document

	276 # after chunking.

	277 self.chunks_ = []

	278

	279 # Start index of the last chunk, whether translateable or not

	280 self.chunk_start = 0

	281

	282 # Index of the last for-sure translateable character if we are parsing

	283 # a translateable chunk, -1 to indicate we are not in a translateable chunk.

	284 # This is needed so that we don't include trailing whitespace in the

	285 # translateable chunk (whitespace is neutral).

	286 self.last_translateable = -1

	287

	288 # Index of the last for-sure nontranslateable character if we are parsing

	289 # a nontranslateable chunk, -1 if we are not in a nontranslateable chunk.

	290 # This is needed to make sure we can group e.g. "<b>Hello</b> there"

	291 # together instead of just "Hello</b> there" which would be much worse

	292 # for translation.

	293 self.last_nontranslateable = -1

	294

	295 # Index of the character we're currently looking at.

	296 self.current = 0

	297

	298 # The name of the last block element parsed.

	299 self.last_element_ = ''

	300

	301 # The last explicit description we found.

	302 self.last_description = ''

	303

	304 while self.current < len(self.text_):

	305 _DebugPrint('REST: %s' % self.text_[self.current:self.current+60])

	306

	307 # First try to match whitespace

	308 m = _WHITESPACE.match(self.Rest())

	309 if m:

	310 # Whitespace is neutral, it just advances 'current' and does not switch

	311 # between translateable/nontranslateable. If we are in a

	312 # nontranslateable section that extends to the current point, we extend

	313 # it to include the whitespace. If we are in a translateable section,

	314 # we do not extend it until we find

	315 # more translateable parts, because we never want a translateable chunk

	316 # to end with whitespace.

	317 if (not self.InTranslateable() and

	318 self.last_nontranslateable == self.current - 1):

	319 self.last_nontranslateable = self.current + m.end() - 1

	320 self.AdvancePast(m)

	321 continue

	322

	323 # Then we try to match nontranslateables

	324 m = _NONTRANSLATEABLES.match(self.Rest())

	325 if m:

	326 if self.InTranslateable():

	327 self.EndTranslateable()

	328 self.last_nontranslateable = self.current + m.end() - 1

	329 self.AdvancePast(m)

	330 continue

	331

	332 # Now match all other HTML element tags (opening, closing, or empty, we

	333 # don't care).

	334 m = _ELEMENT.match(self.Rest())

	335 if m:

	336 element_name = m.group('element').lower()

	337 if element_name in _BLOCK_TAGS:

	338 self.last_element_ = element_name

	339 if self.InTranslateable():

	340 self.EndTranslateable()

	341

	342 # Check for "special" elements, i.e. ones that have a translateable

	343 # attribute, and handle them correctly. Note that all of the

	344 # "special" elements are block tags, so no need to check for this

	345 # if the tag is not a block tag.

	346 sm = _SPECIAL_ELEMENT.match(self.Rest())

	347 if sm:

	348 # Get the appropriate group name

	349 for group in sm.groupdict().keys():

	350 if sm.groupdict()[group]:

	351 break

	352

	353 # First make a nontranslateable chunk up to and including the

	354 # quote before the translateable attribute value

	355 self.AddChunk(False, self.text_[

	356 self.chunk_start : self.current + sm.start(group)])

	357 # Then a translateable for the translateable bit

	358 self.AddChunk(True, self.Rest()[sm.start(group) : sm.end(group)])

	359 # Finally correct the data invariant for the parser

	360 self.chunk_start = self.current + sm.end(group)

	361

	362 self.last_nontranslateable = self.current + m.end() - 1

	363 elif self.InTranslateable():

	364 # We're in a translateable and the tag is an inline tag, so we

	365 # need to include it in the translateable.

	366 self.last_translateable = self.current + m.end() - 1

	367 self.AdvancePast(m)

	368 continue

	369

	370 # Anything else we find must be translateable, so we advance one character

	371 # at a time until one of the above matches.

	372 if not self.InTranslateable():

	373 self.StartTranslateable()

	374 else:

	375 self.last_translateable = self.current

	376 self.current += 1

	377

	378 # Close the final chunk

	379 if self.InTranslateable():

	380 self.AddChunk(True, self.text_[self.chunk_start : ])

	381 else:

	382 self.AddChunk(False, self.text_[self.chunk_start : ])

	383

	384 return self.chunks_

	385

	386

	387 def HtmlToMessage(html, include_block_tags=False, description=''):

	388 '''Takes a bit of HTML, which must contain only "inline" HTML elements,

	389 and changes it into a tclib.Message. This involves escaping any entities and

	390 replacing any HTML code with placeholders.

	391

	392 If include_block_tags is true, no error will be given if block tags (e.g.

	393 <p> or <br>) are included in the HTML.

	394

	395 Args:

	396 html: 'Hello <b>[USERNAME]</b>, how <i>are</i> you?'

	397 include_block_tags: False

	398

	399 Return:

	400 tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, '

	401 'howNBSPSTART_ITALICareEND_ITALIC you?',

	402 [ Placeholder('START_BOLD', '<b>', ''),

	403 Placeholder('USERNAME', '[USERNAME]', ''),

	404 Placeholder('END_BOLD', '</b>', ''),

	405 Placeholder('START_ITALIC', '<i>', ''),

	406 Placeholder('END_ITALIC', '</i>', ''), ])

	407 '''

	408 # Approach is:

	409 # - first placeholderize, finding <elements>, [REPLACEABLES] and

	410 # - then escape all character entities in text in-between placeholders

	411

	412 parts = [] # List of strings (for text chunks) and tuples (ID, original)

	413 # for placeholders

	414

	415 count_names = {} # Map of base names to number of times used

	416 end_names = {} # Map of base names to stack of end tags (for correct nesting)

	417

	418 def MakeNameClosure(base, type = ''):

	419 '''Returns a closure that can be called once all names have been allocated

	420 to return the final name of the placeholder. This allows us to minimally

	421 number placeholders for non-overlap.

	422

	423 Also ensures that END_XXX_Y placeholders have the same Y as the

	424 corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same

	425 type.

	426

	427 Args:

	428 base: 'phname'

	429 type: '' \| 'begin' \| 'end'

	430

	431 Return:

	432 Closure()

	433 '''

	434 name = base

	435 if type != '':

	436 name = ('%s_%s' % (type, base)).upper()

	437

	438 if name in count_names.keys():

	439 count_names[name] += 1

	440 else:

	441 count_names[name] = 1

	442

	443 def MakeFinalName(name_ = name, index = count_names[name] - 1):

	444 if (type.lower() == 'end' and

	445 base in end_names.keys() and len(end_names[base])):

	446 return end_names[base].pop(-1) # For correct nesting

	447 if count_names[name_] != 1:

	448 name_ = '%s_%s' % (name_, _SUFFIXES[index])

	449 # We need to use a stack to ensure that the end-tag suffixes match

	450 # the begin-tag suffixes. Only needed when more than one tag of the

	451 # same type.

	452 if type == 'begin':

	453 end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper()

	454 if base in end_names.keys():

	455 end_names[base].append(end_name)

	456 else:

	457 end_names[base] = [end_name]

	458

	459 return name_

	460

	461 return MakeFinalName

	462

	463 current = 0

	464

	465 while current < len(html):

	466 m = _NBSP.match(html[current:])

	467 if m:

	468 parts.append((MakeNameClosure('SPACE'), m.group()))

	469 current += m.end()

	470 continue

	471

	472 m = _REPLACEABLE.match(html[current:])

	473 if m:

	474 # Replaceables allow - but placeholders don't, so replace - with _

	475 ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_'))

	476 parts.append((ph_name, m.group()))

	477 current += m.end()

	478 continue

	479

	480 m = _SPECIAL_ELEMENT.match(html[current:])

	481 if m:

	482 if not include_block_tags:

	483 raise exception.BlockTagInTranslateableChunk(html)

	484 element_name = 'block' # for simplification

	485 # Get the appropriate group name

	486 for group in m.groupdict().keys():

	487 if m.groupdict()[group]:

	488 break

	489 parts.append((MakeNameClosure(element_name, 'begin'),

	490 html[current : current + m.start(group)]))

	491 parts.append(m.group(group))

	492 parts.append((MakeNameClosure(element_name, 'end'),

	493 html[current + m.end(group) : current + m.end()]))

	494 current += m.end()

	495 continue

	496

	497 m = _ELEMENT.match(html[current:])

	498 if m:

	499 element_name = m.group('element').lower()

	500 if not include_block_tags and not element_name in _INLINE_TAGS:

	501 raise exception.BlockTagInTranslateableChunk(html[current:])

	502 if element_name in _HTML_PLACEHOLDER_NAMES: # use meaningful names

	503 element_name = _HTML_PLACEHOLDER_NAMES[element_name]

	504

	505 # Make a name for the placeholder

	506 type = ''

	507 if not m.group('empty'):

	508 if m.group('closing'):

	509 type = 'end'

	510 else:

	511 type = 'begin'

	512 parts.append((MakeNameClosure(element_name, type), m.group()))

	513 current += m.end()

	514 continue

	515

	516 if len(parts) and isinstance(parts[-1], types.StringTypes):

	517 parts[-1] += html[current]

	518 else:

	519 parts.append(html[current])

	520 current += 1

	521

	522 msg_text = ''

	523 placeholders = []

	524 for part in parts:

	525 if isinstance(part, types.TupleType):

	526 final_name = part[0]()

	527 original = part[1]

	528 msg_text += final_name

	529 placeholders.append(tclib.Placeholder(final_name, original, '(HTML code)') )

	530 else:

	531 msg_text += part

	532

	533 msg = tclib.Message(text=msg_text, placeholders=placeholders,

	534 description=description)

	535 content = msg.GetContent()

	536 for ix in range(len(content)):

	537 if isinstance(content[ix], types.StringTypes):

	538 content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False)

	539

	540 return msg

	541

	542

	543 class TrHtml(interface.GathererBase):

	544 '''Represents a document or message in the template format used by

	545 Total Recall for HTML documents.'''

	546

	547 def __init__(self, text):

	548 '''Creates a new object that represents 'text'.

	549 Args:

	550 text: '<html>...</html>'

	551 '''

	552 super(type(self), self).__init__()

	553

	554 self.text_ = text

	555 self.have_parsed_ = False

	556 self.skeleton_ = [] # list of strings and MessageClique objects

	557

	558 def GetText(self):

	559 '''Returns the original text of the HTML document'''

	560 return self.text_

	561

	562 def GetCliques(self):

	563 '''Returns the message cliques for each translateable message in the

	564 document.'''

	565 return filter(lambda x: isinstance(x, clique.MessageClique), self.skeleton_)

	566

	567 def Translate(self, lang, pseudo_if_not_available=True,

	568 skeleton_gatherer=None, fallback_to_english=False):

	569 '''Returns this document with translateable messages filled with

	570 the translation for language 'lang'.

	571

	572 Args:

	573 lang: 'en'

	574 pseudo_if_not_available: True

	575

	576 Return:

	577 'ID_THIS_SECTION TYPE\n...BEGIN\n "Translated message"\n......\nEND

	578

	579 Raises:

	580 grit.exception.NotReady() if used before Parse() has been successfully

	581 called.

	582 grit.exception.NoSuchTranslation() if 'pseudo_if_not_available' is false

	583 and there is no translation for the requested language.

	584 '''

	585 if len(self.skeleton_) == 0:

	586 raise exception.NotReady()

	587

	588 # TODO(joi) Implement support for skeleton gatherers here.

	589

	590 out = []

	591 for item in self.skeleton_:

	592 if isinstance(item, types.StringTypes):

	593 out.append(item)

	594 else:

	595 msg = item.MessageForLanguage(lang,

	596 pseudo_if_not_available,

	597 fallback_to_english)

	598 for content in msg.GetContent():

	599 if isinstance(content, tclib.Placeholder):

	600 out.append(content.GetOriginal())

	601 else:

	602 # We escape " characters to increase the chance that attributes

	603 # will be properly escaped.

	604 out.append(util.EscapeHtml(content, True))

	605

	606 return ''.join(out)

	607

	608

	609 # Parsing is done in two phases: First, we break the document into

	610 # translateable and nontranslateable chunks. Second, we run through each

	611 # translateable chunk and insert placeholders for any HTML elements, unescape

	612 # escaped characters, etc.

	613 def Parse(self):

	614 if self.have_parsed_:

	615 return

	616 self.have_parsed_ = True

	617

	618 text = self.text_

	619

	620 # First handle the silly little [!]-prefixed header because it's not

	621 # handled by our HTML parsers.

	622 m = _SILLY_HEADER.match(text)

	623 if m:

	624 self.skeleton_.append(text[:m.start('title')])

	625 self.skeleton_.append(self.uberclique.MakeClique(

	626 tclib.Message(text=text[m.start('title'):m.end('title')])))

	627 self.skeleton_.append(text[m.end('title') : m.end()])

	628 text = text[m.end():]

	629

	630 chunks = HtmlChunks().Parse(text)

	631

	632 for chunk in chunks:

	633 if chunk[0]: # Chunk is translateable

	634 self.skeleton_.append(self.uberclique.MakeClique(

	635 HtmlToMessage(chunk[1], description=chunk[2])))

	636 else:

	637 self.skeleton_.append(chunk[1])

	638

	639 # Go through the skeleton and change any messages that consist solely of

	640 # placeholders and whitespace into nontranslateable strings.

	641 for ix in range(len(self.skeleton_)):

	642 got_text = False

	643 if isinstance(self.skeleton_[ix], clique.MessageClique):

	644 msg = self.skeleton_[ix].GetMessage()

	645 for item in msg.GetContent():

	646 if (isinstance(item, types.StringTypes) and _NON_WHITESPACE.search(ite m)

	647 and item != ' '):

	648 got_text = True

	649 break

	650 if not got_text:

	651 self.skeleton_[ix] = msg.GetRealContent()

	652

	653

	654 # Static method

	655 def FromFile(html, extkey=None, encoding = 'utf-8'):

	656 '''Creates a TrHtml object from the contents of 'html' which are decoded

	657 using 'encoding'. Returns a new TrHtml object, upon which Parse() has not

	658 been called.

	659

	660 Args:

	661 html: file('') \| 'filename.html'

	662 extkey: ignored

	663 encoding: 'utf-8' (note that encoding is ignored if 'html' is not a file

	664 name but instead an open file or file-like object)

	665

	666 Return:

	667 TrHtml(text_of_file)

	668 '''

	669 if isinstance(html, types.StringTypes):

	670 html = util.WrapInputStream(file(html, 'r'), encoding)

	671 doc = html.read()

	672

	673 # Ignore the BOM character if the document starts with one.

	674 if len(doc) and doc[0] == u'\ufeff':

	675 doc = doc[1:]

	676

	677 return TrHtml(doc)

	678 FromFile = staticmethod(FromFile)

	679

OLD	NEW

« no previous file with comments | « grit/gather/skeleton_gatherer.py ('k') | grit/gather/tr_html_unittest.py » ('j') | no next file with comments »