Index: chrome/renderer/spellchecker/spellcheck_worditerator.cc |
=================================================================== |
--- chrome/renderer/spellchecker/spellcheck_worditerator.cc (revision 56338) |
+++ chrome/renderer/spellchecker/spellcheck_worditerator.cc (working copy) |
@@ -60,15 +60,19 @@ |
"$Extend = [\\p{Word_Break = Extend}];" |
"$Format = [\\p{Word_Break = Format}];" |
"$Katakana = [\\p{Word_Break = Katakana}];" |
+ // Not all the characters in a given script are ALetter. |
+ // For instance, U+05F4 is MidLetter. So, this may be |
+ // better, but it leads to an empty set error in Thai. |
+ // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" |
"$ALetter = [\\p{script=%s}];" |
"$MidNumLet = [\\p{Word_Break = MidNumLet}];" |
- "$MidLetter = [\\p{Word_Break = MidLetter}];" |
+ "$MidLetter = [\\p{Word_Break = MidLetter}%s];" |
"$MidNum = [\\p{Word_Break = MidNum}];" |
"$Numeric = [\\p{Word_Break = Numeric}];" |
"$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" |
"$Control = [\\p{Grapheme_Cluster_Break = Control}]; " |
- "%s" |
+ "%s" // ALetterPlus |
"$KatakanaEx = $Katakana ($Extend | $Format)*;" |
"$ALetterEx = $ALetterPlus ($Extend | $Format)*;" |
@@ -88,7 +92,7 @@ |
"[^$CR $LF $Newline]? ($Extend | $Format)+;" |
"$ALetterEx {200};" |
"$ALetterEx $ALetterEx {200};" |
- "%s" |
+ "%s" // (Allow|Disallow) Contraction |
"!!reverse;" |
"$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" |
@@ -150,6 +154,13 @@ |
if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) |
aletter_plus = kWithDictionary; |
+ const char kMidLetterExtra[] = ""; |
+ // For Hebrew, treat single/double quoation marks as MidLetter. |
+ const char kMidLetterExtraHebrew[] = "\"'"; |
+ const char* midletter_extra = kMidLetterExtra; |
+ if (script_code_ == USCRIPT_HEBREW) |
+ midletter_extra = kMidLetterExtraHebrew; |
+ |
// Create two custom rule-sets: one allows contraction and the other does not. |
// We save these strings in UTF-16 so we can use it without conversions. (ICU |
// needs UTF-16 strings.) |
@@ -158,9 +169,9 @@ |
const char kDisallowContraction[] = ""; |
ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, |
- aletter, aletter_plus, kAllowContraction)); |
+ aletter, midletter_extra, aletter_plus, kAllowContraction)); |
ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, |
- aletter, aletter_plus, kDisallowContraction)); |
+ aletter, midletter_extra, aletter_plus, kDisallowContraction)); |
} |
bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { |
@@ -245,7 +256,10 @@ |
// niqquds as misspelled. (Same as Arabic vowel marks, we need to check |
// niqquds manually and filter them out since their script codes are |
// USCRIPT_HEBREW.) |
- if (0x05D0 <= c && c <= 0x05EA) |
+ // Pass through ASCII single/double quotation marks and Hebrew Geresh and |
+ // Gershayim. |
+ if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || |
+ c == 0x05F4 || c == 0x05F3) |
output->push_back(c); |
return true; |
} |