OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
6 | 6 |
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
8 | 8 |
9 #include <map> | 9 #include <map> |
10 #include <string> | 10 #include <string> |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
59 "$LF = [\\p{Word_Break = LF}];" | 59 "$LF = [\\p{Word_Break = LF}];" |
60 "$Newline = [\\p{Word_Break = Newline}];" | 60 "$Newline = [\\p{Word_Break = Newline}];" |
61 "$Extend = [\\p{Word_Break = Extend}];" | 61 "$Extend = [\\p{Word_Break = Extend}];" |
62 "$Format = [\\p{Word_Break = Format}];" | 62 "$Format = [\\p{Word_Break = Format}];" |
63 "$Katakana = [\\p{Word_Break = Katakana}];" | 63 "$Katakana = [\\p{Word_Break = Katakana}];" |
64 // Not all the characters in a given script are ALetter. | 64 // Not all the characters in a given script are ALetter. |
65 // For instance, U+05F4 is MidLetter. So, this may be | 65 // For instance, U+05F4 is MidLetter. So, this may be |
66 // better, but it leads to an empty set error in Thai. | 66 // better, but it leads to an empty set error in Thai. |
67 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" | 67 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" |
68 "$ALetter = [\\p{script=%s}%s];" | 68 "$ALetter = [\\p{script=%s}%s];" |
69 "$MidNumLet = [\\p{Word_Break = MidNumLet}];" | 69 // U+0027 (single quote/apostrophe) is not in MidNumLet any more |
| 70 // in UAX 29 rev 21 or later. For our purpose, U+0027 |
| 71 // has to be treated as MidNumLet. ( http://crbug.com/364072 ) |
| 72 "$MidNumLet = [\\p{Word_Break = MidNumLet} \\u0027];" |
70 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" | 73 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" |
71 "$MidNum = [\\p{Word_Break = MidNum}];" | 74 "$MidNum = [\\p{Word_Break = MidNum}];" |
72 "$Numeric = [\\p{Word_Break = Numeric}];" | 75 "$Numeric = [\\p{Word_Break = Numeric}];" |
73 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | 76 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" |
74 | 77 |
75 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " | 78 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " |
76 "%s" // ALetterPlus | 79 "%s" // ALetterPlus |
77 | 80 |
78 "$KatakanaEx = $Katakana ($Extend | $Format)*;" | 81 "$KatakanaEx = $Katakana ($Extend | $Format)*;" |
79 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" | 82 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
145 // which don't need them. | 148 // which don't need them. |
146 const char* aletter = uscript_getName(script_code_); | 149 const char* aletter = uscript_getName(script_code_); |
147 if (!aletter) | 150 if (!aletter) |
148 aletter = "Latin"; | 151 aletter = "Latin"; |
149 | 152 |
150 const char kWithDictionary[] = | 153 const char kWithDictionary[] = |
151 "$dictionary = [:LineBreak = Complex_Context:];" | 154 "$dictionary = [:LineBreak = Complex_Context:];" |
152 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; | 155 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; |
153 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; | 156 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; |
154 const char* aletter_plus = kWithoutDictionary; | 157 const char* aletter_plus = kWithoutDictionary; |
155 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) | 158 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI || |
| 159 script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER) |
156 aletter_plus = kWithDictionary; | 160 aletter_plus = kWithDictionary; |
157 | 161 |
158 // Treat numbers as word characters except for Arabic and Hebrew. | 162 // Treat numbers as word characters except for Arabic and Hebrew. |
159 const char* aletter_extra = " [0123456789]"; | 163 const char* aletter_extra = " [0123456789]"; |
160 if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC) | 164 if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC) |
161 aletter_extra = ""; | 165 aletter_extra = ""; |
162 | 166 |
163 const char kMidLetterExtra[] = ""; | 167 const char kMidLetterExtra[] = ""; |
164 // For Hebrew, treat single/double quoation marks as MidLetter. | 168 // For Hebrew, treat single/double quoation marks as MidLetter. |
165 const char kMidLetterExtraHebrew[] = "\"'"; | 169 const char kMidLetterExtraHebrew[] = "\"'"; |
(...skipping 256 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
422 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 426 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
423 return false; | 427 return false; |
424 | 428 |
425 // Copy the normalized text to the output. | 429 // Copy the normalized text to the output. |
426 icu::StringCharacterIterator it(output); | 430 icu::StringCharacterIterator it(output); |
427 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 431 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
428 attribute_->OutputChar(c, output_string); | 432 attribute_->OutputChar(c, output_string); |
429 | 433 |
430 return !output_string->empty(); | 434 return !output_string->empty(); |
431 } | 435 } |
OLD | NEW |