OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Implements a custom word iterator used for our spellchecker. | 5 // Implements a custom word iterator used for our spellchecker. |
6 | 6 |
7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" | 7 #include "chrome/renderer/spellchecker/spellcheck_worditerator.h" |
8 | 8 |
9 #include <map> | 9 #include <map> |
10 #include <string> | 10 #include <string> |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
60 "$LF = [\\p{Word_Break = LF}];" | 60 "$LF = [\\p{Word_Break = LF}];" |
61 "$Newline = [\\p{Word_Break = Newline}];" | 61 "$Newline = [\\p{Word_Break = Newline}];" |
62 "$Extend = [\\p{Word_Break = Extend}];" | 62 "$Extend = [\\p{Word_Break = Extend}];" |
63 "$Format = [\\p{Word_Break = Format}];" | 63 "$Format = [\\p{Word_Break = Format}];" |
64 "$Katakana = [\\p{Word_Break = Katakana}];" | 64 "$Katakana = [\\p{Word_Break = Katakana}];" |
65 // Not all the characters in a given script are ALetter. | 65 // Not all the characters in a given script are ALetter. |
66 // For instance, U+05F4 is MidLetter. So, this may be | 66 // For instance, U+05F4 is MidLetter. So, this may be |
67 // better, but it leads to an empty set error in Thai. | 67 // better, but it leads to an empty set error in Thai. |
68 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" | 68 // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" |
69 "$ALetter = [\\p{script=%s}%s];" | 69 "$ALetter = [\\p{script=%s}%s];" |
70 "$MidNumLet = [\\p{Word_Break = MidNumLet}];" | 70 // U+0027 (single quote/apostrophe) is not in MidNumLet any more |
| 71 // in UAX 29 rev 21 or later. For our purpose, U+0027 |
| 72 // has to be treated as MidNumLet. ( http://crbug.com/364072 ) |
| 73 "$MidNumLet = [\\p{Word_Break = MidNumLet} \\u0027];" |
71 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" | 74 "$MidLetter = [\\p{Word_Break = MidLetter}%s];" |
72 "$MidNum = [\\p{Word_Break = MidNum}];" | 75 "$MidNum = [\\p{Word_Break = MidNum}];" |
73 "$Numeric = [\\p{Word_Break = Numeric}];" | 76 "$Numeric = [\\p{Word_Break = Numeric}];" |
74 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" | 77 "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" |
75 | 78 |
76 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " | 79 "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " |
77 "%s" // ALetterPlus | 80 "%s" // ALetterPlus |
78 | 81 |
79 "$KatakanaEx = $Katakana ($Extend | $Format)*;" | 82 "$KatakanaEx = $Katakana ($Extend | $Format)*;" |
80 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" | 83 "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
146 // which don't need them. | 149 // which don't need them. |
147 const char* aletter = uscript_getName(script_code_); | 150 const char* aletter = uscript_getName(script_code_); |
148 if (!aletter) | 151 if (!aletter) |
149 aletter = "Latin"; | 152 aletter = "Latin"; |
150 | 153 |
151 const char kWithDictionary[] = | 154 const char kWithDictionary[] = |
152 "$dictionary = [:LineBreak = Complex_Context:];" | 155 "$dictionary = [:LineBreak = Complex_Context:];" |
153 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; | 156 "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; |
154 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; | 157 const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; |
155 const char* aletter_plus = kWithoutDictionary; | 158 const char* aletter_plus = kWithoutDictionary; |
156 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) | 159 if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI || |
| 160 script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER) |
157 aletter_plus = kWithDictionary; | 161 aletter_plus = kWithDictionary; |
158 | 162 |
159 // Treat numbers as word characters except for Arabic and Hebrew. | 163 // Treat numbers as word characters except for Arabic and Hebrew. |
160 const char* aletter_extra = " [0123456789]"; | 164 const char* aletter_extra = " [0123456789]"; |
161 if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC) | 165 if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC) |
162 aletter_extra = ""; | 166 aletter_extra = ""; |
163 | 167 |
164 const char kMidLetterExtra[] = ""; | 168 const char kMidLetterExtra[] = ""; |
165 // For Hebrew, treat single/double quoation marks as MidLetter. | 169 // For Hebrew, treat single/double quoation marks as MidLetter. |
166 const char kMidLetterExtraHebrew[] = "\"'"; | 170 const char kMidLetterExtraHebrew[] = "\"'"; |
(...skipping 243 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
410 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) | 414 if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) |
411 return false; | 415 return false; |
412 | 416 |
413 // Copy the normalized text to the output. | 417 // Copy the normalized text to the output. |
414 icu::StringCharacterIterator it(output); | 418 icu::StringCharacterIterator it(output); |
415 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) | 419 for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) |
416 attribute_->OutputChar(c, output_string); | 420 attribute_->OutputChar(c, output_string); |
417 | 421 |
418 return !output_string->empty(); | 422 return !output_string->empty(); |
419 } | 423 } |
OLD | NEW |