| OLD | NEW |
| 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/tools/convert_dict/aff_reader.h" | 5 #include "chrome/tools/convert_dict/aff_reader.h" |
| 6 | 6 |
| 7 #include <algorithm> | 7 #include <algorithm> |
| 8 | 8 |
| 9 #include "base/file_util.h" | 9 #include "base/file_util.h" |
| 10 #include "base/i18n/icu_string_conversions.h" | 10 #include "base/i18n/icu_string_conversions.h" |
| (...skipping 104 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 115 } | 115 } |
| 116 } else if (StringBeginsWith(line, "TRY ") || | 116 } else if (StringBeginsWith(line, "TRY ") || |
| 117 StringBeginsWith(line, "MAP ")) { | 117 StringBeginsWith(line, "MAP ")) { |
| 118 HandleEncodedCommand(line); | 118 HandleEncodedCommand(line); |
| 119 } else if (StringBeginsWith(line, "IGNORE ")) { | 119 } else if (StringBeginsWith(line, "IGNORE ")) { |
| 120 printf("We don't support the IGNORE command yet. This would change how " | 120 printf("We don't support the IGNORE command yet. This would change how " |
| 121 "we would insert things in our lookup table.\n"); | 121 "we would insert things in our lookup table.\n"); |
| 122 exit(1); | 122 exit(1); |
| 123 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { | 123 } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { |
| 124 printf("We don't support the COMPLEXPREFIXES command yet. This would " | 124 printf("We don't support the COMPLEXPREFIXES command yet. This would " |
| 125 "mean we have to insert words backwords as well (I think)\n"); | 125 "mean we have to insert words backwards as well (I think)\n"); |
| 126 exit(1); | 126 exit(1); |
| 127 } else { | 127 } else { |
| 128 // All other commands get stored in the other commands list. | 128 // All other commands get stored in the other commands list. |
| 129 HandleRawCommand(line); | 129 HandleRawCommand(line); |
| 130 } | 130 } |
| 131 } | 131 } |
| 132 | 132 |
| 133 return true; | 133 return true; |
| 134 } | 134 } |
| 135 | 135 |
| (...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 234 size_t slash_index = part.find('/'); | 234 size_t slash_index = part.find('/'); |
| 235 if (slash_index != std::string::npos && !has_indexed_affixes()) { | 235 if (slash_index != std::string::npos && !has_indexed_affixes()) { |
| 236 // This can also have a rule string associated with it following a | 236 // This can also have a rule string associated with it following a |
| 237 // slash. For example: | 237 // slash. For example: |
| 238 // PFX P 0 foo/Y . | 238 // PFX P 0 foo/Y . |
| 239 // The "Y" is a flag. For example, the aff file might have a line: | 239 // The "Y" is a flag. For example, the aff file might have a line: |
| 240 // COMPOUNDFLAG Y | 240 // COMPOUNDFLAG Y |
| 241 // so that means that this prefix would be a compound one. | 241 // so that means that this prefix would be a compound one. |
| 242 // | 242 // |
| 243 // It expects these rules to use the same alias rules as the .dic | 243 // It expects these rules to use the same alias rules as the .dic |
| 244 // file. We've forced it to use aliases, which is a numberical index | 244 // file. We've forced it to use aliases, which is a numerical index |
| 245 // instead of these character flags, and this needs to be consistent. | 245 // instead of these character flags, and this needs to be consistent. |
| 246 | 246 |
| 247 std::string before_flags = part.substr(0, slash_index + 1); | 247 std::string before_flags = part.substr(0, slash_index + 1); |
| 248 | 248 |
| 249 // After the slash are both the flags, then whitespace, then the part | 249 // After the slash are both the flags, then whitespace, then the part |
| 250 // that tells us what to strip. | 250 // that tells us what to strip. |
| 251 std::vector<std::string> after_slash; | 251 std::vector<std::string> after_slash; |
| 252 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash); | 252 base::SplitString(part.substr(slash_index + 1), ' ', &after_slash); |
| 253 if (after_slash.size() < 2) { | 253 if (after_slash.size() == 0) { |
| 254 // Note that we may get a third term here which is the | 254 printf("ERROR: Found 0 terms after slash in affix rule '%s', " |
| 255 // morphological description of this rule. This happens in the tests | 255 "but need at least 2.\n", |
| 256 // only, so we can just ignore it. | 256 part.c_str()); |
| 257 printf("ERROR: Didn't get enough after the slash\n"); | |
| 258 return; | 257 return; |
| 259 } | 258 } |
| 259 if (after_slash.size() == 1) { |
| 260 printf("WARNING: Found 1 term after slash in affix rule '%s', " |
| 261 "but expected at least 2. Adding '.'.\n", |
| 262 part.c_str()); |
| 263 after_slash.push_back("."); |
| 264 } |
| 265 // Note that we may get a third term here which is the morphological |
| 266 // description of this rule. This happens in the tests only, so we can |
| 267 // just ignore it. |
| 260 | 268 |
| 261 part = base::StringPrintf("%s%d %s", | 269 part = base::StringPrintf("%s%d %s", |
| 262 before_flags.c_str(), | 270 before_flags.c_str(), |
| 263 GetAFIndexForAFString(after_slash[0]), | 271 GetAFIndexForAFString(after_slash[0]), |
| 264 after_slash[1].c_str()); | 272 after_slash[1].c_str()); |
| 265 } | 273 } |
| 266 | 274 |
| 267 // Reencode from here | 275 // Reencode from here |
| 268 std::string reencoded; | 276 std::string reencoded; |
| 269 if (!EncodingToUTF8(part, &reencoded)) | 277 if (!EncodingToUTF8(part, &reencoded)) { |
| 278 printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n", |
| 279 part.c_str()); |
| 270 break; | 280 break; |
| 281 } |
| 271 | 282 |
| 272 *rule = rule->substr(0, part_start) + reencoded; | 283 *rule = rule->substr(0, part_start) + reencoded; |
| 273 break; | 284 break; |
| 274 } | 285 } |
| 275 token.clear(); | 286 token.clear(); |
| 276 } else { | 287 } else { |
| 277 token.push_back((*rule)[i]); | 288 token.push_back((*rule)[i]); |
| 278 } | 289 } |
| 279 } | 290 } |
| 280 | 291 |
| 281 affix_rules_.push_back(*rule); | 292 affix_rules_.push_back(*rule); |
| 282 } | 293 } |
| 283 | 294 |
| 284 void AffReader::AddReplacement(std::string* rule) { | 295 void AffReader::AddReplacement(std::string* rule) { |
| 285 TrimLine(rule); | 296 TrimLine(rule); |
| 297 CollapseDuplicateSpaces(rule); |
| 286 | 298 |
| 287 std::string utf8rule; | 299 std::string utf8rule; |
| 288 if (!EncodingToUTF8(*rule, &utf8rule)) | 300 if (!EncodingToUTF8(*rule, &utf8rule)) { |
| 301 printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n", |
| 302 rule->c_str()); |
| 289 return; | 303 return; |
| 304 } |
| 290 | 305 |
| 306 // The first space separates key and value. |
| 307 size_t space_index = utf8rule.find(' '); |
| 308 if (space_index == std::string::npos) { |
| 309 printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str()); |
| 310 return; |
| 311 } |
| 291 std::vector<std::string> split; | 312 std::vector<std::string> split; |
| 292 base::SplitString(utf8rule, ' ', &split); | 313 split.push_back(utf8rule.substr(0, space_index)); |
| 314 split.push_back(utf8rule.substr(space_index + 1)); |
| 293 | 315 |
| 294 // There should be two parts. | 316 // Underscores are used to represent spaces in most aff files |
| 295 if (split.size() != 2) | |
| 296 return; | |
| 297 | |
| 298 // Underscores are used to represent spaces | |
| 299 // (since the line is parsed on spaces). | 317 // (since the line is parsed on spaces). |
| 300 std::replace(split[0].begin(), split[0].end(), '_', ' '); | 318 std::replace(split[0].begin(), split[0].end(), '_', ' '); |
| 301 std::replace(split[1].begin(), split[1].end(), '_', ' '); | 319 std::replace(split[1].begin(), split[1].end(), '_', ' '); |
| 302 | 320 |
| 303 replacements_.push_back(std::make_pair(split[0], split[1])); | 321 replacements_.push_back(std::make_pair(split[0], split[1])); |
| 304 } | 322 } |
| 305 | 323 |
| 306 void AffReader::HandleRawCommand(const std::string& line) { | 324 void AffReader::HandleRawCommand(const std::string& line) { |
| 307 other_commands_.push_back(line); | 325 other_commands_.push_back(line); |
| 308 } | 326 } |
| 309 | 327 |
| 310 void AffReader::HandleEncodedCommand(const std::string& line) { | 328 void AffReader::HandleEncodedCommand(const std::string& line) { |
| 311 std::string utf8; | 329 std::string utf8; |
| 312 if (EncodingToUTF8(line, &utf8)) | 330 if (!EncodingToUTF8(line, &utf8)) { |
| 313 other_commands_.push_back(utf8); | 331 printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str()); |
| 332 return; |
| 333 } |
| 334 other_commands_.push_back(utf8); |
| 314 } | 335 } |
| 315 | 336 |
| 316 } // namespace convert_dict | 337 } // namespace convert_dict |
| OLD | NEW |