OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All r
ights reserved. | 2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All |
| 3 * rights reserved. |
3 * Copyright (C) 2005 Alexey Proskuryakov. | 4 * Copyright (C) 2005 Alexey Proskuryakov. |
4 * | 5 * |
5 * Redistribution and use in source and binary forms, with or without | 6 * Redistribution and use in source and binary forms, with or without |
6 * modification, are permitted provided that the following conditions | 7 * modification, are permitted provided that the following conditions |
7 * are met: | 8 * are met: |
8 * 1. Redistributions of source code must retain the above copyright | 9 * 1. Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 10 * notice, this list of conditions and the following disclaimer. |
10 * 2. Redistributions in binary form must reproduce the above copyright | 11 * 2. Redistributions in binary form must reproduce the above copyright |
11 * notice, this list of conditions and the following disclaimer in the | 12 * notice, this list of conditions and the following disclaimer in the |
12 * documentation and/or other materials provided with the distribution. | 13 * documentation and/or other materials provided with the distribution. |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
45 switch (static_cast<UChar>(c)) { | 46 switch (static_cast<UChar>(c)) { |
46 case hebrewPunctuationGershayimCharacter: | 47 case hebrewPunctuationGershayimCharacter: |
47 case leftDoubleQuotationMarkCharacter: | 48 case leftDoubleQuotationMarkCharacter: |
48 case rightDoubleQuotationMarkCharacter: | 49 case rightDoubleQuotationMarkCharacter: |
49 return '"'; | 50 return '"'; |
50 case hebrewPunctuationGereshCharacter: | 51 case hebrewPunctuationGereshCharacter: |
51 case leftSingleQuotationMarkCharacter: | 52 case leftSingleQuotationMarkCharacter: |
52 case rightSingleQuotationMarkCharacter: | 53 case rightSingleQuotationMarkCharacter: |
53 return '\''; | 54 return '\''; |
54 case softHyphenCharacter: | 55 case softHyphenCharacter: |
55 // Replace soft hyphen with an ignorable character so that their presence
or absence will | 56 // Replace soft hyphen with an ignorable character so that their presence |
| 57 // or absence will |
56 // not affect string comparison. | 58 // not affect string comparison. |
57 return 0; | 59 return 0; |
58 default: | 60 default: |
59 return c; | 61 return c; |
60 } | 62 } |
61 } | 63 } |
62 | 64 |
63 void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) { | 65 void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) { |
64 for (size_t i = 0; i < length; ++i) | 66 for (size_t i = 0; i < length; ++i) |
65 data[i] = foldQuoteMarkOrSoftHyphen(data[i]); | 67 data[i] = foldQuoteMarkOrSoftHyphen(data[i]); |
66 } | 68 } |
67 | 69 |
68 void foldQuoteMarksAndSoftHyphens(String& s) { | 70 void foldQuoteMarksAndSoftHyphens(String& s) { |
69 s.replace(hebrewPunctuationGereshCharacter, '\''); | 71 s.replace(hebrewPunctuationGereshCharacter, '\''); |
70 s.replace(hebrewPunctuationGershayimCharacter, '"'); | 72 s.replace(hebrewPunctuationGershayimCharacter, '"'); |
71 s.replace(leftDoubleQuotationMarkCharacter, '"'); | 73 s.replace(leftDoubleQuotationMarkCharacter, '"'); |
72 s.replace(leftSingleQuotationMarkCharacter, '\''); | 74 s.replace(leftSingleQuotationMarkCharacter, '\''); |
73 s.replace(rightDoubleQuotationMarkCharacter, '"'); | 75 s.replace(rightDoubleQuotationMarkCharacter, '"'); |
74 s.replace(rightSingleQuotationMarkCharacter, '\''); | 76 s.replace(rightSingleQuotationMarkCharacter, '\''); |
75 // Replace soft hyphen with an ignorable character so that their presence or a
bsence will | 77 // Replace soft hyphen with an ignorable character so that their presence or |
| 78 // absence will |
76 // not affect string comparison. | 79 // not affect string comparison. |
77 s.replace(softHyphenCharacter, static_cast<UChar>('\0')); | 80 s.replace(softHyphenCharacter, static_cast<UChar>('\0')); |
78 } | 81 } |
79 | 82 |
80 static bool isNonLatin1Separator(UChar32 character) { | 83 static bool isNonLatin1Separator(UChar32 character) { |
81 DCHECK_GE(character, 256); | 84 DCHECK_GE(character, 256); |
82 return U_GET_GC_MASK(character) & | 85 return U_GET_GC_MASK(character) & |
83 (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); | 86 (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK); |
84 } | 87 } |
85 | 88 |
86 bool isSeparator(UChar32 character) { | 89 bool isSeparator(UChar32 character) { |
87 static const bool | 90 // clang-format off |
88 latin1SeparatorTable[256] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 91 static const bool latin1SeparatorTable[256] = { |
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
90 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
91 1, 1, 1, // space ! " # $ % & ' ( ) * + , -
. / | 94 // space ! " # $ % & ' ( ) * + , - . / |
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, | 95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
93 1, // : ; < = > ? | 96 // : ; < = > ? |
94 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, |
95 0, // @ | 98 // @ |
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, | 99 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
97 1, // [ \ ] ^ _ | 100 // [ \ ] ^ _ |
98 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, |
99 0, // ` | 102 // ` |
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, | 103 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
101 0, // { | } ~ | 104 // { | } ~ |
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, |
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
104 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, | 107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
105 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, | 108 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
106 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 109 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, |
107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, | 110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
108 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 111 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, |
109 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, | 112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
110 0, 0, 0, 0, 0, 0, 0, 0}; | 113 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 |
111 | 114 }; |
| 115 // clang-format on |
112 if (character < 256) | 116 if (character < 256) |
113 return latin1SeparatorTable[character]; | 117 return latin1SeparatorTable[character]; |
114 | 118 |
115 return isNonLatin1Separator(character); | 119 return isNonLatin1Separator(character); |
116 } | 120 } |
117 | 121 |
118 // ICU's search ignores the distinction between small kana letters and ones | 122 // ICU's search ignores the distinction between small kana letters and ones |
119 // that are not small, and also characters that differ only in the voicing | 123 // that are not small, and also characters that differ only in the voicing |
120 // marks when considering only primary collation strength differences. | 124 // marks when considering only primary collation strength differences. |
121 // This is not helpful for end users, since these differences make words | 125 // This is not helpful for end users, since these differences make words |
(...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
306 | 310 |
307 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) | 311 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) |
308 return; | 312 return; |
309 | 313 |
310 status = U_ZERO_ERROR; | 314 status = U_ZERO_ERROR; |
311 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, | 315 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, |
312 &status); | 316 &status); |
313 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); | 317 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING); |
314 } | 318 } |
315 | 319 |
316 // This function returns kNotFound if |first| and |second| contain different Kan
a letters. | 320 // This function returns kNotFound if |first| and |second| contain different |
317 // If |first| and |second| contain the same Kana letter | 321 // Kana letters. If |first| and |second| contain the same Kana letter then |
318 // then function returns offset in characters from |first|. | 322 // function returns offset in characters from |first|. |
319 // Pointers to both strings increase simultaneously so so it is possible to use
one offset value. | 323 // Pointers to both strings increase simultaneously so so it is possible to use |
| 324 // one offset value. |
320 static inline size_t compareKanaLetterAndComposedVoicedSoundMarks( | 325 static inline size_t compareKanaLetterAndComposedVoicedSoundMarks( |
321 const UChar* first, | 326 const UChar* first, |
322 const UChar* firstEnd, | 327 const UChar* firstEnd, |
323 const UChar* second, | 328 const UChar* second, |
324 const UChar* secondEnd) { | 329 const UChar* secondEnd) { |
325 const UChar* start = first; | 330 const UChar* start = first; |
326 // Check for differences in the kana letter character itself. | 331 // Check for differences in the kana letter character itself. |
327 if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second)) | 332 if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second)) |
328 return kNotFound; | 333 return kNotFound; |
329 if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second)) | 334 if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second)) |
330 return kNotFound; | 335 return kNotFound; |
331 ++first; | 336 ++first; |
332 ++second; | 337 ++second; |
333 | 338 |
334 // Check for differences in combining voiced sound marks found after the lette
r. | 339 // Check for differences in combining voiced sound marks found after the |
| 340 // letter. |
335 while (true) { | 341 while (true) { |
336 const bool secondIsNotSoundMark = | 342 const bool secondIsNotSoundMark = |
337 second == secondEnd || !isCombiningVoicedSoundMark(*second); | 343 second == secondEnd || !isCombiningVoicedSoundMark(*second); |
338 if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) { | 344 if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) { |
339 return secondIsNotSoundMark ? first - start : kNotFound; | 345 return secondIsNotSoundMark ? first - start : kNotFound; |
340 } | 346 } |
341 if (secondIsNotSoundMark) | 347 if (secondIsNotSoundMark) |
342 return kNotFound; | 348 return kNotFound; |
343 if (*first != *second) | 349 if (*first != *second) |
344 return kNotFound; | 350 return kNotFound; |
345 ++first; | 351 ++first; |
346 ++second; | 352 ++second; |
347 } | 353 } |
348 } | 354 } |
349 | 355 |
350 bool checkOnlyKanaLettersInStrings(const UChar* firstData, | 356 bool checkOnlyKanaLettersInStrings(const UChar* firstData, |
351 unsigned firstLength, | 357 unsigned firstLength, |
352 const UChar* secondData, | 358 const UChar* secondData, |
353 unsigned secondLength) { | 359 unsigned secondLength) { |
354 const UChar* a = firstData; | 360 const UChar* a = firstData; |
355 const UChar* aEnd = firstData + firstLength; | 361 const UChar* aEnd = firstData + firstLength; |
356 | 362 |
357 const UChar* b = secondData; | 363 const UChar* b = secondData; |
358 const UChar* bEnd = secondData + secondLength; | 364 const UChar* bEnd = secondData + secondLength; |
359 while (true) { | 365 while (true) { |
360 // Skip runs of non-kana-letter characters. This is necessary so we can | 366 // Skip runs of non-kana-letter characters. This is necessary so we can |
361 // correctly handle strings where the |firstData| and |secondData| have diff
erent-length | 367 // correctly handle strings where the |firstData| and |secondData| have |
362 // runs of characters that match, while still double checking the correctnes
s | 368 // different-length runs of characters that match, while still double |
363 // of matches of kana letters with other kana letters. | 369 // checking the correctness of matches of kana letters with other kana |
| 370 // letters. |
364 while (a != aEnd && !isKanaLetter(*a)) | 371 while (a != aEnd && !isKanaLetter(*a)) |
365 ++a; | 372 ++a; |
366 while (b != bEnd && !isKanaLetter(*b)) | 373 while (b != bEnd && !isKanaLetter(*b)) |
367 ++b; | 374 ++b; |
368 | 375 |
369 // If we reached the end of either the target or the match, we should have | 376 // If we reached the end of either the target or the match, we should have |
370 // reached the end of both; both should have the same number of kana letters
. | 377 // reached the end of both; both should have the same number of kana |
| 378 // letters. |
371 if (a == aEnd || b == bEnd) { | 379 if (a == aEnd || b == bEnd) { |
372 return a == aEnd && b == bEnd; | 380 return a == aEnd && b == bEnd; |
373 } | 381 } |
374 | 382 |
375 // Check that single Kana letters in |a| and |b| are the same. | 383 // Check that single Kana letters in |a| and |b| are the same. |
376 const size_t offset = | 384 const size_t offset = |
377 compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); | 385 compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); |
378 if (offset == kNotFound) | 386 if (offset == kNotFound) |
379 return false; | 387 return false; |
380 | 388 |
(...skipping 13 matching lines...) Expand all Loading... |
394 const UChar* b = secondData; | 402 const UChar* b = secondData; |
395 const UChar* bEnd = secondData + secondLength; | 403 const UChar* bEnd = secondData + secondLength; |
396 while (true) { | 404 while (true) { |
397 // Check for non-kana-letter characters. | 405 // Check for non-kana-letter characters. |
398 while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) { | 406 while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) { |
399 if (*a++ != *b++) | 407 if (*a++ != *b++) |
400 return false; | 408 return false; |
401 } | 409 } |
402 | 410 |
403 // If we reached the end of either the target or the match, we should have | 411 // If we reached the end of either the target or the match, we should have |
404 // reached the end of both; both should have the same number of kana letters
. | 412 // reached the end of both; both should have the same number of kana |
| 413 // letters. |
405 if (a == aEnd || b == bEnd) { | 414 if (a == aEnd || b == bEnd) { |
406 return a == aEnd && b == bEnd; | 415 return a == aEnd && b == bEnd; |
407 } | 416 } |
408 | 417 |
409 if (isKanaLetter(*a) != isKanaLetter(*b)) | 418 if (isKanaLetter(*a) != isKanaLetter(*b)) |
410 return false; | 419 return false; |
411 | 420 |
412 // Check that single Kana letters in |a| and |b| are the same. | 421 // Check that single Kana letters in |a| and |b| are the same. |
413 const size_t offset = | 422 const size_t offset = |
414 compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); | 423 compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd); |
415 if (offset == kNotFound) | 424 if (offset == kNotFound) |
416 return false; | 425 return false; |
417 | 426 |
418 // Update values of |a| and |b| after comparing. | 427 // Update values of |a| and |b| after comparing. |
419 a += offset; | 428 a += offset; |
420 b += offset; | 429 b += offset; |
421 } | 430 } |
422 } | 431 } |
423 | 432 |
424 } // namespace blink | 433 } // namespace blink |
OLD | NEW |