| OLD | NEW |
| (Empty) | |
| 1 /* |
| 2 * Copyright (C) 2010 The Android Open Source Project |
| 3 * |
| 4 * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 * you may not use this file except in compliance with the License. |
| 6 * You may obtain a copy of the License at |
| 7 * |
| 8 * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 * |
| 10 * Unless required by applicable law or agreed to in writing, software |
| 11 * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 * See the License for the specific language governing permissions and |
| 14 * limitations under the License. |
| 15 */ |
| 16 |
| 17 #ifndef LATINIME_CHAR_UTILS_H |
| 18 #define LATINIME_CHAR_UTILS_H |
| 19 |
| 20 #include <cctype> |
| 21 #include <cstring> |
| 22 #include <vector> |
| 23 |
| 24 #include "third_party/prediction/defines.h" |
| 25 |
| 26 namespace latinime { |
| 27 |
| 28 class CharUtils { |
| 29 public: |
| 30 static AK_FORCE_INLINE bool isAsciiUpper(int c) { |
| 31 // Note: isupper(...) reports false positives for some Cyrillic characters, |
| 32 // causing them to |
| 33 // be incorrectly lower-cased using toAsciiLower(...) rather than |
| 34 // latin_tolower(...). |
| 35 return (c >= 'A' && c <= 'Z'); |
| 36 } |
| 37 |
| 38 static AK_FORCE_INLINE int toAsciiLower(int c) { return c - 'A' + 'a'; } |
| 39 |
| 40 static AK_FORCE_INLINE bool isAscii(int c) { return isascii(c) != 0; } |
| 41 |
| 42 static AK_FORCE_INLINE int toLowerCase(const int c) { |
| 43 if (isAsciiUpper(c)) { |
| 44 return toAsciiLower(c); |
| 45 } |
| 46 if (isAscii(c)) { |
| 47 return c; |
| 48 } |
| 49 return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); |
| 50 } |
| 51 |
| 52 static AK_FORCE_INLINE int toBaseLowerCase(const int c) { |
| 53 return toLowerCase(toBaseCodePoint(c)); |
| 54 } |
| 55 |
| 56 static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint( |
| 57 const int codePoint) { |
| 58 // TODO: Do not hardcode here |
| 59 return codePoint == KEYCODE_SINGLE_QUOTE || |
| 60 codePoint == KEYCODE_HYPHEN_MINUS; |
| 61 } |
| 62 |
| 63 static AK_FORCE_INLINE int getCodePointCount(const int arraySize, |
| 64 const int* const codePoints) { |
| 65 int size = 0; |
| 66 for (; size < arraySize; ++size) { |
| 67 if (codePoints[size] == '\0') { |
| 68 break; |
| 69 } |
| 70 } |
| 71 return size; |
| 72 } |
| 73 |
| 74 static AK_FORCE_INLINE int toBaseCodePoint(int c) { |
| 75 if (c < BASE_CHARS_SIZE) { |
| 76 return static_cast<int>(BASE_CHARS[c]); |
| 77 } |
| 78 return c; |
| 79 } |
| 80 |
| 81 static AK_FORCE_INLINE int getSpaceCount(const int* const codePointBuffer, |
| 82 const int length) { |
| 83 int spaceCount = 0; |
| 84 for (int i = 0; i < length; ++i) { |
| 85 if (codePointBuffer[i] == KEYCODE_SPACE) { |
| 86 ++spaceCount; |
| 87 } |
| 88 } |
| 89 return spaceCount; |
| 90 } |
| 91 |
| 92 static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) { |
| 93 return codePoint >= MIN_UNICODE_CODE_POINT && |
| 94 codePoint <= MAX_UNICODE_CODE_POINT; |
| 95 } |
| 96 |
| 97 static unsigned short latin_tolower(const unsigned short c); |
| 98 static const std::vector<int> EMPTY_STRING; |
| 99 |
| 100 // Returns updated code point count. Returns 0 when the code points cannot be |
| 101 // marked as a |
| 102 // Beginning-of-Sentence. |
| 103 static AK_FORCE_INLINE int attachBeginningOfSentenceMarker( |
| 104 int* const codePoints, |
| 105 const int codePointCount, |
| 106 const int maxCodePoint) { |
| 107 if (codePointCount > 0 && |
| 108 codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { |
| 109 // Marker has already been attached. |
| 110 return codePointCount; |
| 111 } |
| 112 if (codePointCount >= maxCodePoint) { |
| 113 // the code points cannot be marked as a Beginning-of-Sentence. |
| 114 return 0; |
| 115 } |
| 116 memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount); |
| 117 codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE; |
| 118 return codePointCount + 1; |
| 119 } |
| 120 |
| 121 private: |
| 122 DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); |
| 123 |
| 124 static const int MIN_UNICODE_CODE_POINT; |
| 125 static const int MAX_UNICODE_CODE_POINT; |
| 126 |
| 127 /** |
| 128 * Table mapping most combined Latin, Greek, and Cyrillic characters |
| 129 * to their base characters. If c is in range, BASE_CHARS[c] == c |
| 130 * if c is not a combined character, or the base character if it |
| 131 * is combined. |
| 132 */ |
| 133 static const int BASE_CHARS_SIZE = 0x0500; |
| 134 static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; |
| 135 }; |
| 136 } // namespace latinime |
| 137 #endif // LATINIME_CHAR_UTILS_H |
| OLD | NEW |