OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (C) 2013, The Android Open Source Project |
| 3 * |
| 4 * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 * you may not use this file except in compliance with the License. |
| 6 * You may obtain a copy of the License at |
| 7 * |
| 8 * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 * |
| 10 * Unless required by applicable law or agreed to in writing, software |
| 11 * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 * See the License for the specific language governing permissions and |
| 14 * limitations under the License. |
| 15 */ |
| 16 |
| 17 #ifndef LATINIME_HEADER_POLICY_H |
| 18 #define LATINIME_HEADER_POLICY_H |
| 19 |
| 20 #include <cstdint> |
| 21 |
| 22 #include "third_party/android_prediction/defines.h" |
| 23 #include "third_party/android_prediction/suggest/core/policy/dictionary_header_s
tructure_policy.h" |
| 24 #include "third_party/android_prediction/suggest/policyimpl/dictionary/header/he
ader_read_write_utils.h" |
| 25 #include "third_party/android_prediction/suggest/policyimpl/dictionary/utils/for
mat_utils.h" |
| 26 #include "third_party/android_prediction/utils/char_utils.h" |
| 27 #include "third_party/android_prediction/utils/time_keeper.h" |
| 28 |
| 29 namespace latinime { |
| 30 |
| 31 class HeaderPolicy : public DictionaryHeaderStructurePolicy { |
| 32 public: |
| 33 // Reads information from existing dictionary buffer. |
| 34 HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION
formatVersion) |
| 35 : mDictFormatVersion(formatVersion), |
| 36 mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), |
| 37 mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), |
| 38 mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), |
| 39 mLocale(readLocale()), |
| 40 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), |
| 41 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing
()), |
| 42 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAtt
ributeMap, |
| 43 IS_DECAYING_DICT_KEY, false /* defaultValue */)), |
| 44 mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, |
| 45 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */
)), |
| 46 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAtt
ributeMap, |
| 47 LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* de
faultValue */)), |
| 48 mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttrib
uteMap, |
| 49 UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), |
| 50 mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttribu
teMap, |
| 51 BIGRAM_COUNT_KEY, 0 /* defaultValue */)), |
| 52 mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&m
AttributeMap, |
| 53 EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), |
| 54 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeV
alue( |
| 55 &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultV
alue */)), |
| 56 mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readInt
AttributeValue( |
| 57 &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_K
EY, |
| 58 DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), |
| 59 mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::rea
dIntAttributeValue( |
| 60 &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_
ID_KEY, |
| 61 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), |
| 62 mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntA
ttributeValue( |
| 63 &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN
_SECONDS_KEY, |
| 64 DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS
)), |
| 65 mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 66 &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM
_COUNT)), |
| 67 mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 68 &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_C
OUNT)) {} |
| 69 |
| 70 // Constructs header information using an attribute map. |
| 71 HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, |
| 72 const std::vector<int> &locale, |
| 73 const DictionaryHeaderStructurePolicy::AttributeMap *const attribute
Map) |
| 74 : mDictFormatVersion(dictFormatVersion), |
| 75 mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlags
UsingAttributeMap( |
| 76 attributeMap)), mSize(0), mAttributeMap(*attributeMap), mL
ocale(locale), |
| 77 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), |
| 78 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing
()), |
| 79 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAtt
ributeMap, |
| 80 IS_DECAYING_DICT_KEY, false /* defaultValue */)), |
| 81 mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, |
| 82 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */
)), |
| 83 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAtt
ributeMap, |
| 84 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */
)), |
| 85 mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), |
| 86 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeV
alue( |
| 87 &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultV
alue */)), |
| 88 mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readInt
AttributeValue( |
| 89 &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_K
EY, |
| 90 DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), |
| 91 mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::rea
dIntAttributeValue( |
| 92 &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_
ID_KEY, |
| 93 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), |
| 94 mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntA
ttributeValue( |
| 95 &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN
_SECONDS_KEY, |
| 96 DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS
)), |
| 97 mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 98 &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM
_COUNT)), |
| 99 mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 100 &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_C
OUNT)) {} |
| 101 |
| 102 // Copy header information |
| 103 HeaderPolicy(const HeaderPolicy *const headerPolicy) |
| 104 : mDictFormatVersion(headerPolicy->mDictFormatVersion), |
| 105 mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPoli
cy->mSize), |
| 106 mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->
mLocale), |
| 107 mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), |
| 108 mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlau
tProcessing), |
| 109 mIsDecayingDict(headerPolicy->mIsDecayingDict), |
| 110 mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDe
cayedTime), |
| 111 mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPol
icy->mBigramCount), |
| 112 mExtendedRegionSize(headerPolicy->mExtendedRegionSize), |
| 113 mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords)
, |
| 114 mForgettingCurveOccurrencesToLevelUp( |
| 115 headerPolicy->mForgettingCurveOccurrencesToLevelUp), |
| 116 mForgettingCurveProbabilityValuesTableId( |
| 117 headerPolicy->mForgettingCurveProbabilityValuesTableId), |
| 118 mForgettingCurveDurationToLevelDown( |
| 119 headerPolicy->mForgettingCurveDurationToLevelDown), |
| 120 mMaxUnigramCount(headerPolicy->mMaxUnigramCount), |
| 121 mMaxBigramCount(headerPolicy->mMaxBigramCount) {} |
| 122 |
| 123 // Temporary dummy header. |
| 124 HeaderPolicy() |
| 125 : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags
(0), mSize(0), |
| 126 mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostM
ultiplier(0.0f), |
| 127 mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), |
| 128 mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), |
| 129 mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), |
| 130 mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabili
tyValuesTableId(0), |
| 131 mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxB
igramCount(0) {} |
| 132 |
| 133 ~HeaderPolicy() {} |
| 134 |
| 135 virtual int getFormatVersionNumber() const { |
| 136 // Conceptually this converts the symbolic value we use in the code into
the |
| 137 // hardcoded of the bytes in the file. But we want the constants to be t
he |
| 138 // same so we use them for both here. |
| 139 switch (mDictFormatVersion) { |
| 140 case FormatUtils::VERSION_2: |
| 141 return FormatUtils::VERSION_2; |
| 142 case FormatUtils::VERSION_4_ONLY_FOR_TESTING: |
| 143 return FormatUtils::VERSION_4_ONLY_FOR_TESTING; |
| 144 case FormatUtils::VERSION_4: |
| 145 return FormatUtils::VERSION_4; |
| 146 case FormatUtils::VERSION_4_DEV: |
| 147 return FormatUtils::VERSION_4_DEV; |
| 148 default: |
| 149 return FormatUtils::UNKNOWN_VERSION; |
| 150 } |
| 151 } |
| 152 |
| 153 AK_FORCE_INLINE bool isValid() const { |
| 154 // Decaying dictionary must have historical information. |
| 155 if (!mIsDecayingDict) { |
| 156 return true; |
| 157 } |
| 158 if (mHasHistoricalInfoOfWords) { |
| 159 return true; |
| 160 } else { |
| 161 return false; |
| 162 } |
| 163 } |
| 164 |
| 165 AK_FORCE_INLINE int getSize() const { |
| 166 return mSize; |
| 167 } |
| 168 |
| 169 AK_FORCE_INLINE float getMultiWordCostMultiplier() const { |
| 170 return mMultiWordCostMultiplier; |
| 171 } |
| 172 |
| 173 AK_FORCE_INLINE bool isDecayingDict() const { |
| 174 return mIsDecayingDict; |
| 175 } |
| 176 |
| 177 AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { |
| 178 return mRequiresGermanUmlautProcessing; |
| 179 } |
| 180 |
| 181 AK_FORCE_INLINE int getDate() const { |
| 182 return mDate; |
| 183 } |
| 184 |
| 185 AK_FORCE_INLINE int getLastDecayedTime() const { |
| 186 return mLastDecayedTime; |
| 187 } |
| 188 |
| 189 AK_FORCE_INLINE int getUnigramCount() const { |
| 190 return mUnigramCount; |
| 191 } |
| 192 |
| 193 AK_FORCE_INLINE int getBigramCount() const { |
| 194 return mBigramCount; |
| 195 } |
| 196 |
| 197 AK_FORCE_INLINE int getExtendedRegionSize() const { |
| 198 return mExtendedRegionSize; |
| 199 } |
| 200 |
| 201 AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { |
| 202 return mHasHistoricalInfoOfWords; |
| 203 } |
| 204 |
| 205 AK_FORCE_INLINE bool shouldBoostExactMatches() const { |
| 206 // TODO: Investigate better ways to handle exact matches for personalize
d dictionaries. |
| 207 return !isDecayingDict(); |
| 208 } |
| 209 |
| 210 const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const
{ |
| 211 return &mAttributeMap; |
| 212 } |
| 213 |
| 214 AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const { |
| 215 return mForgettingCurveOccurrencesToLevelUp; |
| 216 } |
| 217 |
| 218 AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { |
| 219 return mForgettingCurveProbabilityValuesTableId; |
| 220 } |
| 221 |
| 222 AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const { |
| 223 return mForgettingCurveDurationToLevelDown; |
| 224 } |
| 225 |
| 226 AK_FORCE_INLINE int getMaxUnigramCount() const { |
| 227 return mMaxUnigramCount; |
| 228 } |
| 229 |
| 230 AK_FORCE_INLINE int getMaxBigramCount() const { |
| 231 return mMaxBigramCount; |
| 232 } |
| 233 |
| 234 void readHeaderValueOrQuestionMark(const char *const key, |
| 235 int *outValue, int outValueSize) const; |
| 236 |
| 237 bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, |
| 238 const int unigramCount, const int bigramCount, |
| 239 const int extendedRegionSize, BufferWithExtendableBuffer *const outB
uffer) const; |
| 240 |
| 241 void fillInHeader(const bool updatesLastDecayedTime, |
| 242 const int unigramCount, const int bigramCount, const int extendedReg
ionSize, |
| 243 DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) cons
t; |
| 244 |
| 245 AK_FORCE_INLINE const std::vector<int> *getLocale() const { |
| 246 return &mLocale; |
| 247 } |
| 248 |
| 249 bool supportsBeginningOfSentence() const { |
| 250 return mDictFormatVersion >= FormatUtils::VERSION_4; |
| 251 } |
| 252 |
| 253 private: |
| 254 DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); |
| 255 |
| 256 static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; |
| 257 static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; |
| 258 static const char *const IS_DECAYING_DICT_KEY; |
| 259 static const char *const DATE_KEY; |
| 260 static const char *const LAST_DECAYED_TIME_KEY; |
| 261 static const char *const UNIGRAM_COUNT_KEY; |
| 262 static const char *const BIGRAM_COUNT_KEY; |
| 263 static const char *const EXTENDED_REGION_SIZE_KEY; |
| 264 static const char *const HAS_HISTORICAL_INFO_KEY; |
| 265 static const char *const LOCALE_KEY; |
| 266 static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; |
| 267 static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; |
| 268 static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_
KEY; |
| 269 static const char *const MAX_UNIGRAM_COUNT_KEY; |
| 270 static const char *const MAX_BIGRAM_COUNT_KEY; |
| 271 static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; |
| 272 static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; |
| 273 static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP; |
| 274 static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; |
| 275 static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS; |
| 276 static const int DEFAULT_MAX_UNIGRAM_COUNT; |
| 277 static const int DEFAULT_MAX_BIGRAM_COUNT; |
| 278 |
| 279 const FormatUtils::FORMAT_VERSION mDictFormatVersion; |
| 280 const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; |
| 281 const int mSize; |
| 282 DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; |
| 283 const std::vector<int> mLocale; |
| 284 const float mMultiWordCostMultiplier; |
| 285 const bool mRequiresGermanUmlautProcessing; |
| 286 const bool mIsDecayingDict; |
| 287 const int mDate; |
| 288 const int mLastDecayedTime; |
| 289 const int mUnigramCount; |
| 290 const int mBigramCount; |
| 291 const int mExtendedRegionSize; |
| 292 const bool mHasHistoricalInfoOfWords; |
| 293 const int mForgettingCurveOccurrencesToLevelUp; |
| 294 const int mForgettingCurveProbabilityValuesTableId; |
| 295 const int mForgettingCurveDurationToLevelDown; |
| 296 const int mMaxUnigramCount; |
| 297 const int mMaxBigramCount; |
| 298 |
| 299 const std::vector<int> readLocale() const; |
| 300 float readMultipleWordCostMultiplier() const; |
| 301 bool readRequiresGermanUmlautProcessing() const; |
| 302 |
| 303 static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndRe
adAllAttributes( |
| 304 const uint8_t *const dictBuf); |
| 305 }; |
| 306 } // namespace latinime |
| 307 #endif /* LATINIME_HEADER_POLICY_H */ |
OLD | NEW |