OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (C) 2013, The Android Open Source Project |
| 3 * |
| 4 * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 * you may not use this file except in compliance with the License. |
| 6 * You may obtain a copy of the License at |
| 7 * |
| 8 * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 * |
| 10 * Unless required by applicable law or agreed to in writing, software |
| 11 * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 * See the License for the specific language governing permissions and |
| 14 * limitations under the License. |
| 15 */ |
| 16 |
| 17 #ifndef LATINIME_HEADER_POLICY_H |
| 18 #define LATINIME_HEADER_POLICY_H |
| 19 |
| 20 #include <cstdint> |
| 21 |
| 22 #include "third_party/prediction/defines.h" |
| 23 #include "third_party/prediction/suggest/core/policy/dictionary_header_structure
_policy.h" |
| 24 #include "third_party/prediction/suggest/policyimpl/dictionary/header/header_rea
d_write_utils.h" |
| 25 #include "third_party/prediction/suggest/policyimpl/dictionary/utils/format_util
s.h" |
| 26 #include "third_party/prediction/utils/char_utils.h" |
| 27 #include "third_party/prediction/utils/time_keeper.h" |
| 28 |
| 29 namespace latinime { |
| 30 |
| 31 class HeaderPolicy : public DictionaryHeaderStructurePolicy { |
| 32 public: |
| 33 // Reads information from existing dictionary buffer. |
| 34 HeaderPolicy(const uint8_t* const dictBuf, |
| 35 const FormatUtils::FORMAT_VERSION formatVersion) |
| 36 : mDictFormatVersion(formatVersion), |
| 37 mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), |
| 38 mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), |
| 39 mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), |
| 40 mLocale(readLocale()), |
| 41 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), |
| 42 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), |
| 43 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue( |
| 44 &mAttributeMap, |
| 45 IS_DECAYING_DICT_KEY, |
| 46 false /* defaultValue */)), |
| 47 mDate(HeaderReadWriteUtils::readIntAttributeValue( |
| 48 &mAttributeMap, |
| 49 DATE_KEY, |
| 50 TimeKeeper::peekCurrentTime() /* defaultValue */)), |
| 51 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue( |
| 52 &mAttributeMap, |
| 53 LAST_DECAYED_TIME_KEY, |
| 54 TimeKeeper::peekCurrentTime() /* defaultValue */)), |
| 55 mUnigramCount( |
| 56 HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, |
| 57 UNIGRAM_COUNT_KEY, |
| 58 0 /* defaultValue */)), |
| 59 mBigramCount( |
| 60 HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, |
| 61 BIGRAM_COUNT_KEY, |
| 62 0 /* defaultValue */)), |
| 63 mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue( |
| 64 &mAttributeMap, |
| 65 EXTENDED_REGION_SIZE_KEY, |
| 66 0 /* defaultValue */)), |
| 67 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( |
| 68 &mAttributeMap, |
| 69 HAS_HISTORICAL_INFO_KEY, |
| 70 false /* defaultValue */)), |
| 71 mForgettingCurveOccurrencesToLevelUp( |
| 72 HeaderReadWriteUtils::readIntAttributeValue( |
| 73 &mAttributeMap, |
| 74 FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, |
| 75 DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), |
| 76 mForgettingCurveProbabilityValuesTableId( |
| 77 HeaderReadWriteUtils::readIntAttributeValue( |
| 78 &mAttributeMap, |
| 79 FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, |
| 80 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), |
| 81 mForgettingCurveDurationToLevelDown( |
| 82 HeaderReadWriteUtils::readIntAttributeValue( |
| 83 &mAttributeMap, |
| 84 FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, |
| 85 DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), |
| 86 mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 87 &mAttributeMap, |
| 88 MAX_UNIGRAM_COUNT_KEY, |
| 89 DEFAULT_MAX_UNIGRAM_COUNT)), |
| 90 mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 91 &mAttributeMap, |
| 92 MAX_BIGRAM_COUNT_KEY, |
| 93 DEFAULT_MAX_BIGRAM_COUNT)) {} |
| 94 |
| 95 // Constructs header information using an attribute map. |
| 96 HeaderPolicy( |
| 97 const FormatUtils::FORMAT_VERSION dictFormatVersion, |
| 98 const std::vector<int>& locale, |
| 99 const DictionaryHeaderStructurePolicy::AttributeMap* const attributeMap) |
| 100 : mDictFormatVersion(dictFormatVersion), |
| 101 mDictionaryFlags( |
| 102 HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( |
| 103 attributeMap)), |
| 104 mSize(0), |
| 105 mAttributeMap(*attributeMap), |
| 106 mLocale(locale), |
| 107 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), |
| 108 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), |
| 109 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue( |
| 110 &mAttributeMap, |
| 111 IS_DECAYING_DICT_KEY, |
| 112 false /* defaultValue */)), |
| 113 mDate(HeaderReadWriteUtils::readIntAttributeValue( |
| 114 &mAttributeMap, |
| 115 DATE_KEY, |
| 116 TimeKeeper::peekCurrentTime() /* defaultValue */)), |
| 117 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue( |
| 118 &mAttributeMap, |
| 119 DATE_KEY, |
| 120 TimeKeeper::peekCurrentTime() /* defaultValue */)), |
| 121 mUnigramCount(0), |
| 122 mBigramCount(0), |
| 123 mExtendedRegionSize(0), |
| 124 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( |
| 125 &mAttributeMap, |
| 126 HAS_HISTORICAL_INFO_KEY, |
| 127 false /* defaultValue */)), |
| 128 mForgettingCurveOccurrencesToLevelUp( |
| 129 HeaderReadWriteUtils::readIntAttributeValue( |
| 130 &mAttributeMap, |
| 131 FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, |
| 132 DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), |
| 133 mForgettingCurveProbabilityValuesTableId( |
| 134 HeaderReadWriteUtils::readIntAttributeValue( |
| 135 &mAttributeMap, |
| 136 FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, |
| 137 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), |
| 138 mForgettingCurveDurationToLevelDown( |
| 139 HeaderReadWriteUtils::readIntAttributeValue( |
| 140 &mAttributeMap, |
| 141 FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, |
| 142 DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), |
| 143 mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 144 &mAttributeMap, |
| 145 MAX_UNIGRAM_COUNT_KEY, |
| 146 DEFAULT_MAX_UNIGRAM_COUNT)), |
| 147 mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( |
| 148 &mAttributeMap, |
| 149 MAX_BIGRAM_COUNT_KEY, |
| 150 DEFAULT_MAX_BIGRAM_COUNT)) {} |
| 151 |
| 152 // Copy header information |
| 153 HeaderPolicy(const HeaderPolicy* const headerPolicy) |
| 154 : mDictFormatVersion(headerPolicy->mDictFormatVersion), |
| 155 mDictionaryFlags(headerPolicy->mDictionaryFlags), |
| 156 mSize(headerPolicy->mSize), |
| 157 mAttributeMap(headerPolicy->mAttributeMap), |
| 158 mLocale(headerPolicy->mLocale), |
| 159 mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), |
| 160 mRequiresGermanUmlautProcessing( |
| 161 headerPolicy->mRequiresGermanUmlautProcessing), |
| 162 mIsDecayingDict(headerPolicy->mIsDecayingDict), |
| 163 mDate(headerPolicy->mDate), |
| 164 mLastDecayedTime(headerPolicy->mLastDecayedTime), |
| 165 mUnigramCount(headerPolicy->mUnigramCount), |
| 166 mBigramCount(headerPolicy->mBigramCount), |
| 167 mExtendedRegionSize(headerPolicy->mExtendedRegionSize), |
| 168 mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), |
| 169 mForgettingCurveOccurrencesToLevelUp( |
| 170 headerPolicy->mForgettingCurveOccurrencesToLevelUp), |
| 171 mForgettingCurveProbabilityValuesTableId( |
| 172 headerPolicy->mForgettingCurveProbabilityValuesTableId), |
| 173 mForgettingCurveDurationToLevelDown( |
| 174 headerPolicy->mForgettingCurveDurationToLevelDown), |
| 175 mMaxUnigramCount(headerPolicy->mMaxUnigramCount), |
| 176 mMaxBigramCount(headerPolicy->mMaxBigramCount) {} |
| 177 |
| 178 // Temporary dummy header. |
| 179 HeaderPolicy() |
| 180 : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), |
| 181 mDictionaryFlags(0), |
| 182 mSize(0), |
| 183 mAttributeMap(), |
| 184 mLocale(CharUtils::EMPTY_STRING), |
| 185 mMultiWordCostMultiplier(0.0f), |
| 186 mRequiresGermanUmlautProcessing(false), |
| 187 mIsDecayingDict(false), |
| 188 mDate(0), |
| 189 mLastDecayedTime(0), |
| 190 mUnigramCount(0), |
| 191 mBigramCount(0), |
| 192 mExtendedRegionSize(0), |
| 193 mHasHistoricalInfoOfWords(false), |
| 194 mForgettingCurveOccurrencesToLevelUp(0), |
| 195 mForgettingCurveProbabilityValuesTableId(0), |
| 196 mForgettingCurveDurationToLevelDown(0), |
| 197 mMaxUnigramCount(0), |
| 198 mMaxBigramCount(0) {} |
| 199 |
| 200 ~HeaderPolicy() {} |
| 201 |
| 202 virtual int getFormatVersionNumber() const { |
| 203 // Conceptually this converts the symbolic value we use in the code into the |
| 204 // hardcoded of the bytes in the file. But we want the constants to be the |
| 205 // same so we use them for both here. |
| 206 switch (mDictFormatVersion) { |
| 207 case FormatUtils::VERSION_2: |
| 208 return FormatUtils::VERSION_2; |
| 209 case FormatUtils::VERSION_4_ONLY_FOR_TESTING: |
| 210 return FormatUtils::VERSION_4_ONLY_FOR_TESTING; |
| 211 case FormatUtils::VERSION_4: |
| 212 return FormatUtils::VERSION_4; |
| 213 case FormatUtils::VERSION_4_DEV: |
| 214 return FormatUtils::VERSION_4_DEV; |
| 215 default: |
| 216 return FormatUtils::UNKNOWN_VERSION; |
| 217 } |
| 218 } |
| 219 |
| 220 AK_FORCE_INLINE bool isValid() const { |
| 221 // Decaying dictionary must have historical information. |
| 222 if (!mIsDecayingDict) { |
| 223 return true; |
| 224 } |
| 225 if (mHasHistoricalInfoOfWords) { |
| 226 return true; |
| 227 } else { |
| 228 return false; |
| 229 } |
| 230 } |
| 231 |
| 232 AK_FORCE_INLINE int getSize() const { return mSize; } |
| 233 |
| 234 AK_FORCE_INLINE float getMultiWordCostMultiplier() const { |
| 235 return mMultiWordCostMultiplier; |
| 236 } |
| 237 |
| 238 AK_FORCE_INLINE bool isDecayingDict() const { return mIsDecayingDict; } |
| 239 |
| 240 AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { |
| 241 return mRequiresGermanUmlautProcessing; |
| 242 } |
| 243 |
| 244 AK_FORCE_INLINE int getDate() const { return mDate; } |
| 245 |
| 246 AK_FORCE_INLINE int getLastDecayedTime() const { return mLastDecayedTime; } |
| 247 |
| 248 AK_FORCE_INLINE int getUnigramCount() const { return mUnigramCount; } |
| 249 |
| 250 AK_FORCE_INLINE int getBigramCount() const { return mBigramCount; } |
| 251 |
| 252 AK_FORCE_INLINE int getExtendedRegionSize() const { |
| 253 return mExtendedRegionSize; |
| 254 } |
| 255 |
| 256 AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { |
| 257 return mHasHistoricalInfoOfWords; |
| 258 } |
| 259 |
| 260 AK_FORCE_INLINE bool shouldBoostExactMatches() const { |
| 261 // TODO: Investigate better ways to handle exact matches for personalized |
| 262 // dictionaries. |
| 263 return !isDecayingDict(); |
| 264 } |
| 265 |
| 266 const DictionaryHeaderStructurePolicy::AttributeMap* getAttributeMap() const { |
| 267 return &mAttributeMap; |
| 268 } |
| 269 |
| 270 AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const { |
| 271 return mForgettingCurveOccurrencesToLevelUp; |
| 272 } |
| 273 |
| 274 AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { |
| 275 return mForgettingCurveProbabilityValuesTableId; |
| 276 } |
| 277 |
| 278 AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const { |
| 279 return mForgettingCurveDurationToLevelDown; |
| 280 } |
| 281 |
| 282 AK_FORCE_INLINE int getMaxUnigramCount() const { return mMaxUnigramCount; } |
| 283 |
| 284 AK_FORCE_INLINE int getMaxBigramCount() const { return mMaxBigramCount; } |
| 285 |
| 286 void readHeaderValueOrQuestionMark(const char* const key, |
| 287 int* outValue, |
| 288 int outValueSize) const; |
| 289 |
| 290 bool fillInAndWriteHeaderToBuffer( |
| 291 const bool updatesLastDecayedTime, |
| 292 const int unigramCount, |
| 293 const int bigramCount, |
| 294 const int extendedRegionSize, |
| 295 BufferWithExtendableBuffer* const outBuffer) const; |
| 296 |
| 297 void fillInHeader( |
| 298 const bool updatesLastDecayedTime, |
| 299 const int unigramCount, |
| 300 const int bigramCount, |
| 301 const int extendedRegionSize, |
| 302 DictionaryHeaderStructurePolicy::AttributeMap* outAttributeMap) const; |
| 303 |
| 304 AK_FORCE_INLINE const std::vector<int>* getLocale() const { return &mLocale; } |
| 305 |
| 306 bool supportsBeginningOfSentence() const { |
| 307 return mDictFormatVersion >= FormatUtils::VERSION_4; |
| 308 } |
| 309 |
| 310 private: |
| 311 DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); |
| 312 |
| 313 static const char* const MULTIPLE_WORDS_DEMOTION_RATE_KEY; |
| 314 static const char* const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; |
| 315 static const char* const IS_DECAYING_DICT_KEY; |
| 316 static const char* const DATE_KEY; |
| 317 static const char* const LAST_DECAYED_TIME_KEY; |
| 318 static const char* const UNIGRAM_COUNT_KEY; |
| 319 static const char* const BIGRAM_COUNT_KEY; |
| 320 static const char* const EXTENDED_REGION_SIZE_KEY; |
| 321 static const char* const HAS_HISTORICAL_INFO_KEY; |
| 322 static const char* const LOCALE_KEY; |
| 323 static const char* const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; |
| 324 static const char* const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; |
| 325 static const char* const |
| 326 FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; |
| 327 static const char* const MAX_UNIGRAM_COUNT_KEY; |
| 328 static const char* const MAX_BIGRAM_COUNT_KEY; |
| 329 static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; |
| 330 static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; |
| 331 static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP; |
| 332 static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; |
| 333 static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS; |
| 334 static const int DEFAULT_MAX_UNIGRAM_COUNT; |
| 335 static const int DEFAULT_MAX_BIGRAM_COUNT; |
| 336 |
| 337 const FormatUtils::FORMAT_VERSION mDictFormatVersion; |
| 338 const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; |
| 339 const int mSize; |
| 340 DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; |
| 341 const std::vector<int> mLocale; |
| 342 const float mMultiWordCostMultiplier; |
| 343 const bool mRequiresGermanUmlautProcessing; |
| 344 const bool mIsDecayingDict; |
| 345 const int mDate; |
| 346 const int mLastDecayedTime; |
| 347 const int mUnigramCount; |
| 348 const int mBigramCount; |
| 349 const int mExtendedRegionSize; |
| 350 const bool mHasHistoricalInfoOfWords; |
| 351 const int mForgettingCurveOccurrencesToLevelUp; |
| 352 const int mForgettingCurveProbabilityValuesTableId; |
| 353 const int mForgettingCurveDurationToLevelDown; |
| 354 const int mMaxUnigramCount; |
| 355 const int mMaxBigramCount; |
| 356 |
| 357 const std::vector<int> readLocale() const; |
| 358 float readMultipleWordCostMultiplier() const; |
| 359 bool readRequiresGermanUmlautProcessing() const; |
| 360 |
| 361 static DictionaryHeaderStructurePolicy::AttributeMap |
| 362 createAttributeMapAndReadAllAttributes(const uint8_t* const dictBuf); |
| 363 }; |
| 364 } // namespace latinime |
| 365 #endif /* LATINIME_HEADER_POLICY_H */ |
OLD | NEW |