| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #ifndef CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ | |
| 6 #define CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ | |
| 7 | |
| 8 #include "base/basictypes.h" | |
| 9 #include "base/md5.h" | |
| 10 | |
| 11 // BDict (binary dictionary) format. All offsets are little endian. | |
| 12 // | |
| 13 // Header (28 bytes). | |
| 14 // "BDic" Signature (4 bytes) | |
| 15 // Version (little endian 4 bytes) | |
| 16 // Absolute offset in file of the aff info. (4 bytes) | |
| 17 // Absolute offset in file of the dic table. (4 bytes) | |
| 18 // (Added by v2.0) MD5 checksum of the aff info and the dic table. (16 bytes) | |
| 19 // | |
| 20 // Aff information: | |
| 21 // Absolute offset in file of the affix group table (4 bytes) | |
| 22 // Absolute offset in file of the affix rules table (4 bytes) | |
| 23 // Absolute offset in file of the replacements table (4 bytes) | |
| 24 // Absolute offset in file of the "other rules" table (4 bytes) | |
| 25 // | |
| 26 // The data between the aff header and the affix rules table is the comment | |
| 27 // from the beginning of the .aff file which often contains copyrights, etc. | |
| 28 // | |
| 29 // Affix group table: | |
| 30 // Array of NULL terminated strings. It will end in a double-NULL. | |
| 31 // | |
| 32 // Affix rules table: | |
| 33 // List of LF termianted lines. NULL terminated. | |
| 34 // | |
| 35 // Replacements table: | |
| 36 // List of pairs of NULL teminated words. The end is indicated by a | |
| 37 // double-NULL. The first word in the pair is the replacement source, the | |
| 38 // second is what to replace it with. Example: | |
| 39 // foo\0bar\0a\0b\0\0 | |
| 40 // for replacing ("foo" with "bar") and ("a" with "b"). | |
| 41 // | |
| 42 // Other rules table: | |
| 43 // List of LF termianted lines. NULL terminated. | |
| 44 // | |
| 45 // | |
| 46 // Dic table. This stores the .dic file which contains the words in the | |
| 47 // dictionary, and indices for each one that indicate a set of suffixes or | |
| 48 // prefixes that can be applied. We store it in a trie to save space. It | |
| 49 // replaces Hunspell's hash manager. | |
| 50 // | |
| 51 // 0abxxxxx xxxxxxxx (in binary) Leaf node: | |
| 52 // The number stored in the bits represented by x is the affix index. | |
| 53 // | |
| 54 // If bit <a> is set, the leaf node has an additional string. Following the | |
| 55 // 2 byte header is a NULL-terminated (possibly 0-length) string that should | |
| 56 // be appended to the node. This allows long unique endings to be handled | |
| 57 // efficiently. | |
| 58 // | |
| 59 // If bit <b> is set, the leaf node has a supplimental list of affix IDs | |
| 60 // following the ordinary data for the leaf node. These affix group IDs are | |
| 61 // additional rules for the same word. For example, two prefixes may go | |
| 62 // with distinct sets of suffixes. | |
| 63 // | |
| 64 // If the affix index is all 1's, then that means that there is only the | |
| 65 // supplimental list, and the 13-bit of affix built-in to the node don't | |
| 66 // count. This is used to represent numbers greater than 13 bits, since | |
| 67 // the supplimentary list has 16 bits per entry. The node must have a | |
| 68 // supplimenal list if this is set. | |
| 69 // | |
| 70 // This additional array is an array of 16-bit little-endian values, | |
| 71 // terminated by 0xFFFF (since 0 is an affix ID meaning "no affix ID". | |
| 72 // | |
| 73 // 0x110000ab: Lookup node. | |
| 74 // When <a> is set, addresses are 32-bits relative to the beginning of the | |
| 75 // dictionary data. When unset, addresses are 16-bits relative to the | |
| 76 // beginning of this node. All values are little endian. | |
| 77 // | |
| 78 // When <b> is set, there is one additional entry before the table begins. | |
| 79 // This is the 0th character. 0 is a common addition (meaning no more data) | |
| 80 // and this prevents us from having to store entries for all the control | |
| 81 // characters. This magic element is not counted in the table size. | |
| 82 // | |
| 83 // The ID byte is followeed by two bytes: | |
| 84 // XX: First character value in the lookup table. | |
| 85 // XX: Number of characters in the lookup table. | |
| 86 // | |
| 87 // This is followed optionally by the entry for 0, and then by a table of | |
| 88 // size indicated by the second charatcer after the ID. | |
| 89 // | |
| 90 // 1110xxxx: List node with 8-bit addresses. | |
| 91 // The number of items (max 16) in the list is stored in the bits xxxx. | |
| 92 // Followed by N (character byte, 8-bit offset) pairs. These offsets are | |
| 93 // relative to the end of the list of pairs. | |
| 94 // 1111xxxx: List node with 16-bit addresses. Same as above but offsets are | |
| 95 // 2-bytes each. LITTLE ENDIAN! | |
| 96 | |
| 97 namespace hunspell { | |
| 98 | |
| 99 #pragma pack(push, 1) | |
| 100 | |
| 101 class BDict { | |
| 102 public: | |
| 103 // File header. | |
| 104 enum { SIGNATURE = 0x63694442 }; | |
| 105 enum { | |
| 106 MAJOR_VERSION = 2, | |
| 107 MINOR_VERSION = 0 | |
| 108 }; | |
| 109 struct Header { | |
| 110 uint32 signature; | |
| 111 | |
| 112 // Major versions are incompatible with other major versions. Minor versions | |
| 113 // should be readable by older programs expecting the same major version. | |
| 114 uint16 major_version; | |
| 115 uint16 minor_version; | |
| 116 | |
| 117 uint32 aff_offset; // Offset of the aff data. | |
| 118 uint32 dic_offset; // Offset of the dic data. | |
| 119 | |
| 120 // Added by version 2.0. | |
| 121 base::MD5Digest digest; // MD5 digest of the aff data and the dic data. | |
| 122 }; | |
| 123 | |
| 124 // AFF section =============================================================== | |
| 125 | |
| 126 struct AffHeader { | |
| 127 uint32 affix_group_offset; | |
| 128 uint32 affix_rule_offset; | |
| 129 uint32 rep_offset; // Replacements table. | |
| 130 uint32 other_offset; | |
| 131 }; | |
| 132 | |
| 133 // DIC section =============================================================== | |
| 134 | |
| 135 // Leaf ---------------------------------------------------------------------- | |
| 136 | |
| 137 // Leaf nodes have the high bit set to 0. | |
| 138 enum { LEAF_NODE_TYPE_MASK = 0x80 }; // 10000000 | |
| 139 enum { LEAF_NODE_TYPE_VALUE = 0 }; // 00000000 | |
| 140 | |
| 141 // Leaf nodes with additional strings have the next-to-high bit set to 1. | |
| 142 // This mask/value pair also includes the high bit set to 0 which is the leaf | |
| 143 // indicator. | |
| 144 enum { LEAF_NODE_ADDITIONAL_MASK = 0xC0 }; // 11000000 | |
| 145 enum { LEAF_NODE_ADDITIONAL_VALUE = 0x40 }; // 01000000 | |
| 146 | |
| 147 // Leaf nodes with an additional array of affix rules following it. | |
| 148 enum { LEAF_NODE_FOLLOWING_MASK = 0xA0 }; // 10100000 | |
| 149 enum { LEAF_NODE_FOLLOWING_VALUE = 0x20 }; // 00100000 | |
| 150 | |
| 151 // The low 5 bits of the leaf node ID byte are the first 5 bits of the affix | |
| 152 // ID. The following byte is used for the low bits of the affix ID (we don't | |
| 153 // specify as mask for that). | |
| 154 enum { LEAF_NODE_FIRST_BYTE_AFFIX_MASK = 0x1F }; // 00011111 | |
| 155 | |
| 156 // The maximum affix value that can be stored in the first entry (not in the | |
| 157 // following list). We reserve all 1's to be a magic value (see next entry) | |
| 158 // so we can store large numbers somewhere else. | |
| 159 enum { LEAF_NODE_MAX_FIRST_AFFIX_ID = 0x1FFE }; // 00011111 11111110 | |
| 160 | |
| 161 // When the affix built-in to the leaf node (the first one) has too many bits | |
| 162 // for the space reserved for it (13 bits), then we fill it with this value. | |
| 163 // This means that the affix doesn't count. The affix will instead be stored | |
| 164 // in the "following list" which allows up to 16 bits per entry. | |
| 165 enum { FIRST_AFFIX_IS_UNUSED = 0x1FFF }; // 00011111 11111111 | |
| 166 | |
| 167 // The maximum number of leaf nodes we'll read that have the same word and | |
| 168 // follow each other (the FOLLOWING bit is set). | |
| 169 enum { MAX_AFFIXES_PER_WORD = 32 }; | |
| 170 | |
| 171 // The terminator for the list of following affix group IDs. | |
| 172 enum { LEAF_NODE_FOLLOWING_LIST_TERMINATOR = 0xFFFF }; | |
| 173 | |
| 174 // Lookup -------------------------------------------------------------------- | |
| 175 | |
| 176 // Lookup nodes have the first 6 bits set to 110000. | |
| 177 enum { LOOKUP_NODE_TYPE_MASK = 0xFC }; // 11111100 | |
| 178 enum { LOOKUP_NODE_TYPE_VALUE = 0xC0 }; // 11000000 | |
| 179 | |
| 180 // Lookup nodes have the low bit meaning it has a 0th entry, and the | |
| 181 // next-to-lowest bit indicating whether the offsets are 32-bits. Included | |
| 182 // in these masks are the lookup ID above. | |
| 183 enum { LOOKUP_NODE_0TH_MASK = 0xFD }; // 11111110 | |
| 184 enum { LOOKUP_NODE_0TH_VALUE = 0xC1 }; // 11000010 | |
| 185 enum { LOOKUP_NODE_32BIT_MASK = 0xFE}; // 11111110 | |
| 186 enum { LOOKUP_NODE_32BIT_VALUE = 0xC2}; // 11000001 | |
| 187 | |
| 188 // List ---------------------------------------------------------------------- | |
| 189 | |
| 190 // List nodes have the first 3 bits set to 1. | |
| 191 enum { LIST_NODE_TYPE_MASK = 0xE0 }; // 11100000 | |
| 192 enum { LIST_NODE_TYPE_VALUE = 0xE0 }; // 11100000 | |
| 193 | |
| 194 // The 4th from highest bit indicates a 16 bit (as opposed to 8 bit) list. | |
| 195 // This mask/value also includes the list ID in the high 3 bits. | |
| 196 enum { LIST_NODE_16BIT_MASK = 0xF0 }; // 11110000 | |
| 197 enum { LIST_NODE_16BIT_VALUE = 0xF0 }; // 11110000 | |
| 198 | |
| 199 // The low 4 bits of the list ID byte are the count. | |
| 200 enum { LIST_NODE_COUNT_MASK = 0xF }; // 00001111 | |
| 201 | |
| 202 // Verifies the specified BDICT is sane. This function checks the BDICT header | |
| 203 // and compares the MD5 digest of the data with the one in the header. | |
| 204 static bool Verify(const char* bdict_data, size_t bdict_length); | |
| 205 }; | |
| 206 | |
| 207 #pragma pack(pop) | |
| 208 | |
| 209 } // namespace hunspell | |
| 210 | |
| 211 #endif // CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ | |
| OLD | NEW |