OLD | NEW |
1 // Copyright 2008 Google Inc. All Rights Reserved. | 1 // Copyright 2008 Google Inc. All Rights Reserved. |
2 | 2 |
3 #ifndef CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ | 3 #ifndef CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ |
4 #define CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ | 4 #define CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ |
5 | 5 |
6 #include "base/basictypes.h" | 6 #include "base/basictypes.h" |
| 7 #include "base/md5.h" |
7 | 8 |
8 // BDict (binary dictionary) format. All offsets are little endian. | 9 // BDict (binary dictionary) format. All offsets are little endian. |
9 // | 10 // |
10 // Header (12 bytes). | 11 // Header (28 bytes). |
11 // "BDic" Signature (4 bytes) | 12 // "BDic" Signature (4 bytes) |
12 // Version (little endian 4 bytes) | 13 // Version (little endian 4 bytes) |
13 // Absolute offset in file of the aff info. (4 bytes) | 14 // Absolute offset in file of the aff info. (4 bytes) |
14 // Absolute offset in file of the dic table. (4 bytes) | 15 // Absolute offset in file of the dic table. (4 bytes) |
| 16 // (Added by v2.0) MD5 checksum of the aff info and the dic table. (16 bytes) |
15 // | 17 // |
16 // Aff information: | 18 // Aff information: |
17 // Absolute offset in file of the affix group table (4 bytes) | 19 // Absolute offset in file of the affix group table (4 bytes) |
18 // Absolute offset in file of the affix rules table (4 bytes) | 20 // Absolute offset in file of the affix rules table (4 bytes) |
19 // Absolute offset in file of the replacements table (4 bytes) | 21 // Absolute offset in file of the replacements table (4 bytes) |
20 // Absolute offset in file of the "other rules" table (4 bytes) | 22 // Absolute offset in file of the "other rules" table (4 bytes) |
21 // | 23 // |
22 // The data between the aff header and the affix rules table is the comment | 24 // The data between the aff header and the affix rules table is the comment |
23 // from the beginning of the .aff file which often contains copyrights, etc. | 25 // from the beginning of the .aff file which often contains copyrights, etc. |
24 // | 26 // |
25 // Affix group table: | 27 // Affix group table: |
26 // Array of NULL terminated strings. It will end in a double-NULL. | 28 // Array of NULL terminated strings. It will end in a double-NULL. |
27 // | 29 // |
28 // Affix rules table: | 30 // Affix rules table: |
29 // List of LF termianted lines. NULL terminated. | 31 // List of LF termianted lines. NULL terminated. |
30 // | 32 // |
31 // Replacements table: | 33 // Replacements table: |
32 // List of pairs of NULL teminated words. The end is indicated by a | 34 // List of pairs of NULL teminated words. The end is indicated by a |
33 // double-NULL. The first word in the pair is the replacement source, the | 35 // double-NULL. The first word in the pair is the replacement source, the |
34 // second is what to replace it with. Example: | 36 // second is what to replace it with. Example: |
35 // foo\0bar\0a\0b\0\0 | 37 // foo\0bar\0a\0b\0\0 |
36 // for replacing ("foo" with "bar") and ("a" with "b"). | 38 // for replacing ("foo" with "bar") and ("a" with "b"). |
37 // | 39 // |
38 // Other rules table: | 40 // Other rules table: |
39 // List of LF termianted lines. NULL terminated. | 41 // List of LF termianted lines. NULL terminated. |
40 // | 42 // |
41 // | 43 // |
42 // Dic table. This stores the .dic file which contains the words in the | 44 // Dic table. This stores the .dic file which contains the words in the |
43 // dictionary, and indices for each one that indicate a set of suffixes or | 45 // dictionary, and indices for each one that indicate a set of suffixes or |
44 // prefixes that can be applied. We store it in a trie to save space. It | 46 // prefixes that can be applied. We store it in a trie to save space. It |
45 // replaces Hunspell's hash manager. | 47 // replaces Hunspell's hash manager. |
46 // | 48 // |
47 // 0abxxxxx xxxxxxxx (in binary) Leaf node: | 49 // 0abxxxxx xxxxxxxx (in binary) Leaf node: |
48 // The number stored in the bits represented by x is the affix index. | 50 // The number stored in the bits represented by x is the affix index. |
49 // | 51 // |
50 // If bit <a> is set, the leaf node has an additional string. Following the | 52 // If bit <a> is set, the leaf node has an additional string. Following the |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
92 | 94 |
93 namespace hunspell { | 95 namespace hunspell { |
94 | 96 |
95 #pragma pack(push, 1) | 97 #pragma pack(push, 1) |
96 | 98 |
97 class BDict { | 99 class BDict { |
98 public: | 100 public: |
99 // File header. | 101 // File header. |
100 enum { SIGNATURE = 0x63694442 }; | 102 enum { SIGNATURE = 0x63694442 }; |
101 enum { | 103 enum { |
102 MAJOR_VERSION = 1, | 104 MAJOR_VERSION = 2, |
103 MINOR_VERSION = 1 | 105 MINOR_VERSION = 0 |
104 }; | 106 }; |
105 struct Header { | 107 struct Header { |
106 uint32 signature; | 108 uint32 signature; |
107 | 109 |
108 // Major versions are incompatible with other major versions. Minor versions | 110 // Major versions are incompatible with other major versions. Minor versions |
109 // should be readable by older programs expecting the same major version. | 111 // should be readable by older programs expecting the same major version. |
110 uint16 major_version; | 112 uint16 major_version; |
111 uint16 minor_version; | 113 uint16 minor_version; |
112 | 114 |
113 uint32 aff_offset; // Offset of the aff data. | 115 uint32 aff_offset; // Offset of the aff data. |
114 uint32 dic_offset; // Offset of the dic data. | 116 uint32 dic_offset; // Offset of the dic data. |
| 117 |
| 118 // Added by version 2.0. |
| 119 MD5Digest digest; // MD5 digest of the aff data and the dic data. |
115 }; | 120 }; |
116 | 121 |
117 // AFF section =============================================================== | 122 // AFF section =============================================================== |
118 | 123 |
119 struct AffHeader { | 124 struct AffHeader { |
120 uint32 affix_group_offset; | 125 uint32 affix_group_offset; |
121 uint32 affix_rule_offset; | 126 uint32 affix_rule_offset; |
122 uint32 rep_offset; // Replacements table. | 127 uint32 rep_offset; // Replacements table. |
123 uint32 other_offset; | 128 uint32 other_offset; |
124 }; | 129 }; |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
184 enum { LIST_NODE_TYPE_MASK = 0xE0 }; // 11100000 | 189 enum { LIST_NODE_TYPE_MASK = 0xE0 }; // 11100000 |
185 enum { LIST_NODE_TYPE_VALUE = 0xE0 }; // 11100000 | 190 enum { LIST_NODE_TYPE_VALUE = 0xE0 }; // 11100000 |
186 | 191 |
187 // The 4th from highest bit indicates a 16 bit (as opposed to 8 bit) list. | 192 // The 4th from highest bit indicates a 16 bit (as opposed to 8 bit) list. |
188 // This mask/value also includes the list ID in the high 3 bits. | 193 // This mask/value also includes the list ID in the high 3 bits. |
189 enum { LIST_NODE_16BIT_MASK = 0xF0 }; // 11110000 | 194 enum { LIST_NODE_16BIT_MASK = 0xF0 }; // 11110000 |
190 enum { LIST_NODE_16BIT_VALUE = 0xF0 }; // 11110000 | 195 enum { LIST_NODE_16BIT_VALUE = 0xF0 }; // 11110000 |
191 | 196 |
192 // The low 4 bits of the list ID byte are the count. | 197 // The low 4 bits of the list ID byte are the count. |
193 enum { LIST_NODE_COUNT_MASK = 0xF }; // 00001111 | 198 enum { LIST_NODE_COUNT_MASK = 0xF }; // 00001111 |
| 199 |
| 200 // Verifies the specified BDICT is sane. This function checks the BDICT header |
| 201 // and compares the MD5 digest of the data with the one in the header. |
| 202 static bool Verify(const char* bdict_data, size_t bdict_length); |
194 }; | 203 }; |
195 | 204 |
196 #pragma pack(pop) | 205 #pragma pack(pop) |
197 | 206 |
198 } // namespace hunspell | 207 } // namespace hunspell |
199 | 208 |
200 #endif // CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ | 209 #endif // CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__ |
OLD | NEW |