third_party/hunspell_new/google/bdict.h - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Side by Side Diff: third_party/hunspell_new/google/bdict.h

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #ifndef CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__

6 #define CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__

7

8 #include "base/basictypes.h"

9 #include "base/md5.h"

10

11 // BDict (binary dictionary) format. All offsets are little endian.

12 //

13 // Header (28 bytes).

14 // "BDic" Signature (4 bytes)

15 // Version (little endian 4 bytes)

16 // Absolute offset in file of the aff info. (4 bytes)

17 // Absolute offset in file of the dic table. (4 bytes)

18 // (Added by v2.0) MD5 checksum of the aff info and the dic table. (16 bytes)

19 //

20 // Aff information:

21 // Absolute offset in file of the affix group table (4 bytes)

22 // Absolute offset in file of the affix rules table (4 bytes)

23 // Absolute offset in file of the replacements table (4 bytes)

24 // Absolute offset in file of the "other rules" table (4 bytes)

25 //

26 // The data between the aff header and the affix rules table is the comment

27 // from the beginning of the .aff file which often contains copyrights, etc.

28 //

29 // Affix group table:

30 // Array of NULL terminated strings. It will end in a double-NULL.

31 //

32 // Affix rules table:

33 // List of LF termianted lines. NULL terminated.

34 //

35 // Replacements table:

36 // List of pairs of NULL teminated words. The end is indicated by a

37 // double-NULL. The first word in the pair is the replacement source, the

38 // second is what to replace it with. Example:

39 // foo\0bar\0a\0b\0\0

40 // for replacing ("foo" with "bar") and ("a" with "b").

41 //

42 // Other rules table:

43 // List of LF termianted lines. NULL terminated.

44 //

45 //

46 // Dic table. This stores the .dic file which contains the words in the

47 // dictionary, and indices for each one that indicate a set of suffixes or

48 // prefixes that can be applied. We store it in a trie to save space. It

49 // replaces Hunspell's hash manager.

50 //

51 // 0abxxxxx xxxxxxxx (in binary) Leaf node:

52 // The number stored in the bits represented by x is the affix index.

53 //

54 // If bit <a> is set, the leaf node has an additional string. Following the

55 // 2 byte header is a NULL-terminated (possibly 0-length) string that should

56 // be appended to the node. This allows long unique endings to be handled

57 // efficiently.

58 //

59 // If bit <b> is set, the leaf node has a supplimental list of affix IDs

60 // following the ordinary data for the leaf node. These affix group IDs are

61 // additional rules for the same word. For example, two prefixes may go

62 // with distinct sets of suffixes.

63 //

64 // If the affix index is all 1's, then that means that there is only the

65 // supplimental list, and the 13-bit of affix built-in to the node don't

66 // count. This is used to represent numbers greater than 13 bits, since

67 // the supplimentary list has 16 bits per entry. The node must have a

68 // supplimenal list if this is set.

69 //

70 // This additional array is an array of 16-bit little-endian values,

71 // terminated by 0xFFFF (since 0 is an affix ID meaning "no affix ID".

72 //

73 // 0x110000ab: Lookup node.

74 // When <a> is set, addresses are 32-bits relative to the beginning of the

75 // dictionary data. When unset, addresses are 16-bits relative to the

76 // beginning of this node. All values are little endian.

77 //

78 // When <b> is set, there is one additional entry before the table begins.

79 // This is the 0th character. 0 is a common addition (meaning no more data)

80 // and this prevents us from having to store entries for all the control

81 // characters. This magic element is not counted in the table size.

82 //

83 // The ID byte is followeed by two bytes:

84 // XX: First character value in the lookup table.

85 // XX: Number of characters in the lookup table.

86 //

87 // This is followed optionally by the entry for 0, and then by a table of

88 // size indicated by the second charatcer after the ID.

89 //

90 // 1110xxxx: List node with 8-bit addresses.

91 // The number of items (max 16) in the list is stored in the bits xxxx.

92 // Followed by N (character byte, 8-bit offset) pairs. These offsets are

93 // relative to the end of the list of pairs.

94 // 1111xxxx: List node with 16-bit addresses. Same as above but offsets are

95 // 2-bytes each. LITTLE ENDIAN!

96

97 namespace hunspell {

98

99 #pragma pack(push, 1)

100

101 class BDict {

102 public:

103 // File header.

104 enum { SIGNATURE = 0x63694442 };

105 enum {

106 MAJOR_VERSION = 2,

107 MINOR_VERSION = 0

108 };

109 struct Header {

110 uint32 signature;

111

112 // Major versions are incompatible with other major versions. Minor versions

113 // should be readable by older programs expecting the same major version.

114 uint16 major_version;

115 uint16 minor_version;

116

117 uint32 aff_offset; // Offset of the aff data.

118 uint32 dic_offset; // Offset of the dic data.

119

120 // Added by version 2.0.

121 base::MD5Digest digest; // MD5 digest of the aff data and the dic data.

122 };

123

124 // AFF section ===============================================================

125

126 struct AffHeader {

127 uint32 affix_group_offset;

128 uint32 affix_rule_offset;

129 uint32 rep_offset; // Replacements table.

130 uint32 other_offset;

131 };

132

133 // DIC section ===============================================================

134

135 // Leaf ----------------------------------------------------------------------

136

137 // Leaf nodes have the high bit set to 0.

138 enum { LEAF_NODE_TYPE_MASK = 0x80 }; // 10000000

139 enum { LEAF_NODE_TYPE_VALUE = 0 }; // 00000000

140

141 // Leaf nodes with additional strings have the next-to-high bit set to 1.

142 // This mask/value pair also includes the high bit set to 0 which is the leaf

143 // indicator.

144 enum { LEAF_NODE_ADDITIONAL_MASK = 0xC0 }; // 11000000

145 enum { LEAF_NODE_ADDITIONAL_VALUE = 0x40 }; // 01000000

146

147 // Leaf nodes with an additional array of affix rules following it.

148 enum { LEAF_NODE_FOLLOWING_MASK = 0xA0 }; // 10100000

149 enum { LEAF_NODE_FOLLOWING_VALUE = 0x20 }; // 00100000

150

151 // The low 5 bits of the leaf node ID byte are the first 5 bits of the affix

152 // ID. The following byte is used for the low bits of the affix ID (we don't

153 // specify as mask for that).

154 enum { LEAF_NODE_FIRST_BYTE_AFFIX_MASK = 0x1F }; // 00011111

155

156 // The maximum affix value that can be stored in the first entry (not in the

157 // following list). We reserve all 1's to be a magic value (see next entry)

158 // so we can store large numbers somewhere else.

159 enum { LEAF_NODE_MAX_FIRST_AFFIX_ID = 0x1FFE }; // 00011111 11111110

160

161 // When the affix built-in to the leaf node (the first one) has too many bits

162 // for the space reserved for it (13 bits), then we fill it with this value.

163 // This means that the affix doesn't count. The affix will instead be stored

164 // in the "following list" which allows up to 16 bits per entry.

165 enum { FIRST_AFFIX_IS_UNUSED = 0x1FFF }; // 00011111 11111111

166

167 // The maximum number of leaf nodes we'll read that have the same word and

168 // follow each other (the FOLLOWING bit is set).

169 enum { MAX_AFFIXES_PER_WORD = 32 };

170

171 // The terminator for the list of following affix group IDs.

172 enum { LEAF_NODE_FOLLOWING_LIST_TERMINATOR = 0xFFFF };

173

174 // Lookup --------------------------------------------------------------------

175

176 // Lookup nodes have the first 6 bits set to 110000.

177 enum { LOOKUP_NODE_TYPE_MASK = 0xFC }; // 11111100

178 enum { LOOKUP_NODE_TYPE_VALUE = 0xC0 }; // 11000000

179

180 // Lookup nodes have the low bit meaning it has a 0th entry, and the

181 // next-to-lowest bit indicating whether the offsets are 32-bits. Included

182 // in these masks are the lookup ID above.

183 enum { LOOKUP_NODE_0TH_MASK = 0xFD }; // 11111110

184 enum { LOOKUP_NODE_0TH_VALUE = 0xC1 }; // 11000010

185 enum { LOOKUP_NODE_32BIT_MASK = 0xFE}; // 11111110

186 enum { LOOKUP_NODE_32BIT_VALUE = 0xC2}; // 11000001

187

188 // List ----------------------------------------------------------------------

189

190 // List nodes have the first 3 bits set to 1.

191 enum { LIST_NODE_TYPE_MASK = 0xE0 }; // 11100000

192 enum { LIST_NODE_TYPE_VALUE = 0xE0 }; // 11100000

193

194 // The 4th from highest bit indicates a 16 bit (as opposed to 8 bit) list.

195 // This mask/value also includes the list ID in the high 3 bits.

196 enum { LIST_NODE_16BIT_MASK = 0xF0 }; // 11110000

197 enum { LIST_NODE_16BIT_VALUE = 0xF0 }; // 11110000

198

199 // The low 4 bits of the list ID byte are the count.

200 enum { LIST_NODE_COUNT_MASK = 0xF }; // 00001111

201

202 // Verifies the specified BDICT is sane. This function checks the BDICT header

203 // and compares the MD5 digest of the data with the one in the header.

204 static bool Verify(const char* bdict_data, size_t bdict_length);

205 };

206

207 #pragma pack(pop)

208

209 } // namespace hunspell

210

211 #endif // CHROME_THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H__

OLD	NEW

« no previous file with comments | « third_party/hunspell_new/google/DEPS ('k') | third_party/hunspell_new/google/bdict.cc » ('j') | no next file with comments »