third_party/hunspell_new/google/bdict_writer.cc - Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell.

Unified Diff: third_party/hunspell_new/google/bdict_writer.cc

Issue 1135173004: Rename third_party/hunspell_new back to third_party/hunspell. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/hunspell_new/google/bdict_writer.cc

diff --git a/third_party/hunspell_new/google/bdict_writer.cc b/third_party/hunspell_new/google/bdict_writer.cc

deleted file mode 100644

index 838a71dc152038348e69b7a6b1f0711b9e52ffb3..0000000000000000000000000000000000000000

--- a/third_party/hunspell_new/google/bdict_writer.cc

+++ /dev/null

@@ -1,512 +0,0 @@

-// Use of this source code is governed by a BSD-style license that can be

-// found in the LICENSE file.

-#include "third_party/hunspell_new/google/bdict_writer.h"

-#include "base/logging.h"

-#include "base/strings/stringprintf.h"

-#include "third_party/hunspell_new/google/bdict.h"

-namespace hunspell {

-// Represents one node the word trie in memory. This does not have to be very

-// efficient since it is only used when building.

-class DicNode {

- public:

- enum StorageType {

- UNDEFINED, // Uninitialized storage type.

- LEAF, // Word with no additional string data.

- LEAFMORE, // Word with additional suffix following.

- LIST16, // List of sub-nodes with 16-bit relative offsets.

- LIST8, // List of sub-nodes with 8-bit relative offsets.

- LOOKUP32, // Lookup table with 32-bit absolute offsets.

- LOOKUP16, // LOokup table with 16-bit relative offsets.

- };

- DicNode() : addition(0), storage(UNDEFINED) {

- }

- ~DicNode() {

- for (size_t i = 0; i < children.size(); i++)

- delete children[i];

- }

- bool is_leaf() const { return children.empty(); }

- // When non-zero, this character is the additional level that this

- // node represents. This will be 0 for some leaf nodes when there is no

- // addition and for the root node.

- char addition;

- std::vector<DicNode*> children;

- // When there are no children, this is a leaf node and this "addition string"

- // is appended to the result. When there are children, this will be empty.

- std::string leaf_addition;

- // For leaf nodes, this are the indices into the affix table.

- std::vector<int> affix_indices;

- // Initially uninitialized, ComputeStorage() will fill this in with the

- // desired serialization method.

- StorageType storage;

-};

-namespace {

-void SerializeTrie(const DicNode* node, std::string* output);

-// Returns true if the nth character in the given word is |ch|. Will return

-// false when there is no nth character. Note that this will also match an

-// implicit NULL at the end of the string.

-bool NthCharacterIs(const std::string& word, size_t n, char ch) {

- if (word.length() < n) // Want to allow n == length() to catch the NULL.

- return false;

- return word.c_str()[n] == ch; // Use c_str() to get NULL terminator.

-// Recursively build the trie data structure for the range in the |words| list

-// in [begin, end). It is assumed that all words in that range will have the

-// same |node_depth - 2| characters at the beginning. This node will key off of

-// the |node_depth - 1| character, with a special case for the root.

-//

-// |prefix_chars| is how deep this node is in the trie (and corresponds to how

-// many letters of the word we will skip). The root level will have

-// |prefix_chars| of 0.

-//

-// The given |node| will be filled with the data. The return value is the

-// index into the |words| vector of the next word to process. It will be

-// equal to |end| when all words have been consumed.

-size_t BuildTrie(const BDictWriter::WordList& words,

- size_t begin, size_t end,

- size_t node_depth, DicNode* node) {

- // Find the prefix that this node represents.

- const std::string& begin_str = words[begin].first;

- if (begin_str.length() < node_depth) {

- // Singleton.

- node->addition = 0;

- node->affix_indices = words[begin].second;

- return begin + 1;

- }

- // Now find the range of words sharing this prefix.

- size_t match_count;

- if (node_depth == 0 && begin == 0) {

- // Special case the root node.

- match_count = end - begin;

- node->addition = 0;

- } else {

- match_count = 0;

- node->addition = begin_str[node_depth - 1];

- // We know the strings should have [0, node_depth-1) characters at the

- // beginning already matching, so we only need to check the new one.

- while (begin + match_count < end &&

- NthCharacterIs(words[begin + match_count].first,

- node_depth - 1, node->addition))

- match_count++;

- }

- if (match_count == 1) {

- // Just found a leaf node with no other words sharing its prefix. Save any

- // remaining characters and we're done.

- node->affix_indices = words[begin].second;

- node->leaf_addition = begin_str.substr(node_depth);

- return begin + 1;

- }

- // We have a range of words, add them as children of this node.

- size_t i = begin;

- while (i < begin + match_count) {

- DicNode* cur = new DicNode;

- i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur);

- node->children.push_back(cur);

- }

- return begin + match_count;

-// Lookup tables are complicated. They can have a magic 0th entry not counted

-// in the table dimensions, and also have indices only for the used sub-range.

-// This function will compute the starting point and size of a lookup table,

-// in addition to whether it should have the magic 0th entry for the given

-// list of child nodes.

-void ComputeLookupStrategyDetails(const std::vector<DicNode*>& children,

- bool* has_0th_entry,

- int* first_item,

- int* list_size) {

- *has_0th_entry = false;

- *first_item = 0;

- *list_size = 0;

- if (children.empty())

- return;

- size_t first_offset = 0;

- if (children[0]->addition == 0) {

- *has_0th_entry = true;

- first_offset++;

- }

- if (children.size() == first_offset)

- return;

- *first_item = static_cast<unsigned char>(children[first_offset]->addition);

- unsigned char last_item = children[children.size() - 1]->addition;

- *list_size = last_item - *first_item + 1;

-// Recursively fills in the storage strategy for this node and each of its

-// children. This must be done before actually serializing because the storage

-// mode will depend on the size of the children.

-size_t ComputeTrieStorage(DicNode* node) {

- if (node->is_leaf()) {

- // The additional affix list holds affixes when there is more than one. Each

- // entry is two bytes, plus an additional FFFF terminator.

- size_t supplimentary_size = 0;

- if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {

- // We cannot store the first affix ID of the affix list into a leaf node.

- // In this case, we have to store all the affix IDs and a terminator

- // into a supplimentary list.

- supplimentary_size = node->affix_indices.size() * 2 + 2;

- } else if (node->affix_indices.size() > 1) {

- // We can store the first affix ID of the affix list into a leaf node.

- // In this case, we need to store the remaining affix IDs and a

- // terminator into a supplimentary list.

- supplimentary_size = node->affix_indices.size() * 2;

- }

- if (node->leaf_addition.empty()) {

- node->storage = DicNode::LEAF;

- return 2 + supplimentary_size;

- }

- node->storage = DicNode::LEAFMORE;

- // Signature & affix (2) + null for leaf_addition (1) = 3

- return 3 + node->leaf_addition.size() + supplimentary_size;

- }

- // Recursively compute the size of the children for non-leaf nodes.

- size_t child_size = 0;

- for (size_t i = 0; i < node->children.size(); i++)

- child_size += ComputeTrieStorage(node->children[i]);

- // Fixed size is only 1 byte which is the ID byte and the count combined.

- static const int kListHeaderSize = 1;

- // Lists can only store up to 16 items.

- static const size_t kListThreshold = 16;

- if (node->children.size() < kListThreshold && child_size <= 0xFF) {

- node->storage = DicNode::LIST8;

- return kListHeaderSize + node->children.size() * 2 + child_size;

- }

- if (node->children.size() < kListThreshold && child_size <= 0xFFFF) {

- node->storage = DicNode::LIST16;

- // Fixed size is one byte plus 3 for each table entry.

- return kListHeaderSize + node->children.size() * 3 + child_size;

- }

- static const int kTableHeaderSize = 2; // Type + table size.

- bool has_0th_item;

- int first_table_item, table_item_count;

- ComputeLookupStrategyDetails(node->children, &has_0th_item,

- &first_table_item, &table_item_count);

- if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) +

- table_item_count * 2 < 0xFFFF) {

- // Use 16-bit addressing since the children will fit.

- node->storage = DicNode::LOOKUP16;

- return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 +

- child_size;

- }

- // Use 32-bit addressing as a last resort.

- node->storage = DicNode::LOOKUP32;

- return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 +

- child_size;

-// Serializes the given node when it is DicNode::LEAF* to the output.

-void SerializeLeaf(const DicNode* node, std::string* output) {

- // The low 6 bits of the ID byte are the high 6 bits of the first affix ID.

- int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0;

- // We may store the first value with the node or in the supplimentary list.

- size_t first_affix_in_supplimentary_list = 1;

- if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) {

- // There are not enough bits for this value, move it to the supplimentary

- // list where there are more bits per value.

- first_affix_in_supplimentary_list = 0;

- first_affix = BDict::FIRST_AFFIX_IS_UNUSED;

- }

- unsigned char id_byte = (first_affix >> 8) &

- BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK;

- // The next two bits indicates an additional string and more affixes.

- if (node->storage == DicNode::LEAFMORE)

- id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE;

- if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0)

- id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE;

- output->push_back(id_byte);

- // Following is the low 8 bits of the affix index.

- output->push_back(first_affix & 0xff);

- // Handle the optional addition with NULL terminator.

- if (node->storage == DicNode::LEAFMORE) {

- for (size_t i = 0; i < node->leaf_addition.size() + 1; i++)

- output->push_back(node->leaf_addition.c_str()[i]);

- }

- // Handle any following affixes. We already wrote the 0th one.

- if (node->affix_indices.size() > first_affix_in_supplimentary_list) {

- for (size_t i = first_affix_in_supplimentary_list;

- i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD;

- i++) {

- output->push_back(static_cast<char>(node->affix_indices[i] & 0xFF));

- output->push_back(

- static_cast<char>((node->affix_indices[i] >> 8) & 0xFF));

- }

- // Terminator for affix list. We use 0xFFFF.

- output->push_back(static_cast<unsigned char>(0xFF));

- }

-// Serializes the given node when it is DicNode::LIST* to the output.

-void SerializeList(const DicNode* node, std::string* output) {

- bool is_8_bit = node->storage == DicNode::LIST8;

- unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE |

- (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE);

- id_byte |= node->children.size(); // We assume the size is < 4 bits.

- output->push_back(id_byte);

- // Reserve enough room for the lookup table (either 2 or 3 bytes per entry).

- int bytes_per_entry = (is_8_bit ? 2 : 3);

- size_t table_begin = output->size();

- output->resize(output->size() + node->children.size() * bytes_per_entry);

- size_t children_begin = output->size();

- for (size_t i = 0; i < node->children.size(); i++) {

- // First is the character this entry represents.

- (*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition;

- // Next is the 8- or 16-bit offset.

- size_t offset = output->size() - children_begin;

- if (is_8_bit) {

- DCHECK(offset <= 0xFF);

- (*output)[table_begin + i * bytes_per_entry + 1] =

- static_cast<char>(offset & 0xFF);

- } else {

- unsigned short* output16 = reinterpret_cast<unsigned short*>(

- &(*output)[table_begin + i * bytes_per_entry + 1]);

- *output16 = static_cast<unsigned short>(offset);

- }

- // Now append the children's data.

- SerializeTrie(node->children[i], output);

- }

-// Serializes the given node when it is DicNode::LOOKUP* to the output.

-void SerializeLookup(const DicNode* node, std::string* output) {

- unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE;

- bool has_0th_item;

- int first_table_item, table_item_count;

- ComputeLookupStrategyDetails(node->children, &has_0th_item,

- &first_table_item, &table_item_count);

- // Set the extra bits in the ID byte.

- bool is_32_bit = (node->storage == DicNode::LOOKUP32);

- if (is_32_bit)

- id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE;

- if (has_0th_item)

- id_byte |= BDict::LOOKUP_NODE_0TH_VALUE;

- size_t begin_offset = output->size();

- output->push_back(id_byte);

- output->push_back(static_cast<char>(first_table_item));

- output->push_back(static_cast<char>(table_item_count));

- // Save room for the lookup table and the optional 0th item.

- int bytes_per_entry = (is_32_bit ? 4 : 2);

- size_t zeroth_item_offset = output->size();

- if (has_0th_item)

- output->resize(output->size() + bytes_per_entry);

- size_t table_begin = output->size();

- output->resize(output->size() + table_item_count * bytes_per_entry);

- // Append the children.

- for (size_t i = 0; i < node->children.size(); i++) {

- size_t offset = output->size();

- // Compute the location at which we'll store the offset of the child data.

- // We may be writing the magic 0th item.

- size_t offset_offset;

- if (i == 0 && has_0th_item) {

- offset_offset = zeroth_item_offset;

- } else {

- int table_index = static_cast<unsigned char>(node->children[i]->addition) - first_table_item;

- offset_offset = table_begin + table_index * bytes_per_entry;

- }

- // Write the offset.

- if (is_32_bit) {

- // Use 32-bit absolute offsets.

- // FIXME(brettw) use bit cast.

- unsigned* offset32 = reinterpret_cast<unsigned*>(&(*output)[offset_offset]);

- *offset32 = static_cast<unsigned>(output->size());

- } else {

- // Use 16-bit relative offsets.

- unsigned short* offset16 = reinterpret_cast<unsigned short*>(&(*output)[offset_offset]);

- *offset16 = static_cast<unsigned short>(output->size() - begin_offset);

- }

- SerializeTrie(node->children[i], output);

- }

-// Recursively serializes this node and all of its children to the output.

-void SerializeTrie(const DicNode* node, std::string* output) {

- if (node->storage == DicNode::LEAF ||

- node->storage == DicNode::LEAFMORE) {

- SerializeLeaf(node, output);

- } else if (node->storage == DicNode::LIST8 ||

- node->storage == DicNode::LIST16) {

- SerializeList(node, output);

- } else if (node->storage == DicNode::LOOKUP16 ||

- node->storage == DicNode::LOOKUP32) {

- SerializeLookup(node, output);

- }

-/*

-void SerializeStringList(const std::vector<std::string>& list,

- std::string* output) {

- for (size_t i = 0; i < list.size(); i++) {

- if (i != 0)

- output->push_back('\n');

- output->append(list[i]);

- }

- output->push_back(0);

-*/

-// Appends the given uint32 to the given string.

-void AppendUint32(uint32 a, std::string* output) {

- size_t offset = output->size();

- output->resize(offset + 4);

- memcpy(&(*output)[offset], &a, sizeof(uint32));

-// Serializes the given list of strings with 0 bytes separating them. The end

-// will be marked by a double-0.

-void SerializeStringListNullTerm(const std::vector<std::string>& strings,

- std::string* output) {

- for (size_t i = 0; i < strings.size(); i++) {

- // Can't tolerate empty strings since the'll mark the end.

- if (strings[i].empty())

- output->push_back(' ');

- else

- output->append(strings[i]);

- output->push_back(0);

- }

- output->push_back(0);

-void SerializeReplacements(

- const std::vector< std::pair<std::string, std::string> >& repl,

- std::string* output) {

- for (size_t i = 0; i < repl.size(); i++) {

- output->append(repl[i].first);

- output->push_back(0);

- output->append(repl[i].second);

- output->push_back(0);

- }

- output->push_back(0);

-} // namespace

-BDictWriter::BDictWriter() : trie_root_(NULL) {

-BDictWriter::~BDictWriter() {

- delete trie_root_;

-void BDictWriter::SetWords(const WordList& words) {

- trie_root_ = new DicNode;

- BuildTrie(words, 0, words.size(), 0, trie_root_);

-std::string BDictWriter::GetBDict() const {

- std::string ret;

- // Save room for the header. This will be populated at the end.

- ret.resize(sizeof(hunspell::BDict::Header));

- // Serialize the affix portion.

- size_t aff_offset = ret.size();

- SerializeAff(&ret);

- // Serialize the dictionary words.

- size_t dic_offset = ret.size();

- ret.reserve(ret.size() + ComputeTrieStorage(trie_root_));

- SerializeTrie(trie_root_, &ret);

- // Fill the header last, now that we have the data.

- hunspell::BDict::Header* header =

- reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);

- header->signature = hunspell::BDict::SIGNATURE;

- header->major_version = hunspell::BDict::MAJOR_VERSION;

- header->minor_version = hunspell::BDict::MINOR_VERSION;

- header->aff_offset = static_cast<uint32>(aff_offset);

- header->dic_offset = static_cast<uint32>(dic_offset);

- // Write the MD5 digest of the affix information and the dictionary words at

- // the end of the BDic header.

- if (header->major_version >= 2)

- base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest);

- return ret;

-void BDictWriter::SerializeAff(std::string* output) const {

- // Reserve enough room for the header.

- size_t header_offset = output->size();

- output->resize(output->size() + sizeof(hunspell::BDict::AffHeader));

- // Write the comment.

- output->push_back('\n');

- output->append(comment_);

- output->push_back('\n');

- // We need a magic first AF line that lists the number of following ones.

- size_t affix_group_offset = output->size();

- output->append(base::StringPrintf("AF %d",

- static_cast<int>(affix_groups_.size())));

- output->push_back(0);

- SerializeStringListNullTerm(affix_groups_, output);

- size_t affix_rule_offset = output->size();

- SerializeStringListNullTerm(affix_rules_, output);

- size_t rep_offset = output->size();

- SerializeReplacements(replacements_, output);

- size_t other_offset = output->size();

- SerializeStringListNullTerm(other_commands_, output);

- // Add the header now that we know the offsets.

- hunspell::BDict::AffHeader* header =

- reinterpret_cast<hunspell::BDict::AffHeader*>(&(*output)[header_offset]);

- header->affix_group_offset = static_cast<uint32>(affix_group_offset);

- header->affix_rule_offset = static_cast<uint32>(affix_rule_offset);

- header->rep_offset = static_cast<uint32>(rep_offset);

- header->other_offset = static_cast<uint32>(other_offset);

-} // namespace hunspell

« no previous file with comments | « third_party/hunspell_new/google/bdict_writer.h ('k') | third_party/hunspell_new/hunspell.gyp » ('j') | no next file with comments »