Source/core/html/parser/create-html-entity-table - Issue 199103002: Better storage of HTML entities.

Unified Diff: Source/core/html/parser/create-html-entity-table

Issue 199103002: Better storage of HTML entities. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: Source/core/html/parser/create-html-entity-table

diff --git a/Source/core/html/parser/create-html-entity-table b/Source/core/html/parser/create-html-entity-table

index 3b9ca45c58ce8b87b3234bb788702b7076cfb429..587e8a8f5d11d027abb3826782953a3f8d58814f 100755

--- a/Source/core/html/parser/create-html-entity-table

+++ b/Source/core/html/parser/create-html-entity-table

@@ -27,6 +27,11 @@

# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+"""This python script creates the raw data that is our entity

+database. The representation is one string database containing all

+strings we could need, and then a mapping from offset+length -> entity

+data. That is compact, easy to use and efficient."""

import csv

import os.path

import string

@@ -35,13 +40,6 @@ import sys

ENTITY = 0

VALUE = 1

-def convert_entity_to_cpp_name(entity):

- postfix = "EntityName"

- if entity[-1] == ";":

- return "%sSemicolon%s" % (entity[:-1], postfix)

- return "%s%s" % (entity, postfix)

def convert_value_to_int(value):

if not value:

return "0";

@@ -63,9 +61,8 @@ if len(sys.argv) < 4 or sys.argv[1] != "-o":

output_path = sys.argv[2]

input_path = sys.argv[3]

-html_entity_names_file = open(input_path)

-entries = list(csv.reader(html_entity_names_file))

-html_entity_names_file.close()

+with open(input_path) as html_entity_names_file:

+ entries = list(csv.reader(html_entity_names_file))

entries.sort(key = lambda entry: entry[ENTITY])

entity_count = len(entries)

@@ -97,7 +94,7 @@ output_file.write("""/*

* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-// THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table

+// THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table

// DO NOT EDIT (unless you are a ninja)!

#include "config.h"

@@ -108,60 +105,126 @@ namespace WebCore {

namespace {

""")

+assert len(entries) > 0, "Code assumes a non-empty entity array."

+def check_ascii(entity_string):

+ for c in entity_string:

+ code = ord(c)

+ assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " +

+ "of storage from LChar to UChar to support " +

+ "this entity.")

+output_file.write("static const LChar staticEntityStringStorage[] = {\n")

+output_file.write("'")

+all_data = ""

+entity_offset = 0

+first_output = True

+saved_by_reusing = 0

for entry in entries:

- output_file.write("static const LChar %s[] = \"%s\";\n" % (convert_entity_to_cpp_name(entry[ENTITY]), entry[ENTITY]))

+ check_ascii(entry[ENTITY])

+ # Reuse substrings from earlier entries. This saves 1-2000

+ # characters, but it's O(n^2) and not very smart. The optimal

+ # solution has to solve the "Shortest Common Superstring" problem

+ # and that is NP-Complete or worse.

abarth-chromium 2014/03/17 17:37:27 I bet most of the hits are just stripping the trai

+ #

+ # This would be even more efficient if we didn't store the

+ # semi-colon in the array but as a bit in the entry.

+ entity = entry[ENTITY]

+ already_existing_offset = all_data.find(entity)

+ if already_existing_offset != -1:

+ # Reusing space.

+ this_offset = already_existing_offset

+ saved_by_reusing += len(entity)

+ else:

+ if not first_output:

+ output_file.write(",\n'")

+ first_output = False

+ # Try the end of the string and see if we can reuse that to

+ # fit the start of the new entity.

+ data_to_add = entity

+ this_offset = entity_offset

+ for truncated_len in range(len(entity) - 1, 0, -1):

+ if all_data.endswith(entity[:truncated_len]):

+ data_to_add = entity[truncated_len:]

+ this_offset = entity_offset - truncated_len

+ saved_by_reusing += truncated_len

+ break

+ output_file.write("', '".join(data_to_add))

+ all_data += data_to_add

+ output_file.write("'")

+ entity_offset += len(data_to_add)

+ assert len(entry) == 2, "We will use slot [2] in the list for the offset."

+ assert this_offset < 32768 # Stored in a 16 bit short.

+ entry.append(this_offset)

+output_file.write("};\n")

+index = {}

+for offset, entry in enumerate(entries):

+ starting_letter = entry[ENTITY][0]

+ if starting_letter not in index:

+ index[starting_letter] = offset

output_file.write("""

static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count)

-index = {}

-offset = 0

for entry in entries:

- letter = entry[ENTITY][0]

- if letter not in index:

- index[letter] = offset

values = entry[VALUE].split(' ')

assert len(values) <= 2, values

- output_file.write(' { %s, %s, %s, %s },\n' % (

- convert_entity_to_cpp_name(entry[ENTITY]),

- len(entry[ENTITY]),

+ output_file.write(' { %s, %s, %s, %s }, // &%s\n' % (

convert_value_to_int(values[0]),

- convert_value_to_int(values[1] if len(values) >= 2 else "")))

- offset += 1

+ convert_value_to_int(values[1] if len(values) >= 2 else ""),

+ entry[2],

+ len(entry[ENTITY]),

+ entry[ENTITY],

+ ))

output_file.write("""};

""")

-output_file.write("static const HTMLEntityTableEntry* uppercaseOffset[] = {\n")

+output_file.write("""

+""")

+output_file.write("static const short uppercaseOffset[] = {\n")

for letter in string.ascii_uppercase:

- output_file.write("%s\n" % offset_table_entry(index[letter]))

-output_file.write("%s\n" % offset_table_entry(index['a']))

+ output_file.write("%d,\n" % index[letter])

+output_file.write("%d\n" % index['a'])

output_file.write("""};

-static const HTMLEntityTableEntry* lowercaseOffset[] = {\n""")

+static const short lowercaseOffset[] = {\n""")

for letter in string.ascii_lowercase:

- output_file.write("%s\n" % offset_table_entry(index[letter]))

-output_file.write("%s\n" % offset_table_entry(entity_count))

+ output_file.write("%d,\n" % index[letter])

+output_file.write("%d\n" % entity_count)

output_file.write("""};

+const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry)

+ return staticEntityStringStorage + entry.entityOffset;

+LChar HTMLEntityTableEntry::lastCharacter() const

+ return HTMLEntityTable::entityString(*this)[length - 1];

}

const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c)

{

if (c >= 'A' && c <= 'Z')

- return uppercaseOffset[c - 'A'];

+ return &staticEntityTable[uppercaseOffset[c - 'A']];

if (c >= 'a' && c <= 'z')

- return lowercaseOffset[c - 'a'];

+ return &staticEntityTable[lowercaseOffset[c - 'a']];

return 0;

}

const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c)

{

if (c >= 'A' && c <= 'Z')

- return uppercaseOffset[c - 'A' + 1] - 1;

+ return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1;

if (c >= 'a' && c <= 'z')

- return lowercaseOffset[c - 'a' + 1] - 1;

+ return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1;

return 0;

}

« no previous file with comments | « Source/core/html/parser/HTMLEntityTable.h ('k') | no next file » | no next file with comments »