Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1048)

Unified Diff: Source/core/html/parser/create-html-entity-table

Issue 199103002: Better storage of HTML entities. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « Source/core/html/parser/HTMLEntityTable.h ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: Source/core/html/parser/create-html-entity-table
diff --git a/Source/core/html/parser/create-html-entity-table b/Source/core/html/parser/create-html-entity-table
index 3b9ca45c58ce8b87b3234bb788702b7076cfb429..587e8a8f5d11d027abb3826782953a3f8d58814f 100755
--- a/Source/core/html/parser/create-html-entity-table
+++ b/Source/core/html/parser/create-html-entity-table
@@ -27,6 +27,11 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""This python script creates the raw data that is our entity
+database. The representation is one string database containing all
+strings we could need, and then a mapping from offset+length -> entity
+data. That is compact, easy to use and efficient."""
+
import csv
import os.path
import string
@@ -35,13 +40,6 @@ import sys
ENTITY = 0
VALUE = 1
-def convert_entity_to_cpp_name(entity):
- postfix = "EntityName"
- if entity[-1] == ";":
- return "%sSemicolon%s" % (entity[:-1], postfix)
- return "%s%s" % (entity, postfix)
-
-
def convert_value_to_int(value):
if not value:
return "0";
@@ -63,9 +61,8 @@ if len(sys.argv) < 4 or sys.argv[1] != "-o":
output_path = sys.argv[2]
input_path = sys.argv[3]
-html_entity_names_file = open(input_path)
-entries = list(csv.reader(html_entity_names_file))
-html_entity_names_file.close()
+with open(input_path) as html_entity_names_file:
+ entries = list(csv.reader(html_entity_names_file))
entries.sort(key = lambda entry: entry[ENTITY])
entity_count = len(entries)
@@ -97,7 +94,7 @@ output_file.write("""/*
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-// THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table
+// THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table
// DO NOT EDIT (unless you are a ninja)!
#include "config.h"
@@ -108,60 +105,126 @@ namespace WebCore {
namespace {
""")
+assert len(entries) > 0, "Code assumes a non-empty entity array."
+def check_ascii(entity_string):
+ for c in entity_string:
+ code = ord(c)
+ assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " +
+ "of storage from LChar to UChar to support " +
+ "this entity.")
+
+output_file.write("static const LChar staticEntityStringStorage[] = {\n")
+output_file.write("'")
+all_data = ""
+entity_offset = 0
+first_output = True
+saved_by_reusing = 0
for entry in entries:
- output_file.write("static const LChar %s[] = \"%s\";\n" % (convert_entity_to_cpp_name(entry[ENTITY]), entry[ENTITY]))
+ check_ascii(entry[ENTITY])
+ # Reuse substrings from earlier entries. This saves 1-2000
+ # characters, but it's O(n^2) and not very smart. The optimal
+ # solution has to solve the "Shortest Common Superstring" problem
+ # and that is NP-Complete or worse.
abarth-chromium 2014/03/17 17:37:27 I bet most of the hits are just stripping the trai
+ #
+ # This would be even more efficient if we didn't store the
+ # semi-colon in the array but as a bit in the entry.
+ entity = entry[ENTITY]
+ already_existing_offset = all_data.find(entity)
+ if already_existing_offset != -1:
+ # Reusing space.
+ this_offset = already_existing_offset
+ saved_by_reusing += len(entity)
+ else:
+ if not first_output:
+ output_file.write(",\n'")
+ first_output = False
+
+ # Try the end of the string and see if we can reuse that to
+ # fit the start of the new entity.
+ data_to_add = entity
+ this_offset = entity_offset
+ for truncated_len in range(len(entity) - 1, 0, -1):
+ if all_data.endswith(entity[:truncated_len]):
+ data_to_add = entity[truncated_len:]
+ this_offset = entity_offset - truncated_len
+ saved_by_reusing += truncated_len
+ break
+
+ output_file.write("', '".join(data_to_add))
+ all_data += data_to_add
+ output_file.write("'")
+ entity_offset += len(data_to_add)
+ assert len(entry) == 2, "We will use slot [2] in the list for the offset."
+ assert this_offset < 32768 # Stored in a 16 bit short.
+ entry.append(this_offset)
+
+output_file.write("};\n")
+
+index = {}
+for offset, entry in enumerate(entries):
+ starting_letter = entry[ENTITY][0]
+ if starting_letter not in index:
+ index[starting_letter] = offset
output_file.write("""
static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count)
-index = {}
-offset = 0
for entry in entries:
- letter = entry[ENTITY][0]
- if letter not in index:
- index[letter] = offset
values = entry[VALUE].split(' ')
assert len(values) <= 2, values
- output_file.write(' { %s, %s, %s, %s },\n' % (
- convert_entity_to_cpp_name(entry[ENTITY]),
- len(entry[ENTITY]),
+ output_file.write(' { %s, %s, %s, %s }, // &%s\n' % (
convert_value_to_int(values[0]),
- convert_value_to_int(values[1] if len(values) >= 2 else "")))
- offset += 1
+ convert_value_to_int(values[1] if len(values) >= 2 else ""),
+ entry[2],
+ len(entry[ENTITY]),
+ entry[ENTITY],
+ ))
output_file.write("""};
""")
-output_file.write("static const HTMLEntityTableEntry* uppercaseOffset[] = {\n")
+output_file.write("""
+}
+""")
+
+output_file.write("static const short uppercaseOffset[] = {\n")
for letter in string.ascii_uppercase:
- output_file.write("%s\n" % offset_table_entry(index[letter]))
-output_file.write("%s\n" % offset_table_entry(index['a']))
+ output_file.write("%d,\n" % index[letter])
+output_file.write("%d\n" % index['a'])
output_file.write("""};
-static const HTMLEntityTableEntry* lowercaseOffset[] = {\n""")
+static const short lowercaseOffset[] = {\n""")
for letter in string.ascii_lowercase:
- output_file.write("%s\n" % offset_table_entry(index[letter]))
-output_file.write("%s\n" % offset_table_entry(entity_count))
+ output_file.write("%d,\n" % index[letter])
+output_file.write("%d\n" % entity_count)
output_file.write("""};
+const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry)
+{
+ return staticEntityStringStorage + entry.entityOffset;
+}
+
+LChar HTMLEntityTableEntry::lastCharacter() const
+{
+ return HTMLEntityTable::entityString(*this)[length - 1];
}
const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c)
{
if (c >= 'A' && c <= 'Z')
- return uppercaseOffset[c - 'A'];
+ return &staticEntityTable[uppercaseOffset[c - 'A']];
if (c >= 'a' && c <= 'z')
- return lowercaseOffset[c - 'a'];
+ return &staticEntityTable[lowercaseOffset[c - 'a']];
return 0;
}
const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c)
{
if (c >= 'A' && c <= 'Z')
- return uppercaseOffset[c - 'A' + 1] - 1;
+ return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1;
if (c >= 'a' && c <= 'z')
- return lowercaseOffset[c - 'a' + 1] - 1;
+ return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1;
return 0;
}
« no previous file with comments | « Source/core/html/parser/HTMLEntityTable.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698