Index: Source/core/html/parser/create-html-entity-table |
diff --git a/Source/core/html/parser/create-html-entity-table b/Source/core/html/parser/create-html-entity-table |
index 3b9ca45c58ce8b87b3234bb788702b7076cfb429..587e8a8f5d11d027abb3826782953a3f8d58814f 100755 |
--- a/Source/core/html/parser/create-html-entity-table |
+++ b/Source/core/html/parser/create-html-entity-table |
@@ -27,6 +27,11 @@ |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
+"""This python script creates the raw data that is our entity |
+database. The representation is one string database containing all |
+strings we could need, and then a mapping from offset+length -> entity |
+data. That is compact, easy to use and efficient.""" |
+ |
import csv |
import os.path |
import string |
@@ -35,13 +40,6 @@ import sys |
ENTITY = 0 |
VALUE = 1 |
-def convert_entity_to_cpp_name(entity): |
- postfix = "EntityName" |
- if entity[-1] == ";": |
- return "%sSemicolon%s" % (entity[:-1], postfix) |
- return "%s%s" % (entity, postfix) |
- |
- |
def convert_value_to_int(value): |
if not value: |
return "0"; |
@@ -63,9 +61,8 @@ if len(sys.argv) < 4 or sys.argv[1] != "-o": |
output_path = sys.argv[2] |
input_path = sys.argv[3] |
-html_entity_names_file = open(input_path) |
-entries = list(csv.reader(html_entity_names_file)) |
-html_entity_names_file.close() |
+with open(input_path) as html_entity_names_file: |
+ entries = list(csv.reader(html_entity_names_file)) |
entries.sort(key = lambda entry: entry[ENTITY]) |
entity_count = len(entries) |
@@ -97,7 +94,7 @@ output_file.write("""/* |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
*/ |
-// THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table |
+// THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table |
// DO NOT EDIT (unless you are a ninja)! |
#include "config.h" |
@@ -108,60 +105,126 @@ namespace WebCore { |
namespace { |
""") |
+assert len(entries) > 0, "Code assumes a non-empty entity array." |
+def check_ascii(entity_string): |
+ for c in entity_string: |
+ code = ord(c) |
+ assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " + |
+ "of storage from LChar to UChar to support " + |
+ "this entity.") |
+ |
+output_file.write("static const LChar staticEntityStringStorage[] = {\n") |
+output_file.write("'") |
+all_data = "" |
+entity_offset = 0 |
+first_output = True |
+saved_by_reusing = 0 |
for entry in entries: |
- output_file.write("static const LChar %s[] = \"%s\";\n" % (convert_entity_to_cpp_name(entry[ENTITY]), entry[ENTITY])) |
+ check_ascii(entry[ENTITY]) |
+ # Reuse substrings from earlier entries. This saves 1-2000 |
+ # characters, but it's O(n^2) and not very smart. The optimal |
+ # solution has to solve the "Shortest Common Superstring" problem |
+ # and that is NP-Complete or worse. |
abarth-chromium
2014/03/17 17:37:27
I bet most of the hits are just stripping the trai
|
+ # |
+ # This would be even more efficient if we didn't store the |
+ # semi-colon in the array but as a bit in the entry. |
+ entity = entry[ENTITY] |
+ already_existing_offset = all_data.find(entity) |
+ if already_existing_offset != -1: |
+ # Reusing space. |
+ this_offset = already_existing_offset |
+ saved_by_reusing += len(entity) |
+ else: |
+ if not first_output: |
+ output_file.write(",\n'") |
+ first_output = False |
+ |
+ # Try the end of the string and see if we can reuse that to |
+ # fit the start of the new entity. |
+ data_to_add = entity |
+ this_offset = entity_offset |
+ for truncated_len in range(len(entity) - 1, 0, -1): |
+ if all_data.endswith(entity[:truncated_len]): |
+ data_to_add = entity[truncated_len:] |
+ this_offset = entity_offset - truncated_len |
+ saved_by_reusing += truncated_len |
+ break |
+ |
+ output_file.write("', '".join(data_to_add)) |
+ all_data += data_to_add |
+ output_file.write("'") |
+ entity_offset += len(data_to_add) |
+ assert len(entry) == 2, "We will use slot [2] in the list for the offset." |
+ assert this_offset < 32768 # Stored in a 16 bit short. |
+ entry.append(this_offset) |
+ |
+output_file.write("};\n") |
+ |
+index = {} |
+for offset, entry in enumerate(entries): |
+ starting_letter = entry[ENTITY][0] |
+ if starting_letter not in index: |
+ index[starting_letter] = offset |
output_file.write(""" |
static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) |
-index = {} |
-offset = 0 |
for entry in entries: |
- letter = entry[ENTITY][0] |
- if letter not in index: |
- index[letter] = offset |
values = entry[VALUE].split(' ') |
assert len(values) <= 2, values |
- output_file.write(' { %s, %s, %s, %s },\n' % ( |
- convert_entity_to_cpp_name(entry[ENTITY]), |
- len(entry[ENTITY]), |
+ output_file.write(' { %s, %s, %s, %s }, // &%s\n' % ( |
convert_value_to_int(values[0]), |
- convert_value_to_int(values[1] if len(values) >= 2 else ""))) |
- offset += 1 |
+ convert_value_to_int(values[1] if len(values) >= 2 else ""), |
+ entry[2], |
+ len(entry[ENTITY]), |
+ entry[ENTITY], |
+ )) |
output_file.write("""}; |
""") |
-output_file.write("static const HTMLEntityTableEntry* uppercaseOffset[] = {\n") |
+output_file.write(""" |
+} |
+""") |
+ |
+output_file.write("static const short uppercaseOffset[] = {\n") |
for letter in string.ascii_uppercase: |
- output_file.write("%s\n" % offset_table_entry(index[letter])) |
-output_file.write("%s\n" % offset_table_entry(index['a'])) |
+ output_file.write("%d,\n" % index[letter]) |
+output_file.write("%d\n" % index['a']) |
output_file.write("""}; |
-static const HTMLEntityTableEntry* lowercaseOffset[] = {\n""") |
+static const short lowercaseOffset[] = {\n""") |
for letter in string.ascii_lowercase: |
- output_file.write("%s\n" % offset_table_entry(index[letter])) |
-output_file.write("%s\n" % offset_table_entry(entity_count)) |
+ output_file.write("%d,\n" % index[letter]) |
+output_file.write("%d\n" % entity_count) |
output_file.write("""}; |
+const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry) |
+{ |
+ return staticEntityStringStorage + entry.entityOffset; |
+} |
+ |
+LChar HTMLEntityTableEntry::lastCharacter() const |
+{ |
+ return HTMLEntityTable::entityString(*this)[length - 1]; |
} |
const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) |
{ |
if (c >= 'A' && c <= 'Z') |
- return uppercaseOffset[c - 'A']; |
+ return &staticEntityTable[uppercaseOffset[c - 'A']]; |
if (c >= 'a' && c <= 'z') |
- return lowercaseOffset[c - 'a']; |
+ return &staticEntityTable[lowercaseOffset[c - 'a']]; |
return 0; |
} |
const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) |
{ |
if (c >= 'A' && c <= 'Z') |
- return uppercaseOffset[c - 'A' + 1] - 1; |
+ return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1; |
if (c >= 'a' && c <= 'z') |
- return lowercaseOffset[c - 'a' + 1] - 1; |
+ return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1; |
return 0; |
} |