Chromium Code Reviews| Index: Source/core/html/parser/create-html-entity-table |
| diff --git a/Source/core/html/parser/create-html-entity-table b/Source/core/html/parser/create-html-entity-table |
| index 3b9ca45c58ce8b87b3234bb788702b7076cfb429..587e8a8f5d11d027abb3826782953a3f8d58814f 100755 |
| --- a/Source/core/html/parser/create-html-entity-table |
| +++ b/Source/core/html/parser/create-html-entity-table |
| @@ -27,6 +27,11 @@ |
| # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| +"""This python script creates the raw data that is our entity |
| +database. The representation is one string database containing all |
| +strings we could need, and then a mapping from offset+length -> entity |
| +data. That is compact, easy to use and efficient.""" |
| + |
| import csv |
| import os.path |
| import string |
| @@ -35,13 +40,6 @@ import sys |
| ENTITY = 0 |
| VALUE = 1 |
| -def convert_entity_to_cpp_name(entity): |
| - postfix = "EntityName" |
| - if entity[-1] == ";": |
| - return "%sSemicolon%s" % (entity[:-1], postfix) |
| - return "%s%s" % (entity, postfix) |
| - |
| - |
| def convert_value_to_int(value): |
| if not value: |
| return "0"; |
| @@ -63,9 +61,8 @@ if len(sys.argv) < 4 or sys.argv[1] != "-o": |
| output_path = sys.argv[2] |
| input_path = sys.argv[3] |
| -html_entity_names_file = open(input_path) |
| -entries = list(csv.reader(html_entity_names_file)) |
| -html_entity_names_file.close() |
| +with open(input_path) as html_entity_names_file: |
| + entries = list(csv.reader(html_entity_names_file)) |
| entries.sort(key = lambda entry: entry[ENTITY]) |
| entity_count = len(entries) |
| @@ -97,7 +94,7 @@ output_file.write("""/* |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| -// THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table |
| +// THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table |
| // DO NOT EDIT (unless you are a ninja)! |
| #include "config.h" |
| @@ -108,60 +105,126 @@ namespace WebCore { |
| namespace { |
| """) |
| +assert len(entries) > 0, "Code assumes a non-empty entity array." |
| +def check_ascii(entity_string): |
| + for c in entity_string: |
| + code = ord(c) |
| + assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " + |
| + "of storage from LChar to UChar to support " + |
| + "this entity.") |
| + |
| +output_file.write("static const LChar staticEntityStringStorage[] = {\n") |
| +output_file.write("'") |
| +all_data = "" |
| +entity_offset = 0 |
| +first_output = True |
| +saved_by_reusing = 0 |
| for entry in entries: |
| - output_file.write("static const LChar %s[] = \"%s\";\n" % (convert_entity_to_cpp_name(entry[ENTITY]), entry[ENTITY])) |
| + check_ascii(entry[ENTITY]) |
| + # Reuse substrings from earlier entries. This saves 1-2000 |
| + # characters, but it's O(n^2) and not very smart. The optimal |
| + # solution has to solve the "Shortest Common Superstring" problem |
| + # and that is NP-Complete or worse. |
|
abarth-chromium
2014/03/17 17:37:27
I bet most of the hits are just stripping the trai
|
| + # |
| + # This would be even more efficient if we didn't store the |
| + # semi-colon in the array but as a bit in the entry. |
| + entity = entry[ENTITY] |
| + already_existing_offset = all_data.find(entity) |
| + if already_existing_offset != -1: |
| + # Reusing space. |
| + this_offset = already_existing_offset |
| + saved_by_reusing += len(entity) |
| + else: |
| + if not first_output: |
| + output_file.write(",\n'") |
| + first_output = False |
| + |
| + # Try the end of the string and see if we can reuse that to |
| + # fit the start of the new entity. |
| + data_to_add = entity |
| + this_offset = entity_offset |
| + for truncated_len in range(len(entity) - 1, 0, -1): |
| + if all_data.endswith(entity[:truncated_len]): |
| + data_to_add = entity[truncated_len:] |
| + this_offset = entity_offset - truncated_len |
| + saved_by_reusing += truncated_len |
| + break |
| + |
| + output_file.write("', '".join(data_to_add)) |
| + all_data += data_to_add |
| + output_file.write("'") |
| + entity_offset += len(data_to_add) |
| + assert len(entry) == 2, "We will use slot [2] in the list for the offset." |
| + assert this_offset < 32768 # Stored in a 16 bit short. |
| + entry.append(this_offset) |
| + |
| +output_file.write("};\n") |
| + |
| +index = {} |
| +for offset, entry in enumerate(entries): |
| + starting_letter = entry[ENTITY][0] |
| + if starting_letter not in index: |
| + index[starting_letter] = offset |
| output_file.write(""" |
| static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) |
| -index = {} |
| -offset = 0 |
| for entry in entries: |
| - letter = entry[ENTITY][0] |
| - if letter not in index: |
| - index[letter] = offset |
| values = entry[VALUE].split(' ') |
| assert len(values) <= 2, values |
| - output_file.write(' { %s, %s, %s, %s },\n' % ( |
| - convert_entity_to_cpp_name(entry[ENTITY]), |
| - len(entry[ENTITY]), |
| + output_file.write(' { %s, %s, %s, %s }, // &%s\n' % ( |
| convert_value_to_int(values[0]), |
| - convert_value_to_int(values[1] if len(values) >= 2 else ""))) |
| - offset += 1 |
| + convert_value_to_int(values[1] if len(values) >= 2 else ""), |
| + entry[2], |
| + len(entry[ENTITY]), |
| + entry[ENTITY], |
| + )) |
| output_file.write("""}; |
| """) |
| -output_file.write("static const HTMLEntityTableEntry* uppercaseOffset[] = {\n") |
| +output_file.write(""" |
| +} |
| +""") |
| + |
| +output_file.write("static const short uppercaseOffset[] = {\n") |
| for letter in string.ascii_uppercase: |
| - output_file.write("%s\n" % offset_table_entry(index[letter])) |
| -output_file.write("%s\n" % offset_table_entry(index['a'])) |
| + output_file.write("%d,\n" % index[letter]) |
| +output_file.write("%d\n" % index['a']) |
| output_file.write("""}; |
| -static const HTMLEntityTableEntry* lowercaseOffset[] = {\n""") |
| +static const short lowercaseOffset[] = {\n""") |
| for letter in string.ascii_lowercase: |
| - output_file.write("%s\n" % offset_table_entry(index[letter])) |
| -output_file.write("%s\n" % offset_table_entry(entity_count)) |
| + output_file.write("%d,\n" % index[letter]) |
| +output_file.write("%d\n" % entity_count) |
| output_file.write("""}; |
| +const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry) |
| +{ |
| + return staticEntityStringStorage + entry.entityOffset; |
| +} |
| + |
| +LChar HTMLEntityTableEntry::lastCharacter() const |
| +{ |
| + return HTMLEntityTable::entityString(*this)[length - 1]; |
| } |
| const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) |
| { |
| if (c >= 'A' && c <= 'Z') |
| - return uppercaseOffset[c - 'A']; |
| + return &staticEntityTable[uppercaseOffset[c - 'A']]; |
| if (c >= 'a' && c <= 'z') |
| - return lowercaseOffset[c - 'a']; |
| + return &staticEntityTable[lowercaseOffset[c - 'a']]; |
| return 0; |
| } |
| const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) |
| { |
| if (c >= 'A' && c <= 'Z') |
| - return uppercaseOffset[c - 'A' + 1] - 1; |
| + return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1; |
| if (c >= 'a' && c <= 'z') |
| - return lowercaseOffset[c - 'a' + 1] - 1; |
| + return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1; |
| return 0; |
| } |