Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # Copyright (c) 2010 Google Inc. All rights reserved. | 2 # Copyright (c) 2010 Google Inc. All rights reserved. |
| 3 # | 3 # |
| 4 # Redistribution and use in source and binary forms, with or without | 4 # Redistribution and use in source and binary forms, with or without |
| 5 # modification, are permitted provided that the following conditions are | 5 # modification, are permitted provided that the following conditions are |
| 6 # met: | 6 # met: |
| 7 # | 7 # |
| 8 # * Redistributions of source code must retain the above copyright | 8 # * Redistributions of source code must retain the above copyright |
| 9 # notice, this list of conditions and the following disclaimer. | 9 # notice, this list of conditions and the following disclaimer. |
| 10 # * Redistributions in binary form must reproduce the above | 10 # * Redistributions in binary form must reproduce the above |
| 11 # copyright notice, this list of conditions and the following disclaimer | 11 # copyright notice, this list of conditions and the following disclaimer |
| 12 # in the documentation and/or other materials provided with the | 12 # in the documentation and/or other materials provided with the |
| 13 # distribution. | 13 # distribution. |
| 14 # * Neither the name of Google Inc. nor the names of its | 14 # * Neither the name of Google Inc. nor the names of its |
| 15 # contributors may be used to endorse or promote products derived from | 15 # contributors may be used to endorse or promote products derived from |
| 16 # this software without specific prior written permission. | 16 # this software without specific prior written permission. |
| 17 # | 17 # |
| 18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | 29 |
| 30 """This python script creates the raw data that is our entity | |
| 31 database. The representation is one string database containing all | |
| 32 strings we could need, and then a mapping from offset+length -> entity | |
| 33 data. That is compact, easy to use and efficient.""" | |
| 34 | |
| 30 import csv | 35 import csv |
| 31 import os.path | 36 import os.path |
| 32 import string | 37 import string |
| 33 import sys | 38 import sys |
| 34 | 39 |
| 35 ENTITY = 0 | 40 ENTITY = 0 |
| 36 VALUE = 1 | 41 VALUE = 1 |
| 37 | 42 |
| 38 def convert_entity_to_cpp_name(entity): | |
| 39 postfix = "EntityName" | |
| 40 if entity[-1] == ";": | |
| 41 return "%sSemicolon%s" % (entity[:-1], postfix) | |
| 42 return "%s%s" % (entity, postfix) | |
| 43 | |
| 44 | |
| 45 def convert_value_to_int(value): | 43 def convert_value_to_int(value): |
| 46 if not value: | 44 if not value: |
| 47 return "0"; | 45 return "0"; |
| 48 assert(value[0] == "U") | 46 assert(value[0] == "U") |
| 49 assert(value[1] == "+") | 47 assert(value[1] == "+") |
| 50 return "0x" + value[2:] | 48 return "0x" + value[2:] |
| 51 | 49 |
| 52 | 50 |
| 53 def offset_table_entry(offset): | 51 def offset_table_entry(offset): |
| 54 return " &staticEntityTable[%s]," % offset | 52 return " &staticEntityTable[%s]," % offset |
| 55 | 53 |
| 56 | 54 |
| 57 program_name = os.path.basename(__file__) | 55 program_name = os.path.basename(__file__) |
| 58 if len(sys.argv) < 4 or sys.argv[1] != "-o": | 56 if len(sys.argv) < 4 or sys.argv[1] != "-o": |
| 59 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program _name, file=sys.stderr) | 57 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program _name, file=sys.stderr) |
| 60 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name) | 58 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name) |
| 61 exit(1) | 59 exit(1) |
| 62 | 60 |
| 63 output_path = sys.argv[2] | 61 output_path = sys.argv[2] |
| 64 input_path = sys.argv[3] | 62 input_path = sys.argv[3] |
| 65 | 63 |
| 66 html_entity_names_file = open(input_path) | 64 with open(input_path) as html_entity_names_file: |
| 67 entries = list(csv.reader(html_entity_names_file)) | 65 entries = list(csv.reader(html_entity_names_file)) |
| 68 html_entity_names_file.close() | |
| 69 | 66 |
| 70 entries.sort(key = lambda entry: entry[ENTITY]) | 67 entries.sort(key = lambda entry: entry[ENTITY]) |
| 71 entity_count = len(entries) | 68 entity_count = len(entries) |
| 72 | 69 |
| 73 output_file = open(output_path, "w") | 70 output_file = open(output_path, "w") |
| 74 | 71 |
| 75 output_file.write("""/* | 72 output_file.write("""/* |
| 76 * Copyright (C) 2010 Google, Inc. All Rights Reserved. | 73 * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
| 77 * | 74 * |
| 78 * Redistribution and use in source and binary forms, with or without | 75 * Redistribution and use in source and binary forms, with or without |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 90 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | 87 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| 91 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 88 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 92 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 89 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 93 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 90 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 94 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 91 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| 95 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 92 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 96 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 97 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 94 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 98 */ | 95 */ |
| 99 | 96 |
| 100 // THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table | 97 // THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table |
| 101 // DO NOT EDIT (unless you are a ninja)! | 98 // DO NOT EDIT (unless you are a ninja)! |
| 102 | 99 |
| 103 #include "config.h" | 100 #include "config.h" |
| 104 #include "core/html/parser/HTMLEntityTable.h" | 101 #include "core/html/parser/HTMLEntityTable.h" |
| 105 | 102 |
| 106 namespace WebCore { | 103 namespace WebCore { |
| 107 | 104 |
| 108 namespace { | 105 namespace { |
| 109 """) | 106 """) |
| 110 | 107 |
| 108 assert len(entries) > 0, "Code assumes a non-empty entity array." | |
| 109 def check_ascii(entity_string): | |
| 110 for c in entity_string: | |
| 111 code = ord(c) | |
| 112 assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " + | |
| 113 "of storage from LChar to UChar to support " + | |
| 114 "this entity.") | |
| 115 | |
| 116 output_file.write("static const LChar staticEntityStringStorage[] = {\n") | |
| 117 output_file.write("'") | |
| 118 all_data = "" | |
| 119 entity_offset = 0 | |
| 120 first_output = True | |
| 121 saved_by_reusing = 0 | |
| 111 for entry in entries: | 122 for entry in entries: |
| 112 output_file.write("static const LChar %s[] = \"%s\";\n" % (convert_entity_to _cpp_name(entry[ENTITY]), entry[ENTITY])) | 123 check_ascii(entry[ENTITY]) |
| 124 # Reuse substrings from earlier entries. This saves 1-2000 | |
| 125 # characters, but it's O(n^2) and not very smart. The optimal | |
| 126 # solution has to solve the "Shortest Common Superstring" problem | |
| 127 # and that is NP-Complete or worse. | |
|
abarth-chromium
2014/03/17 17:37:27
I bet most of the hits are just stripping the trai
| |
| 128 # | |
| 129 # This would be even more efficient if we didn't store the | |
| 130 # semi-colon in the array but as a bit in the entry. | |
| 131 entity = entry[ENTITY] | |
| 132 already_existing_offset = all_data.find(entity) | |
| 133 if already_existing_offset != -1: | |
| 134 # Reusing space. | |
| 135 this_offset = already_existing_offset | |
| 136 saved_by_reusing += len(entity) | |
| 137 else: | |
| 138 if not first_output: | |
| 139 output_file.write(",\n'") | |
| 140 first_output = False | |
| 141 | |
| 142 # Try the end of the string and see if we can reuse that to | |
| 143 # fit the start of the new entity. | |
| 144 data_to_add = entity | |
| 145 this_offset = entity_offset | |
| 146 for truncated_len in range(len(entity) - 1, 0, -1): | |
| 147 if all_data.endswith(entity[:truncated_len]): | |
| 148 data_to_add = entity[truncated_len:] | |
| 149 this_offset = entity_offset - truncated_len | |
| 150 saved_by_reusing += truncated_len | |
| 151 break | |
| 152 | |
| 153 output_file.write("', '".join(data_to_add)) | |
| 154 all_data += data_to_add | |
| 155 output_file.write("'") | |
| 156 entity_offset += len(data_to_add) | |
| 157 assert len(entry) == 2, "We will use slot [2] in the list for the offset." | |
| 158 assert this_offset < 32768 # Stored in a 16 bit short. | |
| 159 entry.append(this_offset) | |
| 160 | |
| 161 output_file.write("};\n") | |
| 162 | |
| 163 index = {} | |
| 164 for offset, entry in enumerate(entries): | |
| 165 starting_letter = entry[ENTITY][0] | |
| 166 if starting_letter not in index: | |
| 167 index[starting_letter] = offset | |
| 113 | 168 |
| 114 output_file.write(""" | 169 output_file.write(""" |
| 115 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) | 170 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) |
| 116 | 171 |
| 117 index = {} | |
| 118 offset = 0 | |
| 119 for entry in entries: | 172 for entry in entries: |
| 120 letter = entry[ENTITY][0] | |
| 121 if letter not in index: | |
| 122 index[letter] = offset | |
| 123 values = entry[VALUE].split(' ') | 173 values = entry[VALUE].split(' ') |
| 124 assert len(values) <= 2, values | 174 assert len(values) <= 2, values |
| 125 output_file.write(' { %s, %s, %s, %s },\n' % ( | 175 output_file.write(' { %s, %s, %s, %s }, // &%s\n' % ( |
| 126 convert_entity_to_cpp_name(entry[ENTITY]), | 176 convert_value_to_int(values[0]), |
| 177 convert_value_to_int(values[1] if len(values) >= 2 else ""), | |
| 178 entry[2], | |
| 127 len(entry[ENTITY]), | 179 len(entry[ENTITY]), |
| 128 convert_value_to_int(values[0]), | 180 entry[ENTITY], |
| 129 convert_value_to_int(values[1] if len(values) >= 2 else ""))) | 181 )) |
| 130 offset += 1 | |
| 131 | 182 |
| 132 output_file.write("""}; | 183 output_file.write("""}; |
| 133 | 184 |
| 134 """) | 185 """) |
| 135 | 186 |
| 136 output_file.write("static const HTMLEntityTableEntry* uppercaseOffset[] = {\n") | 187 output_file.write(""" |
| 188 } | |
| 189 """) | |
| 190 | |
| 191 output_file.write("static const short uppercaseOffset[] = {\n") | |
| 137 for letter in string.ascii_uppercase: | 192 for letter in string.ascii_uppercase: |
| 138 output_file.write("%s\n" % offset_table_entry(index[letter])) | 193 output_file.write("%d,\n" % index[letter]) |
| 139 output_file.write("%s\n" % offset_table_entry(index['a'])) | 194 output_file.write("%d\n" % index['a']) |
| 140 output_file.write("""}; | 195 output_file.write("""}; |
| 141 | 196 |
| 142 static const HTMLEntityTableEntry* lowercaseOffset[] = {\n""") | 197 static const short lowercaseOffset[] = {\n""") |
| 143 for letter in string.ascii_lowercase: | 198 for letter in string.ascii_lowercase: |
| 144 output_file.write("%s\n" % offset_table_entry(index[letter])) | 199 output_file.write("%d,\n" % index[letter]) |
| 145 output_file.write("%s\n" % offset_table_entry(entity_count)) | 200 output_file.write("%d\n" % entity_count) |
| 146 output_file.write("""}; | 201 output_file.write("""}; |
| 147 | 202 |
| 203 const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry) | |
| 204 { | |
| 205 return staticEntityStringStorage + entry.entityOffset; | |
| 206 } | |
| 207 | |
| 208 LChar HTMLEntityTableEntry::lastCharacter() const | |
| 209 { | |
| 210 return HTMLEntityTable::entityString(*this)[length - 1]; | |
| 148 } | 211 } |
| 149 | 212 |
| 150 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) | 213 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) |
| 151 { | 214 { |
| 152 if (c >= 'A' && c <= 'Z') | 215 if (c >= 'A' && c <= 'Z') |
| 153 return uppercaseOffset[c - 'A']; | 216 return &staticEntityTable[uppercaseOffset[c - 'A']]; |
| 154 if (c >= 'a' && c <= 'z') | 217 if (c >= 'a' && c <= 'z') |
| 155 return lowercaseOffset[c - 'a']; | 218 return &staticEntityTable[lowercaseOffset[c - 'a']]; |
| 156 return 0; | 219 return 0; |
| 157 } | 220 } |
| 158 | 221 |
| 159 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) | 222 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) |
| 160 { | 223 { |
| 161 if (c >= 'A' && c <= 'Z') | 224 if (c >= 'A' && c <= 'Z') |
| 162 return uppercaseOffset[c - 'A' + 1] - 1; | 225 return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1; |
| 163 if (c >= 'a' && c <= 'z') | 226 if (c >= 'a' && c <= 'z') |
| 164 return lowercaseOffset[c - 'a' + 1] - 1; | 227 return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1; |
| 165 return 0; | 228 return 0; |
| 166 } | 229 } |
| 167 | 230 |
| 168 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() | 231 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() |
| 169 { | 232 { |
| 170 return &staticEntityTable[0]; | 233 return &staticEntityTable[0]; |
| 171 } | 234 } |
| 172 | 235 |
| 173 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() | 236 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() |
| 174 { | 237 { |
| 175 return &staticEntityTable[%s - 1]; | 238 return &staticEntityTable[%s - 1]; |
| 176 } | 239 } |
| 177 | 240 |
| 178 } | 241 } |
| 179 """ % entity_count) | 242 """ % entity_count) |
| OLD | NEW |