OLD | NEW |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2010 Google Inc. All rights reserved. | 2 # Copyright (c) 2010 Google Inc. All rights reserved. |
3 # | 3 # |
4 # Redistribution and use in source and binary forms, with or without | 4 # Redistribution and use in source and binary forms, with or without |
5 # modification, are permitted provided that the following conditions are | 5 # modification, are permitted provided that the following conditions are |
6 # met: | 6 # met: |
7 # | 7 # |
8 # * Redistributions of source code must retain the above copyright | 8 # * Redistributions of source code must retain the above copyright |
9 # notice, this list of conditions and the following disclaimer. | 9 # notice, this list of conditions and the following disclaimer. |
10 # * Redistributions in binary form must reproduce the above | 10 # * Redistributions in binary form must reproduce the above |
11 # copyright notice, this list of conditions and the following disclaimer | 11 # copyright notice, this list of conditions and the following disclaimer |
12 # in the documentation and/or other materials provided with the | 12 # in the documentation and/or other materials provided with the |
13 # distribution. | 13 # distribution. |
14 # * Neither the name of Google Inc. nor the names of its | 14 # * Neither the name of Google Inc. nor the names of its |
15 # contributors may be used to endorse or promote products derived from | 15 # contributors may be used to endorse or promote products derived from |
16 # this software without specific prior written permission. | 16 # this software without specific prior written permission. |
17 # | 17 # |
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | 29 |
30 """This python script creates the raw data that is our entity | |
31 database. The representation is one string database containing all | |
32 strings we could need, and then a mapping from offset+length -> entity | |
33 data. That is compact, easy to use and efficient.""" | |
34 | |
30 import csv | 35 import csv |
31 import os.path | 36 import os.path |
32 import string | 37 import string |
33 import sys | 38 import sys |
34 | 39 |
35 ENTITY = 0 | 40 ENTITY = 0 |
36 VALUE = 1 | 41 VALUE = 1 |
37 | 42 |
38 def convert_entity_to_cpp_name(entity): | |
39 postfix = "EntityName" | |
40 if entity[-1] == ";": | |
41 return "%sSemicolon%s" % (entity[:-1], postfix) | |
42 return "%s%s" % (entity, postfix) | |
43 | |
44 | |
45 def convert_value_to_int(value): | 43 def convert_value_to_int(value): |
46 if not value: | 44 if not value: |
47 return "0"; | 45 return "0"; |
48 assert(value[0] == "U") | 46 assert(value[0] == "U") |
49 assert(value[1] == "+") | 47 assert(value[1] == "+") |
50 return "0x" + value[2:] | 48 return "0x" + value[2:] |
51 | 49 |
52 | 50 |
53 def offset_table_entry(offset): | 51 def offset_table_entry(offset): |
54 return " &staticEntityTable[%s]," % offset | 52 return " &staticEntityTable[%s]," % offset |
55 | 53 |
56 | 54 |
57 program_name = os.path.basename(__file__) | 55 program_name = os.path.basename(__file__) |
58 if len(sys.argv) < 4 or sys.argv[1] != "-o": | 56 if len(sys.argv) < 4 or sys.argv[1] != "-o": |
59 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program _name, file=sys.stderr) | 57 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program _name, file=sys.stderr) |
60 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name) | 58 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name) |
61 exit(1) | 59 exit(1) |
62 | 60 |
63 output_path = sys.argv[2] | 61 output_path = sys.argv[2] |
64 input_path = sys.argv[3] | 62 input_path = sys.argv[3] |
65 | 63 |
66 html_entity_names_file = open(input_path) | 64 with open(input_path) as html_entity_names_file: |
67 entries = list(csv.reader(html_entity_names_file)) | 65 entries = list(csv.reader(html_entity_names_file)) |
68 html_entity_names_file.close() | |
69 | 66 |
70 entries.sort(key = lambda entry: entry[ENTITY]) | 67 entries.sort(key = lambda entry: entry[ENTITY]) |
71 entity_count = len(entries) | 68 entity_count = len(entries) |
72 | 69 |
73 output_file = open(output_path, "w") | 70 output_file = open(output_path, "w") |
74 | 71 |
75 output_file.write("""/* | 72 output_file.write("""/* |
76 * Copyright (C) 2010 Google, Inc. All Rights Reserved. | 73 * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
77 * | 74 * |
78 * Redistribution and use in source and binary forms, with or without | 75 * Redistribution and use in source and binary forms, with or without |
(...skipping 11 matching lines...) Expand all Loading... | |
90 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | 87 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
91 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 88 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
92 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 89 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
93 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 90 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
94 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 91 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
95 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 92 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
96 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
97 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 94 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
98 */ | 95 */ |
99 | 96 |
100 // THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table | 97 // THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table |
101 // DO NOT EDIT (unless you are a ninja)! | 98 // DO NOT EDIT (unless you are a ninja)! |
102 | 99 |
103 #include "config.h" | 100 #include "config.h" |
104 #include "core/html/parser/HTMLEntityTable.h" | 101 #include "core/html/parser/HTMLEntityTable.h" |
105 | 102 |
106 namespace WebCore { | 103 namespace WebCore { |
107 | 104 |
108 namespace { | 105 namespace { |
109 """) | 106 """) |
110 | 107 |
108 assert len(entries) > 0, "Code assumes a non-empty entity array." | |
109 def check_ascii(entity_string): | |
110 for c in entity_string: | |
111 code = ord(c) | |
112 assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " + | |
113 "of storage from LChar to UChar to support " + | |
114 "this entity.") | |
115 | |
116 output_file.write("static const LChar staticEntityStringStorage[] = {\n") | |
117 output_file.write("'") | |
118 all_data = "" | |
119 entity_offset = 0 | |
120 first_output = True | |
121 saved_by_reusing = 0 | |
111 for entry in entries: | 122 for entry in entries: |
112 output_file.write("static const LChar %s[] = \"%s\";\n" % (convert_entity_to _cpp_name(entry[ENTITY]), entry[ENTITY])) | 123 check_ascii(entry[ENTITY]) |
124 # Reuse substrings from earlier entries. This saves 1-2000 | |
125 # characters, but it's O(n^2) and not very smart. The optimal | |
126 # solution has to solve the "Shortest Common Superstring" problem | |
127 # and that is NP-Complete or worse. | |
abarth-chromium
2014/03/17 17:37:27
I bet most of the hits are just stripping the trai
| |
128 # | |
129 # This would be even more efficient if we didn't store the | |
130 # semi-colon in the array but as a bit in the entry. | |
131 entity = entry[ENTITY] | |
132 already_existing_offset = all_data.find(entity) | |
133 if already_existing_offset != -1: | |
134 # Reusing space. | |
135 this_offset = already_existing_offset | |
136 saved_by_reusing += len(entity) | |
137 else: | |
138 if not first_output: | |
139 output_file.write(",\n'") | |
140 first_output = False | |
141 | |
142 # Try the end of the string and see if we can reuse that to | |
143 # fit the start of the new entity. | |
144 data_to_add = entity | |
145 this_offset = entity_offset | |
146 for truncated_len in range(len(entity) - 1, 0, -1): | |
147 if all_data.endswith(entity[:truncated_len]): | |
148 data_to_add = entity[truncated_len:] | |
149 this_offset = entity_offset - truncated_len | |
150 saved_by_reusing += truncated_len | |
151 break | |
152 | |
153 output_file.write("', '".join(data_to_add)) | |
154 all_data += data_to_add | |
155 output_file.write("'") | |
156 entity_offset += len(data_to_add) | |
157 assert len(entry) == 2, "We will use slot [2] in the list for the offset." | |
158 assert this_offset < 32768 # Stored in a 16 bit short. | |
159 entry.append(this_offset) | |
160 | |
161 output_file.write("};\n") | |
162 | |
163 index = {} | |
164 for offset, entry in enumerate(entries): | |
165 starting_letter = entry[ENTITY][0] | |
166 if starting_letter not in index: | |
167 index[starting_letter] = offset | |
113 | 168 |
114 output_file.write(""" | 169 output_file.write(""" |
115 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) | 170 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) |
116 | 171 |
117 index = {} | |
118 offset = 0 | |
119 for entry in entries: | 172 for entry in entries: |
120 letter = entry[ENTITY][0] | |
121 if letter not in index: | |
122 index[letter] = offset | |
123 values = entry[VALUE].split(' ') | 173 values = entry[VALUE].split(' ') |
124 assert len(values) <= 2, values | 174 assert len(values) <= 2, values |
125 output_file.write(' { %s, %s, %s, %s },\n' % ( | 175 output_file.write(' { %s, %s, %s, %s }, // &%s\n' % ( |
126 convert_entity_to_cpp_name(entry[ENTITY]), | 176 convert_value_to_int(values[0]), |
177 convert_value_to_int(values[1] if len(values) >= 2 else ""), | |
178 entry[2], | |
127 len(entry[ENTITY]), | 179 len(entry[ENTITY]), |
128 convert_value_to_int(values[0]), | 180 entry[ENTITY], |
129 convert_value_to_int(values[1] if len(values) >= 2 else ""))) | 181 )) |
130 offset += 1 | |
131 | 182 |
132 output_file.write("""}; | 183 output_file.write("""}; |
133 | 184 |
134 """) | 185 """) |
135 | 186 |
136 output_file.write("static const HTMLEntityTableEntry* uppercaseOffset[] = {\n") | 187 output_file.write(""" |
188 } | |
189 """) | |
190 | |
191 output_file.write("static const short uppercaseOffset[] = {\n") | |
137 for letter in string.ascii_uppercase: | 192 for letter in string.ascii_uppercase: |
138 output_file.write("%s\n" % offset_table_entry(index[letter])) | 193 output_file.write("%d,\n" % index[letter]) |
139 output_file.write("%s\n" % offset_table_entry(index['a'])) | 194 output_file.write("%d\n" % index['a']) |
140 output_file.write("""}; | 195 output_file.write("""}; |
141 | 196 |
142 static const HTMLEntityTableEntry* lowercaseOffset[] = {\n""") | 197 static const short lowercaseOffset[] = {\n""") |
143 for letter in string.ascii_lowercase: | 198 for letter in string.ascii_lowercase: |
144 output_file.write("%s\n" % offset_table_entry(index[letter])) | 199 output_file.write("%d,\n" % index[letter]) |
145 output_file.write("%s\n" % offset_table_entry(entity_count)) | 200 output_file.write("%d\n" % entity_count) |
146 output_file.write("""}; | 201 output_file.write("""}; |
147 | 202 |
203 const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry) | |
204 { | |
205 return staticEntityStringStorage + entry.entityOffset; | |
206 } | |
207 | |
208 LChar HTMLEntityTableEntry::lastCharacter() const | |
209 { | |
210 return HTMLEntityTable::entityString(*this)[length - 1]; | |
148 } | 211 } |
149 | 212 |
150 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) | 213 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) |
151 { | 214 { |
152 if (c >= 'A' && c <= 'Z') | 215 if (c >= 'A' && c <= 'Z') |
153 return uppercaseOffset[c - 'A']; | 216 return &staticEntityTable[uppercaseOffset[c - 'A']]; |
154 if (c >= 'a' && c <= 'z') | 217 if (c >= 'a' && c <= 'z') |
155 return lowercaseOffset[c - 'a']; | 218 return &staticEntityTable[lowercaseOffset[c - 'a']]; |
156 return 0; | 219 return 0; |
157 } | 220 } |
158 | 221 |
159 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) | 222 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) |
160 { | 223 { |
161 if (c >= 'A' && c <= 'Z') | 224 if (c >= 'A' && c <= 'Z') |
162 return uppercaseOffset[c - 'A' + 1] - 1; | 225 return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1; |
163 if (c >= 'a' && c <= 'z') | 226 if (c >= 'a' && c <= 'z') |
164 return lowercaseOffset[c - 'a' + 1] - 1; | 227 return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1; |
165 return 0; | 228 return 0; |
166 } | 229 } |
167 | 230 |
168 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() | 231 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() |
169 { | 232 { |
170 return &staticEntityTable[0]; | 233 return &staticEntityTable[0]; |
171 } | 234 } |
172 | 235 |
173 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() | 236 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() |
174 { | 237 { |
175 return &staticEntityTable[%s - 1]; | 238 return &staticEntityTable[%s - 1]; |
176 } | 239 } |
177 | 240 |
178 } | 241 } |
179 """ % entity_count) | 242 """ % entity_count) |
OLD | NEW |