Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(586)

Side by Side Diff: Source/core/html/parser/create-html-entity-table

Issue 199103002: Better storage of HTML entities. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Source/core/html/parser/HTMLEntityTable.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright (c) 2010 Google Inc. All rights reserved. 2 # Copyright (c) 2010 Google Inc. All rights reserved.
3 # 3 #
4 # Redistribution and use in source and binary forms, with or without 4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions are 5 # modification, are permitted provided that the following conditions are
6 # met: 6 # met:
7 # 7 #
8 # * Redistributions of source code must retain the above copyright 8 # * Redistributions of source code must retain the above copyright
9 # notice, this list of conditions and the following disclaimer. 9 # notice, this list of conditions and the following disclaimer.
10 # * Redistributions in binary form must reproduce the above 10 # * Redistributions in binary form must reproduce the above
11 # copyright notice, this list of conditions and the following disclaimer 11 # copyright notice, this list of conditions and the following disclaimer
12 # in the documentation and/or other materials provided with the 12 # in the documentation and/or other materials provided with the
13 # distribution. 13 # distribution.
14 # * Neither the name of Google Inc. nor the names of its 14 # * Neither the name of Google Inc. nor the names of its
15 # contributors may be used to endorse or promote products derived from 15 # contributors may be used to endorse or promote products derived from
16 # this software without specific prior written permission. 16 # this software without specific prior written permission.
17 # 17 #
18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 29
30 """This python script creates the raw data that is our entity
31 database. The representation is one string database containing all
32 strings we could need, and then a mapping from offset+length -> entity
33 data. That is compact, easy to use and efficient."""
34
30 import csv 35 import csv
31 import os.path 36 import os.path
32 import string 37 import string
33 import sys 38 import sys
34 39
35 ENTITY = 0 40 ENTITY = 0
36 VALUE = 1 41 VALUE = 1
37 42
38 def convert_entity_to_cpp_name(entity):
39 postfix = "EntityName"
40 if entity[-1] == ";":
41 return "%sSemicolon%s" % (entity[:-1], postfix)
42 return "%s%s" % (entity, postfix)
43
44
45 def convert_value_to_int(value): 43 def convert_value_to_int(value):
46 if not value: 44 if not value:
47 return "0"; 45 return "0";
48 assert(value[0] == "U") 46 assert(value[0] == "U")
49 assert(value[1] == "+") 47 assert(value[1] == "+")
50 return "0x" + value[2:] 48 return "0x" + value[2:]
51 49
52 50
53 def offset_table_entry(offset): 51 def offset_table_entry(offset):
54 return " &staticEntityTable[%s]," % offset 52 return " &staticEntityTable[%s]," % offset
55 53
56 54
57 program_name = os.path.basename(__file__) 55 program_name = os.path.basename(__file__)
58 if len(sys.argv) < 4 or sys.argv[1] != "-o": 56 if len(sys.argv) < 4 or sys.argv[1] != "-o":
59 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program _name, file=sys.stderr) 57 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program _name, file=sys.stderr)
60 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name) 58 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name)
61 exit(1) 59 exit(1)
62 60
63 output_path = sys.argv[2] 61 output_path = sys.argv[2]
64 input_path = sys.argv[3] 62 input_path = sys.argv[3]
65 63
66 html_entity_names_file = open(input_path) 64 with open(input_path) as html_entity_names_file:
67 entries = list(csv.reader(html_entity_names_file)) 65 entries = list(csv.reader(html_entity_names_file))
68 html_entity_names_file.close()
69 66
70 entries.sort(key = lambda entry: entry[ENTITY]) 67 entries.sort(key = lambda entry: entry[ENTITY])
71 entity_count = len(entries) 68 entity_count = len(entries)
72 69
73 output_file = open(output_path, "w") 70 output_file = open(output_path, "w")
74 71
75 output_file.write("""/* 72 output_file.write("""/*
76 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 73 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
77 * 74 *
78 * Redistribution and use in source and binary forms, with or without 75 * Redistribution and use in source and binary forms, with or without
(...skipping 11 matching lines...) Expand all
90 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 87 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
91 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 88 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
92 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 89 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
93 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 90 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
94 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 91 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
95 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 92 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
96 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
97 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 94 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
98 */ 95 */
99 96
100 // THIS FILE IS GENERATED BY WebCore/html/parser/create-html-entity-table 97 // THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table
101 // DO NOT EDIT (unless you are a ninja)! 98 // DO NOT EDIT (unless you are a ninja)!
102 99
103 #include "config.h" 100 #include "config.h"
104 #include "core/html/parser/HTMLEntityTable.h" 101 #include "core/html/parser/HTMLEntityTable.h"
105 102
106 namespace WebCore { 103 namespace WebCore {
107 104
108 namespace { 105 namespace {
109 """) 106 """)
110 107
108 assert len(entries) > 0, "Code assumes a non-empty entity array."
109 def check_ascii(entity_string):
110 for c in entity_string:
111 code = ord(c)
112 assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " +
113 "of storage from LChar to UChar to support " +
114 "this entity.")
115
116 output_file.write("static const LChar staticEntityStringStorage[] = {\n")
117 output_file.write("'")
118 all_data = ""
119 entity_offset = 0
120 first_output = True
121 saved_by_reusing = 0
111 for entry in entries: 122 for entry in entries:
112 output_file.write("static const LChar %s[] = \"%s\";\n" % (convert_entity_to _cpp_name(entry[ENTITY]), entry[ENTITY])) 123 check_ascii(entry[ENTITY])
124 # Reuse substrings from earlier entries. This saves 1-2000
125 # characters, but it's O(n^2) and not very smart. The optimal
126 # solution has to solve the "Shortest Common Superstring" problem
127 # and that is NP-Complete or worse.
abarth-chromium 2014/03/17 17:37:27 I bet most of the hits are just stripping the trai
128 #
129 # This would be even more efficient if we didn't store the
130 # semi-colon in the array but as a bit in the entry.
131 entity = entry[ENTITY]
132 already_existing_offset = all_data.find(entity)
133 if already_existing_offset != -1:
134 # Reusing space.
135 this_offset = already_existing_offset
136 saved_by_reusing += len(entity)
137 else:
138 if not first_output:
139 output_file.write(",\n'")
140 first_output = False
141
142 # Try the end of the string and see if we can reuse that to
143 # fit the start of the new entity.
144 data_to_add = entity
145 this_offset = entity_offset
146 for truncated_len in range(len(entity) - 1, 0, -1):
147 if all_data.endswith(entity[:truncated_len]):
148 data_to_add = entity[truncated_len:]
149 this_offset = entity_offset - truncated_len
150 saved_by_reusing += truncated_len
151 break
152
153 output_file.write("', '".join(data_to_add))
154 all_data += data_to_add
155 output_file.write("'")
156 entity_offset += len(data_to_add)
157 assert len(entry) == 2, "We will use slot [2] in the list for the offset."
158 assert this_offset < 32768 # Stored in a 16 bit short.
159 entry.append(this_offset)
160
161 output_file.write("};\n")
162
163 index = {}
164 for offset, entry in enumerate(entries):
165 starting_letter = entry[ENTITY][0]
166 if starting_letter not in index:
167 index[starting_letter] = offset
113 168
114 output_file.write(""" 169 output_file.write("""
115 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) 170 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count)
116 171
117 index = {}
118 offset = 0
119 for entry in entries: 172 for entry in entries:
120 letter = entry[ENTITY][0]
121 if letter not in index:
122 index[letter] = offset
123 values = entry[VALUE].split(' ') 173 values = entry[VALUE].split(' ')
124 assert len(values) <= 2, values 174 assert len(values) <= 2, values
125 output_file.write(' { %s, %s, %s, %s },\n' % ( 175 output_file.write(' { %s, %s, %s, %s }, // &%s\n' % (
126 convert_entity_to_cpp_name(entry[ENTITY]), 176 convert_value_to_int(values[0]),
177 convert_value_to_int(values[1] if len(values) >= 2 else ""),
178 entry[2],
127 len(entry[ENTITY]), 179 len(entry[ENTITY]),
128 convert_value_to_int(values[0]), 180 entry[ENTITY],
129 convert_value_to_int(values[1] if len(values) >= 2 else ""))) 181 ))
130 offset += 1
131 182
132 output_file.write("""}; 183 output_file.write("""};
133 184
134 """) 185 """)
135 186
136 output_file.write("static const HTMLEntityTableEntry* uppercaseOffset[] = {\n") 187 output_file.write("""
188 }
189 """)
190
191 output_file.write("static const short uppercaseOffset[] = {\n")
137 for letter in string.ascii_uppercase: 192 for letter in string.ascii_uppercase:
138 output_file.write("%s\n" % offset_table_entry(index[letter])) 193 output_file.write("%d,\n" % index[letter])
139 output_file.write("%s\n" % offset_table_entry(index['a'])) 194 output_file.write("%d\n" % index['a'])
140 output_file.write("""}; 195 output_file.write("""};
141 196
142 static const HTMLEntityTableEntry* lowercaseOffset[] = {\n""") 197 static const short lowercaseOffset[] = {\n""")
143 for letter in string.ascii_lowercase: 198 for letter in string.ascii_lowercase:
144 output_file.write("%s\n" % offset_table_entry(index[letter])) 199 output_file.write("%d,\n" % index[letter])
145 output_file.write("%s\n" % offset_table_entry(entity_count)) 200 output_file.write("%d\n" % entity_count)
146 output_file.write("""}; 201 output_file.write("""};
147 202
203 const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry)
204 {
205 return staticEntityStringStorage + entry.entityOffset;
206 }
207
208 LChar HTMLEntityTableEntry::lastCharacter() const
209 {
210 return HTMLEntityTable::entityString(*this)[length - 1];
148 } 211 }
149 212
150 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) 213 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c)
151 { 214 {
152 if (c >= 'A' && c <= 'Z') 215 if (c >= 'A' && c <= 'Z')
153 return uppercaseOffset[c - 'A']; 216 return &staticEntityTable[uppercaseOffset[c - 'A']];
154 if (c >= 'a' && c <= 'z') 217 if (c >= 'a' && c <= 'z')
155 return lowercaseOffset[c - 'a']; 218 return &staticEntityTable[lowercaseOffset[c - 'a']];
156 return 0; 219 return 0;
157 } 220 }
158 221
159 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) 222 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c)
160 { 223 {
161 if (c >= 'A' && c <= 'Z') 224 if (c >= 'A' && c <= 'Z')
162 return uppercaseOffset[c - 'A' + 1] - 1; 225 return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1;
163 if (c >= 'a' && c <= 'z') 226 if (c >= 'a' && c <= 'z')
164 return lowercaseOffset[c - 'a' + 1] - 1; 227 return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1;
165 return 0; 228 return 0;
166 } 229 }
167 230
168 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() 231 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry()
169 { 232 {
170 return &staticEntityTable[0]; 233 return &staticEntityTable[0];
171 } 234 }
172 235
173 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() 236 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry()
174 { 237 {
175 return &staticEntityTable[%s - 1]; 238 return &staticEntityTable[%s - 1];
176 } 239 }
177 240
178 } 241 }
179 """ % entity_count) 242 """ % entity_count)
OLDNEW
« no previous file with comments | « Source/core/html/parser/HTMLEntityTable.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698