OLD | NEW |
| (Empty) |
1 /* | |
2 ********************************************************************** | |
3 * Copyright (C) 2014, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ********************************************************************** | |
6 * file name: bitset.cpp | |
7 * encoding: US-ASCII | |
8 * tab size: 8 (not used) | |
9 * indentation:4 | |
10 * | |
11 * created on: 2007jan15 | |
12 * created by: Markus Scherer | |
13 * | |
14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet | |
15 * using a folded bit set consisting of a 1k-entry index table and a | |
16 * compacted array of 64-bit words. | |
17 * Uses a simple hash table for compaction. | |
18 * Uses the original set for supplementary code points. | |
19 */ | |
20 | |
21 #include "unicode/utypes.h" | |
22 #include "unicont.h" | |
23 #include "cmemory.h" // for UPRV_LENGTHOF | |
24 | |
25 /* | |
26 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. | |
27 * Hashes 64-bit words and maps them to 16-bit integers which are | |
28 * assigned in order of new incoming words for subsequent storage | |
29 * in a contiguous array. | |
30 */ | |
31 struct BMPBitHash : public UObject { | |
32 int64_t keys[0x800]; // 2k | |
33 uint16_t values[0x800]; | |
34 uint16_t reverse[0x400]; | |
35 uint16_t count; | |
36 const int32_t prime=1301; // Less than 2k. | |
37 | |
38 BMPBitHash() : count(0) { | |
39 // Fill values[] with 0xffff. | |
40 uprv_memset(values, 0xff, sizeof(values)); | |
41 } | |
42 | |
43 /* | |
44 * Map a key to an integer count. | |
45 * Map at most 1k=0x400 different keys with this data structure. | |
46 */ | |
47 uint16_t map(int64_t key) { | |
48 int32_t hash=(int32_t)(key>>55)&0x1ff; | |
49 hash^=(int32_t)(key>>44)&0x7ff; | |
50 hash^=(int32_t)(key>>33)&0x7ff; | |
51 hash^=(int32_t)(key>>22)&0x7ff; | |
52 hash^=(int32_t)(key>>11)&0x7ff; | |
53 hash^=(int32_t)key&0x7ff; | |
54 for(;;) { | |
55 if(values[hash]==0xffff) { | |
56 // Unused slot. | |
57 keys[hash]=key; | |
58 reverse[count]=hash; | |
59 return values[hash]=count++; | |
60 } else if(keys[hash]==key) { | |
61 // Found a slot with this key. | |
62 return values[hash]; | |
63 } else { | |
64 // Used slot with a different key, move to another slot. | |
65 hash=(hash+prime)&0x7ff; | |
66 } | |
67 } | |
68 } | |
69 | |
70 uint16_t countKeys() const { return count; } | |
71 | |
72 /* | |
73 * Invert the hash map: Fill an array of length countKeys() with the keys | |
74 * indexed by their mapped values. | |
75 */ | |
76 void invert(int64_t *k) const { | |
77 uint16_t i; | |
78 | |
79 for(i=0; i<count; ++i) { | |
80 k[i]=keys[reverse[i]]; | |
81 } | |
82 } | |
83 }; | |
84 | |
85 class BitSet : public UObject, public UnicodeContainable { | |
86 public: | |
87 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), rest
Set(set.clone()) { | |
88 if(U_FAILURE(errorCode)) { | |
89 return; | |
90 } | |
91 BMPBitHash *bitHash=new BMPBitHash; | |
92 if(bitHash==NULL || restSet==NULL) { | |
93 errorCode=U_MEMORY_ALLOCATION_ERROR; | |
94 return; | |
95 } | |
96 | |
97 UnicodeSetIterator iter(set); | |
98 int64_t b; | |
99 UChar32 start, end; | |
100 int32_t prevIndex, i, j; | |
101 | |
102 b=0; // Not necessary but makes compilers happy. | |
103 prevIndex=-1; | |
104 for(;;) { | |
105 if(iter.nextRange() && !iter.isString()) { | |
106 start=iter.getCodepoint(); | |
107 end=iter.getCodepointEnd(); | |
108 } else { | |
109 start=0x10000; | |
110 } | |
111 i=start>>6; | |
112 if(prevIndex!=i) { | |
113 // Finish the end of the previous range. | |
114 if(prevIndex<0) { | |
115 prevIndex=0; | |
116 } else { | |
117 index[prevIndex++]=bitHash->map(b); | |
118 } | |
119 // Fill all-zero entries between ranges. | |
120 if(prevIndex<i) { | |
121 uint16_t zero=bitHash->map(0); | |
122 do { | |
123 index[prevIndex++]=zero; | |
124 } while(prevIndex<i); | |
125 } | |
126 b=0; | |
127 } | |
128 if(start>0xffff) { | |
129 break; | |
130 } | |
131 b|=~((INT64_C(1)<<(start&0x3f))-1); | |
132 j=end>>6; | |
133 if(i<j) { | |
134 // Set bits for the start of the range. | |
135 index[i++]=bitHash->map(b); | |
136 // Fill all-one entries inside the range. | |
137 if(i<j) { | |
138 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); | |
139 do { | |
140 index[i++]=all; | |
141 } while(i<j); | |
142 } | |
143 b=INT64_C(0xffffffffffffffff); | |
144 } | |
145 /* i==j */ | |
146 b&=(INT64_C(1)<<(end&0x3f))-1; | |
147 prevIndex=j; | |
148 } | |
149 | |
150 if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) { | |
151 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); | |
152 } | |
153 if(bits!=NULL) { | |
154 bitHash->invert(bits); | |
155 } else { | |
156 bits=shortBits; | |
157 errorCode=U_MEMORY_ALLOCATION_ERROR; | |
158 return; | |
159 } | |
160 | |
161 latin1Set[0]=(uint32_t)bits[0]; | |
162 latin1Set[1]=(uint32_t)(bits[0]>>32); | |
163 latin1Set[2]=(uint32_t)bits[1]; | |
164 latin1Set[3]=(uint32_t)(bits[1]>>32); | |
165 latin1Set[4]=(uint32_t)bits[2]; | |
166 latin1Set[5]=(uint32_t)(bits[2]>>32); | |
167 latin1Set[6]=(uint32_t)bits[3]; | |
168 latin1Set[7]=(uint32_t)(bits[3]>>32); | |
169 | |
170 restSet.remove(0, 0xffff); | |
171 } | |
172 | |
173 ~BitSet() { | |
174 if(bits!=shortBits) { | |
175 uprv_free(bits); | |
176 } | |
177 delete restSet; | |
178 } | |
179 | |
180 UBool contains(UChar32 c) const { | |
181 if((uint32_t)c<=0xff) { | |
182 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); | |
183 } else if((uint32_t)c<0xffff) { | |
184 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); | |
185 } else { | |
186 return restSet->contains(c); | |
187 } | |
188 } | |
189 | |
190 private: | |
191 uint16_t index[0x400]; | |
192 int64_t shortBits[32]; | |
193 int64_t *bits; | |
194 | |
195 uint32_t latin1Bits[8]; | |
196 | |
197 UnicodeSet *restSet; | |
198 }; | |
OLD | NEW |