OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2007, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * file name: bitset.cpp |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * created on: 2007jan15 |
| 12 * created by: Markus Scherer |
| 13 * |
| 14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet |
| 15 * using a folded bit set consisting of a 1k-entry index table and a |
| 16 * compacted array of 64-bit words. |
| 17 * Uses a simple hash table for compaction. |
| 18 * Uses the original set for supplementary code points. |
| 19 */ |
| 20 |
| 21 #include "unicode/utypes.h" |
| 22 #include "unicont.h" |
| 23 |
| 24 /* |
| 25 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. |
| 26 * Hashes 64-bit words and maps them to 16-bit integers which are |
| 27 * assigned in order of new incoming words for subsequent storage |
| 28 * in a contiguous array. |
| 29 */ |
| 30 struct BMPBitHash : public UObject { |
| 31 int64_t keys[0x800]; // 2k |
| 32 uint16_t values[0x800]; |
| 33 uint16_t reverse[0x400]; |
| 34 uint16_t count; |
| 35 const int32_t prime=1301; // Less than 2k. |
| 36 |
| 37 BMPBitHash() : count(0) { |
| 38 // Fill values[] with 0xffff. |
| 39 uprv_memset(values, 0xff, sizeof(values)); |
| 40 } |
| 41 |
| 42 /* |
| 43 * Map a key to an integer count. |
| 44 * Map at most 1k=0x400 different keys with this data structure. |
| 45 */ |
| 46 uint16_t map(int64_t key) { |
| 47 int32_t hash=(int32_t)(key>>55)&0x1ff; |
| 48 hash^=(int32_t)(key>>44)&0x7ff; |
| 49 hash^=(int32_t)(key>>33)&0x7ff; |
| 50 hash^=(int32_t)(key>>22)&0x7ff; |
| 51 hash^=(int32_t)(key>>11)&0x7ff; |
| 52 hash^=(int32_t)key&0x7ff; |
| 53 for(;;) { |
| 54 if(values[hash]==0xffff) { |
| 55 // Unused slot. |
| 56 keys[hash]=key; |
| 57 reverse[count]=hash; |
| 58 return values[hash]=count++; |
| 59 } else if(keys[hash]==key) { |
| 60 // Found a slot with this key. |
| 61 return values[hash]; |
| 62 } else { |
| 63 // Used slot with a different key, move to another slot. |
| 64 hash=(hash+prime)&0x7ff; |
| 65 } |
| 66 } |
| 67 } |
| 68 |
| 69 uint16_t countKeys() const { return count; } |
| 70 |
| 71 /* |
| 72 * Invert the hash map: Fill an array of length countKeys() with the keys |
| 73 * indexed by their mapped values. |
| 74 */ |
| 75 void invert(int64_t *k) const { |
| 76 uint16_t i; |
| 77 |
| 78 for(i=0; i<count; ++i) { |
| 79 k[i]=keys[reverse[i]]; |
| 80 } |
| 81 } |
| 82 }; |
| 83 |
| 84 class BitSet : public UObject, public UnicodeContainable { |
| 85 public: |
| 86 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), rest
Set(set.clone()) { |
| 87 if(U_FAILURE(errorCode)) { |
| 88 return; |
| 89 } |
| 90 BMPBitHash *bitHash=new BMPBitHash; |
| 91 if(bitHash==NULL || restSet==NULL) { |
| 92 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 93 return; |
| 94 } |
| 95 |
| 96 UnicodeSetIterator iter(set); |
| 97 int64_t b; |
| 98 UChar32 start, end; |
| 99 int32_t prevIndex, i, j; |
| 100 |
| 101 b=0; // Not necessary but makes compilers happy. |
| 102 prevIndex=-1; |
| 103 for(;;) { |
| 104 if(iter.nextRange() && !iter.isString()) { |
| 105 start=iter.getCodepoint(); |
| 106 end=iter.getCodepointEnd(); |
| 107 } else { |
| 108 start=0x10000; |
| 109 } |
| 110 i=start>>6; |
| 111 if(prevIndex!=i) { |
| 112 // Finish the end of the previous range. |
| 113 if(prevIndex<0) { |
| 114 prevIndex=0; |
| 115 } else { |
| 116 index[prevIndex++]=bitHash->map(b); |
| 117 } |
| 118 // Fill all-zero entries between ranges. |
| 119 if(prevIndex<i) { |
| 120 uint16_t zero=bitHash->map(0); |
| 121 do { |
| 122 index[prevIndex++]=zero; |
| 123 } while(prevIndex<i); |
| 124 } |
| 125 b=0; |
| 126 } |
| 127 if(start>0xffff) { |
| 128 break; |
| 129 } |
| 130 b|=~((INT64_C(1)<<(start&0x3f))-1); |
| 131 j=end>>6; |
| 132 if(i<j) { |
| 133 // Set bits for the start of the range. |
| 134 index[i++]=bitHash->map(b); |
| 135 // Fill all-one entries inside the range. |
| 136 if(i<j) { |
| 137 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); |
| 138 do { |
| 139 index[i++]=all; |
| 140 } while(i<j); |
| 141 } |
| 142 b=INT64_C(0xffffffffffffffff); |
| 143 } |
| 144 /* i==j */ |
| 145 b&=(INT64_C(1)<<(end&0x3f))-1; |
| 146 prevIndex=j; |
| 147 } |
| 148 |
| 149 if(bitHash->countKeys()>LENGTHOF(shortBits)) { |
| 150 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); |
| 151 } |
| 152 if(bits!=NULL) { |
| 153 bitHash->invert(bits); |
| 154 } else { |
| 155 bits=shortBits; |
| 156 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 157 return; |
| 158 } |
| 159 |
| 160 latin1Set[0]=(uint32_t)bits[0]; |
| 161 latin1Set[1]=(uint32_t)(bits[0]>>32); |
| 162 latin1Set[2]=(uint32_t)bits[1]; |
| 163 latin1Set[3]=(uint32_t)(bits[1]>>32); |
| 164 latin1Set[4]=(uint32_t)bits[2]; |
| 165 latin1Set[5]=(uint32_t)(bits[2]>>32); |
| 166 latin1Set[6]=(uint32_t)bits[3]; |
| 167 latin1Set[7]=(uint32_t)(bits[3]>>32); |
| 168 |
| 169 restSet.remove(0, 0xffff); |
| 170 } |
| 171 |
| 172 ~BitSet() { |
| 173 if(bits!=shortBits) { |
| 174 uprv_free(bits); |
| 175 } |
| 176 delete restSet; |
| 177 } |
| 178 |
| 179 UBool contains(UChar32 c) const { |
| 180 if((uint32_t)c<=0xff) { |
| 181 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); |
| 182 } else if((uint32_t)c<0xffff) { |
| 183 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); |
| 184 } else { |
| 185 return restSet->contains(c); |
| 186 } |
| 187 } |
| 188 |
| 189 private: |
| 190 uint16_t index[0x400]; |
| 191 int64_t shortBits[32]; |
| 192 int64_t *bits; |
| 193 |
| 194 uint32_t latin1Bits[8]; |
| 195 |
| 196 UnicodeSet *restSet; |
| 197 }; |
OLD | NEW |