| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 ********************************************************************** | |
| 3 * Copyright (C) 2014, International Business Machines | |
| 4 * Corporation and others. All Rights Reserved. | |
| 5 ********************************************************************** | |
| 6 * file name: bitset.cpp | |
| 7 * encoding: US-ASCII | |
| 8 * tab size: 8 (not used) | |
| 9 * indentation:4 | |
| 10 * | |
| 11 * created on: 2007jan15 | |
| 12 * created by: Markus Scherer | |
| 13 * | |
| 14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet | |
| 15 * using a folded bit set consisting of a 1k-entry index table and a | |
| 16 * compacted array of 64-bit words. | |
| 17 * Uses a simple hash table for compaction. | |
| 18 * Uses the original set for supplementary code points. | |
| 19 */ | |
| 20 | |
| 21 #include "unicode/utypes.h" | |
| 22 #include "unicont.h" | |
| 23 #include "cmemory.h" // for UPRV_LENGTHOF | |
| 24 | |
| 25 /* | |
| 26 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. | |
| 27 * Hashes 64-bit words and maps them to 16-bit integers which are | |
| 28 * assigned in order of new incoming words for subsequent storage | |
| 29 * in a contiguous array. | |
| 30 */ | |
| 31 struct BMPBitHash : public UObject { | |
| 32 int64_t keys[0x800]; // 2k | |
| 33 uint16_t values[0x800]; | |
| 34 uint16_t reverse[0x400]; | |
| 35 uint16_t count; | |
| 36 const int32_t prime=1301; // Less than 2k. | |
| 37 | |
| 38 BMPBitHash() : count(0) { | |
| 39 // Fill values[] with 0xffff. | |
| 40 uprv_memset(values, 0xff, sizeof(values)); | |
| 41 } | |
| 42 | |
| 43 /* | |
| 44 * Map a key to an integer count. | |
| 45 * Map at most 1k=0x400 different keys with this data structure. | |
| 46 */ | |
| 47 uint16_t map(int64_t key) { | |
| 48 int32_t hash=(int32_t)(key>>55)&0x1ff; | |
| 49 hash^=(int32_t)(key>>44)&0x7ff; | |
| 50 hash^=(int32_t)(key>>33)&0x7ff; | |
| 51 hash^=(int32_t)(key>>22)&0x7ff; | |
| 52 hash^=(int32_t)(key>>11)&0x7ff; | |
| 53 hash^=(int32_t)key&0x7ff; | |
| 54 for(;;) { | |
| 55 if(values[hash]==0xffff) { | |
| 56 // Unused slot. | |
| 57 keys[hash]=key; | |
| 58 reverse[count]=hash; | |
| 59 return values[hash]=count++; | |
| 60 } else if(keys[hash]==key) { | |
| 61 // Found a slot with this key. | |
| 62 return values[hash]; | |
| 63 } else { | |
| 64 // Used slot with a different key, move to another slot. | |
| 65 hash=(hash+prime)&0x7ff; | |
| 66 } | |
| 67 } | |
| 68 } | |
| 69 | |
| 70 uint16_t countKeys() const { return count; } | |
| 71 | |
| 72 /* | |
| 73 * Invert the hash map: Fill an array of length countKeys() with the keys | |
| 74 * indexed by their mapped values. | |
| 75 */ | |
| 76 void invert(int64_t *k) const { | |
| 77 uint16_t i; | |
| 78 | |
| 79 for(i=0; i<count; ++i) { | |
| 80 k[i]=keys[reverse[i]]; | |
| 81 } | |
| 82 } | |
| 83 }; | |
| 84 | |
| 85 class BitSet : public UObject, public UnicodeContainable { | |
| 86 public: | |
| 87 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), rest
Set(set.clone()) { | |
| 88 if(U_FAILURE(errorCode)) { | |
| 89 return; | |
| 90 } | |
| 91 BMPBitHash *bitHash=new BMPBitHash; | |
| 92 if(bitHash==NULL || restSet==NULL) { | |
| 93 errorCode=U_MEMORY_ALLOCATION_ERROR; | |
| 94 return; | |
| 95 } | |
| 96 | |
| 97 UnicodeSetIterator iter(set); | |
| 98 int64_t b; | |
| 99 UChar32 start, end; | |
| 100 int32_t prevIndex, i, j; | |
| 101 | |
| 102 b=0; // Not necessary but makes compilers happy. | |
| 103 prevIndex=-1; | |
| 104 for(;;) { | |
| 105 if(iter.nextRange() && !iter.isString()) { | |
| 106 start=iter.getCodepoint(); | |
| 107 end=iter.getCodepointEnd(); | |
| 108 } else { | |
| 109 start=0x10000; | |
| 110 } | |
| 111 i=start>>6; | |
| 112 if(prevIndex!=i) { | |
| 113 // Finish the end of the previous range. | |
| 114 if(prevIndex<0) { | |
| 115 prevIndex=0; | |
| 116 } else { | |
| 117 index[prevIndex++]=bitHash->map(b); | |
| 118 } | |
| 119 // Fill all-zero entries between ranges. | |
| 120 if(prevIndex<i) { | |
| 121 uint16_t zero=bitHash->map(0); | |
| 122 do { | |
| 123 index[prevIndex++]=zero; | |
| 124 } while(prevIndex<i); | |
| 125 } | |
| 126 b=0; | |
| 127 } | |
| 128 if(start>0xffff) { | |
| 129 break; | |
| 130 } | |
| 131 b|=~((INT64_C(1)<<(start&0x3f))-1); | |
| 132 j=end>>6; | |
| 133 if(i<j) { | |
| 134 // Set bits for the start of the range. | |
| 135 index[i++]=bitHash->map(b); | |
| 136 // Fill all-one entries inside the range. | |
| 137 if(i<j) { | |
| 138 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); | |
| 139 do { | |
| 140 index[i++]=all; | |
| 141 } while(i<j); | |
| 142 } | |
| 143 b=INT64_C(0xffffffffffffffff); | |
| 144 } | |
| 145 /* i==j */ | |
| 146 b&=(INT64_C(1)<<(end&0x3f))-1; | |
| 147 prevIndex=j; | |
| 148 } | |
| 149 | |
| 150 if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) { | |
| 151 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); | |
| 152 } | |
| 153 if(bits!=NULL) { | |
| 154 bitHash->invert(bits); | |
| 155 } else { | |
| 156 bits=shortBits; | |
| 157 errorCode=U_MEMORY_ALLOCATION_ERROR; | |
| 158 return; | |
| 159 } | |
| 160 | |
| 161 latin1Set[0]=(uint32_t)bits[0]; | |
| 162 latin1Set[1]=(uint32_t)(bits[0]>>32); | |
| 163 latin1Set[2]=(uint32_t)bits[1]; | |
| 164 latin1Set[3]=(uint32_t)(bits[1]>>32); | |
| 165 latin1Set[4]=(uint32_t)bits[2]; | |
| 166 latin1Set[5]=(uint32_t)(bits[2]>>32); | |
| 167 latin1Set[6]=(uint32_t)bits[3]; | |
| 168 latin1Set[7]=(uint32_t)(bits[3]>>32); | |
| 169 | |
| 170 restSet.remove(0, 0xffff); | |
| 171 } | |
| 172 | |
| 173 ~BitSet() { | |
| 174 if(bits!=shortBits) { | |
| 175 uprv_free(bits); | |
| 176 } | |
| 177 delete restSet; | |
| 178 } | |
| 179 | |
| 180 UBool contains(UChar32 c) const { | |
| 181 if((uint32_t)c<=0xff) { | |
| 182 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); | |
| 183 } else if((uint32_t)c<0xffff) { | |
| 184 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); | |
| 185 } else { | |
| 186 return restSet->contains(c); | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 private: | |
| 191 uint16_t index[0x400]; | |
| 192 int64_t shortBits[32]; | |
| 193 int64_t *bits; | |
| 194 | |
| 195 uint32_t latin1Bits[8]; | |
| 196 | |
| 197 UnicodeSet *restSet; | |
| 198 }; | |
| OLD | NEW |