Index: icu46/source/test/perf/unisetperf/draft/bitset.cpp |
=================================================================== |
--- icu46/source/test/perf/unisetperf/draft/bitset.cpp (revision 0) |
+++ icu46/source/test/perf/unisetperf/draft/bitset.cpp (revision 0) |
@@ -0,0 +1,197 @@ |
+/* |
+********************************************************************** |
+* Copyright (C) 2007, International Business Machines |
+* Corporation and others. All Rights Reserved. |
+********************************************************************** |
+* file name: bitset.cpp |
+* encoding: US-ASCII |
+* tab size: 8 (not used) |
+* indentation:4 |
+* |
+* created on: 2007jan15 |
+* created by: Markus Scherer |
+* |
+* Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet |
+* using a folded bit set consisting of a 1k-entry index table and a |
+* compacted array of 64-bit words. |
+* Uses a simple hash table for compaction. |
+* Uses the original set for supplementary code points. |
+*/ |
+ |
+#include "unicode/utypes.h" |
+#include "unicont.h" |
+ |
+/* |
+ * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. |
+ * Hashes 64-bit words and maps them to 16-bit integers which are |
+ * assigned in order of new incoming words for subsequent storage |
+ * in a contiguous array. |
+ */ |
+struct BMPBitHash : public UObject { |
+ int64_t keys[0x800]; // 2k |
+ uint16_t values[0x800]; |
+ uint16_t reverse[0x400]; |
+ uint16_t count; |
+ const int32_t prime=1301; // Less than 2k. |
+ |
+ BMPBitHash() : count(0) { |
+ // Fill values[] with 0xffff. |
+ uprv_memset(values, 0xff, sizeof(values)); |
+ } |
+ |
+ /* |
+ * Map a key to an integer count. |
+ * Map at most 1k=0x400 different keys with this data structure. |
+ */ |
+ uint16_t map(int64_t key) { |
+ int32_t hash=(int32_t)(key>>55)&0x1ff; |
+ hash^=(int32_t)(key>>44)&0x7ff; |
+ hash^=(int32_t)(key>>33)&0x7ff; |
+ hash^=(int32_t)(key>>22)&0x7ff; |
+ hash^=(int32_t)(key>>11)&0x7ff; |
+ hash^=(int32_t)key&0x7ff; |
+ for(;;) { |
+ if(values[hash]==0xffff) { |
+ // Unused slot. |
+ keys[hash]=key; |
+ reverse[count]=hash; |
+ return values[hash]=count++; |
+ } else if(keys[hash]==key) { |
+ // Found a slot with this key. |
+ return values[hash]; |
+ } else { |
+ // Used slot with a different key, move to another slot. |
+ hash=(hash+prime)&0x7ff; |
+ } |
+ } |
+ } |
+ |
+ uint16_t countKeys() const { return count; } |
+ |
+ /* |
+ * Invert the hash map: Fill an array of length countKeys() with the keys |
+ * indexed by their mapped values. |
+ */ |
+ void invert(int64_t *k) const { |
+ uint16_t i; |
+ |
+ for(i=0; i<count; ++i) { |
+ k[i]=keys[reverse[i]]; |
+ } |
+ } |
+}; |
+ |
+class BitSet : public UObject, public UnicodeContainable { |
+public: |
+ BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { |
+ if(U_FAILURE(errorCode)) { |
+ return; |
+ } |
+ BMPBitHash *bitHash=new BMPBitHash; |
+ if(bitHash==NULL || restSet==NULL) { |
+ errorCode=U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ |
+ UnicodeSetIterator iter(set); |
+ int64_t b; |
+ UChar32 start, end; |
+ int32_t prevIndex, i, j; |
+ |
+ b=0; // Not necessary but makes compilers happy. |
+ prevIndex=-1; |
+ for(;;) { |
+ if(iter.nextRange() && !iter.isString()) { |
+ start=iter.getCodepoint(); |
+ end=iter.getCodepointEnd(); |
+ } else { |
+ start=0x10000; |
+ } |
+ i=start>>6; |
+ if(prevIndex!=i) { |
+ // Finish the end of the previous range. |
+ if(prevIndex<0) { |
+ prevIndex=0; |
+ } else { |
+ index[prevIndex++]=bitHash->map(b); |
+ } |
+ // Fill all-zero entries between ranges. |
+ if(prevIndex<i) { |
+ uint16_t zero=bitHash->map(0); |
+ do { |
+ index[prevIndex++]=zero; |
+ } while(prevIndex<i); |
+ } |
+ b=0; |
+ } |
+ if(start>0xffff) { |
+ break; |
+ } |
+ b|=~((INT64_C(1)<<(start&0x3f))-1); |
+ j=end>>6; |
+ if(i<j) { |
+ // Set bits for the start of the range. |
+ index[i++]=bitHash->map(b); |
+ // Fill all-one entries inside the range. |
+ if(i<j) { |
+ uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); |
+ do { |
+ index[i++]=all; |
+ } while(i<j); |
+ } |
+ b=INT64_C(0xffffffffffffffff); |
+ } |
+ /* i==j */ |
+ b&=(INT64_C(1)<<(end&0x3f))-1; |
+ prevIndex=j; |
+ } |
+ |
+ if(bitHash->countKeys()>LENGTHOF(shortBits)) { |
+ bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); |
+ } |
+ if(bits!=NULL) { |
+ bitHash->invert(bits); |
+ } else { |
+ bits=shortBits; |
+ errorCode=U_MEMORY_ALLOCATION_ERROR; |
+ return; |
+ } |
+ |
+ latin1Set[0]=(uint32_t)bits[0]; |
+ latin1Set[1]=(uint32_t)(bits[0]>>32); |
+ latin1Set[2]=(uint32_t)bits[1]; |
+ latin1Set[3]=(uint32_t)(bits[1]>>32); |
+ latin1Set[4]=(uint32_t)bits[2]; |
+ latin1Set[5]=(uint32_t)(bits[2]>>32); |
+ latin1Set[6]=(uint32_t)bits[3]; |
+ latin1Set[7]=(uint32_t)(bits[3]>>32); |
+ |
+ restSet.remove(0, 0xffff); |
+ } |
+ |
+ ~BitSet() { |
+ if(bits!=shortBits) { |
+ uprv_free(bits); |
+ } |
+ delete restSet; |
+ } |
+ |
+ UBool contains(UChar32 c) const { |
+ if((uint32_t)c<=0xff) { |
+ return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); |
+ } else if((uint32_t)c<0xffff) { |
+ return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); |
+ } else { |
+ return restSet->contains(c); |
+ } |
+ } |
+ |
+private: |
+ uint16_t index[0x400]; |
+ int64_t shortBits[32]; |
+ int64_t *bits; |
+ |
+ uint32_t latin1Bits[8]; |
+ |
+ UnicodeSet *restSet; |
+}; |
Property changes on: icu46/source/test/perf/unisetperf/draft/bitset.cpp |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |