OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ********************************************************************** |
| 3 * Copyright (C) 2007, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ********************************************************************** |
| 6 * file name: trieset.cpp |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * created on: 2007jan15 |
| 12 * created by: Markus Scherer |
| 13 * |
| 14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet |
| 15 * using a UTrie with 8-bit (byte) results per code point. |
| 16 * Modifies the trie index to make the BMP linear, and uses the original set |
| 17 * for supplementary code points. |
| 18 */ |
| 19 |
| 20 #include "unicode/utypes.h" |
| 21 #include "unicont.h" |
| 22 |
| 23 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOC
K_LENGTH) |
| 24 |
| 25 #define UTRIE_GET8_FROM_LEAD(trie, c16) \ |
| 26 ((const uint8_t *)(trie)->data32)[ \ |
| 27 ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \ |
| 28 ((c16)&UTRIE_MASK) \ |
| 29 ] |
| 30 |
| 31 class TrieSet : public UObject, public UnicodeContainable { |
| 32 public: |
| 33 TrieSet(const UnicodeSet &set, UErrorCode &errorCode) |
| 34 : trieData(NULL), latin1(NULL), restSet(set.clone()) { |
| 35 if(U_FAILURE(errorCode)) { |
| 36 return; |
| 37 } |
| 38 if(restSet==NULL) { |
| 39 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 40 return; |
| 41 } |
| 42 |
| 43 UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE); |
| 44 UChar32 start, end; |
| 45 |
| 46 UnicodeSetIterator iter(set); |
| 47 |
| 48 while(iter.nextRange() && !iter.isString()) { |
| 49 start=iter.getCodepoint(); |
| 50 end=iter.getCodepointEnd(); |
| 51 if(start>0xffff) { |
| 52 break; |
| 53 } |
| 54 if(end>0xffff) { |
| 55 end=0xffff; |
| 56 } |
| 57 if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) { |
| 58 errorCode=U_INTERNAL_PROGRAM_ERROR; |
| 59 return; |
| 60 } |
| 61 } |
| 62 |
| 63 // Preflight the trie length. |
| 64 int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode); |
| 65 if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { |
| 66 return; |
| 67 } |
| 68 |
| 69 trieData=(uint32_t *)uprv_malloc(length); |
| 70 if(trieData==NULL) { |
| 71 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 72 return; |
| 73 } |
| 74 |
| 75 errorCode=U_ZERO_ERROR; |
| 76 utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode); |
| 77 utrie_unserialize(&trie, trieData, length, &errorCode); // TODO: Implem
ent for 8-bit UTrie! |
| 78 |
| 79 if(U_SUCCESS(errorCode)) { |
| 80 // Copy the indexes for surrogate code points into the BMP range |
| 81 // for simple access across the entire BMP. |
| 82 uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT), |
| 83 trie.index+UTRIE_BMP_INDEX_LENGTH, |
| 84 (0x800>>UTRIE_SHIFT)*2); |
| 85 latin1=UTRIE_GET8_LATIN1(&trie); |
| 86 } |
| 87 |
| 88 restSet.remove(0, 0xffff); |
| 89 } |
| 90 |
| 91 ~TrieSet() { |
| 92 uprv_free(trieData); |
| 93 delete restSet; |
| 94 } |
| 95 |
| 96 UBool contains(UChar32 c) const { |
| 97 if((uint32_t)c<=0xff) { |
| 98 return (UBool)latin1[c]; |
| 99 } else if((uint32_t)c<0xffff) { |
| 100 return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c); |
| 101 } else { |
| 102 return restSet->contains(c); |
| 103 } |
| 104 } |
| 105 |
| 106 private: |
| 107 uint32_t *trieData; |
| 108 const uint8_t *latin1; |
| 109 UTrie trie; |
| 110 UnicodeSet *restSet; |
| 111 }; |
OLD | NEW |