OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2001-2003, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * file name: bocsu.c |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * Author: Markus W. Scherer |
| 12 * |
| 13 * Modification history: |
| 14 * 05/18/2001 weiv Made into separate module |
| 15 */ |
| 16 |
| 17 |
| 18 #include "unicode/utypes.h" |
| 19 |
| 20 #if !UCONFIG_NO_COLLATION |
| 21 |
| 22 #include "bocsu.h" |
| 23 |
| 24 /* |
| 25 * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes, |
| 26 * preserving lexical order |
| 27 */ |
| 28 U_CFUNC uint8_t * |
| 29 u_writeDiff(int32_t diff, uint8_t *p) { |
| 30 if(diff>=SLOPE_REACH_NEG_1) { |
| 31 if(diff<=SLOPE_REACH_POS_1) { |
| 32 *p++=(uint8_t)(SLOPE_MIDDLE+diff); |
| 33 } else if(diff<=SLOPE_REACH_POS_2) { |
| 34 *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT)); |
| 35 *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
| 36 } else if(diff<=SLOPE_REACH_POS_3) { |
| 37 p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
| 38 diff/=SLOPE_TAIL_COUNT; |
| 39 p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
| 40 *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT)); |
| 41 p+=3; |
| 42 } else { |
| 43 p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
| 44 diff/=SLOPE_TAIL_COUNT; |
| 45 p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
| 46 diff/=SLOPE_TAIL_COUNT; |
| 47 p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); |
| 48 *p=SLOPE_MAX; |
| 49 p+=4; |
| 50 } |
| 51 } else { |
| 52 int32_t m; |
| 53 |
| 54 if(diff>=SLOPE_REACH_NEG_2) { |
| 55 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
| 56 *p++=(uint8_t)(SLOPE_START_NEG_2+diff); |
| 57 *p++=(uint8_t)(SLOPE_MIN+m); |
| 58 } else if(diff>=SLOPE_REACH_NEG_3) { |
| 59 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
| 60 p[2]=(uint8_t)(SLOPE_MIN+m); |
| 61 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
| 62 p[1]=(uint8_t)(SLOPE_MIN+m); |
| 63 *p=(uint8_t)(SLOPE_START_NEG_3+diff); |
| 64 p+=3; |
| 65 } else { |
| 66 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
| 67 p[3]=(uint8_t)(SLOPE_MIN+m); |
| 68 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
| 69 p[2]=(uint8_t)(SLOPE_MIN+m); |
| 70 NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); |
| 71 p[1]=(uint8_t)(SLOPE_MIN+m); |
| 72 *p=SLOPE_MIN; |
| 73 p+=4; |
| 74 } |
| 75 } |
| 76 return p; |
| 77 } |
| 78 |
| 79 /* How many bytes would writeDiff() write? */ |
| 80 static int32_t |
| 81 lengthOfDiff(int32_t diff) { |
| 82 if(diff>=SLOPE_REACH_NEG_1) { |
| 83 if(diff<=SLOPE_REACH_POS_1) { |
| 84 return 1; |
| 85 } else if(diff<=SLOPE_REACH_POS_2) { |
| 86 return 2; |
| 87 } else if(diff<=SLOPE_REACH_POS_3) { |
| 88 return 3; |
| 89 } else { |
| 90 return 4; |
| 91 } |
| 92 } else { |
| 93 if(diff>=SLOPE_REACH_NEG_2) { |
| 94 return 2; |
| 95 } else if(diff>=SLOPE_REACH_NEG_3) { |
| 96 return 3; |
| 97 } else { |
| 98 return 4; |
| 99 } |
| 100 } |
| 101 } |
| 102 |
| 103 /* |
| 104 * Encode the code points of a string as |
| 105 * a sequence of byte-encoded differences (slope detection), |
| 106 * preserving lexical order. |
| 107 * |
| 108 * Optimize the difference-taking for runs of Unicode text within |
| 109 * small scripts: |
| 110 * |
| 111 * Most small scripts are allocated within aligned 128-blocks of Unicode |
| 112 * code points. Lexical order is preserved if "prev" is always moved |
| 113 * into the middle of such a block. |
| 114 * |
| 115 * Additionally, "prev" is moved from anywhere in the Unihan |
| 116 * area into the middle of that area. |
| 117 * Note that the identical-level run in a sort key is generated from |
| 118 * NFD text - there are never Hangul characters included. |
| 119 */ |
| 120 U_CFUNC int32_t |
| 121 u_writeIdenticalLevelRun(const UChar *s, int32_t length, uint8_t *p) { |
| 122 uint8_t *p0; |
| 123 int32_t c, prev; |
| 124 int32_t i; |
| 125 |
| 126 prev=0; |
| 127 p0=p; |
| 128 i=0; |
| 129 while(i<length) { |
| 130 if(prev<0x4e00 || prev>=0xa000) { |
| 131 prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; |
| 132 } else { |
| 133 /* |
| 134 * Unihan U+4e00..U+9fa5: |
| 135 * double-bytes down from the upper end |
| 136 */ |
| 137 prev=0x9fff-SLOPE_REACH_POS_2; |
| 138 } |
| 139 |
| 140 UTF_NEXT_CHAR(s, i, length, c); |
| 141 p=u_writeDiff(c-prev, p); |
| 142 prev=c; |
| 143 } |
| 144 return (int32_t)(p-p0); |
| 145 } |
| 146 |
| 147 U_CFUNC int32_t |
| 148 u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) { |
| 149 uint8_t *p0 = p; |
| 150 if(first<0x4e00 || first>=0xa000) { |
| 151 first=(first&~0x7f)-SLOPE_REACH_NEG_1; |
| 152 } else { |
| 153 /* |
| 154 * Unihan U+4e00..U+9fa5: |
| 155 * double-bytes down from the upper end |
| 156 */ |
| 157 first=0x9fff-SLOPE_REACH_POS_2; |
| 158 } |
| 159 |
| 160 p=u_writeDiff(second-first, p); |
| 161 return (int32_t)(p-p0); |
| 162 } |
| 163 |
| 164 /* How many bytes would writeIdenticalLevelRun() write? */ |
| 165 U_CFUNC int32_t |
| 166 u_lengthOfIdenticalLevelRun(const UChar *s, int32_t length) { |
| 167 int32_t c, prev; |
| 168 int32_t i, runLength; |
| 169 |
| 170 prev=0; |
| 171 runLength=0; |
| 172 i=0; |
| 173 while(i<length) { |
| 174 if(prev<0x4e00 || prev>=0xa000) { |
| 175 prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; |
| 176 } else { |
| 177 /* |
| 178 * Unihan U+4e00..U+9fa5: |
| 179 * double-bytes down from the upper end |
| 180 */ |
| 181 prev=0x9fff-SLOPE_REACH_POS_2; |
| 182 } |
| 183 |
| 184 UTF_NEXT_CHAR(s, i, length, c); |
| 185 runLength+=lengthOfDiff(c-prev); |
| 186 prev=c; |
| 187 } |
| 188 return runLength; |
| 189 } |
| 190 |
| 191 #endif /* #if !UCONFIG_NO_COLLATION */ |
OLD | NEW |