OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ****************************************************************************** |
| 3 * |
| 4 * Copyright (C) 2000-2010, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ****************************************************************************** |
| 8 * file name: ushape.c |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2000jun29 |
| 14 * created by: Markus W. Scherer |
| 15 * |
| 16 * Arabic letter shaping implemented by Ayman Roshdy |
| 17 */ |
| 18 |
| 19 #include "unicode/utypes.h" |
| 20 #include "unicode/uchar.h" |
| 21 #include "unicode/ustring.h" |
| 22 #include "unicode/ushape.h" |
| 23 #include "cmemory.h" |
| 24 #include "putilimp.h" |
| 25 #include "ustr_imp.h" |
| 26 #include "ubidi_props.h" |
| 27 |
| 28 #if UTF_SIZE<16 |
| 29 /* |
| 30 * This implementation assumes that the internal encoding is UTF-16 |
| 31 * or UTF-32, not UTF-8. |
| 32 * The main assumption is that the Arabic characters and their |
| 33 * presentation forms each fit into a single UChar. |
| 34 * With UTF-8, they occupy 2 or 3 bytes, and more than the ASCII |
| 35 * characters. |
| 36 */ |
| 37 # error This implementation assumes UTF-16 or UTF-32 (check UTF_SIZE) |
| 38 #endif |
| 39 |
| 40 /* |
| 41 * ### TODO in general for letter shaping: |
| 42 * - the letter shaping code is UTF-16-unaware; needs update |
| 43 * + especially invertBuffer()?! |
| 44 * - needs to handle the "Arabic Tail" that is used in some legacy codepages |
| 45 * as a glyph fragment of wide-glyph letters |
| 46 * + IBM Unicode conversion tables map it to U+200B (ZWSP) |
| 47 * + IBM Egypt has proposed to encode the tail in Unicode among Arabic Present
ation Forms |
| 48 */ |
| 49 |
| 50 /* definitions for Arabic letter shaping ------------------------------------ */ |
| 51 |
| 52 #define IRRELEVANT 4 |
| 53 #define LAMTYPE 16 |
| 54 #define ALEFTYPE 32 |
| 55 #define LINKR 1 |
| 56 #define LINKL 2 |
| 57 #define APRESENT 8 |
| 58 #define SHADDA 64 |
| 59 #define CSHADDA 128 |
| 60 #define COMBINE (SHADDA+CSHADDA) |
| 61 |
| 62 #define HAMZAFE_CHAR 0xfe80 |
| 63 #define HAMZA06_CHAR 0x0621 |
| 64 #define YEH_HAMZA_CHAR 0x0626 |
| 65 #define YEH_HAMZAFE_CHAR 0xFE89 |
| 66 #define LAMALEF_SPACE_SUB 0xFFFF |
| 67 #define TASHKEEL_SPACE_SUB 0xFFFE |
| 68 #define NEW_TAIL_CHAR 0xFE73 |
| 69 #define OLD_TAIL_CHAR 0x200B |
| 70 #define LAM_CHAR 0x0644 |
| 71 #define SPACE_CHAR 0x0020 |
| 72 #define SHADDA_CHAR 0xFE7C |
| 73 #define TATWEEL_CHAR 0x0640 |
| 74 #define SHADDA_TATWEEL_CHAR 0xFE7D |
| 75 |
| 76 #define SHAPE_MODE 0 |
| 77 #define DESHAPE_MODE 1 |
| 78 |
| 79 static UChar tailChar = OLD_TAIL_CHAR; |
| 80 static uint32_t uShapeLamalefBegin = U_SHAPE_LAMALEF_BEGIN; |
| 81 static uint32_t uShapeLamalefEnd = U_SHAPE_LAMALEF_END; |
| 82 static uint32_t uShapeTashkeelBegin = U_SHAPE_TASHKEEL_BEGIN; |
| 83 static uint32_t uShapeTashkeelEnd = U_SHAPE_TASHKEEL_END; |
| 84 static int spacesRelativeToTextBeginEnd = 0; |
| 85 |
| 86 static const uint8_t tailFamilyIsolatedFinal[] = { |
| 87 /* FEB1 */ 1, |
| 88 /* FEB2 */ 1, |
| 89 /* FEB3 */ 0, |
| 90 /* FEB4 */ 0, |
| 91 /* FEB5 */ 1, |
| 92 /* FEB6 */ 1, |
| 93 /* FEB7 */ 0, |
| 94 /* FEB8 */ 0, |
| 95 /* FEB9 */ 1, |
| 96 /* FEBA */ 1, |
| 97 /* FEBB */ 0, |
| 98 /* FEBC */ 0, |
| 99 /* FEBD */ 1, |
| 100 /* FEBE */ 1 |
| 101 }; |
| 102 |
| 103 static const uint8_t tashkeelMedial[] = { |
| 104 /* FE70 */ 0, |
| 105 /* FE71 */ 1, |
| 106 /* FE72 */ 0, |
| 107 /* FE73 */ 0, |
| 108 /* FE74 */ 0, |
| 109 /* FE75 */ 0, |
| 110 /* FE76 */ 0, |
| 111 /* FE77 */ 1, |
| 112 /* FE78 */ 0, |
| 113 /* FE79 */ 1, |
| 114 /* FE7A */ 0, |
| 115 /* FE7B */ 1, |
| 116 /* FE7C */ 0, |
| 117 /* FE7D */ 1, |
| 118 /* FE7E */ 0, |
| 119 /* FE7F */ 1 |
| 120 }; |
| 121 |
| 122 static const UChar yehHamzaToYeh[] = |
| 123 { |
| 124 /* isolated*/ 0xFEEF, |
| 125 /* final */ 0xFEF0 |
| 126 }; |
| 127 |
| 128 static const uint8_t IrrelevantPos[] = { |
| 129 0x0, 0x2, 0x4, 0x6, |
| 130 0x8, 0xA, 0xC, 0xE |
| 131 }; |
| 132 |
| 133 |
| 134 static const UChar convertLamAlef[] = |
| 135 { |
| 136 /*FEF5*/ 0x0622, |
| 137 /*FEF6*/ 0x0622, |
| 138 /*FEF7*/ 0x0623, |
| 139 /*FEF8*/ 0x0623, |
| 140 /*FEF9*/ 0x0625, |
| 141 /*FEFA*/ 0x0625, |
| 142 /*FEFB*/ 0x0627, |
| 143 /*FEFC*/ 0x0627 |
| 144 }; |
| 145 |
| 146 static const UChar araLink[178]= |
| 147 { |
| 148 1 + 32 + 256 * 0x11,/*0x0622*/ |
| 149 1 + 32 + 256 * 0x13,/*0x0623*/ |
| 150 1 + 256 * 0x15,/*0x0624*/ |
| 151 1 + 32 + 256 * 0x17,/*0x0625*/ |
| 152 1 + 2 + 256 * 0x19,/*0x0626*/ |
| 153 1 + 32 + 256 * 0x1D,/*0x0627*/ |
| 154 1 + 2 + 256 * 0x1F,/*0x0628*/ |
| 155 1 + 256 * 0x23,/*0x0629*/ |
| 156 1 + 2 + 256 * 0x25,/*0x062A*/ |
| 157 1 + 2 + 256 * 0x29,/*0x062B*/ |
| 158 1 + 2 + 256 * 0x2D,/*0x062C*/ |
| 159 1 + 2 + 256 * 0x31,/*0x062D*/ |
| 160 1 + 2 + 256 * 0x35,/*0x062E*/ |
| 161 1 + 256 * 0x39,/*0x062F*/ |
| 162 1 + 256 * 0x3B,/*0x0630*/ |
| 163 1 + 256 * 0x3D,/*0x0631*/ |
| 164 1 + 256 * 0x3F,/*0x0632*/ |
| 165 1 + 2 + 256 * 0x41,/*0x0633*/ |
| 166 1 + 2 + 256 * 0x45,/*0x0634*/ |
| 167 1 + 2 + 256 * 0x49,/*0x0635*/ |
| 168 1 + 2 + 256 * 0x4D,/*0x0636*/ |
| 169 1 + 2 + 256 * 0x51,/*0x0637*/ |
| 170 1 + 2 + 256 * 0x55,/*0x0638*/ |
| 171 1 + 2 + 256 * 0x59,/*0x0639*/ |
| 172 1 + 2 + 256 * 0x5D,/*0x063A*/ |
| 173 0, 0, 0, 0, 0, /*0x063B-0x063F*/ |
| 174 1 + 2, /*0x0640*/ |
| 175 1 + 2 + 256 * 0x61,/*0x0641*/ |
| 176 1 + 2 + 256 * 0x65,/*0x0642*/ |
| 177 1 + 2 + 256 * 0x69,/*0x0643*/ |
| 178 1 + 2 + 16 + 256 * 0x6D,/*0x0644*/ |
| 179 1 + 2 + 256 * 0x71,/*0x0645*/ |
| 180 1 + 2 + 256 * 0x75,/*0x0646*/ |
| 181 1 + 2 + 256 * 0x79,/*0x0647*/ |
| 182 1 + 256 * 0x7D,/*0x0648*/ |
| 183 1 + 256 * 0x7F,/*0x0649*/ |
| 184 1 + 2 + 256 * 0x81,/*0x064A*/ |
| 185 4 + 256 * 1, /*0x064B*/ |
| 186 4 + 128 + 256 * 1, /*0x064C*/ |
| 187 4 + 128 + 256 * 1, /*0x064D*/ |
| 188 4 + 128 + 256 * 1, /*0x064E*/ |
| 189 4 + 128 + 256 * 1, /*0x064F*/ |
| 190 4 + 128 + 256 * 1, /*0x0650*/ |
| 191 4 + 64 + 256 * 3, /*0x0651*/ |
| 192 4 + 256 * 1, /*0x0652*/ |
| 193 4 + 256 * 7, /*0x0653*/ |
| 194 4 + 256 * 8, /*0x0654*/ |
| 195 4 + 256 * 8, /*0x0655*/ |
| 196 4 + 256 * 1, /*0x0656*/ |
| 197 0, 0, 0, 0, 0, /*0x0657-0x065B*/ |
| 198 1 + 256 * 0x85,/*0x065C*/ |
| 199 1 + 256 * 0x87,/*0x065D*/ |
| 200 1 + 256 * 0x89,/*0x065E*/ |
| 201 1 + 256 * 0x8B,/*0x065F*/ |
| 202 0, 0, 0, 0, 0, /*0x0660-0x0664*/ |
| 203 0, 0, 0, 0, 0, /*0x0665-0x0669*/ |
| 204 0, 0, 0, 0, 0, 0, /*0x066A-0x066F*/ |
| 205 4 + 256 * 6, /*0x0670*/ |
| 206 1 + 8 + 256 * 0x00,/*0x0671*/ |
| 207 1 + 32, /*0x0672*/ |
| 208 1 + 32, /*0x0673*/ |
| 209 0, /*0x0674*/ |
| 210 1 + 32, /*0x0675*/ |
| 211 1, 1, /*0x0676-0x0677*/ |
| 212 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x0678-0x067D*/ |
| 213 1+2+8+256 * 0x06, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x067E-0x0683*/ |
| 214 1+2, 1+2, 1+2+8+256 * 0x2A, 1+2, /*0x0684-0x0687*/ |
| 215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x0688-0x0691*/ |
| 216 1, 1, 1, 1, 1, 1, 1+8+256 * 0x3A, 1, /*0x0692-0x0699*/ |
| 217 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/ |
| 218 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/ |
| 219 1+2, 1+2, 1+2, 1+2, 1+2, 1+2+8+256 * 0x3E, /*0x06A4-0x06AD*/ |
| 220 1+2, 1+2, 1+2, 1+2, /*0x06A4-0x06AD*/ |
| 221 1+2, 1+2+8+256 * 0x42, 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/ |
| 222 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/ |
| 223 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06B8-0x06BF*/ |
| 224 1+2, 1+2, /*0x06B8-0x06BF*/ |
| 225 1, /*0x06C0*/ |
| 226 1+2, /*0x06C1*/ |
| 227 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x06C2-0x06CB*/ |
| 228 1+2+8+256 * 0xAC, /*0x06CC*/ |
| 229 1, /*0x06CD*/ |
| 230 1+2, 1+2, 1+2, 1+2, /*0x06CE-0x06D1*/ |
| 231 1, 1 /*0x06D2-0x06D3*/ |
| 232 }; |
| 233 |
| 234 static const uint8_t presALink[] = { |
| 235 /***********0*****1*****2*****3*****4*****5*****6*****7*****8*****9*****A*****B*
****C*****D*****E*****F*/ |
| 236 /*FB5*/ 0, 1, 0, 0, 0, 0, 0, 1, 2,1 + 2, 0, 0,
0, 0, 0, 0, |
| 237 /*FB6*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 238 /*FB7*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
2,1 + 2, 0, 0, |
| 239 /*FB8*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 1, |
| 240 /*FB9*/ 2,1 + 2, 0, 1, 2,1 + 2, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 241 /*FBA*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 242 /*FBB*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 243 /*FBC*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 244 /*FBD*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 245 /*FBE*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 246 /*FBF*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 2,1 + 2, |
| 247 /*FC0*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 248 /*FC1*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 249 /*FC2*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 250 /*FC3*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 251 /*FC4*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, |
| 252 /*FC5*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 4, 4, |
| 253 /*FC6*/ 4, 4, 4 |
| 254 }; |
| 255 |
| 256 static const uint8_t presBLink[]= |
| 257 { |
| 258 /***********0*****1*****2*****3*****4*****5*****6*****7*****8*****9*****A*****B*
****C*****D*****E*****F*/ |
| 259 /*FE7*/1 + 2,1 + 2,1 + 2, 0,1 + 2, 0,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2,1 + 2,1
+ 2,1 + 2,1 + 2,1 + 2, |
| 260 /*FE8*/ 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2,1
+ 2, 0, 1, 0, |
| 261 /*FE9*/ 1, 2,1 + 2, 0, 1, 0, 1, 2,1 + 2, 0, 1, 2,1
+ 2, 0, 1, 2, |
| 262 /*FEA*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 0,
1, 0, 1, 0, |
| 263 /*FEB*/ 1, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1
+ 2, 0, 1, 2, |
| 264 /*FEC*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1
+ 2, 0, 1, 2, |
| 265 /*FED*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1
+ 2, 0, 1, 2, |
| 266 /*FEE*/1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1 + 2, 0, 1, 2,1
+ 2, 0, 1, 0, |
| 267 /*FEF*/ 1, 0, 1, 2,1 + 2, 0, 1, 0, 1, 0, 1, 0,
1, 0, 0, 0 |
| 268 }; |
| 269 |
| 270 static const UChar convertFBto06[] = |
| 271 { |
| 272 /***********0******1******2******3******4******5******6******7******8******9****
**A******B******C******D******E******F***/ |
| 273 /*FB5*/ 0x671, 0x671, 0, 0, 0, 0, 0x07E, 0x07E, 0x07E, 0x07E,
0, 0, 0, 0, 0, 0, |
| 274 /*FB6*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 275 /*FB7*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x686, 0x686, 0x686, 0x686, 0, 0, |
| 276 /*FB8*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x698, 0x698, 0, 0, 0x6A9, 0x6A9, |
| 277 /*FB9*/ 0x6A9, 0x6A9, 0x6AF, 0x6AF, 0x6AF, 0x6AF, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 278 /*FBA*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 279 /*FBB*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 280 /*FBC*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 281 /*FBD*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 282 /*FBE*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, |
| 283 /*FBF*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0x6CC, 0x6CC, 0x6CC, 0x6CC |
| 284 }; |
| 285 |
| 286 static const UChar convertFEto06[] = |
| 287 { |
| 288 /***********0******1******2******3******4******5******6******7******8******9****
**A******B******C******D******E******F***/ |
| 289 /*FE7*/ 0x64B, 0x64B, 0x64C, 0x64C, 0x64D, 0x64D, 0x64E, 0x64E, 0x64F, 0x64F,
0x650, 0x650, 0x651, 0x651, 0x652, 0x652, |
| 290 /*FE8*/ 0x621, 0x622, 0x622, 0x623, 0x623, 0x624, 0x624, 0x625, 0x625, 0x626,
0x626, 0x626, 0x626, 0x627, 0x627, 0x628, |
| 291 /*FE9*/ 0x628, 0x628, 0x628, 0x629, 0x629, 0x62A, 0x62A, 0x62A, 0x62A, 0x62B,
0x62B, 0x62B, 0x62B, 0x62C, 0x62C, 0x62C, |
| 292 /*FEA*/ 0x62C, 0x62D, 0x62D, 0x62D, 0x62D, 0x62E, 0x62E, 0x62E, 0x62E, 0x62F,
0x62F, 0x630, 0x630, 0x631, 0x631, 0x632, |
| 293 /*FEB*/ 0x632, 0x633, 0x633, 0x633, 0x633, 0x634, 0x634, 0x634, 0x634, 0x635,
0x635, 0x635, 0x635, 0x636, 0x636, 0x636, |
| 294 /*FEC*/ 0x636, 0x637, 0x637, 0x637, 0x637, 0x638, 0x638, 0x638, 0x638, 0x639,
0x639, 0x639, 0x639, 0x63A, 0x63A, 0x63A, |
| 295 /*FED*/ 0x63A, 0x641, 0x641, 0x641, 0x641, 0x642, 0x642, 0x642, 0x642, 0x643,
0x643, 0x643, 0x643, 0x644, 0x644, 0x644, |
| 296 /*FEE*/ 0x644, 0x645, 0x645, 0x645, 0x645, 0x646, 0x646, 0x646, 0x646, 0x647,
0x647, 0x647, 0x647, 0x648, 0x648, 0x649, |
| 297 /*FEF*/ 0x649, 0x64A, 0x64A, 0x64A, 0x64A, 0x65C, 0x65C, 0x65D, 0x65D, 0x65E,
0x65E, 0x65F, 0x65F |
| 298 }; |
| 299 |
| 300 static const uint8_t shapeTable[4][4][4]= |
| 301 { |
| 302 { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,1} }, |
| 303 { {0,0,2,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} }, |
| 304 { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,3} }, |
| 305 { {0,0,1,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} } |
| 306 }; |
| 307 |
| 308 /* |
| 309 * This function shapes European digits to Arabic-Indic digits |
| 310 * in-place, writing over the input characters. |
| 311 * Since we know that we are only looking for BMP code points, |
| 312 * we can safely just work with code units (again, at least UTF-16). |
| 313 */ |
| 314 static void |
| 315 _shapeToArabicDigitsWithContext(UChar *s, int32_t length, |
| 316 UChar digitBase, |
| 317 UBool isLogical, UBool lastStrongWasAL) { |
| 318 const UBiDiProps *bdp; |
| 319 int32_t i; |
| 320 UChar c; |
| 321 |
| 322 bdp=ubidi_getSingleton(); |
| 323 digitBase-=0x30; |
| 324 |
| 325 /* the iteration direction depends on the type of input */ |
| 326 if(isLogical) { |
| 327 for(i=0; i<length; ++i) { |
| 328 c=s[i]; |
| 329 switch(ubidi_getClass(bdp, c)) { |
| 330 case U_LEFT_TO_RIGHT: /* L */ |
| 331 case U_RIGHT_TO_LEFT: /* R */ |
| 332 lastStrongWasAL=FALSE; |
| 333 break; |
| 334 case U_RIGHT_TO_LEFT_ARABIC: /* AL */ |
| 335 lastStrongWasAL=TRUE; |
| 336 break; |
| 337 case U_EUROPEAN_NUMBER: /* EN */ |
| 338 if(lastStrongWasAL && (uint32_t)(c-0x30)<10) { |
| 339 s[i]=(UChar)(digitBase+c); /* digitBase+(c-0x30) - digitBase
was modified above */ |
| 340 } |
| 341 break; |
| 342 default : |
| 343 break; |
| 344 } |
| 345 } |
| 346 } else { |
| 347 for(i=length; i>0; /* pre-decrement in the body */) { |
| 348 c=s[--i]; |
| 349 switch(ubidi_getClass(bdp, c)) { |
| 350 case U_LEFT_TO_RIGHT: /* L */ |
| 351 case U_RIGHT_TO_LEFT: /* R */ |
| 352 lastStrongWasAL=FALSE; |
| 353 break; |
| 354 case U_RIGHT_TO_LEFT_ARABIC: /* AL */ |
| 355 lastStrongWasAL=TRUE; |
| 356 break; |
| 357 case U_EUROPEAN_NUMBER: /* EN */ |
| 358 if(lastStrongWasAL && (uint32_t)(c-0x30)<10) { |
| 359 s[i]=(UChar)(digitBase+c); /* digitBase+(c-0x30) - digitBase
was modified above */ |
| 360 } |
| 361 break; |
| 362 default : |
| 363 break; |
| 364 } |
| 365 } |
| 366 } |
| 367 } |
| 368 |
| 369 /* |
| 370 *Name : invertBuffer |
| 371 *Function : This function inverts the buffer, it's used |
| 372 * in case the user specifies the buffer to be |
| 373 * U_SHAPE_TEXT_DIRECTION_LOGICAL |
| 374 */ |
| 375 static void |
| 376 invertBuffer(UChar *buffer,int32_t size,uint32_t options,int32_t lowlimit,int32_
t highlimit) { |
| 377 UChar temp; |
| 378 int32_t i=0,j=0; |
| 379 for(i=lowlimit,j=size-highlimit-1;i<j;i++,j--) { |
| 380 temp = buffer[i]; |
| 381 buffer[i] = buffer[j]; |
| 382 buffer[j] = temp; |
| 383 } |
| 384 } |
| 385 |
| 386 /* |
| 387 *Name : changeLamAlef |
| 388 *Function : Converts the Alef characters into an equivalent |
| 389 * LamAlef location in the 0x06xx Range, this is an |
| 390 * intermediate stage in the operation of the program |
| 391 * later it'll be converted into the 0xFExx LamAlefs |
| 392 * in the shaping function. |
| 393 */ |
| 394 static U_INLINE UChar |
| 395 changeLamAlef(UChar ch) { |
| 396 switch(ch) { |
| 397 case 0x0622 : |
| 398 return 0x065C; |
| 399 case 0x0623 : |
| 400 return 0x065D; |
| 401 case 0x0625 : |
| 402 return 0x065E; |
| 403 case 0x0627 : |
| 404 return 0x065F; |
| 405 } |
| 406 return 0; |
| 407 } |
| 408 |
| 409 /* |
| 410 *Name : getLink |
| 411 *Function : Resolves the link between the characters as |
| 412 * Arabic characters have four forms : |
| 413 * Isolated, Initial, Middle and Final Form |
| 414 */ |
| 415 static UChar |
| 416 getLink(UChar ch) { |
| 417 if(ch >= 0x0622 && ch <= 0x06D3) { |
| 418 return(araLink[ch-0x0622]); |
| 419 } else if(ch == 0x200D) { |
| 420 return(3); |
| 421 } else if(ch >= 0x206D && ch <= 0x206F) { |
| 422 return(4); |
| 423 }else if(ch >= 0xFB50 && ch <= 0xFC62) { |
| 424 return(presALink[ch-0xFB50]); |
| 425 } else if(ch >= 0xFE70 && ch <= 0xFEFC) { |
| 426 return(presBLink[ch-0xFE70]); |
| 427 }else { |
| 428 return(0); |
| 429 } |
| 430 } |
| 431 |
| 432 /* |
| 433 *Name : countSpaces |
| 434 *Function : Counts the number of spaces |
| 435 * at each end of the logical buffer |
| 436 */ |
| 437 static void |
| 438 countSpaces(UChar *dest,int32_t size,uint32_t options,int32_t *spacesCountl,int3
2_t *spacesCountr) { |
| 439 int32_t i = 0; |
| 440 int32_t countl = 0,countr = 0; |
| 441 while(dest[i] == SPACE_CHAR) { |
| 442 countl++; |
| 443 i++; |
| 444 } |
| 445 while(dest[size-1] == SPACE_CHAR) { |
| 446 countr++; |
| 447 size--; |
| 448 } |
| 449 *spacesCountl = countl; |
| 450 *spacesCountr = countr; |
| 451 } |
| 452 |
| 453 /* |
| 454 *Name : isTashkeelChar |
| 455 *Function : Returns 1 for Tashkeel characters in 06 range else return 0 |
| 456 */ |
| 457 static U_INLINE int32_t |
| 458 isTashkeelChar(UChar ch) { |
| 459 return (int32_t)( ch>=0x064B && ch<= 0x0652 ); |
| 460 } |
| 461 |
| 462 /* |
| 463 *Name : isTashkeelCharFE |
| 464 *Function : Returns 1 for Tashkeel characters in FE range else return 0 |
| 465 */ |
| 466 static U_INLINE int32_t |
| 467 isTashkeelCharFE(UChar ch) { |
| 468 return (int32_t)( ch>=0xFE70 && ch<= 0xFE7F ); |
| 469 } |
| 470 |
| 471 /* |
| 472 *Name : isAlefChar |
| 473 *Function : Returns 1 for Alef characters else return 0 |
| 474 */ |
| 475 static U_INLINE int32_t |
| 476 isAlefChar(UChar ch) { |
| 477 return (int32_t)( (ch==0x0622)||(ch==0x0623)||(ch==0x0625)||(ch==0x0627) ); |
| 478 } |
| 479 |
| 480 /* |
| 481 *Name : isLamAlefChar |
| 482 *Function : Returns 1 for LamAlef characters else return 0 |
| 483 */ |
| 484 static U_INLINE int32_t |
| 485 isLamAlefChar(UChar ch) { |
| 486 return (int32_t)((ch>=0xFEF5)&&(ch<=0xFEFC) ); |
| 487 } |
| 488 |
| 489 /*BIDI |
| 490 *Name : isTailChar |
| 491 *Function : returns 1 if the character matches one of the tail characters (0xfe
73 or 0x200b) otherwise returns 0 |
| 492 */ |
| 493 |
| 494 static U_INLINE int32_t |
| 495 isTailChar(UChar ch) { |
| 496 if(ch == OLD_TAIL_CHAR || ch == NEW_TAIL_CHAR){ |
| 497 return 1; |
| 498 }else{ |
| 499 return 0; |
| 500 } |
| 501 } |
| 502 |
| 503 /*BIDI |
| 504 *Name : isSeenTailFamilyChar |
| 505 *Function : returns 1 if the character is a seen family isolated character |
| 506 * in the FE range otherwise returns 0 |
| 507 */ |
| 508 |
| 509 static U_INLINE int32_t |
| 510 isSeenTailFamilyChar(UChar ch) { |
| 511 if(ch >= 0xfeb1 && ch < 0xfebf){ |
| 512 return tailFamilyIsolatedFinal [ch - 0xFEB1]; |
| 513 }else{ |
| 514 return 0; |
| 515 } |
| 516 } |
| 517 |
| 518 /* Name : isSeenFamilyChar |
| 519 * Function : returns 1 if the character is a seen family character in the Unic
ode |
| 520 * 06 range otherwise returns 0 |
| 521 */ |
| 522 |
| 523 static U_INLINE int32_t |
| 524 isSeenFamilyChar(UChar ch){ |
| 525 if(ch >= 0x633 && ch <= 0x636){ |
| 526 return 1; |
| 527 }else { |
| 528 return 0; |
| 529 } |
| 530 } |
| 531 |
| 532 /*Start of BIDI*/ |
| 533 /* |
| 534 *Name : isAlefMaksouraChar |
| 535 *Function : returns 1 if the character is a Alef Maksoura Final or isolated |
| 536 * otherwise returns 0 |
| 537 */ |
| 538 static U_INLINE int32_t |
| 539 isAlefMaksouraChar(UChar ch) { |
| 540 return (int32_t)( (ch == 0xFEEF) || ( ch == 0xFEF0) || (ch == 0x0649)); |
| 541 } |
| 542 |
| 543 /* |
| 544 * Name : isYehHamzaChar |
| 545 * Function : returns 1 if the character is a yehHamza isolated or yehhamza |
| 546 * final is found otherwise returns 0 |
| 547 */ |
| 548 static U_INLINE int32_t |
| 549 isYehHamzaChar(UChar ch) { |
| 550 if((ch==0xFE89)||(ch==0xFE8A)){ |
| 551 return 1; |
| 552 }else{ |
| 553 return 0; |
| 554 } |
| 555 } |
| 556 |
| 557 /* |
| 558 * Name: isTashkeelOnTatweelChar |
| 559 * Function: Checks if the Tashkeel Character is on Tatweel or not,if the |
| 560 * Tashkeel on tatweel (FE range), it returns 1 else if the |
| 561 * Tashkeel with shadda on tatweel (FC range)return 2 otherwise |
| 562 * returns 0 |
| 563 */ |
| 564 static U_INLINE int32_t |
| 565 isTashkeelOnTatweelChar(UChar ch){ |
| 566 if(ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75 && ch
!= SHADDA_TATWEEL_CHAR) |
| 567 { |
| 568 return tashkeelMedial [ch - 0xFE70]; |
| 569 }else if( (ch >= 0xfcf2 && ch <= 0xfcf4) || (ch == SHADDA_TATWEEL_CHAR)) { |
| 570 return 2; |
| 571 }else{ |
| 572 return 0; |
| 573 } |
| 574 } |
| 575 |
| 576 /* |
| 577 * Name: isIsolatedTashkeelChar |
| 578 * Function: Checks if the Tashkeel Character is in the isolated form |
| 579 * (i.e. Unicode FE range) returns 1 else if the Tashkeel |
| 580 * with shadda is in the isolated form (i.e. Unicode FC range) |
| 581 * returns 2 otherwise returns 0 |
| 582 */ |
| 583 static U_INLINE int32_t |
| 584 isIsolatedTashkeelChar(UChar ch){ |
| 585 if(ch >= 0xfe70 && ch <= 0xfe7f && ch != NEW_TAIL_CHAR && ch != 0xFE75){ |
| 586 return (1 - tashkeelMedial [ch - 0xFE70]); |
| 587 }else if(ch >= 0xfc5e && ch <= 0xfc63){ |
| 588 return 1; |
| 589 }else{ |
| 590 return 0; |
| 591 } |
| 592 } |
| 593 |
| 594 |
| 595 |
| 596 |
| 597 /* |
| 598 *Name : calculateSize |
| 599 *Function : This function calculates the destSize to be used in preflighting |
| 600 * when the destSize is equal to 0 |
| 601 * It is used also to calculate the new destsize in case the |
| 602 * destination buffer will be resized. |
| 603 */ |
| 604 |
| 605 static int32_t |
| 606 calculateSize(const UChar *source, int32_t sourceLength, |
| 607 int32_t destSize,uint32_t options) { |
| 608 int32_t i = 0; |
| 609 |
| 610 int lamAlefOption = 0; |
| 611 int tashkeelOption = 0; |
| 612 |
| 613 destSize = sourceLength; |
| 614 |
| 615 if (((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_SHAPE || |
| 616 ((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLAT
ED )) && |
| 617 ((options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_RESIZE )){ |
| 618 lamAlefOption = 1; |
| 619 } |
| 620 if((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_SHAPE && |
| 621 ((options&U_SHAPE_TASHKEEL_MASK) == U_SHAPE_TASHKEEL_RESIZE ) ){ |
| 622 tashkeelOption = 1; |
| 623 } |
| 624 |
| 625 if(lamAlefOption || tashkeelOption){ |
| 626 if((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_VISUAL_
LTR) { |
| 627 for(i=0;i<sourceLength;i++) { |
| 628 if( ((isAlefChar(source[i]))&& (i<(sourceLength-1)) &&(source[i+
1] == LAM_CHAR)) || (isTashkeelCharFE(source[i])) ) { |
| 629 destSize--; |
| 630 } |
| 631 } |
| 632 }else if((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTI
ON_LOGICAL) { |
| 633 for(i=0;i<sourceLength;i++) { |
| 634 if( ( (source[i] == LAM_CHAR) && (i<(sourceLength-1)) && (is
AlefChar(source[i+1]))) || (isTashkeelCharFE(source[i])) ) { |
| 635 destSize--; |
| 636 } |
| 637 } |
| 638 } |
| 639 } |
| 640 |
| 641 if ((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_UNSHAPE){ |
| 642 if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_RESIZE){ |
| 643 for(i=0;i<sourceLength;i++) { |
| 644 if(isLamAlefChar(source[i])) |
| 645 destSize++; |
| 646 } |
| 647 } |
| 648 } |
| 649 |
| 650 return destSize; |
| 651 } |
| 652 |
| 653 /* |
| 654 *Name : handleTashkeelWithTatweel |
| 655 *Function : Replaces Tashkeel as following: |
| 656 * Case 1 :if the Tashkeel on tatweel, replace it with Tatweel. |
| 657 * Case 2 :if the Tashkeel aggregated with Shadda on Tatweel, replace
|
| 658 * it with Shadda on Tatweel. |
| 659 * Case 3: if the Tashkeel is isolated replace it with Space. |
| 660 * |
| 661 */ |
| 662 static int32_t |
| 663 handleTashkeelWithTatweel(UChar *dest, int32_t sourceLength, |
| 664 int32_t destSize,uint32_t options, |
| 665 UErrorCode *pErrorCode) { |
| 666 int i; |
| 667 for(i = 0; i < sourceLength; i++){ |
| 668 if((isTashkeelOnTatweelChar(dest[i]) == 1)){ |
| 669 dest[i] = TATWEEL_CHAR; |
| 670 }else if((isTashkeelOnTatweelChar(dest[i]) == 2)){ |
| 671 dest[i] = SHADDA_TATWEEL_CHAR; |
| 672 }else if(isIsolatedTashkeelChar(dest[i]) && dest[i] != SHADD
A_CHAR){ |
| 673 dest[i] = SPACE_CHAR; |
| 674 } |
| 675 } |
| 676 return sourceLength; |
| 677 } |
| 678 |
| 679 |
| 680 |
| 681 /* |
| 682 *Name : handleGeneratedSpaces |
| 683 *Function : The shapeUnicode function converts Lam + Alef into LamAlef + space, |
| 684 * and Tashkeel to space. |
| 685 * handleGeneratedSpaces function puts these generated spaces |
| 686 * according to the options the user specifies. LamAlef and Tashkeel |
| 687 * spaces can be replaced at begin, at end, at near or decrease the |
| 688 * buffer size. |
| 689 * |
| 690 * There is also Auto option for LamAlef and tashkeel, which will put |
| 691 * the spaces at end of the buffer (or end of text if the user used |
| 692 * the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END). |
| 693 * |
| 694 * If the text type was visual_LTR and the option |
| 695 * U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected the END |
| 696 * option will place the space at the beginning of the buffer and |
| 697 * BEGIN will place the space at the end of the buffer. |
| 698 */ |
| 699 |
| 700 static int32_t |
| 701 handleGeneratedSpaces(UChar *dest, int32_t sourceLength, |
| 702 int32_t destSize, |
| 703 uint32_t options, |
| 704 UErrorCode *pErrorCode ) { |
| 705 |
| 706 int32_t i = 0, j = 0; |
| 707 int32_t count = 0; |
| 708 UChar *tempbuffer=NULL; |
| 709 |
| 710 int lamAlefOption = 0; |
| 711 int tashkeelOption = 0; |
| 712 int shapingMode = SHAPE_MODE; |
| 713 |
| 714 if (shapingMode == 0){ |
| 715 if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_RESIZE ){ |
| 716 lamAlefOption = 1; |
| 717 } |
| 718 if ( (options&U_SHAPE_TASHKEEL_MASK) == U_SHAPE_TASHKEEL_RESIZE ){ |
| 719 tashkeelOption = 1; |
| 720 } |
| 721 } |
| 722 |
| 723 if (lamAlefOption || tashkeelOption){ |
| 724 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR); |
| 725 /* Test for NULL */ |
| 726 if(tempbuffer == NULL) { |
| 727 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 728 return 0; |
| 729 } |
| 730 |
| 731 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); |
| 732 |
| 733 i = j = 0; count = 0; |
| 734 while(i < sourceLength) { |
| 735 if ( (lamAlefOption && dest[i] == LAMALEF_SPACE_SUB) || |
| 736 (tashkeelOption && dest[i] == TASHKEEL_SPACE_SUB) ){ |
| 737 j--; |
| 738 count++; |
| 739 } else { |
| 740 tempbuffer[j] = dest[i]; |
| 741 } |
| 742 i++; |
| 743 j++; |
| 744 } |
| 745 |
| 746 while(count >= 0) { |
| 747 tempbuffer[i] = 0x0000; |
| 748 i--; |
| 749 count--; |
| 750 } |
| 751 |
| 752 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); |
| 753 destSize = u_strlen(dest); |
| 754 } |
| 755 |
| 756 lamAlefOption = 0; |
| 757 |
| 758 if (shapingMode == 0){ |
| 759 if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_NEAR ){ |
| 760 lamAlefOption = 1; |
| 761 } |
| 762 } |
| 763 |
| 764 if (lamAlefOption){ |
| 765 /* Lam+Alef is already shaped into LamAlef + FFFF */ |
| 766 i = 0; |
| 767 while(i < sourceLength) { |
| 768 if(lamAlefOption&&dest[i] == LAMALEF_SPACE_SUB){ |
| 769 dest[i] = SPACE_CHAR; |
| 770 } |
| 771 i++; |
| 772 } |
| 773 destSize = sourceLength; |
| 774 } |
| 775 lamAlefOption = 0; |
| 776 tashkeelOption = 0; |
| 777 |
| 778 if (shapingMode == 0) { |
| 779 if ( ((options&U_SHAPE_LAMALEF_MASK) == uShapeLamalefBegin) || |
| 780 (((options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_AUTO ) |
| 781 && (spacesRelativeToTextBeginEnd==1)) ) { |
| 782 lamAlefOption = 1; |
| 783 } |
| 784 if ( (options&U_SHAPE_TASHKEEL_MASK) == uShapeTashkeelBegin ) { |
| 785 tashkeelOption = 1; |
| 786 } |
| 787 } |
| 788 |
| 789 if(lamAlefOption || tashkeelOption){ |
| 790 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR); |
| 791 |
| 792 /* Test for NULL */ |
| 793 if(tempbuffer == NULL) { |
| 794 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 795 return 0; |
| 796 } |
| 797 |
| 798 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); |
| 799 i = j = sourceLength; count = 0; |
| 800 while(i >= 0) { |
| 801 if ( (lamAlefOption && dest[i] == LAMALEF_SPACE_SUB) || |
| 802 (tashkeelOption && dest[i] == TASHKEEL_SPACE_SUB) ){ |
| 803 j++; |
| 804 count++; |
| 805 }else { |
| 806 tempbuffer[j] = dest[i]; |
| 807 } |
| 808 i--; |
| 809 j--; |
| 810 } |
| 811 |
| 812 for(i=0 ;i < count; i++){ |
| 813 tempbuffer[i] = SPACE_CHAR; |
| 814 } |
| 815 |
| 816 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); |
| 817 destSize = sourceLength; |
| 818 } |
| 819 |
| 820 |
| 821 |
| 822 lamAlefOption = 0; |
| 823 tashkeelOption = 0; |
| 824 |
| 825 if (shapingMode == 0) { |
| 826 if ( ((options&U_SHAPE_LAMALEF_MASK) == uShapeLamalefEnd) || |
| 827 (((options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_AUTO ) |
| 828 && (spacesRelativeToTextBeginEnd==0)) ) { |
| 829 lamAlefOption = 1; |
| 830 } |
| 831 if ( (options&U_SHAPE_TASHKEEL_MASK) == uShapeTashkeelEnd ){ |
| 832 tashkeelOption = 1; |
| 833 } |
| 834 } |
| 835 |
| 836 if(lamAlefOption || tashkeelOption){ |
| 837 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR); |
| 838 /* Test for NULL */ |
| 839 if(tempbuffer == NULL) { |
| 840 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 841 return 0; |
| 842 } |
| 843 |
| 844 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); |
| 845 |
| 846 i = j = 0; count = 0; |
| 847 while(i < sourceLength) { |
| 848 if ( (lamAlefOption && dest[i] == LAMALEF_SPACE_SUB) || |
| 849 (tashkeelOption && dest[i] == TASHKEEL_SPACE_SUB) ){ |
| 850 j--; |
| 851 count++; |
| 852 }else { |
| 853 tempbuffer[j] = dest[i]; |
| 854 } |
| 855 i++; |
| 856 j++; |
| 857 } |
| 858 |
| 859 while(count >= 0) { |
| 860 tempbuffer[i] = SPACE_CHAR; |
| 861 i--; |
| 862 count--; |
| 863 } |
| 864 |
| 865 uprv_memcpy(dest,tempbuffer, sourceLength*U_SIZEOF_UCHAR); |
| 866 destSize = sourceLength; |
| 867 } |
| 868 |
| 869 |
| 870 if(tempbuffer){ |
| 871 uprv_free(tempbuffer); |
| 872 } |
| 873 |
| 874 return destSize; |
| 875 } |
| 876 |
| 877 /* |
| 878 *Name :expandCompositCharAtBegin |
| 879 *Function :Expands the LamAlef character to Lam and Alef consuming the required |
| 880 * space from beginning of the buffer. If the text type was visual_LTR |
| 881 * and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected |
| 882 * the spaces will be located at end of buffer. |
| 883 * If there are no spaces to expand the LamAlef, an error |
| 884 * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h |
| 885 */ |
| 886 |
| 887 static int32_t |
| 888 expandCompositCharAtBegin(UChar *dest, int32_t sourceLength, int32_t destSize,UE
rrorCode *pErrorCode) { |
| 889 int32_t i = 0,j = 0; |
| 890 int32_t countl = 0; |
| 891 UChar *tempbuffer=NULL; |
| 892 |
| 893 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR); |
| 894 |
| 895 /* Test for NULL */ |
| 896 if(tempbuffer == NULL) { |
| 897 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 898 return 0; |
| 899 } |
| 900 |
| 901 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); |
| 902 |
| 903 i = 0; |
| 904 while(dest[i] == SPACE_CHAR) { |
| 905 countl++; |
| 906 i++; |
| 907 } |
| 908 |
| 909 i = j = sourceLength-1; |
| 910 |
| 911 while(i >= 0 && j >= 0) { |
| 912 if( countl>0 && isLamAlefChar(dest[i])) { |
| 913 tempbuffer[j] = LAM_CHAR; |
| 914 tempbuffer[j-1] = convertLamAlef[ dest[i] - 0xFEF5 ]; |
| 915 j--; |
| 916 countl--; |
| 917 }else { |
| 918 if( countl == 0 && isLamAlefChar(dest[i]) ) { |
| 919 *pErrorCode=U_NO_SPACE_AVAILABLE; |
| 920 } |
| 921 tempbuffer[j] = dest[i]; |
| 922 } |
| 923 i--; |
| 924 j--; |
| 925 } |
| 926 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); |
| 927 |
| 928 uprv_free(tempbuffer); |
| 929 |
| 930 destSize = sourceLength; |
| 931 return destSize; |
| 932 } |
| 933 |
| 934 /* |
| 935 *Name : expandCompositCharAtEnd |
| 936 *Function : Expands the LamAlef character to Lam and Alef consuming the |
| 937 * required space from end of the buffer. If the text type was |
| 938 * Visual LTR and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END |
| 939 * was used, the spaces will be consumed from begin of buffer. If |
| 940 * there are no spaces to expand the LamAlef, an error |
| 941 * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h |
| 942 */ |
| 943 |
| 944 static int32_t |
| 945 expandCompositCharAtEnd(UChar *dest, int32_t sourceLength, int32_t destSize,UErr
orCode *pErrorCode) { |
| 946 int32_t i = 0,j = 0; |
| 947 |
| 948 int32_t countr = 0; |
| 949 int32_t inpsize = sourceLength; |
| 950 |
| 951 UChar *tempbuffer=NULL; |
| 952 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR); |
| 953 |
| 954 /* Test for NULL */ |
| 955 if(tempbuffer == NULL) { |
| 956 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 957 return 0; |
| 958 } |
| 959 |
| 960 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR); |
| 961 |
| 962 while(dest[inpsize-1] == SPACE_CHAR) { |
| 963 countr++; |
| 964 inpsize--; |
| 965 } |
| 966 |
| 967 i = sourceLength - countr - 1; |
| 968 j = sourceLength - 1; |
| 969 |
| 970 while(i >= 0 && j >= 0) { |
| 971 if( countr>0 && isLamAlefChar(dest[i]) ) { |
| 972 tempbuffer[j] = LAM_CHAR; |
| 973 tempbuffer[j-1] = convertLamAlef[ dest[i] - 0xFEF5 ]; |
| 974 j--; |
| 975 countr--; |
| 976 }else { |
| 977 if ((countr == 0) && isLamAlefChar(dest[i]) ) { |
| 978 *pErrorCode=U_NO_SPACE_AVAILABLE; |
| 979 } |
| 980 tempbuffer[j] = dest[i]; |
| 981 } |
| 982 i--; |
| 983 j--; |
| 984 } |
| 985 |
| 986 if(countr > 0) { |
| 987 uprv_memmove(tempbuffer, tempbuffer+countr, sourceLength*U_SIZEOF_UCHAR)
; |
| 988 if(u_strlen(tempbuffer) < sourceLength) { |
| 989 for(i=sourceLength-1;i>=sourceLength-countr;i--) { |
| 990 tempbuffer[i] = SPACE_CHAR; |
| 991 } |
| 992 } |
| 993 } |
| 994 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR); |
| 995 |
| 996 uprv_free(tempbuffer); |
| 997 |
| 998 destSize = sourceLength; |
| 999 return destSize; |
| 1000 } |
| 1001 |
| 1002 /* |
| 1003 *Name : expandCompositCharAtNear |
| 1004 *Function : Expands the LamAlef character into Lam + Alef, YehHamza character |
| 1005 * into Yeh + Hamza, SeenFamily character into SeenFamily character |
| 1006 * + Tail, while consuming the space next to the character. |
| 1007 * If there are no spaces next to the character, an error |
| 1008 * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h |
| 1009 */ |
| 1010 |
| 1011 static int32_t |
| 1012 expandCompositCharAtNear(UChar *dest, int32_t sourceLength, int32_t destSize,UEr
rorCode *pErrorCode, |
| 1013 int yehHamzaOption, int seenTailOption, int lamAlefOpti
on) { |
| 1014 int32_t i = 0; |
| 1015 |
| 1016 |
| 1017 UChar lamalefChar, yehhamzaChar; |
| 1018 |
| 1019 for(i = 0 ;i<=sourceLength-1;i++) { |
| 1020 if (seenTailOption && isSeenTailFamilyChar(dest[i])) { |
| 1021 if ((i>0) && (dest[i-1] == SPACE_CHAR) ) { |
| 1022 dest[i-1] = tailChar; |
| 1023 }else { |
| 1024 *pErrorCode=U_NO_SPACE_AVAILABLE; |
| 1025 } |
| 1026 }else if(yehHamzaOption && (isYehHamzaChar(dest[i])) ) { |
| 1027 if ((i>0) && (dest[i-1] == SPACE_CHAR) ) { |
| 1028 yehhamzaChar = dest[i]; |
| 1029 dest[i] = yehHamzaToYeh[yehhamzaChar - YEH_HAMZAFE_CHAR]; |
| 1030 dest[i-1] = HAMZAFE_CHAR; |
| 1031 }else { |
| 1032 |
| 1033 *pErrorCode=U_NO_SPACE_AVAILABLE; |
| 1034 } |
| 1035 }else if(lamAlefOption && isLamAlefChar(dest[i+1])) { |
| 1036 if(dest[i] == SPACE_CHAR){ |
| 1037 lamalefChar = dest[i+1]; |
| 1038 dest[i+1] = LAM_CHAR; |
| 1039 dest[i] = convertLamAlef[ lamalefChar - 0xFEF5 ]; |
| 1040 }else { |
| 1041 *pErrorCode=U_NO_SPACE_AVAILABLE; |
| 1042 } |
| 1043 } |
| 1044 } |
| 1045 destSize = sourceLength; |
| 1046 return destSize; |
| 1047 } |
| 1048 /* |
| 1049 * Name : expandCompositChar |
| 1050 * Function : LamAlef, need special handling, since it expands from one |
| 1051 * character into two characters while shaping or deshaping. |
| 1052 * In order to expand it, near or far spaces according to the |
| 1053 * options user specifies. Also buffer size can be increased. |
| 1054 * |
| 1055 * For SeenFamily characters and YehHamza only the near option is |
| 1056 * supported, while for LamAlef we can take spaces from begin, end, |
| 1057 * near or even increase the buffer size. |
| 1058 * There is also the Auto option for LamAlef only, which will first |
| 1059 * search for a space at end, begin then near, respectively. |
| 1060 * If there are no spaces to expand these characters, an error will b
e set to |
| 1061 * U_NO_SPACE_AVAILABLE as defined in utypes.h |
| 1062 */ |
| 1063 |
| 1064 static int32_t |
| 1065 expandCompositChar(UChar *dest, int32_t sourceLength, |
| 1066 int32_t destSize,uint32_t options, |
| 1067 UErrorCode *pErrorCode, int shapingMode) { |
| 1068 |
| 1069 int32_t i = 0,j = 0; |
| 1070 |
| 1071 UChar *tempbuffer=NULL; |
| 1072 int yehHamzaOption = 0; |
| 1073 int seenTailOption = 0; |
| 1074 int lamAlefOption = 0; |
| 1075 |
| 1076 if (shapingMode == 1){ |
| 1077 if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_AUTO){ |
| 1078 |
| 1079 if(spacesRelativeToTextBeginEnd == 0) { |
| 1080 destSize = expandCompositCharAtEnd(dest, sourceLength, destSize,
pErrorCode); |
| 1081 |
| 1082 if(*pErrorCode == U_NO_SPACE_AVAILABLE) { |
| 1083 *pErrorCode = U_ZERO_ERROR; |
| 1084 destSize = expandCompositCharAtBegin(dest, sourceLength, des
tSize, pErrorCode); |
| 1085 } |
| 1086 }else { |
| 1087 destSize = expandCompositCharAtBegin(dest, sourceLength, destSiz
e, pErrorCode); |
| 1088 |
| 1089 if(*pErrorCode == U_NO_SPACE_AVAILABLE) { |
| 1090 *pErrorCode = U_ZERO_ERROR; |
| 1091 destSize = expandCompositCharAtEnd(dest, sourceLength, destS
ize, pErrorCode); |
| 1092 } |
| 1093 } |
| 1094 |
| 1095 if(*pErrorCode == U_NO_SPACE_AVAILABLE) { |
| 1096 *pErrorCode = U_ZERO_ERROR; |
| 1097 destSize = expandCompositCharAtNear(dest, sourceLength, destSize
, pErrorCode, yehHamzaOption, |
| 1098 seenTailOption, 1); |
| 1099 } |
| 1100 } |
| 1101 } |
| 1102 |
| 1103 if (shapingMode == 1){ |
| 1104 if ( (options&U_SHAPE_LAMALEF_MASK) == uShapeLamalefEnd){ |
| 1105 destSize = expandCompositCharAtEnd(dest, sourceLength, destSize, pEr
rorCode); |
| 1106 } |
| 1107 } |
| 1108 |
| 1109 if (shapingMode == 1){ |
| 1110 if ( (options&U_SHAPE_LAMALEF_MASK) == uShapeLamalefBegin){ |
| 1111 destSize = expandCompositCharAtBegin(dest, sourceLength, destSize, p
ErrorCode); |
| 1112 } |
| 1113 } |
| 1114 |
| 1115 if (shapingMode == 0){ |
| 1116 if ((options&U_SHAPE_YEHHAMZA_MASK) == U_SHAPE_YEHHAMZA_TWOCELL_NEAR){ |
| 1117 yehHamzaOption = 1; |
| 1118 } |
| 1119 if ((options&U_SHAPE_SEEN_MASK) == U_SHAPE_SEEN_TWOCELL_NEAR){ |
| 1120 seenTailOption = 1; |
| 1121 } |
| 1122 } |
| 1123 if (shapingMode == 1) { |
| 1124 if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_NEAR) { |
| 1125 lamAlefOption = 1; |
| 1126 } |
| 1127 } |
| 1128 |
| 1129 |
| 1130 if (yehHamzaOption || seenTailOption || lamAlefOption){ |
| 1131 destSize = expandCompositCharAtNear(dest, sourceLength, destSize, pError
Code, yehHamzaOption, |
| 1132 seenTailOption,lamAlefOption); |
| 1133 } |
| 1134 |
| 1135 |
| 1136 if (shapingMode == 1){ |
| 1137 if ( (options&U_SHAPE_LAMALEF_MASK) == U_SHAPE_LAMALEF_RESIZE){ |
| 1138 destSize = calculateSize(dest,sourceLength,destSize,options); |
| 1139 tempbuffer = (UChar *)uprv_malloc((destSize+1)*U_SIZEOF_UCHAR); |
| 1140 |
| 1141 /* Test for NULL */ |
| 1142 if(tempbuffer == NULL) { |
| 1143 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 1144 return 0; |
| 1145 } |
| 1146 |
| 1147 uprv_memset(tempbuffer, 0, (destSize+1)*U_SIZEOF_UCHAR); |
| 1148 |
| 1149 i = j = 0; |
| 1150 while(i < destSize && j < destSize) { |
| 1151 if(isLamAlefChar(dest[i]) ) { |
| 1152 tempbuffer[j] = convertLamAlef[ dest[i] - 0xFEF5 ]; |
| 1153 tempbuffer[j+1] = LAM_CHAR; |
| 1154 j++; |
| 1155 }else { |
| 1156 tempbuffer[j] = dest[i]; |
| 1157 } |
| 1158 i++; |
| 1159 j++; |
| 1160 } |
| 1161 |
| 1162 uprv_memcpy(dest, tempbuffer, destSize*U_SIZEOF_UCHAR); |
| 1163 } |
| 1164 } |
| 1165 |
| 1166 if(tempbuffer) { |
| 1167 uprv_free(tempbuffer); |
| 1168 } |
| 1169 return destSize; |
| 1170 } |
| 1171 |
| 1172 /* |
| 1173 *Name : shapeUnicode |
| 1174 *Function : Converts an Arabic Unicode buffer in 06xx Range into a shaped |
| 1175 * arabic Unicode buffer in FExx Range |
| 1176 */ |
| 1177 static int32_t |
| 1178 shapeUnicode(UChar *dest, int32_t sourceLength, |
| 1179 int32_t destSize,uint32_t options, |
| 1180 UErrorCode *pErrorCode, |
| 1181 int tashkeelFlag) { |
| 1182 |
| 1183 int32_t i, iend; |
| 1184 int32_t step; |
| 1185 int32_t lastPos,Nx, Nw; |
| 1186 unsigned int Shape; |
| 1187 int32_t lamalef_found = 0; |
| 1188 int32_t seenfamFound = 0, yehhamzaFound =0, tashkeelFound = 0; |
| 1189 UChar prevLink = 0, lastLink = 0, currLink, nextLink = 0; |
| 1190 UChar wLamalef; |
| 1191 |
| 1192 /* |
| 1193 * Converts the input buffer from FExx Range into 06xx Range |
| 1194 * to make sure that all characters are in the 06xx range |
| 1195 * even the lamalef is converted to the special region in |
| 1196 * the 06xx range |
| 1197 */ |
| 1198 if ((options & U_SHAPE_PRESERVE_PRESENTATION_MASK) == U_SHAPE_PRESERVE_PRES
ENTATION_NOOP) { |
| 1199 for (i = 0; i < sourceLength; i++) { |
| 1200 UChar inputChar = dest[i]; |
| 1201 if ( (inputChar >= 0xFB50) && (inputChar <= 0xFBFF)) { |
| 1202 UChar c = convertFBto06 [ (inputChar - 0xFB50) ]; |
| 1203 if (c != 0) |
| 1204 dest[i] = c; |
| 1205 } else if ( (inputChar >= 0xFE70) && (inputChar <= 0xFEFC)) { |
| 1206 dest[i] = convertFEto06 [ (inputChar - 0xFE70) ] ; |
| 1207 } else { |
| 1208 dest[i] = inputChar ; |
| 1209 } |
| 1210 } |
| 1211 } |
| 1212 |
| 1213 |
| 1214 /* sets the index to the end of the buffer, together with the step point to
-1 */ |
| 1215 i = sourceLength - 1; |
| 1216 iend = -1; |
| 1217 step = -1; |
| 1218 |
| 1219 /* |
| 1220 * This function resolves the link between the characters . |
| 1221 * Arabic characters have four forms : |
| 1222 * Isolated Form, Initial Form, Middle Form and Final Form |
| 1223 */ |
| 1224 currLink = getLink(dest[i]); |
| 1225 |
| 1226 lastPos = i; |
| 1227 Nx = -2, Nw = 0; |
| 1228 |
| 1229 while (i != iend) { |
| 1230 /* If high byte of currLink > 0 then more than one shape */ |
| 1231 if ((currLink & 0xFF00) > 0 || (getLink(dest[i]) & IRRELEVANT) != 0) { |
| 1232 Nw = i + step; |
| 1233 while (Nx < 0) { /* we need to know about next char */ |
| 1234 if(Nw == iend) { |
| 1235 nextLink = 0; |
| 1236 Nx = 3000; |
| 1237 } else { |
| 1238 nextLink = getLink(dest[Nw]); |
| 1239 if((nextLink & IRRELEVANT) == 0) { |
| 1240 Nx = Nw; |
| 1241 } else { |
| 1242 Nw = Nw + step; |
| 1243 } |
| 1244 } |
| 1245 } |
| 1246 |
| 1247 if ( ((currLink & ALEFTYPE) > 0) && ((lastLink & LAMTYPE) > 0) ) { |
| 1248 lamalef_found = 1; |
| 1249 wLamalef = changeLamAlef(dest[i]); /*get from 0x065C-0x065f */ |
| 1250 if ( wLamalef != 0) { |
| 1251 dest[i] = LAMALEF_SPACE_SUB; /* The default case
is to drop the Alef and replace */ |
| 1252 dest[lastPos] =wLamalef; /* it by LAMALEF_SPACE_SUB whic
h is the last character in the */ |
| 1253 i=lastPos; /* unicode private use area, th
is is done to make */ |
| 1254 } /* sure that removeLamAlefSpace
s() handles only the */ |
| 1255 lastLink = prevLink; /* spaces generated during lama
lef generation. */ |
| 1256 currLink = getLink(wLamalef); /* LAMALEF_SPACE_SUB is added h
ere and is replaced by spaces */ |
| 1257 } /* in removeLamAlefSpaces()
*/ |
| 1258 |
| 1259 if ((i > 0) && (dest[i-1] == SPACE_CHAR)){ |
| 1260 if ( isSeenFamilyChar(dest[i])){ |
| 1261 seenfamFound = 1; |
| 1262 } else if (dest[i] == YEH_HAMZA_CHAR) { |
| 1263 yehhamzaFound = 1; |
| 1264 } |
| 1265 } |
| 1266 else if(i==0){ |
| 1267 if ( isSeenFamilyChar(dest[i])){ |
| 1268 seenfamFound = 1; |
| 1269 } else if (dest[i] == YEH_HAMZA_CHAR) { |
| 1270 yehhamzaFound = 1; |
| 1271 } |
| 1272 } |
| 1273 |
| 1274 /* |
| 1275 * get the proper shape according to link ability of neighbors |
| 1276 * and of character; depends on the order of the shapes |
| 1277 * (isolated, initial, middle, final) in the compatibility area |
| 1278 */ |
| 1279 Shape = shapeTable[nextLink & (LINKR + LINKL)] |
| 1280 [lastLink & (LINKR + LINKL)] |
| 1281 [currLink & (LINKR + LINKL)]; |
| 1282 |
| 1283 if ((currLink & (LINKR+LINKL)) == 1) { |
| 1284 Shape &= 1; |
| 1285 } else if(isTashkeelChar(dest[i])) { |
| 1286 if( (lastLink & LINKL) && (nextLink & LINKR) && (tashkeelFlag ==
1) && |
| 1287 dest[i] != 0x064C && dest[i] != 0x064D ) |
| 1288 { |
| 1289 Shape = 1; |
| 1290 if( (nextLink&ALEFTYPE) == ALEFTYPE && (lastLink&LAMTYPE) ==
LAMTYPE ) { |
| 1291 Shape = 0; |
| 1292 } |
| 1293 } |
| 1294 else { |
| 1295 Shape = 0; |
| 1296 } |
| 1297 } |
| 1298 if ((dest[i] ^ 0x0600) < 0x100) { |
| 1299 if ( isTashkeelChar(dest[i]) ){ |
| 1300 if (tashkeelFlag == 2){ |
| 1301 dest[i] = TASHKEEL_SPACE_SUB; |
| 1302 tashkeelFound = 1; |
| 1303 }else { |
| 1304 dest[i] = 0xFE70 + IrrelevantPos[(dest[i] - 0x064B)] + Shap
e; |
| 1305 } |
| 1306 }else if ((currLink & APRESENT) > 0) { |
| 1307 dest[i] = (UChar)(0xFB50 + (currLink >> 8) + Shape); |
| 1308 }else if ((currLink >> 8) > 0 && (currLink & IRRELEVANT) == 0) { |
| 1309 dest[i] = (UChar)(0xFE70 + (currLink >> 8) + Shape); |
| 1310 } |
| 1311 } |
| 1312 } |
| 1313 |
| 1314 /* move one notch forward */ |
| 1315 if ((currLink & IRRELEVANT) == 0) { |
| 1316 prevLink = lastLink; |
| 1317 lastLink = currLink; |
| 1318 lastPos = i; |
| 1319 } |
| 1320 |
| 1321 i = i + step; |
| 1322 if (i == Nx) { |
| 1323 currLink = nextLink; |
| 1324 Nx = -2; |
| 1325 } else if(i != iend) { |
| 1326 currLink = getLink(dest[i]); |
| 1327 } |
| 1328 } |
| 1329 destSize = sourceLength; |
| 1330 if ( (lamalef_found != 0 ) || (tashkeelFound != 0) ){ |
| 1331 destSize = handleGeneratedSpaces(dest,sourceLength,destSize,options,pErr
orCode); |
| 1332 } |
| 1333 |
| 1334 if ( (seenfamFound != 0) || (yehhamzaFound != 0) ) { |
| 1335 destSize = expandCompositChar(dest, sourceLength,destSize,options,pError
Code, SHAPE_MODE); |
| 1336 } |
| 1337 return destSize; |
| 1338 } |
| 1339 |
| 1340 /* |
| 1341 *Name : deShapeUnicode |
| 1342 *Function : Converts an Arabic Unicode buffer in FExx Range into unshaped |
| 1343 * arabic Unicode buffer in 06xx Range |
| 1344 */ |
| 1345 static int32_t |
| 1346 deShapeUnicode(UChar *dest, int32_t sourceLength, |
| 1347 int32_t destSize,uint32_t options, |
| 1348 UErrorCode *pErrorCode) { |
| 1349 int32_t i = 0; |
| 1350 int32_t lamalef_found = 0; |
| 1351 int32_t yehHamzaComposeEnabled = 0; |
| 1352 int32_t seenComposeEnabled = 0; |
| 1353 |
| 1354 yehHamzaComposeEnabled = ((options&U_SHAPE_YEHHAMZA_MASK) == U_SHAPE_YEHHAMZ
A_TWOCELL_NEAR) ? 1 : 0; |
| 1355 seenComposeEnabled = ((options&U_SHAPE_SEEN_MASK) == U_SHAPE_SEEN_TWOCELL_NE
AR)? 1 : 0; |
| 1356 |
| 1357 /* |
| 1358 *This for loop changes the buffer from the Unicode FE range to |
| 1359 *the Unicode 06 range |
| 1360 */ |
| 1361 |
| 1362 for(i = 0; i < sourceLength; i++) { |
| 1363 UChar inputChar = dest[i]; |
| 1364 if ( (inputChar >= 0xFB50) && (inputChar <= 0xFBFF)) { /* FBxx Arabic ra
nge */ |
| 1365 UChar c = convertFBto06 [ (inputChar - 0xFB50) ]; |
| 1366 if (c != 0) |
| 1367 dest[i] = c; |
| 1368 } else if( (yehHamzaComposeEnabled == 1) && ((inputChar == HAMZA06_CHAR)
|| (inputChar == HAMZAFE_CHAR)) |
| 1369 && (i < (sourceLength - 1)) && isAlefMaksouraChar(dest[i+1] )) { |
| 1370 dest[i] = SPACE_CHAR; |
| 1371 dest[i+1] = YEH_HAMZA_CHAR; |
| 1372 } else if ( (seenComposeEnabled == 1) && (isTailChar(inputChar)) && (i<
(sourceLength - 1)) |
| 1373 && (isSeenTailFamilyChar(dest[i+1])) ) { |
| 1374 dest[i] = SPACE_CHAR; |
| 1375 } else if (( inputChar >= 0xFE70) && (inputChar <= 0xFEF4 )) { /* FExx A
rabic range */ |
| 1376 dest[i] = convertFEto06 [ (inputChar - 0xFE70) ]; |
| 1377 } else { |
| 1378 dest[i] = inputChar ; |
| 1379 } |
| 1380 |
| 1381 if( isLamAlefChar(dest[i]) ) |
| 1382 lamalef_found = 1; |
| 1383 } |
| 1384 |
| 1385 destSize = sourceLength; |
| 1386 if (lamalef_found != 0){ |
| 1387 destSize = expandCompositChar(dest,sourceLength,destSize,options,pErro
rCode,DESHAPE_MODE); |
| 1388 } |
| 1389 return destSize; |
| 1390 } |
| 1391 |
| 1392 /* |
| 1393 **************************************** |
| 1394 * u_shapeArabic |
| 1395 **************************************** |
| 1396 */ |
| 1397 |
| 1398 U_CAPI int32_t U_EXPORT2 |
| 1399 u_shapeArabic(const UChar *source, int32_t sourceLength, |
| 1400 UChar *dest, int32_t destCapacity, |
| 1401 uint32_t options, |
| 1402 UErrorCode *pErrorCode) { |
| 1403 |
| 1404 int32_t destLength; |
| 1405 |
| 1406 spacesRelativeToTextBeginEnd = 0; |
| 1407 uShapeLamalefBegin = U_SHAPE_LAMALEF_BEGIN; |
| 1408 uShapeLamalefEnd = U_SHAPE_LAMALEF_END; |
| 1409 uShapeTashkeelBegin = U_SHAPE_TASHKEEL_BEGIN; |
| 1410 uShapeTashkeelEnd = U_SHAPE_TASHKEEL_END; |
| 1411 |
| 1412 /* usual error checking */ |
| 1413 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { |
| 1414 return 0; |
| 1415 } |
| 1416 |
| 1417 /* make sure that no reserved options values are used; allow dest==NULL only
for preflighting */ |
| 1418 if( source==NULL || sourceLength<-1 || (dest==NULL && destCapacity!=0) || de
stCapacity<0 || |
| 1419 (((options&U_SHAPE_TASHKEEL_MASK) > 0) && |
| 1420 ((options&U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED) == U_SHAPE_L
ETTERS_SHAPE_TASHKEEL_ISOLATED) ) || |
| 1421 (((options&U_SHAPE_TASHKEEL_MASK) > 0) && |
| 1422 ((options&U_SHAPE_LETTERS_MASK) == U_SHAPE_LETTERS_UNSHAPE)) || |
| 1423 (options&U_SHAPE_DIGIT_TYPE_RESERVED)==U_SHAPE_DIGIT_TYPE_RESERV
ED || |
| 1424 (options&U_SHAPE_DIGITS_MASK)==U_SHAPE_DIGITS_RESERVED || |
| 1425 ((options&U_SHAPE_LAMALEF_MASK) != U_SHAPE_LAMALEF_RESIZE && |
| 1426 (options&U_SHAPE_AGGREGATE_TASHKEEL_MASK) != 0) || |
| 1427 ((options&U_SHAPE_AGGREGATE_TASHKEEL_MASK) == U_SHAPE_AGGREGATE_
TASHKEEL && |
| 1428 (options&U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED) != U_SHAPE_LET
TERS_SHAPE_TASHKEEL_ISOLATED) |
| 1429 ) |
| 1430 { |
| 1431 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1432 return 0; |
| 1433 } |
| 1434 /* Validate lamalef options */ |
| 1435 if(((options&U_SHAPE_LAMALEF_MASK) > 0)&& |
| 1436 !(((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_BEGIN) || |
| 1437 ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_END ) || |
| 1438 ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_RESIZE )|| |
| 1439 ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_AUTO) || |
| 1440 ((options & U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_NEAR))) |
| 1441 { |
| 1442 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1443 return 0; |
| 1444 } |
| 1445 /* Validate Tashkeel options */ |
| 1446 if(((options&U_SHAPE_TASHKEEL_MASK) > 0)&& |
| 1447 !(((options & U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_BEGIN)
|| |
| 1448 ((options & U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_END ) |
| 1449 ||((options & U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_RESIZ
E )|| |
| 1450 ((options & U_SHAPE_TASHKEEL_MASK)==U_SH
APE_TASHKEEL_REPLACE_BY_TATWEEL))) |
| 1451 { |
| 1452 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1453 return 0; |
| 1454 } |
| 1455 /* determine the source length */ |
| 1456 if(sourceLength==-1) { |
| 1457 sourceLength=u_strlen(source); |
| 1458 } |
| 1459 if(sourceLength<=0) { |
| 1460 return u_terminateUChars(dest, destCapacity, 0, pErrorCode); |
| 1461 } |
| 1462 |
| 1463 /* check that source and destination do not overlap */ |
| 1464 if( dest!=NULL && |
| 1465 ((source<=dest && dest<source+sourceLength) || |
| 1466 (dest<=source && source<dest+destCapacity))) { |
| 1467 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1468 return 0; |
| 1469 } |
| 1470 |
| 1471 /* Does Options contain the new Seen Tail Unicode code point option */ |
| 1472 if ( (options&SHAPE_TAIL_TYPE_MASK) == SHAPE_TAIL_NEW_UNICODE){ |
| 1473 tailChar = NEW_TAIL_CHAR; |
| 1474 }else { |
| 1475 tailChar = OLD_TAIL_CHAR; |
| 1476 } |
| 1477 |
| 1478 if((options&U_SHAPE_LETTERS_MASK)!=U_SHAPE_LETTERS_NOOP) { |
| 1479 UChar buffer[300]; |
| 1480 UChar *tempbuffer, *tempsource = NULL; |
| 1481 int32_t outputSize, spacesCountl=0, spacesCountr=0; |
| 1482 |
| 1483 if((options&U_SHAPE_AGGREGATE_TASHKEEL_MASK)>0) { |
| 1484 int32_t logical_order = (options&U_SHAPE_TEXT_DIRECTION_MASK) == U_S
HAPE_TEXT_DIRECTION_LOGICAL; |
| 1485 int32_t aggregate_tashkeel = |
| 1486 (options&(U_SHAPE_AGGREGATE_TASHKEEL_MASK+U_SHAPE_LETTER
S_SHAPE_TASHKEEL_ISOLATED)) == |
| 1487 (U_SHAPE_AGGREGATE_TASHKEEL+U_SHAPE_LETTERS_SHAPE_TASHKE
EL_ISOLATED); |
| 1488 int step=logical_order?1:-1; |
| 1489 int j=logical_order?-1:2*sourceLength; |
| 1490 int i=logical_order?-1:sourceLength; |
| 1491 int end=logical_order?sourceLength:-1; |
| 1492 int aggregation_possible = 1; |
| 1493 UChar prev = 0; |
| 1494 UChar prevLink, currLink = 0; |
| 1495 int newSourceLength = 0; |
| 1496 tempsource = (UChar *)uprv_malloc(2*sourceLength*U_SIZEOF_UCHAR); |
| 1497 if(tempsource == NULL) { |
| 1498 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 1499 return 0; |
| 1500 } |
| 1501 |
| 1502 while ((i+=step) != end) { |
| 1503 prevLink = currLink; |
| 1504 currLink = getLink(source[i]); |
| 1505 if (aggregate_tashkeel && ((prevLink|currLink)&COMBINE) == COMBI
NE && aggregation_possible) { |
| 1506 aggregation_possible = 0; |
| 1507 tempsource[j] = (prev<source[i]?prev:source[i])-0x064C+0xFC5
E; |
| 1508 currLink = getLink(tempsource[j]); |
| 1509 } else { |
| 1510 aggregation_possible = 1; |
| 1511 tempsource[j+=step] = source[i]; |
| 1512 prev = source[i]; |
| 1513 newSourceLength++; |
| 1514 } |
| 1515 } |
| 1516 source = tempsource+(logical_order?0:j); |
| 1517 sourceLength = newSourceLength; |
| 1518 } |
| 1519 |
| 1520 /* calculate destination size */ |
| 1521 /* TODO: do we ever need to do this pure preflighting? */ |
| 1522 if(((options&U_SHAPE_LAMALEF_MASK)==U_SHAPE_LAMALEF_RESIZE) || |
| 1523 ((options&U_SHAPE_TASHKEEL_MASK)==U_SHAPE_TASHKEEL_RESIZE)) { |
| 1524 outputSize=calculateSize(source,sourceLength,destCapacity,options); |
| 1525 } else { |
| 1526 outputSize=sourceLength; |
| 1527 } |
| 1528 |
| 1529 if(outputSize>destCapacity) { |
| 1530 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1531 if (tempsource != NULL) uprv_free(tempsource); |
| 1532 return outputSize; |
| 1533 } |
| 1534 |
| 1535 /* |
| 1536 * need a temporary buffer of size max(outputSize, sourceLength) |
| 1537 * because at first we copy source->temp |
| 1538 */ |
| 1539 if(sourceLength>outputSize) { |
| 1540 outputSize=sourceLength; |
| 1541 } |
| 1542 |
| 1543 /* Start of Arabic letter shaping part */ |
| 1544 if(outputSize<=sizeof(buffer)/U_SIZEOF_UCHAR) { |
| 1545 outputSize=sizeof(buffer)/U_SIZEOF_UCHAR; |
| 1546 tempbuffer=buffer; |
| 1547 } else { |
| 1548 tempbuffer = (UChar *)uprv_malloc(outputSize*U_SIZEOF_UCHAR); |
| 1549 |
| 1550 /*Test for NULL*/ |
| 1551 if(tempbuffer == NULL) { |
| 1552 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; |
| 1553 if (tempsource != NULL) uprv_free(tempsource); |
| 1554 return 0; |
| 1555 } |
| 1556 } |
| 1557 uprv_memcpy(tempbuffer, source, sourceLength*U_SIZEOF_UCHAR); |
| 1558 if (tempsource != NULL){ |
| 1559 uprv_free(tempsource); |
| 1560 } |
| 1561 |
| 1562 if(sourceLength<outputSize) { |
| 1563 uprv_memset(tempbuffer+sourceLength, 0, (outputSize-sourceLength)*U_
SIZEOF_UCHAR); |
| 1564 } |
| 1565 |
| 1566 if((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGIC
AL) { |
| 1567 countSpaces(tempbuffer,sourceLength,options,&spacesCountl,&spacesCou
ntr); |
| 1568 invertBuffer(tempbuffer,sourceLength,options,spacesCountl,spacesCoun
tr); |
| 1569 } |
| 1570 |
| 1571 if((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_VISUA
L_LTR) { |
| 1572 if((options&U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK) == U_SHAPE_SPACES_
RELATIVE_TO_TEXT_BEGIN_END) { |
| 1573 spacesRelativeToTextBeginEnd = 1; |
| 1574 uShapeLamalefBegin = U_SHAPE_LAMALEF_END; |
| 1575 uShapeLamalefEnd = U_SHAPE_LAMALEF_BEGIN; |
| 1576 |
| 1577 uShapeTashkeelBegin = U_SHAPE_TASHKEEL_END; |
| 1578 uShapeTashkeelEnd = U_SHAPE_TASHKEEL_BEGIN; |
| 1579 } |
| 1580 } |
| 1581 |
| 1582 switch(options&U_SHAPE_LETTERS_MASK) { |
| 1583 case U_SHAPE_LETTERS_SHAPE : |
| 1584 if( (options&U_SHAPE_TASHKEEL_MASK)> 0 |
| 1585 && ((options&U_SHAPE_TASHKEEL_MASK) !=U_SHAPE_TASHKEEL_REPLACE_
BY_TATWEEL)) { |
| 1586 /* Call the shaping function with tashkeel flag == 2 for removal
of tashkeel */ |
| 1587 destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,o
ptions,pErrorCode,2); |
| 1588 }else { |
| 1589 /* default Call the shaping function with tashkeel flag == 1 */ |
| 1590 destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,o
ptions,pErrorCode,1); |
| 1591 |
| 1592 /*After shaping text check if user wants to remove tashkeel and
replace it with tatweel*/ |
| 1593 if( (options&U_SHAPE_TASHKEEL_MASK) == U_SHAPE_TASHKEEL_REPLACE_
BY_TATWEEL){ |
| 1594 destLength = handleTashkeelWithTatweel(tempbuffer,destLength,d
estCapacity,options,pErrorCode); |
| 1595 } |
| 1596 } |
| 1597 break; |
| 1598 case U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED : |
| 1599 /* Call the shaping function with tashkeel flag == 0 */ |
| 1600 destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,optio
ns,pErrorCode,0); |
| 1601 break; |
| 1602 |
| 1603 case U_SHAPE_LETTERS_UNSHAPE : |
| 1604 /* Call the deshaping function */ |
| 1605 destLength = deShapeUnicode(tempbuffer,sourceLength,destCapacity,opt
ions,pErrorCode); |
| 1606 break; |
| 1607 default : |
| 1608 /* will never occur because of validity checks above */ |
| 1609 destLength = 0; |
| 1610 break; |
| 1611 } |
| 1612 |
| 1613 /* |
| 1614 * TODO: (markus 2002aug01) |
| 1615 * For as long as we always preflight the outputSize above |
| 1616 * we should U_ASSERT(outputSize==destLength) |
| 1617 * except for the adjustment above before the tempbuffer allocation |
| 1618 */ |
| 1619 |
| 1620 if((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGIC
AL) { |
| 1621 countSpaces(tempbuffer,destLength,options,&spacesCountl,&spacesCount
r); |
| 1622 invertBuffer(tempbuffer,destLength,options,spacesCountl,spacesCountr
); |
| 1623 } |
| 1624 uprv_memcpy(dest, tempbuffer, uprv_min(destLength, destCapacity)*U_SIZEO
F_UCHAR); |
| 1625 |
| 1626 if(tempbuffer!=buffer) { |
| 1627 uprv_free(tempbuffer); |
| 1628 } |
| 1629 |
| 1630 if(destLength>destCapacity) { |
| 1631 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1632 return destLength; |
| 1633 } |
| 1634 |
| 1635 /* End of Arabic letter shaping part */ |
| 1636 } else { |
| 1637 /* |
| 1638 * No letter shaping: |
| 1639 * just make sure the destination is large enough and copy the string. |
| 1640 */ |
| 1641 if(destCapacity<sourceLength) { |
| 1642 /* this catches preflighting, too */ |
| 1643 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
| 1644 return sourceLength; |
| 1645 } |
| 1646 uprv_memcpy(dest, source, sourceLength*U_SIZEOF_UCHAR); |
| 1647 destLength=sourceLength; |
| 1648 } |
| 1649 |
| 1650 /* |
| 1651 * Perform number shaping. |
| 1652 * With UTF-16 or UTF-32, the length of the string is constant. |
| 1653 * The easiest way to do this is to operate on the destination and |
| 1654 * "shape" the digits in-place. |
| 1655 */ |
| 1656 if((options&U_SHAPE_DIGITS_MASK)!=U_SHAPE_DIGITS_NOOP) { |
| 1657 UChar digitBase; |
| 1658 int32_t i; |
| 1659 |
| 1660 /* select the requested digit group */ |
| 1661 switch(options&U_SHAPE_DIGIT_TYPE_MASK) { |
| 1662 case U_SHAPE_DIGIT_TYPE_AN: |
| 1663 digitBase=0x660; /* Unicode: "Arabic-Indic digits" */ |
| 1664 break; |
| 1665 case U_SHAPE_DIGIT_TYPE_AN_EXTENDED: |
| 1666 digitBase=0x6f0; /* Unicode: "Eastern Arabic-Indic digits (Persian a
nd Urdu)" */ |
| 1667 break; |
| 1668 default: |
| 1669 /* will never occur because of validity checks above */ |
| 1670 digitBase=0; |
| 1671 break; |
| 1672 } |
| 1673 |
| 1674 /* perform the requested operation */ |
| 1675 switch(options&U_SHAPE_DIGITS_MASK) { |
| 1676 case U_SHAPE_DIGITS_EN2AN: |
| 1677 /* add (digitBase-'0') to each European (ASCII) digit code point */ |
| 1678 digitBase-=0x30; |
| 1679 for(i=0; i<destLength; ++i) { |
| 1680 if(((uint32_t)dest[i]-0x30)<10) { |
| 1681 dest[i]+=digitBase; |
| 1682 } |
| 1683 } |
| 1684 break; |
| 1685 case U_SHAPE_DIGITS_AN2EN: |
| 1686 /* subtract (digitBase-'0') from each Arabic digit code point */ |
| 1687 for(i=0; i<destLength; ++i) { |
| 1688 if(((uint32_t)dest[i]-(uint32_t)digitBase)<10) { |
| 1689 dest[i]-=digitBase-0x30; |
| 1690 } |
| 1691 } |
| 1692 break; |
| 1693 case U_SHAPE_DIGITS_ALEN2AN_INIT_LR: |
| 1694 _shapeToArabicDigitsWithContext(dest, destLength, |
| 1695 digitBase, |
| 1696 (UBool)((options&U_SHAPE_TEXT_DIRECT
ION_MASK)==U_SHAPE_TEXT_DIRECTION_LOGICAL), |
| 1697 FALSE); |
| 1698 break; |
| 1699 case U_SHAPE_DIGITS_ALEN2AN_INIT_AL: |
| 1700 _shapeToArabicDigitsWithContext(dest, destLength, |
| 1701 digitBase, |
| 1702 (UBool)((options&U_SHAPE_TEXT_DIRECT
ION_MASK)==U_SHAPE_TEXT_DIRECTION_LOGICAL), |
| 1703 TRUE); |
| 1704 break; |
| 1705 default: |
| 1706 /* will never occur because of validity checks above */ |
| 1707 break; |
| 1708 } |
| 1709 } |
| 1710 |
| 1711 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); |
| 1712 } |
OLD | NEW |