OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * |
| 4 * Copyright (C) 2000-2008, International Business Machines |
| 5 * Corporation and others. All Rights Reserved. |
| 6 * |
| 7 ******************************************************************************* |
| 8 * file name: genmbcs.h |
| 9 * encoding: US-ASCII |
| 10 * tab size: 8 (not used) |
| 11 * indentation:4 |
| 12 * |
| 13 * created on: 2000jul10 |
| 14 * created by: Markus W. Scherer |
| 15 */ |
| 16 |
| 17 #ifndef __GENMBCS_H__ |
| 18 #define __GENMBCS_H__ |
| 19 |
| 20 #include "makeconv.h" |
| 21 |
| 22 enum { |
| 23 /* |
| 24 * TODO: Consider using ucnvmbcs.h constants. |
| 25 * However, not all values need to be exactly the same, for example |
| 26 * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX |
| 27 * may be higher in makeconv than in the runtime code because that |
| 28 * affects only a small number of .cnv files [if any] but all |
| 29 * runtime UConverterSharedData objects. |
| 30 */ |
| 31 MBCS_STAGE_2_SHIFT=4, |
| 32 MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */ |
| 33 MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ |
| 34 MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SH
IFT */ |
| 35 MBCS_STAGE_1_SHIFT=10, |
| 36 MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one en
try per 1k code points on the BMP */ |
| 37 MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for on
e entry per 1k code points */ |
| 38 MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share
a 16-bit-indexed array */ |
| 39 MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, |
| 40 MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, |
| 41 |
| 42 MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned
stage 2 block */ |
| 43 MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first s
tage 2 block after the all-unassigned one */ |
| 44 |
| 45 MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */ |
| 46 MBCS_STAGE_3_BLOCK_MASK=0xf, |
| 47 MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first s
tage 3 block after the all-unassigned one */ |
| 48 |
| 49 MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shift
ed left 4 */ |
| 50 MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */ |
| 51 MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for
MBCS */ |
| 52 |
| 53 /* |
| 54 * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structure
s. |
| 55 * Possible values are 0x01ff..0xffff, in steps of 0x100. |
| 56 * |
| 57 * Unlike for MBCS, this constant only affects the stage 3 block allocation
size; |
| 58 * there is no additional stage 1/2 table stored in the .cnv file. |
| 59 * The max value should be at least 0x7ff to cover 2-byte UTF-8. |
| 60 * 0xfff also covers a number other small scripts which have legacy charsets |
| 61 * (like Thai). |
| 62 * Higher values up to 0x1fff are harmless and potentially useful because |
| 63 * that covers small-script blocks which usually have either dense mappings |
| 64 * or no mappings at all. |
| 65 * Starting at U+2000, there are mostly symbols and format characters |
| 66 * with a low density of SBCS mappings, which would result in more wasted |
| 67 * stage 3 entries with the larger block size. |
| 68 */ |
| 69 SBCS_UTF8_MAX=0x1fff, |
| 70 |
| 71 /* |
| 72 * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structure
s. |
| 73 * Possible values are 0x01ff..0xffff, in steps of 0x100. |
| 74 * |
| 75 * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional U
TF-8 stage table |
| 76 * with extreme input data. The function checks for this overflow. |
| 77 * |
| 78 * 0xd7ff is chosen for the majority of common characters including Unihan a
nd Hangul. |
| 79 * At U+d800 there are mostly surrogates, private use codes, compatibility c
haracters, etc. |
| 80 * Larger values cause slightly larger MBCS .cnv files. |
| 81 */ |
| 82 MBCS_UTF8_MAX=0xd7ff, |
| 83 MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */ |
| 84 |
| 85 MBCS_UTF8_STAGE_SHIFT=6, |
| 86 MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail b
yte */ |
| 87 MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f, |
| 88 |
| 89 /* size of the single-stage table for up to U+d7ff (used instead of stage1/2
) */ |
| 90 MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */ |
| 91 |
| 92 MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappin
gs that fit into the base toU table */ |
| 93 MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fro
mU table */ |
| 94 |
| 95 /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */ |
| 96 MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZ
E, |
| 97 |
| 98 MBCS_MAX_FALLBACK_COUNT=8192 |
| 99 }; |
| 100 |
| 101 U_CFUNC NewConverter * |
| 102 MBCSOpen(UCMFile *ucm); |
| 103 |
| 104 struct MBCSData; |
| 105 typedef struct MBCSData MBCSData; |
| 106 |
| 107 /* |
| 108 * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() |
| 109 * for creating an extension-only file. |
| 110 * Assume maxCharLength>1. |
| 111 */ |
| 112 U_CFUNC const MBCSData * |
| 113 MBCSGetDummy(void); |
| 114 |
| 115 /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure.
*/ |
| 116 U_CFUNC UBool |
| 117 MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, |
| 118 const uint8_t *bytes, int32_t length, |
| 119 UChar32 c, int8_t flag); |
| 120 |
| 121 U_CFUNC NewConverter * |
| 122 CnvExtOpen(UCMFile *ucm); |
| 123 |
| 124 #endif /* __GENMBCS_H__ */ |
OLD | NEW |