| Index: icu46/source/tools/toolutil/ucm.c
|
| ===================================================================
|
| --- icu46/source/tools/toolutil/ucm.c (revision 0)
|
| +++ icu46/source/tools/toolutil/ucm.c (revision 0)
|
| @@ -0,0 +1,1188 @@
|
| +/*
|
| +*******************************************************************************
|
| +*
|
| +* Copyright (C) 2003-2009, International Business Machines
|
| +* Corporation and others. All Rights Reserved.
|
| +*
|
| +*******************************************************************************
|
| +* file name: ucm.c
|
| +* encoding: US-ASCII
|
| +* tab size: 8 (not used)
|
| +* indentation:4
|
| +*
|
| +* created on: 2003jun20
|
| +* created by: Markus W. Scherer
|
| +*
|
| +* This file reads a .ucm file, stores its mappings and sorts them.
|
| +* It implements handling of Unicode conversion mappings from .ucm files
|
| +* for makeconv, canonucm, rptp2ucm, etc.
|
| +*
|
| +* Unicode code point sequences with a length of more than 1,
|
| +* as well as byte sequences with more than 4 bytes or more than one complete
|
| +* character sequence are handled to support m:n mappings.
|
| +*/
|
| +
|
| +#include "unicode/utypes.h"
|
| +#include "unicode/ustring.h"
|
| +#include "cstring.h"
|
| +#include "cmemory.h"
|
| +#include "filestrm.h"
|
| +#include "uarrsort.h"
|
| +#include "ucnvmbcs.h"
|
| +#include "ucnv_bld.h"
|
| +#include "ucnv_ext.h"
|
| +#include "uparse.h"
|
| +#include "ucm.h"
|
| +#include <stdio.h>
|
| +
|
| +#if !UCONFIG_NO_CONVERSION
|
| +
|
| +/* -------------------------------------------------------------------------- */
|
| +
|
| +static void
|
| +printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
|
| + int32_t j;
|
| +
|
| + for(j=0; j<m->uLen; ++j) {
|
| + fprintf(f, "<U%04lX>", (long)codePoints[j]);
|
| + }
|
| +
|
| + fputc(' ', f);
|
| +
|
| + for(j=0; j<m->bLen; ++j) {
|
| + fprintf(f, "\\x%02X", bytes[j]);
|
| + }
|
| +
|
| + if(m->f>=0) {
|
| + fprintf(f, " |%u\n", m->f);
|
| + } else {
|
| + fputs("\n", f);
|
| + }
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
|
| + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
|
| + UCMapping *m;
|
| + int32_t i, length;
|
| +
|
| + m=table->mappings;
|
| + length=table->mappingsLength;
|
| + if(byUnicode) {
|
| + for(i=0; i<length; ++m, ++i) {
|
| + ucm_printMapping(table, m, f);
|
| + }
|
| + } else {
|
| + const int32_t *map=table->reverseMap;
|
| + for(i=0; i<length; ++i) {
|
| + ucm_printMapping(table, m+map[i], f);
|
| + }
|
| + }
|
| +}
|
| +
|
| +/* mapping comparisons ------------------------------------------------------ */
|
| +
|
| +static int32_t
|
| +compareUnicode(UCMTable *lTable, const UCMapping *l,
|
| + UCMTable *rTable, const UCMapping *r) {
|
| + const UChar32 *lu, *ru;
|
| + int32_t result, i, length;
|
| +
|
| + if(l->uLen==1 && r->uLen==1) {
|
| + /* compare two single code points */
|
| + return l->u-r->u;
|
| + }
|
| +
|
| + /* get pointers to the code point sequences */
|
| + lu=UCM_GET_CODE_POINTS(lTable, l);
|
| + ru=UCM_GET_CODE_POINTS(rTable, r);
|
| +
|
| + /* get the minimum length */
|
| + if(l->uLen<=r->uLen) {
|
| + length=l->uLen;
|
| + } else {
|
| + length=r->uLen;
|
| + }
|
| +
|
| + /* compare the code points */
|
| + for(i=0; i<length; ++i) {
|
| + result=lu[i]-ru[i];
|
| + if(result!=0) {
|
| + return result;
|
| + }
|
| + }
|
| +
|
| + /* compare the lengths */
|
| + return l->uLen-r->uLen;
|
| +}
|
| +
|
| +static int32_t
|
| +compareBytes(UCMTable *lTable, const UCMapping *l,
|
| + UCMTable *rTable, const UCMapping *r,
|
| + UBool lexical) {
|
| + const uint8_t *lb, *rb;
|
| + int32_t result, i, length;
|
| +
|
| + /*
|
| + * A lexical comparison is used for sorting in the builder, to allow
|
| + * an efficient search for a byte sequence that could be a prefix
|
| + * of a previously entered byte sequence.
|
| + *
|
| + * Comparing by lengths first is for compatibility with old .ucm tools
|
| + * like canonucm and rptp2ucm.
|
| + */
|
| + if(lexical) {
|
| + /* get the minimum length and continue */
|
| + if(l->bLen<=r->bLen) {
|
| + length=l->bLen;
|
| + } else {
|
| + length=r->bLen;
|
| + }
|
| + } else {
|
| + /* compare lengths first */
|
| + result=l->bLen-r->bLen;
|
| + if(result!=0) {
|
| + return result;
|
| + } else {
|
| + length=l->bLen;
|
| + }
|
| + }
|
| +
|
| + /* get pointers to the byte sequences */
|
| + lb=UCM_GET_BYTES(lTable, l);
|
| + rb=UCM_GET_BYTES(rTable, r);
|
| +
|
| + /* compare the bytes */
|
| + for(i=0; i<length; ++i) {
|
| + result=lb[i]-rb[i];
|
| + if(result!=0) {
|
| + return result;
|
| + }
|
| + }
|
| +
|
| + /* compare the lengths */
|
| + return l->bLen-r->bLen;
|
| +}
|
| +
|
| +/* compare UCMappings for sorting */
|
| +static int32_t
|
| +compareMappings(UCMTable *lTable, const UCMapping *l,
|
| + UCMTable *rTable, const UCMapping *r,
|
| + UBool uFirst) {
|
| + int32_t result;
|
| +
|
| + /* choose which side to compare first */
|
| + if(uFirst) {
|
| + /* Unicode then bytes */
|
| + result=compareUnicode(lTable, l, rTable, r);
|
| + if(result==0) {
|
| + result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
|
| + }
|
| + } else {
|
| + /* bytes then Unicode */
|
| + result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
|
| + if(result==0) {
|
| + result=compareUnicode(lTable, l, rTable, r);
|
| + }
|
| + }
|
| +
|
| + if(result!=0) {
|
| + return result;
|
| + }
|
| +
|
| + /* compare the flags */
|
| + return l->f-r->f;
|
| +}
|
| +
|
| +/* sorting by Unicode first sorts mappings directly */
|
| +static int32_t
|
| +compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
|
| + return compareMappings(
|
| + (UCMTable *)context, (const UCMapping *)left,
|
| + (UCMTable *)context, (const UCMapping *)right, TRUE);
|
| +}
|
| +
|
| +/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
|
| +static int32_t
|
| +compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
|
| + UCMTable *table=(UCMTable *)context;
|
| + int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
|
| + return compareMappings(
|
| + table, table->mappings+l,
|
| + table, table->mappings+r, FALSE);
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_sortTable(UCMTable *t) {
|
| + UErrorCode errorCode;
|
| + int32_t i;
|
| +
|
| + if(t->isSorted) {
|
| + return;
|
| + }
|
| +
|
| + errorCode=U_ZERO_ERROR;
|
| +
|
| + /* 1. sort by Unicode first */
|
| + uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
|
| + compareMappingsUnicodeFirst, t,
|
| + FALSE, &errorCode);
|
| +
|
| + /* build the reverseMap */
|
| + if(t->reverseMap==NULL) {
|
| + /*
|
| + * allocate mappingsCapacity instead of mappingsLength so that
|
| + * if mappings are added, the reverseMap need not be
|
| + * reallocated each time
|
| + * (see ucm_moveMappings() and ucm_addMapping())
|
| + */
|
| + t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
|
| + if(t->reverseMap==NULL) {
|
| + fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| + }
|
| + for(i=0; i<t->mappingsLength; ++i) {
|
| + t->reverseMap[i]=i;
|
| + }
|
| +
|
| + /* 2. sort reverseMap by mappings bytes first */
|
| + uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
|
| + compareMappingsBytesFirst, t,
|
| + FALSE, &errorCode);
|
| +
|
| + if(U_FAILURE(errorCode)) {
|
| + fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
|
| + u_errorName(errorCode));
|
| + exit(errorCode);
|
| + }
|
| +
|
| + t->isSorted=TRUE;
|
| +}
|
| +
|
| +/*
|
| + * remove mappings with their move flag set from the base table
|
| + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
|
| + */
|
| +U_CAPI void U_EXPORT2
|
| +ucm_moveMappings(UCMTable *base, UCMTable *ext) {
|
| + UCMapping *mb, *mbLimit;
|
| + int8_t flag;
|
| +
|
| + mb=base->mappings;
|
| + mbLimit=mb+base->mappingsLength;
|
| +
|
| + while(mb<mbLimit) {
|
| + flag=mb->moveFlag;
|
| + if(flag!=0) {
|
| + /* reset the move flag */
|
| + mb->moveFlag=0;
|
| +
|
| + if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
|
| + /* add the mapping to the extension table */
|
| + ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
|
| + }
|
| +
|
| + /* remove this mapping: move the last base mapping down and overwrite the current one */
|
| + if(mb<(mbLimit-1)) {
|
| + uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
|
| + }
|
| + --mbLimit;
|
| + --base->mappingsLength;
|
| + base->isSorted=FALSE;
|
| + } else {
|
| + ++mb;
|
| + }
|
| + }
|
| +}
|
| +
|
| +enum {
|
| + NEEDS_MOVE=1,
|
| + HAS_ERRORS=2
|
| +};
|
| +
|
| +static uint8_t
|
| +checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
| + UBool moveToExt, UBool intersectBase) {
|
| + UCMapping *mb, *me, *mbLimit, *meLimit;
|
| + int32_t cmp;
|
| + uint8_t result;
|
| +
|
| + mb=base->mappings;
|
| + mbLimit=mb+base->mappingsLength;
|
| +
|
| + me=ext->mappings;
|
| + meLimit=me+ext->mappingsLength;
|
| +
|
| + result=0;
|
| +
|
| + for(;;) {
|
| + /* skip irrelevant mappings on both sides */
|
| + for(;;) {
|
| + if(mb==mbLimit) {
|
| + return result;
|
| + }
|
| +
|
| + if(0<=mb->f && mb->f<=2) {
|
| + break;
|
| + }
|
| +
|
| + ++mb;
|
| + }
|
| +
|
| + for(;;) {
|
| + if(me==meLimit) {
|
| + return result;
|
| + }
|
| +
|
| + if(0<=me->f && me->f<=2) {
|
| + break;
|
| + }
|
| +
|
| + ++me;
|
| + }
|
| +
|
| + /* compare the base and extension mappings */
|
| + cmp=compareUnicode(base, mb, ext, me);
|
| + if(cmp<0) {
|
| + if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
|
| + /*
|
| + * mapping in base but not in ext, move it
|
| + *
|
| + * if ext is DBCS, move DBCS mappings here
|
| + * and check SBCS ones for Unicode prefix below
|
| + */
|
| + mb->moveFlag|=UCM_MOVE_TO_EXT;
|
| + result|=NEEDS_MOVE;
|
| +
|
| + /* does mb map from an input sequence that is a prefix of me's? */
|
| + } else if( mb->uLen<me->uLen &&
|
| + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
|
| + ) {
|
| + if(moveToExt) {
|
| + /* mark this mapping to be moved to the extension table */
|
| + mb->moveFlag|=UCM_MOVE_TO_EXT;
|
| + result|=NEEDS_MOVE;
|
| + } else {
|
| + fprintf(stderr,
|
| + "ucm error: the base table contains a mapping whose input sequence\n"
|
| + " is a prefix of the input sequence of an extension mapping\n");
|
| + ucm_printMapping(base, mb, stderr);
|
| + ucm_printMapping(ext, me, stderr);
|
| + result|=HAS_ERRORS;
|
| + }
|
| + }
|
| +
|
| + ++mb;
|
| + } else if(cmp==0) {
|
| + /*
|
| + * same output: remove the extension mapping,
|
| + * otherwise treat as an error
|
| + */
|
| + if( mb->f==me->f && mb->bLen==me->bLen &&
|
| + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
|
| + ) {
|
| + me->moveFlag|=UCM_REMOVE_MAPPING;
|
| + result|=NEEDS_MOVE;
|
| + } else if(intersectBase) {
|
| + /* mapping in base but not in ext, move it */
|
| + mb->moveFlag|=UCM_MOVE_TO_EXT;
|
| + result|=NEEDS_MOVE;
|
| + } else {
|
| + fprintf(stderr,
|
| + "ucm error: the base table contains a mapping whose input sequence\n"
|
| + " is the same as the input sequence of an extension mapping\n"
|
| + " but it maps differently\n");
|
| + ucm_printMapping(base, mb, stderr);
|
| + ucm_printMapping(ext, me, stderr);
|
| + result|=HAS_ERRORS;
|
| + }
|
| +
|
| + ++mb;
|
| + } else /* cmp>0 */ {
|
| + ++me;
|
| + }
|
| + }
|
| +}
|
| +
|
| +static uint8_t
|
| +checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
| + UBool moveToExt, UBool intersectBase) {
|
| + UCMapping *mb, *me;
|
| + int32_t *baseMap, *extMap;
|
| + int32_t b, e, bLimit, eLimit, cmp;
|
| + uint8_t result;
|
| + UBool isSISO;
|
| +
|
| + baseMap=base->reverseMap;
|
| + extMap=ext->reverseMap;
|
| +
|
| + b=e=0;
|
| + bLimit=base->mappingsLength;
|
| + eLimit=ext->mappingsLength;
|
| +
|
| + result=0;
|
| +
|
| + isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
|
| +
|
| + for(;;) {
|
| + /* skip irrelevant mappings on both sides */
|
| + for(;; ++b) {
|
| + if(b==bLimit) {
|
| + return result;
|
| + }
|
| + mb=base->mappings+baseMap[b];
|
| +
|
| + if(intersectBase==2 && mb->bLen==1) {
|
| + /*
|
| + * comparing a base against a DBCS extension:
|
| + * leave SBCS base mappings alone
|
| + */
|
| + continue;
|
| + }
|
| +
|
| + if(mb->f==0 || mb->f==3) {
|
| + break;
|
| + }
|
| + }
|
| +
|
| + for(;;) {
|
| + if(e==eLimit) {
|
| + return result;
|
| + }
|
| + me=ext->mappings+extMap[e];
|
| +
|
| + if(me->f==0 || me->f==3) {
|
| + break;
|
| + }
|
| +
|
| + ++e;
|
| + }
|
| +
|
| + /* compare the base and extension mappings */
|
| + cmp=compareBytes(base, mb, ext, me, TRUE);
|
| + if(cmp<0) {
|
| + if(intersectBase) {
|
| + /* mapping in base but not in ext, move it */
|
| + mb->moveFlag|=UCM_MOVE_TO_EXT;
|
| + result|=NEEDS_MOVE;
|
| +
|
| + /*
|
| + * does mb map from an input sequence that is a prefix of me's?
|
| + * for SI/SO tables, a single byte is never a prefix because it
|
| + * occurs in a separate single-byte state
|
| + */
|
| + } else if( mb->bLen<me->bLen &&
|
| + (!isSISO || mb->bLen>1) &&
|
| + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
|
| + ) {
|
| + if(moveToExt) {
|
| + /* mark this mapping to be moved to the extension table */
|
| + mb->moveFlag|=UCM_MOVE_TO_EXT;
|
| + result|=NEEDS_MOVE;
|
| + } else {
|
| + fprintf(stderr,
|
| + "ucm error: the base table contains a mapping whose input sequence\n"
|
| + " is a prefix of the input sequence of an extension mapping\n");
|
| + ucm_printMapping(base, mb, stderr);
|
| + ucm_printMapping(ext, me, stderr);
|
| + result|=HAS_ERRORS;
|
| + }
|
| + }
|
| +
|
| + ++b;
|
| + } else if(cmp==0) {
|
| + /*
|
| + * same output: remove the extension mapping,
|
| + * otherwise treat as an error
|
| + */
|
| + if( mb->f==me->f && mb->uLen==me->uLen &&
|
| + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
|
| + ) {
|
| + me->moveFlag|=UCM_REMOVE_MAPPING;
|
| + result|=NEEDS_MOVE;
|
| + } else if(intersectBase) {
|
| + /* mapping in base but not in ext, move it */
|
| + mb->moveFlag|=UCM_MOVE_TO_EXT;
|
| + result|=NEEDS_MOVE;
|
| + } else {
|
| + fprintf(stderr,
|
| + "ucm error: the base table contains a mapping whose input sequence\n"
|
| + " is the same as the input sequence of an extension mapping\n"
|
| + " but it maps differently\n");
|
| + ucm_printMapping(base, mb, stderr);
|
| + ucm_printMapping(ext, me, stderr);
|
| + result|=HAS_ERRORS;
|
| + }
|
| +
|
| + ++b;
|
| + } else /* cmp>0 */ {
|
| + ++e;
|
| + }
|
| + }
|
| +}
|
| +
|
| +U_CAPI UBool U_EXPORT2
|
| +ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
|
| + UCMapping *m, *mLimit;
|
| + int32_t count;
|
| + UBool isOK;
|
| +
|
| + m=table->mappings;
|
| + mLimit=m+table->mappingsLength;
|
| + isOK=TRUE;
|
| +
|
| + while(m<mLimit) {
|
| + count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
|
| + if(count<1) {
|
| + ucm_printMapping(table, m, stderr);
|
| + isOK=FALSE;
|
| + }
|
| + ++m;
|
| + }
|
| +
|
| + return isOK;
|
| +}
|
| +
|
| +U_CAPI UBool U_EXPORT2
|
| +ucm_checkBaseExt(UCMStates *baseStates,
|
| + UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
|
| + UBool intersectBase) {
|
| + uint8_t result;
|
| +
|
| + /* if we have an extension table, we must always use precision flags */
|
| + if(base->flagsType&UCM_FLAGS_IMPLICIT) {
|
| + fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
|
| + return FALSE;
|
| + }
|
| + if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
|
| + fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
|
| + return FALSE;
|
| + }
|
| +
|
| + /* checking requires both tables to be sorted */
|
| + ucm_sortTable(base);
|
| + ucm_sortTable(ext);
|
| +
|
| + /* check */
|
| + result=
|
| + checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
|
| + checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
|
| +
|
| + if(result&HAS_ERRORS) {
|
| + return FALSE;
|
| + }
|
| +
|
| + if(result&NEEDS_MOVE) {
|
| + ucm_moveMappings(ext, NULL);
|
| + ucm_moveMappings(base, moveTarget);
|
| + ucm_sortTable(base);
|
| + ucm_sortTable(ext);
|
| + if(moveTarget!=NULL) {
|
| + ucm_sortTable(moveTarget);
|
| + }
|
| + }
|
| +
|
| + return TRUE;
|
| +}
|
| +
|
| +/* merge tables for rptp2ucm ------------------------------------------------ */
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
|
| + const uint8_t *subchar, int32_t subcharLength,
|
| + uint8_t subchar1) {
|
| + UCMapping *fromUMapping, *toUMapping;
|
| + int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
|
| +
|
| + ucm_sortTable(fromUTable);
|
| + ucm_sortTable(toUTable);
|
| +
|
| + fromUMapping=fromUTable->mappings;
|
| + toUMapping=toUTable->mappings;
|
| +
|
| + fromUTop=fromUTable->mappingsLength;
|
| + toUTop=toUTable->mappingsLength;
|
| +
|
| + fromUIndex=toUIndex=0;
|
| +
|
| + while(fromUIndex<fromUTop && toUIndex<toUTop) {
|
| + cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
|
| + if(cmp==0) {
|
| + /* equal: roundtrip, nothing to do (flags are initially 0) */
|
| + ++fromUMapping;
|
| + ++toUMapping;
|
| +
|
| + ++fromUIndex;
|
| + ++toUIndex;
|
| + } else if(cmp<0) {
|
| + /*
|
| + * the fromU mapping does not have a toU counterpart:
|
| + * fallback Unicode->codepage
|
| + */
|
| + if( (fromUMapping->bLen==subcharLength &&
|
| + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
|
| + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
|
| + ) {
|
| + fromUMapping->f=2; /* SUB mapping */
|
| + } else {
|
| + fromUMapping->f=1; /* normal fallback */
|
| + }
|
| +
|
| + ++fromUMapping;
|
| + ++fromUIndex;
|
| + } else {
|
| + /*
|
| + * the toU mapping does not have a fromU counterpart:
|
| + * (reverse) fallback codepage->Unicode, copy it to the fromU table
|
| + */
|
| +
|
| + /* ignore reverse fallbacks to Unicode SUB */
|
| + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
|
| + toUMapping->f=3; /* reverse fallback */
|
| + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
|
| +
|
| + /* the table may have been reallocated */
|
| + fromUMapping=fromUTable->mappings+fromUIndex;
|
| + }
|
| +
|
| + ++toUMapping;
|
| + ++toUIndex;
|
| + }
|
| + }
|
| +
|
| + /* either one or both tables are exhausted */
|
| + while(fromUIndex<fromUTop) {
|
| + /* leftover fromU mappings are fallbacks */
|
| + if( (fromUMapping->bLen==subcharLength &&
|
| + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
|
| + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
|
| + ) {
|
| + fromUMapping->f=2; /* SUB mapping */
|
| + } else {
|
| + fromUMapping->f=1; /* normal fallback */
|
| + }
|
| +
|
| + ++fromUMapping;
|
| + ++fromUIndex;
|
| + }
|
| +
|
| + while(toUIndex<toUTop) {
|
| + /* leftover toU mappings are reverse fallbacks */
|
| +
|
| + /* ignore reverse fallbacks to Unicode SUB */
|
| + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
|
| + toUMapping->f=3; /* reverse fallback */
|
| + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
|
| + }
|
| +
|
| + ++toUMapping;
|
| + ++toUIndex;
|
| + }
|
| +
|
| + fromUTable->isSorted=FALSE;
|
| +}
|
| +
|
| +/* separate extension mappings out of base table for rptp2ucm --------------- */
|
| +
|
| +U_CAPI UBool U_EXPORT2
|
| +ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
|
| + UCMTable *table;
|
| + UCMapping *m, *mLimit;
|
| + int32_t type;
|
| + UBool needsMove, isOK;
|
| +
|
| + table=ucm->base;
|
| + m=table->mappings;
|
| + mLimit=m+table->mappingsLength;
|
| +
|
| + needsMove=FALSE;
|
| + isOK=TRUE;
|
| +
|
| + for(; m<mLimit; ++m) {
|
| + if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
|
| + fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
|
| + ucm_printMapping(table, m, stderr);
|
| + m->moveFlag|=UCM_REMOVE_MAPPING;
|
| + needsMove=TRUE;
|
| + continue;
|
| + }
|
| +
|
| + type=ucm_mappingType(
|
| + &ucm->states, m,
|
| + UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
|
| + if(type<0) {
|
| + /* illegal byte sequence */
|
| + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
|
| + isOK=FALSE;
|
| + } else if(type>0) {
|
| + m->moveFlag|=UCM_MOVE_TO_EXT;
|
| + needsMove=TRUE;
|
| + }
|
| + }
|
| +
|
| + if(!isOK) {
|
| + return FALSE;
|
| + }
|
| + if(needsMove) {
|
| + ucm_moveMappings(ucm->base, ucm->ext);
|
| + return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
|
| + } else {
|
| + ucm_sortTable(ucm->base);
|
| + return TRUE;
|
| + }
|
| +}
|
| +
|
| +/* ucm parser --------------------------------------------------------------- */
|
| +
|
| +U_CAPI int8_t U_EXPORT2
|
| +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
|
| + const char *s=*ps;
|
| + char *end;
|
| + uint8_t byte;
|
| + int8_t bLen;
|
| +
|
| + bLen=0;
|
| + for(;;) {
|
| + /* skip an optional plus sign */
|
| + if(bLen>0 && *s=='+') {
|
| + ++s;
|
| + }
|
| + if(*s!='\\') {
|
| + break;
|
| + }
|
| +
|
| + if( s[1]!='x' ||
|
| + (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
|
| + ) {
|
| + fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
|
| + return -1;
|
| + }
|
| +
|
| + if(bLen==UCNV_EXT_MAX_BYTES) {
|
| + fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
|
| + return -1;
|
| + }
|
| + bytes[bLen++]=byte;
|
| + s=end;
|
| + }
|
| +
|
| + *ps=s;
|
| + return bLen;
|
| +}
|
| +
|
| +/* parse a mapping line; must not be empty */
|
| +U_CAPI UBool U_EXPORT2
|
| +ucm_parseMappingLine(UCMapping *m,
|
| + UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
| + uint8_t bytes[UCNV_EXT_MAX_BYTES],
|
| + const char *line) {
|
| + const char *s;
|
| + char *end;
|
| + UChar32 cp;
|
| + int32_t u16Length;
|
| + int8_t uLen, bLen, f;
|
| +
|
| + s=line;
|
| + uLen=bLen=0;
|
| +
|
| + /* parse code points */
|
| + for(;;) {
|
| + /* skip an optional plus sign */
|
| + if(uLen>0 && *s=='+') {
|
| + ++s;
|
| + }
|
| + if(*s!='<') {
|
| + break;
|
| + }
|
| +
|
| + if( s[1]!='U' ||
|
| + (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
|
| + *end!='>'
|
| + ) {
|
| + fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
|
| + return FALSE;
|
| + }
|
| + if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
|
| + fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
|
| + return FALSE;
|
| + }
|
| +
|
| + if(uLen==UCNV_EXT_MAX_UCHARS) {
|
| + fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
|
| + return FALSE;
|
| + }
|
| + codePoints[uLen++]=cp;
|
| + s=end+1;
|
| + }
|
| +
|
| + if(uLen==0) {
|
| + fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
|
| + return FALSE;
|
| + } else if(uLen==1) {
|
| + m->u=codePoints[0];
|
| + } else {
|
| + UErrorCode errorCode=U_ZERO_ERROR;
|
| + u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
|
| + if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
|
| + u16Length>UCNV_EXT_MAX_UCHARS
|
| + ) {
|
| + fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
|
| + return FALSE;
|
| + }
|
| + }
|
| +
|
| + s=u_skipWhitespace(s);
|
| +
|
| + /* parse bytes */
|
| + bLen=ucm_parseBytes(bytes, line, &s);
|
| +
|
| + if(bLen<0) {
|
| + return FALSE;
|
| + } else if(bLen==0) {
|
| + fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
|
| + return FALSE;
|
| + } else if(bLen<=4) {
|
| + uprv_memcpy(m->b.bytes, bytes, bLen);
|
| + }
|
| +
|
| + /* skip everything until the fallback indicator, even the start of a comment */
|
| + for(;;) {
|
| + if(*s==0) {
|
| + f=-1; /* no fallback indicator */
|
| + break;
|
| + } else if(*s=='|') {
|
| + f=(int8_t)(s[1]-'0');
|
| + if((uint8_t)f>3) {
|
| + fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
|
| + return FALSE;
|
| + }
|
| + break;
|
| + }
|
| + ++s;
|
| + }
|
| +
|
| + m->uLen=uLen;
|
| + m->bLen=bLen;
|
| + m->f=f;
|
| + return TRUE;
|
| +}
|
| +
|
| +/* general APIs ------------------------------------------------------------- */
|
| +
|
| +U_CAPI UCMTable * U_EXPORT2
|
| +ucm_openTable() {
|
| + UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
|
| + if(table==NULL) {
|
| + fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| +
|
| + memset(table, 0, sizeof(UCMTable));
|
| + return table;
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_closeTable(UCMTable *table) {
|
| + if(table!=NULL) {
|
| + uprv_free(table->mappings);
|
| + uprv_free(table->codePoints);
|
| + uprv_free(table->bytes);
|
| + uprv_free(table->reverseMap);
|
| + uprv_free(table);
|
| + }
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_resetTable(UCMTable *table) {
|
| + if(table!=NULL) {
|
| + table->mappingsLength=0;
|
| + table->flagsType=0;
|
| + table->unicodeMask=0;
|
| + table->bytesLength=table->codePointsLength=0;
|
| + table->isSorted=FALSE;
|
| + }
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_addMapping(UCMTable *table,
|
| + UCMapping *m,
|
| + UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
| + uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
|
| + UCMapping *tm;
|
| + UChar32 c;
|
| + int32_t idx;
|
| +
|
| + if(table->mappingsLength>=table->mappingsCapacity) {
|
| + /* make the mappings array larger */
|
| + if(table->mappingsCapacity==0) {
|
| + table->mappingsCapacity=1000;
|
| + } else {
|
| + table->mappingsCapacity*=10;
|
| + }
|
| + table->mappings=(UCMapping *)uprv_realloc(table->mappings,
|
| + table->mappingsCapacity*sizeof(UCMapping));
|
| + if(table->mappings==NULL) {
|
| + fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
|
| + (int)table->mappingsCapacity);
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| +
|
| + if(table->reverseMap!=NULL) {
|
| + /* the reverseMap must be reallocated in a new sort */
|
| + uprv_free(table->reverseMap);
|
| + table->reverseMap=NULL;
|
| + }
|
| + }
|
| +
|
| + if(m->uLen>1 && table->codePointsCapacity==0) {
|
| + table->codePointsCapacity=10000;
|
| + table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
|
| + if(table->codePoints==NULL) {
|
| + fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
|
| + (int)table->codePointsCapacity);
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| + }
|
| +
|
| + if(m->bLen>4 && table->bytesCapacity==0) {
|
| + table->bytesCapacity=10000;
|
| + table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
|
| + if(table->bytes==NULL) {
|
| + fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
|
| + (int)table->bytesCapacity);
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| + }
|
| +
|
| + if(m->uLen>1) {
|
| + idx=table->codePointsLength;
|
| + table->codePointsLength+=m->uLen;
|
| + if(table->codePointsLength>table->codePointsCapacity) {
|
| + fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| +
|
| + uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
|
| + m->u=idx;
|
| + }
|
| +
|
| + if(m->bLen>4) {
|
| + idx=table->bytesLength;
|
| + table->bytesLength+=m->bLen;
|
| + if(table->bytesLength>table->bytesCapacity) {
|
| + fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| +
|
| + uprv_memcpy(table->bytes+idx, bytes, m->bLen);
|
| + m->b.idx=idx;
|
| + }
|
| +
|
| + /* set unicodeMask */
|
| + for(idx=0; idx<m->uLen; ++idx) {
|
| + c=codePoints[idx];
|
| + if(c>=0x10000) {
|
| + table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
|
| + } else if(U_IS_SURROGATE(c)) {
|
| + table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
|
| + }
|
| + }
|
| +
|
| + /* set flagsType */
|
| + if(m->f<0) {
|
| + table->flagsType|=UCM_FLAGS_IMPLICIT;
|
| + } else {
|
| + table->flagsType|=UCM_FLAGS_EXPLICIT;
|
| + }
|
| +
|
| + tm=table->mappings+table->mappingsLength++;
|
| + uprv_memcpy(tm, m, sizeof(UCMapping));
|
| +
|
| + table->isSorted=FALSE;
|
| +}
|
| +
|
| +U_CAPI UCMFile * U_EXPORT2
|
| +ucm_open() {
|
| + UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
|
| + if(ucm==NULL) {
|
| + fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
|
| + exit(U_MEMORY_ALLOCATION_ERROR);
|
| + }
|
| +
|
| + memset(ucm, 0, sizeof(UCMFile));
|
| +
|
| + ucm->base=ucm_openTable();
|
| + ucm->ext=ucm_openTable();
|
| +
|
| + ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
|
| + ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
|
| + ucm->states.outputType=-1;
|
| + ucm->states.minCharLength=ucm->states.maxCharLength=1;
|
| +
|
| + return ucm;
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_close(UCMFile *ucm) {
|
| + if(ucm!=NULL) {
|
| + uprv_free(ucm->base);
|
| + uprv_free(ucm->ext);
|
| + uprv_free(ucm);
|
| + }
|
| +}
|
| +
|
| +U_CAPI int32_t U_EXPORT2
|
| +ucm_mappingType(UCMStates *baseStates,
|
| + UCMapping *m,
|
| + UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
| + uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
|
| + /* check validity of the bytes and count the characters in them */
|
| + int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
|
| + if(count<1) {
|
| + /* illegal byte sequence */
|
| + return -1;
|
| + }
|
| +
|
| + /*
|
| + * Suitable for an ICU conversion base table means:
|
| + * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
|
| + * - SBCS: any 1:1 mapping
|
| + * (the table stores additional bits to distinguish mapping types)
|
| + * - MBCS: not a |2 SUB mapping for <subchar1>
|
| + * - MBCS: not a |1 fallback to 0x00
|
| + * - MBCS: not a multi-byte mapping with leading 0x00 bytes
|
| + *
|
| + * Further restrictions for fromUnicode tables
|
| + * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
|
| + *
|
| + * All of the MBCS fromUnicode specific tests could be removed from here,
|
| + * but the ones above are for unusual mappings, and removing the tests
|
| + * from here would change canonucm output which seems gratuitous.
|
| + * (Markus Scherer 2006-nov-28)
|
| + *
|
| + * Exception: All implicit mappings (f<0) that need to be moved
|
| + * because of fromUnicode restrictions _must_ be moved here because
|
| + * makeconv uses a hack for moving mappings only for the fromUnicode table
|
| + * that only works with non-negative values of f.
|
| + */
|
| + if( m->uLen==1 && count==1 &&
|
| + (baseStates->maxCharLength==1 ||
|
| + !((m->f==2 && m->bLen==1) ||
|
| + (m->f==1 && bytes[0]==0) ||
|
| + (m->f<=1 && m->bLen>1 && bytes[0]==0)))
|
| + ) {
|
| + return 0; /* suitable for a base table */
|
| + } else {
|
| + return 1; /* needs to go into an extension table */
|
| + }
|
| +}
|
| +
|
| +U_CAPI UBool U_EXPORT2
|
| +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
|
| + UCMapping *m,
|
| + UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
|
| + uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
|
| + int32_t type;
|
| +
|
| + if(m->f==2 && m->uLen>1) {
|
| + fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
|
| + printMapping(m, codePoints, bytes, stderr);
|
| + return FALSE;
|
| + }
|
| +
|
| + if(baseStates!=NULL) {
|
| + /* check validity of the bytes and count the characters in them */
|
| + type=ucm_mappingType(baseStates, m, codePoints, bytes);
|
| + if(type<0) {
|
| + /* illegal byte sequence */
|
| + printMapping(m, codePoints, bytes, stderr);
|
| + return FALSE;
|
| + }
|
| + } else {
|
| + /* not used - adding a mapping for an extension-only table before its base table is read */
|
| + type=1;
|
| + }
|
| +
|
| + /*
|
| + * Add the mapping to the base table if this is requested and suitable.
|
| + * Otherwise, add it to the extension table.
|
| + */
|
| + if(forBase && type==0) {
|
| + ucm_addMapping(ucm->base, m, codePoints, bytes);
|
| + } else {
|
| + ucm_addMapping(ucm->ext, m, codePoints, bytes);
|
| + }
|
| +
|
| + return TRUE;
|
| +}
|
| +
|
| +U_CAPI UBool U_EXPORT2
|
| +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
|
| + UCMapping m={ 0 };
|
| + UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
|
| + uint8_t bytes[UCNV_EXT_MAX_BYTES];
|
| +
|
| + const char *s;
|
| +
|
| + /* ignore empty and comment lines */
|
| + if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
|
| + return TRUE;
|
| + }
|
| +
|
| + return
|
| + ucm_parseMappingLine(&m, codePoints, bytes, line) &&
|
| + ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
|
| +}
|
| +
|
| +U_CAPI void U_EXPORT2
|
| +ucm_readTable(UCMFile *ucm, FileStream* convFile,
|
| + UBool forBase, UCMStates *baseStates,
|
| + UErrorCode *pErrorCode) {
|
| + char line[500];
|
| + char *end;
|
| + UBool isOK;
|
| +
|
| + if(U_FAILURE(*pErrorCode)) {
|
| + return;
|
| + }
|
| +
|
| + isOK=TRUE;
|
| +
|
| + for(;;) {
|
| + /* read the next line */
|
| + if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
|
| + fprintf(stderr, "incomplete charmap section\n");
|
| + isOK=FALSE;
|
| + break;
|
| + }
|
| +
|
| + /* remove CR LF */
|
| + end=uprv_strchr(line, 0);
|
| + while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
|
| + --end;
|
| + }
|
| + *end=0;
|
| +
|
| + /* ignore empty and comment lines */
|
| + if(line[0]==0 || line[0]=='#') {
|
| + continue;
|
| + }
|
| +
|
| + /* stop at the end of the mapping table */
|
| + if(0==uprv_strcmp(line, "END CHARMAP")) {
|
| + break;
|
| + }
|
| +
|
| + isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
|
| + }
|
| +
|
| + if(!isOK) {
|
| + *pErrorCode=U_INVALID_TABLE_FORMAT;
|
| + }
|
| +}
|
| +#endif
|
|
|
| Property changes on: icu46/source/tools/toolutil/ucm.c
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|