icu46/source/common/unames.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/common/unames.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/common/unames.c

===================================================================

--- icu46/source/common/unames.c (revision 0)

+++ icu46/source/common/unames.c (revision 0)

@@ -0,0 +1,2113 @@

+/*

+******************************************************************************

+* file name: unames.c

+* encoding: US-ASCII

+* tab size: 8 (not used)

+* indentation:4

+* created on: 1999oct04

+* created by: Markus W. Scherer

+*/

+#include "unicode/utypes.h"

+#include "unicode/putil.h"

+#include "unicode/uchar.h"

+#include "unicode/udata.h"

+#include "ustr_imp.h"

+#include "umutex.h"

+#include "cmemory.h"

+#include "cstring.h"

+#include "ucln_cmn.h"

+#include "udataswp.h"

+#include "uprops.h"

+/* prototypes ------------------------------------------------------------- */

+#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))

+static const char DATA_NAME[] = "unames";

+static const char DATA_TYPE[] = "icu";

+#define GROUP_SHIFT 5

+#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)

+#define GROUP_MASK (LINES_PER_GROUP-1)

+/*

+ * This struct was replaced by explicitly accessing equivalent

+ * fields from triples of uint16_t.

+ * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,

+ * which broke the assumption that sizeof(Group)==6 and that the ++ operator

+ * would advance by 6 bytes (3 uint16_t).

+ *

+ * We can't just change the data structure because it's loaded from a data file,

+ * and we don't want to make it less compact, so we changed the access code.

+ *

+ * For details see ICU tickets 6331 and 6008.

+typedef struct {

+ uint16_t groupMSB,

+ offsetHigh, offsetLow; / * avoid padding * /

+} Group;

+ */

+enum {

+ GROUP_MSB,

+ GROUP_OFFSET_HIGH,

+ GROUP_OFFSET_LOW,

+ GROUP_LENGTH

+};

+/*

+ * Get the 32-bit group offset.

+ * @param group (const uint16_t *) pointer to a Group triple of uint16_t

+ * @return group offset (int32_t)

+ */

+#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])

+#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)

+#define PREV_GROUP(group) ((group)-GROUP_LENGTH)

+typedef struct {

+ uint32_t start, end;

+ uint8_t type, variant;

+ uint16_t size;

+} AlgorithmicRange;

+typedef struct {

+ uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;

+} UCharNames;

+/*

+ * Get the groups table from a UCharNames struct.

+ * The groups table consists of one uint16_t groupCount followed by

+ * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH

+ * and the comment for the old struct Group above.

+ *

+ * @param names (const UCharNames *) pointer to the UCharNames indexes

+ * @return (const uint16_t *) pointer to the groups table

+ */

+#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)

+typedef struct {

+ const char *otherName;

+ UChar32 code;

+} FindName;

+#define DO_FIND_NAME NULL

+static UDataMemory *uCharNamesData=NULL;

+static UCharNames *uCharNames=NULL;

+static UErrorCode gLoadErrorCode=U_ZERO_ERROR;

+/*

+ * Maximum length of character names (regular & 1.0).

+ */

+static int32_t gMaxNameLength=0;

+/*

+ * Set of chars used in character names (regular & 1.0).

+ * Chars are platform-dependent (can be EBCDIC).

+ */

+static uint32_t gNameSet[8]={ 0 };

+#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT

+#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1

+#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2

+#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)

+static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {

+ "unassigned",

+ "uppercase letter",

+ "lowercase letter",

+ "titlecase letter",

+ "modifier letter",

+ "other letter",

+ "non spacing mark",

+ "enclosing mark",

+ "combining spacing mark",

+ "decimal digit number",

+ "letter number",

+ "other number",

+ "space separator",

+ "line separator",

+ "paragraph separator",

+ "control",

+ "format",

+ "private use area",

+ "surrogate",

+ "dash punctuation",

+ "start punctuation",

+ "end punctuation",

+ "connector punctuation",

+ "other punctuation",

+ "math symbol",

+ "currency symbol",

+ "modifier symbol",

+ "other symbol",

+ "initial punctuation",

+ "final punctuation",

+ "noncharacter",

+ "lead surrogate",

+ "trail surrogate"

+};

+/* implementation ----------------------------------------------------------- */

+static UBool U_CALLCONV unames_cleanup(void)

+ if(uCharNamesData) {

+ udata_close(uCharNamesData);

+ uCharNamesData = NULL;

+ }

+ if(uCharNames) {

+ uCharNames = NULL;

+ }

+ gMaxNameLength=0;

+ return TRUE;

+static UBool U_CALLCONV

+isAcceptable(void *context,

+ const char *type, const char *name,

+ const UDataInfo *pInfo) {

+ return (UBool)(

+ pInfo->size>=20 &&

+ pInfo->isBigEndian==U_IS_BIG_ENDIAN &&

+ pInfo->charsetFamily==U_CHARSET_FAMILY &&

+ pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */

+ pInfo->dataFormat[1]==0x6e &&

+ pInfo->dataFormat[2]==0x61 &&

+ pInfo->dataFormat[3]==0x6d &&

+ pInfo->formatVersion[0]==1);

+static UBool

+isDataLoaded(UErrorCode *pErrorCode) {

+ /* load UCharNames from file if necessary */

+ UBool isCached;

+ /* do this because double-checked locking is broken */

+ UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);

+ if(!isCached) {

+ UCharNames *names;

+ UDataMemory *data;

+ /* check error code from previous attempt */

+ if(U_FAILURE(gLoadErrorCode)) {

+ *pErrorCode=gLoadErrorCode;

+ return FALSE;

+ }

+ /* open the data outside the mutex block */

+ data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);

+ if(U_FAILURE(*pErrorCode)) {

+ gLoadErrorCode=*pErrorCode;

+ return FALSE;

+ }

+ names=(UCharNames *)udata_getMemory(data);

+ /* in the mutex block, set the data for this process */

+ {

+ umtx_lock(NULL);

+ if(uCharNames==NULL) {

+ uCharNamesData=data;

+ uCharNames=names;

+ data=NULL;

+ names=NULL;

+ ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);

+ }

+ umtx_unlock(NULL);

+ }

+ /* if a different thread set it first, then close the extra data */

+ if(data!=NULL) {

+ udata_close(data); /* NULL if it was set correctly */

+ }

+ return TRUE;

+#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \

+ if((bufferLength)>0) { \

+ *(buffer)++=c; \

+ --(bufferLength); \

+ } \

+ ++(bufferPos); \

+#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT

+/*

+ * Important: expandName() and compareName() are almost the same -

+ * apply fixes to both.

+ *

+ * UnicodeData.txt uses ';' as a field separator, so no

+ * field can contain ';' as part of its contents.

+ * In unames.dat, it is marked as token[';']==-1 only if the

+ * semicolon is used in the data file - which is iff we

+ * have Unicode 1.0 names or ISO comments or aliases.

+ * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases

+ * although we know that it will never be part of a name.

+ */

+static uint16_t

+expandName(UCharNames *names,

+ const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,

+ char *buffer, uint16_t bufferLength) {

+ uint16_t *tokens=(uint16_t *)names+8;

+ uint16_t token, tokenCount=*tokens++, bufferPos=0;

+ uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;

+ uint8_t c;

+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {

+ /*

+ * skip the modern name if it is not requested _and_

+ * if the semicolon byte value is a character, not a token number

+ */

+ if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {

+ int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;

+ do {

+ while(nameLength>0) {

+ --nameLength;

+ if(*name++==';') {

+ break;

+ }

+ } while(--fieldIndex>0);

+ } else {

+ /*

+ * the semicolon byte value is a token number, therefore

+ * only modern names are stored in unames.dat and there is no

+ * such requested alternate name here

+ */

+ nameLength=0;

+ }

+ /* write each letter directly, and write a token word per token */

+ while(nameLength>0) {

+ --nameLength;

+ c=*name++;

+ if(c>=tokenCount) {

+ if(c!=';') {

+ /* implicit letter */

+ WRITE_CHAR(buffer, bufferLength, bufferPos, c);

+ } else {

+ /* finished */

+ break;

+ }

+ } else {

+ token=tokens[c];

+ if(token==(uint16_t)(-2)) {

+ /* this is a lead byte for a double-byte token */

+ token=tokens[c<<8|*name++];

+ --nameLength;

+ }

+ if(token==(uint16_t)(-1)) {

+ if(c!=';') {

+ /* explicit letter */

+ WRITE_CHAR(buffer, bufferLength, bufferPos, c);

+ } else {

+ /* stop, but skip the semicolon if we are seeking

+ extended names and there was no 2.0 name but there

+ is a 1.0 name. */

+ if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {

+ if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {

+ continue;

+ }

+ /* finished */

+ break;

+ }

+ } else {

+ /* write token word */

+ uint8_t *tokenString=tokenStrings+token;

+ while((c=*tokenString++)!=0) {

+ WRITE_CHAR(buffer, bufferLength, bufferPos, c);

+ }

+ /* zero-terminate */

+ if(bufferLength>0) {

+ *buffer=0;

+ }

+ return bufferPos;

+/*

+ * compareName() is almost the same as expandName() except that it compares

+ * the currently expanded name to an input name.

+ * It returns the match/no match result as soon as possible.

+ */

+static UBool

+compareName(UCharNames *names,

+ const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,

+ const char *otherName) {

+ uint16_t *tokens=(uint16_t *)names+8;

+ uint16_t token, tokenCount=*tokens++;

+ uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;

+ uint8_t c;

+ const char *origOtherName = otherName;

+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {

+ /*

+ * skip the modern name if it is not requested _and_

+ * if the semicolon byte value is a character, not a token number

+ */

+ if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {

+ int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;

+ do {

+ while(nameLength>0) {

+ --nameLength;

+ if(*name++==';') {

+ break;

+ }

+ } while(--fieldIndex>0);

+ } else {

+ /*

+ * the semicolon byte value is a token number, therefore

+ * only modern names are stored in unames.dat and there is no

+ * such requested alternate name here

+ */

+ nameLength=0;

+ }

+ /* compare each letter directly, and compare a token word per token */

+ while(nameLength>0) {

+ --nameLength;

+ c=*name++;

+ if(c>=tokenCount) {

+ if(c!=';') {

+ /* implicit letter */

+ if((char)c!=*otherName++) {

+ return FALSE;

+ }

+ } else {

+ /* finished */

+ break;

+ }

+ } else {

+ token=tokens[c];

+ if(token==(uint16_t)(-2)) {

+ /* this is a lead byte for a double-byte token */

+ token=tokens[c<<8|*name++];

+ --nameLength;

+ }

+ if(token==(uint16_t)(-1)) {

+ if(c!=';') {

+ /* explicit letter */

+ if((char)c!=*otherName++) {

+ return FALSE;

+ }

+ } else {

+ /* stop, but skip the semicolon if we are seeking

+ extended names and there was no 2.0 name but there

+ is a 1.0 name. */

+ if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {

+ if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {

+ continue;

+ }

+ /* finished */

+ break;

+ }

+ } else {

+ /* write token word */

+ uint8_t *tokenString=tokenStrings+token;

+ while((c=*tokenString++)!=0) {

+ if((char)c!=*otherName++) {

+ return FALSE;

+ }

+ /* complete match? */

+ return (UBool)(*otherName==0);

+static uint8_t getCharCat(UChar32 cp) {

+ uint8_t cat;

+ if (UTF_IS_UNICODE_NONCHAR(cp)) {

+ return U_NONCHARACTER_CODE_POINT;

+ }

+ if ((cat = u_charType(cp)) == U_SURROGATE) {

+ cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;

+ }

+ return cat;

+static const char *getCharCatName(UChar32 cp) {

+ uint8_t cat = getCharCat(cp);

+ /* Return unknown if the table of names above is not up to

+ date. */

+ if (cat >= LENGTHOF(charCatNames)) {

+ return "unknown";

+ } else {

+ return charCatNames[cat];

+ }

+static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {

+ const char *catname = getCharCatName(code);

+ uint16_t length = 0;

+ UChar32 cp;

+ int ndigits, i;

+ WRITE_CHAR(buffer, bufferLength, length, '<');

+ while (catname[length - 1]) {

+ WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);

+ }

+ WRITE_CHAR(buffer, bufferLength, length, '-');

+ for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)

+ ;

+ if (ndigits < 4)

+ ndigits = 4;

+ for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {

+ uint8_t v = (uint8_t)(cp & 0xf);

+ buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);

+ }

+ buffer += ndigits;

+ length += ndigits;

+ WRITE_CHAR(buffer, bufferLength, length, '>');

+ return length;

+/*

+ * getGroup() does a binary search for the group that contains the

+ * Unicode code point "code".

+ * The return value is always a valid Group* that may contain "code"

+ * or else is the highest group before "code".

+ * If the lowest group is after "code", then that one is returned.

+ */

+static const uint16_t *

+getGroup(UCharNames *names, uint32_t code) {

+ const uint16_t *groups=GET_GROUPS(names);

+ uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),

+ start=0,

+ limit=*groups++,

+ number;

+ /* binary search for the group of names that contains the one for code */

+ while(start<limit-1) {

+ number=(uint16_t)((start+limit)/2);

+ if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {

+ limit=number;

+ } else {

+ start=number;

+ }

+ /* return this regardless of whether it is an exact match */

+ return groups+start*GROUP_LENGTH;

+/*

+ * expandGroupLengths() reads a block of compressed lengths of 32 strings and

+ * expands them into offsets and lengths for each string.

+ * Lengths are stored with a variable-width encoding in consecutive nibbles:

+ * If a nibble<0xc, then it is the length itself (0=empty string).

+ * If a nibble>=0xc, then it forms a length value with the following nibble.

+ * Calculation see below.

+ * The offsets and lengths arrays must be at least 33 (one more) long because

+ * there is no check here at the end if the last nibble is still used.

+ */

+static const uint8_t *

+expandGroupLengths(const uint8_t *s,

+ uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {

+ /* read the lengths of the 32 strings in this group and get each string's offset */

+ uint16_t i=0, offset=0, length=0;

+ uint8_t lengthByte;

+ /* all 32 lengths must be read to get the offset of the first group string */

+ while(i<LINES_PER_GROUP) {

+ lengthByte=*s++;

+ /* read even nibble - MSBs of lengthByte */

+ if(length>=12) {

+ /* double-nibble length spread across two bytes */

+ length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);

+ lengthByte&=0xf;

+ } else if((lengthByte /* &0xf0 */)>=0xc0) {

+ /* double-nibble length spread across this one byte */

+ length=(uint16_t)((lengthByte&0x3f)+12);

+ } else {

+ /* single-nibble length in MSBs */

+ length=(uint16_t)(lengthByte>>4);

+ lengthByte&=0xf;

+ }

+ *offsets++=offset;

+ *lengths++=length;

+ offset+=length;

+ ++i;

+ /* read odd nibble - LSBs of lengthByte */

+ if((lengthByte&0xf0)==0) {

+ /* this nibble was not consumed for a double-nibble length above */

+ length=lengthByte;

+ if(length<12) {

+ /* single-nibble length in LSBs */

+ *offsets++=offset;

+ *lengths++=length;

+ offset+=length;

+ ++i;

+ }

+ } else {

+ length=0; /* prevent double-nibble detection in the next iteration */

+ }

+ /* now, s is at the first group string */

+ return s;

+static uint16_t

+expandGroupName(UCharNames *names, const uint16_t *group,

+ uint16_t lineNumber, UCharNameChoice nameChoice,

+ char *buffer, uint16_t bufferLength) {

+ uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];

+ const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);

+ s=expandGroupLengths(s, offsets, lengths);

+ return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,

+ buffer, bufferLength);

+static uint16_t

+getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,

+ char *buffer, uint16_t bufferLength) {

+ const uint16_t *group=getGroup(names, code);

+ if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {

+ return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,

+ buffer, bufferLength);

+ } else {

+ /* group not found */

+ /* zero-terminate */

+ if(bufferLength>0) {

+ *buffer=0;

+ }

+ return 0;

+ }

+/*

+ * enumGroupNames() enumerates all the names in a 32-group

+ * and either calls the enumerator function or finds a given input name.

+ */

+static UBool

+enumGroupNames(UCharNames *names, const uint16_t *group,

+ UChar32 start, UChar32 end,

+ UEnumCharNamesFn *fn, void *context,

+ UCharNameChoice nameChoice) {

+ uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];

+ const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);

+ s=expandGroupLengths(s, offsets, lengths);

+ if(fn!=DO_FIND_NAME) {

+ char buffer[200];

+ uint16_t length;

+ while(start<=end) {

+ length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));

+ if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {

+ buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;

+ }

+ /* here, we assume that the buffer is large enough */

+ if(length>0) {

+ if(!fn(context, start, nameChoice, buffer, length)) {

+ return FALSE;

+ }

+ ++start;

+ }

+ } else {

+ const char *otherName=((FindName *)context)->otherName;

+ while(start<=end) {

+ if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {

+ ((FindName *)context)->code=start;

+ return FALSE;

+ }

+ ++start;

+ }

+ return TRUE;

+/*

+ * enumExtNames enumerate extended names.

+ * It only needs to do it if it is called with a real function and not

+ * with the dummy DO_FIND_NAME, because u_charFromName() does a check

+ * for extended names by itself.

+ */

+static UBool

+enumExtNames(UChar32 start, UChar32 end,

+ UEnumCharNamesFn *fn, void *context)

+ if(fn!=DO_FIND_NAME) {

+ char buffer[200];

+ uint16_t length;

+ while(start<=end) {

+ buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;

+ /* here, we assume that the buffer is large enough */

+ if(length>0) {

+ if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {

+ return FALSE;

+ }

+ ++start;

+ }

+ return TRUE;

+static UBool

+enumNames(UCharNames *names,

+ UChar32 start, UChar32 limit,

+ UEnumCharNamesFn *fn, void *context,

+ UCharNameChoice nameChoice) {

+ uint16_t startGroupMSB, endGroupMSB, groupCount;

+ const uint16_t *group, *groupLimit;

+ startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);

+ endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);

+ /* find the group that contains start, or the highest before it */

+ group=getGroup(names, start);

+ if(startGroupMSB==endGroupMSB) {

+ if(startGroupMSB==group[GROUP_MSB]) {

+ /* if start and limit-1 are in the same group, then enumerate only in that one */

+ return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);

+ }

+ } else {

+ const uint16_t *groups=GET_GROUPS(names);

+ groupCount=*groups++;

+ groupLimit=groups+groupCount*GROUP_LENGTH;

+ if(startGroupMSB==group[GROUP_MSB]) {

+ /* enumerate characters in the partial start group */

+ if((start&GROUP_MASK)!=0) {

+ if(!enumGroupNames(names, group,

+ start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,

+ fn, context, nameChoice)) {

+ return FALSE;

+ }

+ group=NEXT_GROUP(group); /* continue with the next group */

+ }

+ } else if(startGroupMSB>group[GROUP_MSB]) {

+ /* make sure that we start enumerating with the first group after start */

+ const uint16_t *nextGroup=NEXT_GROUP(group);

+ if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {

+ UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;

+ if (end > limit) {

+ end = limit;

+ }

+ if (!enumExtNames(start, end - 1, fn, context)) {

+ return FALSE;

+ }

+ group=nextGroup;

+ }

+ /* enumerate entire groups between the start- and end-groups */

+ while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {

+ const uint16_t *nextGroup;

+ start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;

+ if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {

+ return FALSE;

+ }

+ nextGroup=NEXT_GROUP(group);

+ if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {

+ UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;

+ if (end > limit) {

+ end = limit;

+ }

+ if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {

+ return FALSE;

+ }

+ group=nextGroup;

+ }

+ /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */

+ if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {

+ return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);

+ } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {

+ UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;

+ if (next > start) {

+ start = next;

+ }

+ } else {

+ return TRUE;

+ }

+ /* we have not found a group, which means everything is made of

+ extended names. */

+ if (nameChoice == U_EXTENDED_CHAR_NAME) {

+ if (limit > UCHAR_MAX_VALUE + 1) {

+ limit = UCHAR_MAX_VALUE + 1;

+ }

+ return enumExtNames(start, limit - 1, fn, context);

+ }

+ return TRUE;

+static uint16_t

+writeFactorSuffix(const uint16_t *factors, uint16_t count,

+ const char *s, /* suffix elements */

+ uint32_t code,

+ uint16_t indexes[8], /* output fields from here */

+ const char *elementBases[8], const char *elements[8],

+ char *buffer, uint16_t bufferLength) {

+ uint16_t i, factor, bufferPos=0;

+ char c;

+ /* write elements according to the factors */

+ /*

+ * the factorized elements are determined by modulo arithmetic

+ * with the factors of this algorithm

+ *

+ * note that for fewer operations, count is decremented here

+ */

+ --count;

+ for(i=count; i>0; --i) {

+ factor=factors[i];

+ indexes[i]=(uint16_t)(code%factor);

+ code/=factor;

+ }

+ /*

+ * we don't need to calculate the last modulus because start<=code<=end

+ * guarantees here that code<=factors[0]

+ */

+ indexes[0]=(uint16_t)code;

+ /* write each element */

+ for(;;) {

+ if(elementBases!=NULL) {

+ *elementBases++=s;

+ }

+ /* skip indexes[i] strings */

+ factor=indexes[i];

+ while(factor>0) {

+ while(*s++!=0) {}

+ --factor;

+ }

+ if(elements!=NULL) {

+ *elements++=s;

+ }

+ /* write element */

+ while((c=*s++)!=0) {

+ WRITE_CHAR(buffer, bufferLength, bufferPos, c);

+ }

+ /* we do not need to perform the rest of this loop for i==count - break here */

+ if(i>=count) {

+ break;

+ }

+ /* skip the rest of the strings for this factors[i] */

+ factor=(uint16_t)(factors[i]-indexes[i]-1);

+ while(factor>0) {

+ while(*s++!=0) {}

+ --factor;

+ }

+ ++i;

+ }

+ /* zero-terminate */

+ if(bufferLength>0) {

+ *buffer=0;

+ }

+ return bufferPos;

+/*

+ * Important:

+ * Parts of findAlgName() are almost the same as some of getAlgName().

+ * Fixes must be applied to both.

+ */

+static uint16_t

+getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,

+ char *buffer, uint16_t bufferLength) {

+ uint16_t bufferPos=0;

+ /* Only the normative character name can be algorithmic. */

+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {

+ /* zero-terminate */

+ if(bufferLength>0) {

+ *buffer=0;

+ }

+ return 0;

+ }

+ switch(range->type) {

+ case 0: {

+ /* name = prefix hex-digits */

+ const char *s=(const char *)(range+1);

+ char c;

+ uint16_t i, count;

+ /* copy prefix */

+ while((c=*s++)!=0) {

+ WRITE_CHAR(buffer, bufferLength, bufferPos, c);

+ }

+ /* write hexadecimal code point value */

+ count=range->variant;

+ /* zero-terminate */

+ if(count<bufferLength) {

+ buffer[count]=0;

+ }

+ for(i=count; i>0;) {

+ if(--i<bufferLength) {

+ c=(char)(code&0xf);

+ if(c<10) {

+ c+='0';

+ } else {

+ c+='A'-10;

+ }

+ buffer[i]=c;

+ }

+ code>>=4;

+ }

+ bufferPos+=count;

+ break;

+ }

+ case 1: {

+ /* name = prefix factorized-elements */

+ uint16_t indexes[8];

+ const uint16_t *factors=(const uint16_t *)(range+1);

+ uint16_t count=range->variant;

+ const char *s=(const char *)(factors+count);

+ char c;

+ /* copy prefix */

+ while((c=*s++)!=0) {

+ WRITE_CHAR(buffer, bufferLength, bufferPos, c);

+ }

+ bufferPos+=writeFactorSuffix(factors, count,

+ s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);

+ break;

+ }

+ default:

+ /* undefined type */

+ /* zero-terminate */

+ if(bufferLength>0) {

+ *buffer=0;

+ }

+ break;

+ }

+ return bufferPos;

+/*

+ * Important: enumAlgNames() and findAlgName() are almost the same.

+ * Any fix must be applied to both.

+ */

+static UBool

+enumAlgNames(AlgorithmicRange *range,

+ UChar32 start, UChar32 limit,

+ UEnumCharNamesFn *fn, void *context,

+ UCharNameChoice nameChoice) {

+ char buffer[200];

+ uint16_t length;

+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {

+ return TRUE;

+ }

+ switch(range->type) {

+ case 0: {

+ char *s, *end;

+ char c;

+ /* get the full name of the start character */

+ length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));

+ if(length<=0) {

+ return TRUE;

+ }

+ /* call the enumerator function with this first character */

+ if(!fn(context, start, nameChoice, buffer, length)) {

+ return FALSE;

+ }

+ /* go to the end of the name; all these names have the same length */

+ end=buffer;

+ while(*end!=0) {

+ ++end;

+ }

+ /* enumerate the rest of the names */

+ while(++start<limit) {

+ /* increment the hexadecimal number on a character-basis */

+ s=end;

+ for (;;) {

+ c=*--s;

+ if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {

+ *s=(char)(c+1);

+ break;

+ } else if(c=='9') {

+ *s='A';

+ break;

+ } else if(c=='F') {

+ *s='0';

+ }

+ if(!fn(context, start, nameChoice, buffer, length)) {

+ return FALSE;

+ }

+ break;

+ }

+ case 1: {

+ uint16_t indexes[8];

+ const char *elementBases[8], *elements[8];

+ const uint16_t *factors=(const uint16_t *)(range+1);

+ uint16_t count=range->variant;

+ const char *s=(const char *)(factors+count);

+ char *suffix, *t;

+ uint16_t prefixLength, i, idx;

+ char c;

+ /* name = prefix factorized-elements */

+ /* copy prefix */

+ suffix=buffer;

+ prefixLength=0;

+ while((c=*s++)!=0) {

+ *suffix++=c;

+ ++prefixLength;

+ }

+ /* append the suffix of the start character */

+ length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,

+ s, (uint32_t)start-range->start,

+ indexes, elementBases, elements,

+ suffix, (uint16_t)(sizeof(buffer)-prefixLength)));

+ /* call the enumerator function with this first character */

+ if(!fn(context, start, nameChoice, buffer, length)) {

+ return FALSE;

+ }

+ /* enumerate the rest of the names */

+ while(++start<limit) {

+ /* increment the indexes in lexical order bound by the factors */

+ i=count;

+ for (;;) {

+ idx=(uint16_t)(indexes[--i]+1);

+ if(idx<factors[i]) {

+ /* skip one index and its element string */

+ indexes[i]=idx;

+ s=elements[i];

+ while(*s++!=0) {

+ }

+ elements[i]=s;

+ break;

+ } else {

+ /* reset this index to 0 and its element string to the first one */

+ indexes[i]=0;

+ elements[i]=elementBases[i];

+ }

+ /* to make matters a little easier, just append all elements to the suffix */

+ t=suffix;

+ length=prefixLength;

+ for(i=0; i<count; ++i) {

+ s=elements[i];

+ while((c=*s++)!=0) {

+ *t++=c;

+ ++length;

+ }

+ /* zero-terminate */

+ *t=0;

+ if(!fn(context, start, nameChoice, buffer, length)) {

+ return FALSE;

+ }

+ break;

+ }

+ default:

+ /* undefined type */

+ break;

+ }

+ return TRUE;

+/*

+ * findAlgName() is almost the same as enumAlgNames() except that it

+ * returns the code point for a name if it fits into the range.

+ * It returns 0xffff otherwise.

+ */

+static UChar32

+findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {

+ UChar32 code;

+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {

+ return 0xffff;

+ }

+ switch(range->type) {

+ case 0: {

+ /* name = prefix hex-digits */

+ const char *s=(const char *)(range+1);

+ char c;

+ uint16_t i, count;

+ /* compare prefix */

+ while((c=*s++)!=0) {

+ if((char)c!=*otherName++) {

+ return 0xffff;

+ }

+ /* read hexadecimal code point value */

+ count=range->variant;

+ code=0;

+ for(i=0; i<count; ++i) {

+ c=*otherName++;

+ if('0'<=c && c<='9') {

+ code=(code<<4)|(c-'0');

+ } else if('A'<=c && c<='F') {

+ code=(code<<4)|(c-'A'+10);

+ } else {

+ return 0xffff;

+ }

+ /* does it fit into the range? */

+ if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {

+ return code;

+ }

+ break;

+ }

+ case 1: {

+ char buffer[64];

+ uint16_t indexes[8];

+ const char *elementBases[8], *elements[8];

+ const uint16_t *factors=(const uint16_t *)(range+1);

+ uint16_t count=range->variant;

+ const char *s=(const char *)(factors+count), *t;

+ UChar32 start, limit;

+ uint16_t i, idx;

+ char c;

+ /* name = prefix factorized-elements */

+ /* compare prefix */

+ while((c=*s++)!=0) {

+ if((char)c!=*otherName++) {

+ return 0xffff;

+ }

+ start=(UChar32)range->start;

+ limit=(UChar32)(range->end+1);

+ /* initialize the suffix elements for enumeration; indexes should all be set to 0 */

+ writeFactorSuffix(factors, count, s, 0,

+ indexes, elementBases, elements, buffer, sizeof(buffer));

+ /* compare the first suffix */

+ if(0==uprv_strcmp(otherName, buffer)) {

+ return start;

+ }

+ /* enumerate and compare the rest of the suffixes */

+ while(++start<limit) {

+ /* increment the indexes in lexical order bound by the factors */

+ i=count;

+ for (;;) {

+ idx=(uint16_t)(indexes[--i]+1);

+ if(idx<factors[i]) {

+ /* skip one index and its element string */

+ indexes[i]=idx;

+ s=elements[i];

+ while(*s++!=0) {}

+ elements[i]=s;

+ break;

+ } else {

+ /* reset this index to 0 and its element string to the first one */

+ indexes[i]=0;

+ elements[i]=elementBases[i];

+ }

+ /* to make matters a little easier, just compare all elements of the suffix */

+ t=otherName;

+ for(i=0; i<count; ++i) {

+ s=elements[i];

+ while((c=*s++)!=0) {

+ if(c!=*t++) {

+ s=""; /* does not match */

+ i=99;

+ }

+ if(i<99 && *t==0) {

+ return start;

+ }

+ break;

+ }

+ default:

+ /* undefined type */

+ break;

+ }

+ return 0xffff;

+/* sets of name characters, maximum name lengths ---------------------------- */

+#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))

+#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)

+static int32_t

+calcStringSetLength(uint32_t set[8], const char *s) {

+ int32_t length=0;

+ char c;

+ while((c=*s++)!=0) {

+ SET_ADD(set, c);

+ ++length;

+ }

+ return length;

+static int32_t

+calcAlgNameSetsLengths(int32_t maxNameLength) {

+ AlgorithmicRange *range;

+ uint32_t *p;

+ uint32_t rangeCount;

+ int32_t length;

+ /* enumerate algorithmic ranges */

+ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);

+ rangeCount=*p;

+ range=(AlgorithmicRange *)(p+1);

+ while(rangeCount>0) {

+ switch(range->type) {

+ case 0:

+ /* name = prefix + (range->variant times) hex-digits */

+ /* prefix */

+ length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;

+ if(length>maxNameLength) {

+ maxNameLength=length;

+ }

+ break;

+ case 1: {

+ /* name = prefix factorized-elements */

+ const uint16_t *factors=(const uint16_t *)(range+1);

+ const char *s;

+ int32_t i, count=range->variant, factor, factorLength, maxFactorLength;

+ /* prefix length */

+ s=(const char *)(factors+count);

+ length=calcStringSetLength(gNameSet, s);

+ s+=length+1; /* start of factor suffixes */

+ /* get the set and maximum factor suffix length for each factor */

+ for(i=0; i<count; ++i) {

+ maxFactorLength=0;

+ for(factor=factors[i]; factor>0; --factor) {

+ factorLength=calcStringSetLength(gNameSet, s);

+ s+=factorLength+1;

+ if(factorLength>maxFactorLength) {

+ maxFactorLength=factorLength;

+ }

+ length+=maxFactorLength;

+ }

+ if(length>maxNameLength) {

+ maxNameLength=length;

+ }

+ break;

+ }

+ default:

+ /* unknown type */

+ break;

+ }

+ range=(AlgorithmicRange *)((uint8_t *)range+range->size);

+ --rangeCount;

+ }

+ return maxNameLength;

+static int32_t

+calcExtNameSetsLengths(int32_t maxNameLength) {

+ int32_t i, length;

+ for(i=0; i<LENGTHOF(charCatNames); ++i) {

+ /*

+ * for each category, count the length of the category name

+ * plus 9=

+ * 2 for <>

+ * 1 for -

+ * 6 for most hex digits per code point

+ */

+ length=9+calcStringSetLength(gNameSet, charCatNames[i]);

+ if(length>maxNameLength) {

+ maxNameLength=length;

+ }

+ return maxNameLength;

+static int32_t

+calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,

+ uint32_t set[8],

+ const uint8_t **pLine, const uint8_t *lineLimit) {

+ const uint8_t *line=*pLine;

+ int32_t length=0, tokenLength;

+ uint16_t c, token;

+ while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {

+ if(c>=tokenCount) {

+ /* implicit letter */

+ SET_ADD(set, c);

+ ++length;

+ } else {

+ token=tokens[c];

+ if(token==(uint16_t)(-2)) {

+ /* this is a lead byte for a double-byte token */

+ c=c<<8|*line++;

+ token=tokens[c];

+ }

+ if(token==(uint16_t)(-1)) {

+ /* explicit letter */

+ SET_ADD(set, c);

+ ++length;

+ } else {

+ /* count token word */

+ if(tokenLengths!=NULL) {

+ /* use cached token length */

+ tokenLength=tokenLengths[c];

+ if(tokenLength==0) {

+ tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);

+ tokenLengths[c]=(int8_t)tokenLength;

+ }

+ } else {

+ tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);

+ }

+ length+=tokenLength;

+ }

+ *pLine=line;

+ return length;

+static void

+calcGroupNameSetsLengths(int32_t maxNameLength) {

+ uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];

+ uint16_t *tokens=(uint16_t *)uCharNames+8;

+ uint16_t tokenCount=*tokens++;

+ uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;

+ int8_t *tokenLengths;

+ const uint16_t *group;

+ const uint8_t *s, *line, *lineLimit;

+ int32_t groupCount, lineNumber, length;

+ tokenLengths=(int8_t *)uprv_malloc(tokenCount);

+ if(tokenLengths!=NULL) {

+ uprv_memset(tokenLengths, 0, tokenCount);

+ }

+ group=GET_GROUPS(uCharNames);

+ groupCount=*group++;

+ /* enumerate all groups */

+ while(groupCount>0) {

+ s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);

+ s=expandGroupLengths(s, offsets, lengths);

+ /* enumerate all lines in each group */

+ for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {

+ line=s+offsets[lineNumber];

+ length=lengths[lineNumber];

+ if(length==0) {

+ continue;

+ }

+ lineLimit=line+length;

+ /* read regular name */

+ length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);

+ if(length>maxNameLength) {

+ maxNameLength=length;

+ }

+ if(line==lineLimit) {

+ continue;

+ }

+ /* read Unicode 1.0 name */

+ length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);

+ if(length>maxNameLength) {

+ maxNameLength=length;

+ }

+ if(line==lineLimit) {

+ continue;

+ }

+ /* read ISO comment */

+ /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/

+ }

+ group=NEXT_GROUP(group);

+ --groupCount;

+ }

+ if(tokenLengths!=NULL) {

+ uprv_free(tokenLengths);

+ }

+ /* set gMax... - name length last for threading */

+ gMaxNameLength=maxNameLength;

+static UBool

+calcNameSetsLengths(UErrorCode *pErrorCode) {

+ static const char extChars[]="0123456789ABCDEF<>-";

+ int32_t i, maxNameLength;

+ if(gMaxNameLength!=0) {

+ return TRUE;

+ }

+ if(!isDataLoaded(pErrorCode)) {

+ return FALSE;

+ }

+ /* set hex digits, used in various names, and <>-, used in extended names */

+ for(i=0; i<sizeof(extChars)-1; ++i) {

+ SET_ADD(gNameSet, extChars[i]);

+ }

+ /* set sets and lengths from algorithmic names */

+ maxNameLength=calcAlgNameSetsLengths(0);

+ /* set sets and lengths from extended names */

+ maxNameLength=calcExtNameSetsLengths(maxNameLength);

+ /* set sets and lengths from group names, set global maximum values */

+ calcGroupNameSetsLengths(maxNameLength);

+ return TRUE;

+/* public API --------------------------------------------------------------- */

+U_CAPI int32_t U_EXPORT2

+u_charName(UChar32 code, UCharNameChoice nameChoice,

+ char *buffer, int32_t bufferLength,

+ UErrorCode *pErrorCode) {

+ AlgorithmicRange *algRange;

+ uint32_t *p;

+ uint32_t i;

+ int32_t length;

+ /* check the argument values */

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {

+ return 0;

+ } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||

+ bufferLength<0 || (bufferLength>0 && buffer==NULL)

+ ) {

+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

+ return 0;

+ }

+ if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {

+ return u_terminateChars(buffer, bufferLength, 0, pErrorCode);

+ }

+ length=0;

+ /* try algorithmic names first */

+ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);

+ i=*p;

+ algRange=(AlgorithmicRange *)(p+1);

+ while(i>0) {

+ if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {

+ length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);

+ break;

+ }

+ algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);

+ --i;

+ }

+ if(i==0) {

+ if (nameChoice == U_EXTENDED_CHAR_NAME) {

+ length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);

+ if (!length) {

+ /* extended character name */

+ length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);

+ }

+ } else {

+ /* normal character name */

+ length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);

+ }

+ return u_terminateChars(buffer, bufferLength, length, pErrorCode);

+U_CAPI int32_t U_EXPORT2

+u_getISOComment(UChar32 c,

+ char *dest, int32_t destCapacity,

+ UErrorCode *pErrorCode) {

+ int32_t length;

+ /* check the argument values */

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {

+ return 0;

+ } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {

+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

+ return 0;

+ }

+ if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {

+ return u_terminateChars(dest, destCapacity, 0, pErrorCode);

+ }

+ /* the ISO comment is stored like a normal character name */

+ length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);

+ return u_terminateChars(dest, destCapacity, length, pErrorCode);

+U_CAPI UChar32 U_EXPORT2

+u_charFromName(UCharNameChoice nameChoice,

+ const char *name,

+ UErrorCode *pErrorCode) {

+ char upper[120], lower[120];

+ FindName findName;

+ AlgorithmicRange *algRange;

+ uint32_t *p;

+ uint32_t i;

+ UChar32 cp = 0;

+ char c0;

+ UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {

+ return error;

+ }

+ if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {

+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

+ return error;

+ }

+ if(!isDataLoaded(pErrorCode)) {

+ return error;

+ }

+ /* construct the uppercase and lowercase of the name first */

+ for(i=0; i<sizeof(upper); ++i) {

+ if((c0=*name++)!=0) {

+ upper[i]=uprv_toupper(c0);

+ lower[i]=uprv_tolower(c0);

+ } else {

+ upper[i]=lower[i]=0;

+ break;

+ }

+ if(i==sizeof(upper)) {

+ /* name too long, there is no such character */

+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;

+ return error;

+ }

+ /* try extended names first */

+ if (lower[0] == '<') {

+ if (nameChoice == U_EXTENDED_CHAR_NAME) {

+ if (lower[--i] == '>') {

+ for (--i; lower[i] && lower[i] != '-'; --i) {

+ }

+ if (lower[i] == '-') { /* We've got a category. */

+ uint32_t cIdx;

+ lower[i] = 0;

+ for (++i; lower[i] != '>'; ++i) {

+ if (lower[i] >= '0' && lower[i] <= '9') {

+ cp = (cp << 4) + lower[i] - '0';

+ } else if (lower[i] >= 'a' && lower[i] <= 'f') {

+ cp = (cp << 4) + lower[i] - 'a' + 10;

+ } else {

+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;

+ return error;

+ }

+ /* Now validate the category name.

+ We could use a binary search, or a trie, if

+ we really wanted to. */

+ for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {

+ if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {

+ if (getCharCat(cp) == cIdx) {

+ return cp;

+ }

+ break;

+ }

+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;

+ return error;

+ }

+ /* try algorithmic names now */

+ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);

+ i=*p;

+ algRange=(AlgorithmicRange *)(p+1);

+ while(i>0) {

+ if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {

+ return cp;

+ }

+ algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);

+ --i;

+ }

+ /* normal character name */

+ findName.otherName=upper;

+ findName.code=error;

+ enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);

+ if (findName.code == error) {

+ *pErrorCode = U_ILLEGAL_CHAR_FOUND;

+ }

+ return findName.code;

+U_CAPI void U_EXPORT2

+u_enumCharNames(UChar32 start, UChar32 limit,

+ UEnumCharNamesFn *fn,

+ void *context,

+ UCharNameChoice nameChoice,

+ UErrorCode *pErrorCode) {

+ AlgorithmicRange *algRange;

+ uint32_t *p;

+ uint32_t i;

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {

+ return;

+ }

+ if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {

+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

+ return;

+ }

+ if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {

+ limit = UCHAR_MAX_VALUE + 1;

+ }

+ if((uint32_t)start>=(uint32_t)limit) {

+ return;

+ }

+ if(!isDataLoaded(pErrorCode)) {

+ return;

+ }

+ /* interleave the data-driven ones with the algorithmic ones */

+ /* iterate over all algorithmic ranges; assume that they are in ascending order */

+ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);

+ i=*p;

+ algRange=(AlgorithmicRange *)(p+1);

+ while(i>0) {

+ /* enumerate the character names before the current algorithmic range */

+ /* here: start<limit */

+ if((uint32_t)start<algRange->start) {

+ if((uint32_t)limit<=algRange->start) {

+ enumNames(uCharNames, start, limit, fn, context, nameChoice);

+ return;

+ }

+ if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {

+ return;

+ }

+ start=(UChar32)algRange->start;

+ }

+ /* enumerate the character names in the current algorithmic range */

+ /* here: algRange->start<=start<limit */

+ if((uint32_t)start<=algRange->end) {

+ if((uint32_t)limit<=(algRange->end+1)) {

+ enumAlgNames(algRange, start, limit, fn, context, nameChoice);

+ return;

+ }

+ if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {

+ return;

+ }

+ start=(UChar32)algRange->end+1;

+ }

+ /* continue to the next algorithmic range (here: start<limit) */

+ algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);

+ --i;

+ }

+ /* enumerate the character names after the last algorithmic range */

+ enumNames(uCharNames, start, limit, fn, context, nameChoice);

+U_CAPI int32_t U_EXPORT2

+uprv_getMaxCharNameLength() {

+ UErrorCode errorCode=U_ZERO_ERROR;

+ if(calcNameSetsLengths(&errorCode)) {

+ return gMaxNameLength;

+ } else {

+ return 0;

+ }

+/**

+ * Converts the char set cset into a Unicode set uset.

+ * @param cset Set of 256 bit flags corresponding to a set of chars.

+ * @param uset USet to receive characters. Existing contents are deleted.

+ */

+static void

+charSetToUSet(uint32_t cset[8], const USetAdder *sa) {

+ UChar us[256];

+ char cs[256];

+ int32_t i, length;

+ UErrorCode errorCode;

+ errorCode=U_ZERO_ERROR;

+ if(!calcNameSetsLengths(&errorCode)) {

+ return;

+ }

+ /* build a char string with all chars that are used in character names */

+ length=0;

+ for(i=0; i<256; ++i) {

+ if(SET_CONTAINS(cset, i)) {

+ cs[length++]=(char)i;

+ }

+ /* convert the char string to a UChar string */

+ u_charsToUChars(cs, us, length);

+ /* add each UChar to the USet */

+ for(i=0; i<length; ++i) {

+ if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */

+ sa->add(sa->set, us[i]);

+ }

+/**

+ * Fills set with characters that are used in Unicode character names.

+ * @param set USet to receive characters.

+ */

+U_CAPI void U_EXPORT2

+uprv_getCharNameCharacters(const USetAdder *sa) {

+ charSetToUSet(gNameSet, sa);

+/* data swapping ------------------------------------------------------------ */

+/*

+ * The token table contains non-negative entries for token bytes,

+ * and -1 for bytes that represent themselves in the data file's charset.

+ * -2 entries are used for lead bytes.

+ *

+ * Direct bytes (-1 entries) must be translated from the input charset family

+ * to the output charset family.

+ * makeTokenMap() writes a permutation mapping for this.

+ * Use it once for single-/lead-byte tokens and once more for all trail byte

+ * tokens. (';' is an unused trail byte marked with -1.)

+ */

+static void

+makeTokenMap(const UDataSwapper *ds,

+ int16_t tokens[], uint16_t tokenCount,

+ uint8_t map[256],

+ UErrorCode *pErrorCode) {

+ UBool usedOutChar[256];

+ uint16_t i, j;

+ uint8_t c1, c2;

+ if(U_FAILURE(*pErrorCode)) {

+ return;

+ }

+ if(ds->inCharset==ds->outCharset) {

+ /* Same charset family: identity permutation */

+ for(i=0; i<256; ++i) {

+ map[i]=(uint8_t)i;

+ }

+ } else {

+ uprv_memset(map, 0, 256);

+ uprv_memset(usedOutChar, 0, 256);

+ if(tokenCount>256) {

+ tokenCount=256;

+ }

+ /* set the direct bytes (byte 0 always maps to itself) */

+ for(i=1; i<tokenCount; ++i) {

+ if(tokens[i]==-1) {

+ /* convert the direct byte character */

+ c1=(uint8_t)i;

+ ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);

+ if(U_FAILURE(*pErrorCode)) {

+ udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",

+ i, ds->inCharset);

+ return;

+ }

+ /* enter the converted character into the map and mark it used */

+ map[c1]=c2;

+ usedOutChar[c2]=TRUE;

+ }

+ /* set the mappings for the rest of the permutation */

+ for(i=j=1; i<tokenCount; ++i) {

+ /* set mappings that were not set for direct bytes */

+ if(map[i]==0) {

+ /* set an output byte value that was not used as an output byte above */

+ while(usedOutChar[j]) {

+ ++j;

+ }

+ map[i]=(uint8_t)j++;

+ }

+ /*

+ * leave mappings at tokenCount and above unset if tokenCount<256

+ * because they won't be used

+ */

+ }

+U_CAPI int32_t U_EXPORT2

+uchar_swapNames(const UDataSwapper *ds,

+ const void *inData, int32_t length, void *outData,

+ UErrorCode *pErrorCode) {

+ const UDataInfo *pInfo;

+ int32_t headerSize;

+ const uint8_t *inBytes;

+ uint8_t *outBytes;

+ uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,

+ offset, i, count, stringsCount;

+ const AlgorithmicRange *inRange;

+ AlgorithmicRange *outRange;

+ /* udata_swapDataHeader checks the arguments */

+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);

+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {

+ return 0;

+ }

+ /* check data format and format version */

+ pInfo=(const UDataInfo *)((const char *)inData+4);

+ if(!(

+ pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */

+ pInfo->dataFormat[1]==0x6e &&

+ pInfo->dataFormat[2]==0x61 &&

+ pInfo->dataFormat[3]==0x6d &&

+ pInfo->formatVersion[0]==1

+ )) {

+ udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",

+ pInfo->dataFormat[0], pInfo->dataFormat[1],

+ pInfo->dataFormat[2], pInfo->dataFormat[3],

+ pInfo->formatVersion[0]);

+ *pErrorCode=U_UNSUPPORTED_ERROR;

+ return 0;

+ }

+ inBytes=(const uint8_t *)inData+headerSize;

+ outBytes=(uint8_t *)outData+headerSize;

+ if(length<0) {

+ algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);

+ } else {

+ length-=headerSize;

+ if( length<20 ||

+ (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))

+ ) {

+ udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",

+ length);

+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

+ return 0;

+ }

+ if(length<0) {

+ /* preflighting: iterate through algorithmic ranges */

+ offset=algNamesOffset;

+ count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));

+ offset+=4;

+ for(i=0; i<count; ++i) {

+ inRange=(const AlgorithmicRange *)(inBytes+offset);

+ offset+=ds->readUInt16(inRange->size);

+ }

+ } else {

+ /* swap data */

+ const uint16_t *p;

+ uint16_t *q, *temp;

+ int16_t tokens[512];

+ uint16_t tokenCount;

+ uint8_t map[256], trailMap[256];

+ /* copy the data for inaccessible bytes */

+ if(inBytes!=outBytes) {

+ uprv_memcpy(outBytes, inBytes, length);

+ }

+ /* the initial 4 offsets first */

+ tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);

+ groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);

+ groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);

+ ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);

+ /*

+ * now the tokens table

+ * it needs to be permutated along with the compressed name strings

+ */

+ p=(const uint16_t *)(inBytes+16);

+ q=(uint16_t *)(outBytes+16);

+ /* read and swap the tokenCount */

+ tokenCount=ds->readUInt16(*p);

+ ds->swapArray16(ds, p, 2, q, pErrorCode);

+ ++p;

+ ++q;

+ /* read the first 512 tokens and make the token maps */

+ if(tokenCount<=512) {

+ count=tokenCount;

+ } else {

+ count=512;

+ }

+ for(i=0; i<count; ++i) {

+ tokens[i]=udata_readInt16(ds, p[i]);

+ }

+ for(; i<512; ++i) {

+ tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */

+ }

+ makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);

+ makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);

+ if(U_FAILURE(*pErrorCode)) {

+ return 0;

+ }

+ /*

+ * swap and permutate the tokens

+ * go through a temporary array to support in-place swapping

+ */

+ temp=(uint16_t *)uprv_malloc(tokenCount*2);

+ if(temp==NULL) {

+ udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",

+ tokenCount);

+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

+ return 0;

+ }

+ /* swap and permutate single-/lead-byte tokens */

+ for(i=0; i<tokenCount && i<256; ++i) {

+ ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);

+ }

+ /* swap and permutate trail-byte tokens */

+ for(; i<tokenCount; ++i) {

+ ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);

+ }

+ /* copy the result into the output and free the temporary array */

+ uprv_memcpy(q, temp, tokenCount*2);

+ uprv_free(temp);

+ /*

+ * swap the token strings but not a possible padding byte after

+ * the terminating NUL of the last string

+ */

+ udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),

+ outBytes+tokenStringOffset, pErrorCode);

+ if(U_FAILURE(*pErrorCode)) {

+ udata_printError(ds, "uchar_swapNames(token strings) failed\n");

+ return 0;

+ }

+ /* swap the group table */

+ count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));

+ ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),

+ outBytes+groupsOffset, pErrorCode);

+ /*

+ * swap the group strings

+ * swap the string bytes but not the nibble-encoded string lengths

+ */

+ if(ds->inCharset!=ds->outCharset) {

+ uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];

+ const uint8_t *inStrings, *nextInStrings;

+ uint8_t *outStrings;

+ uint8_t c;

+ inStrings=inBytes+groupStringOffset;

+ outStrings=outBytes+groupStringOffset;

+ stringsCount=algNamesOffset-groupStringOffset;

+ /* iterate through string groups until only a few padding bytes are left */

+ while(stringsCount>32) {

+ nextInStrings=expandGroupLengths(inStrings, offsets, lengths);

+ /* move past the length bytes */

+ stringsCount-=(uint32_t)(nextInStrings-inStrings);

+ outStrings+=nextInStrings-inStrings;

+ inStrings=nextInStrings;

+ count=offsets[31]+lengths[31]; /* total number of string bytes in this group */

+ stringsCount-=count;

+ /* swap the string bytes using map[] and trailMap[] */

+ while(count>0) {

+ c=*inStrings++;

+ *outStrings++=map[c];

+ if(tokens[c]!=-2) {

+ --count;

+ } else {

+ /* token lead byte: swap the trail byte, too */

+ *outStrings++=trailMap[*inStrings++];

+ count-=2;

+ }

+ /* swap the algorithmic ranges */

+ offset=algNamesOffset;

+ count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));

+ ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);

+ offset+=4;

+ for(i=0; i<count; ++i) {

+ if(offset>(uint32_t)length) {

+ udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",

+ length, i);

+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;

+ return 0;

+ }

+ inRange=(const AlgorithmicRange *)(inBytes+offset);

+ outRange=(AlgorithmicRange *)(outBytes+offset);

+ offset+=ds->readUInt16(inRange->size);

+ ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);

+ ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);

+ switch(inRange->type) {

+ case 0:

+ /* swap prefix string */

+ ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),

+ outRange+1, pErrorCode);

+ if(U_FAILURE(*pErrorCode)) {

+ udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",

+ i);

+ return 0;

+ }

+ break;

+ case 1:

+ {

+ /* swap factors and the prefix and factor strings */

+ uint32_t factorsCount;

+ factorsCount=inRange->variant;

+ p=(const uint16_t *)(inRange+1);

+ q=(uint16_t *)(outRange+1);

+ ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);

+ /* swap the strings, up to the last terminating NUL */

+ p+=factorsCount;

+ q+=factorsCount;

+ stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);

+ while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {

+ --stringsCount;

+ }

+ ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);

+ }

+ break;

+ default:

+ udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",

+ inRange->type, i);

+ *pErrorCode=U_UNSUPPORTED_ERROR;

+ return 0;

+ }

+ return headerSize+(int32_t)offset;

+/*

+ * Hey, Emacs, please set the following:

+ *

+ * Local Variables:

+ * indent-tabs-mode: nil

+ * End:

+ *

+ */

Property changes on: icu46/source/common/unames.c

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/common/umutex.c ('k') | icu46/source/common/unicode/brkiter.h » ('j') | no next file with comments »