Index: source/i18n/collationdata.cpp |
diff --git a/source/i18n/collationdata.cpp b/source/i18n/collationdata.cpp |
index 42b5bedb5ab57e0ba22987cb1c5186d5bdd25e3a..7f781f50f1973782f14afbad97f3ede1644f4649 100644 |
--- a/source/i18n/collationdata.cpp |
+++ b/source/i18n/collationdata.cpp |
@@ -1,6 +1,6 @@ |
/* |
******************************************************************************* |
-* Copyright (C) 2012-2014, International Business Machines |
+* Copyright (C) 2012-2015, International Business Machines |
* Corporation and others. All Rights Reserved. |
******************************************************************************* |
* collationdata.cpp |
@@ -21,6 +21,7 @@ |
#include "collationdata.h" |
#include "uassert.h" |
#include "utrie2.h" |
+#include "uvectr32.h" |
U_NAMESPACE_BEGIN |
@@ -114,48 +115,57 @@ CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const { |
uint32_t |
CollationData::getFirstPrimaryForGroup(int32_t script) const { |
- int32_t index = findScript(script); |
- if(index < 0) { |
- return 0; |
- } |
- uint32_t head = scripts[index]; |
- return (head & 0xff00) << 16; |
+ int32_t index = getScriptIndex(script); |
+ return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16; |
} |
uint32_t |
CollationData::getLastPrimaryForGroup(int32_t script) const { |
- int32_t index = findScript(script); |
- if(index < 0) { |
+ int32_t index = getScriptIndex(script); |
+ if(index == 0) { |
return 0; |
} |
- uint32_t head = scripts[index]; |
- uint32_t lastByte = head & 0xff; |
- return ((lastByte + 1) << 24) - 1; |
+ uint32_t limit = scriptStarts[index + 1]; |
+ return (limit << 16) - 1; |
} |
int32_t |
CollationData::getGroupForPrimary(uint32_t p) const { |
- p >>= 24; // Reordering groups are distinguished by primary lead bytes. |
- for(int32_t i = 0; i < scriptsLength; i = i + 2 + scripts[i + 1]) { |
- uint32_t lastByte = scripts[i] & 0xff; |
- if(p <= lastByte) { |
- return scripts[i + 2]; |
+ p >>= 16; |
+ if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) { |
+ return -1; |
+ } |
+ int32_t index = 1; |
+ while(p >= scriptStarts[index + 1]) { ++index; } |
+ for(int32_t i = 0; i < numScripts; ++i) { |
+ if(scriptsIndex[i] == index) { |
+ return i; |
+ } |
+ } |
+ for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { |
+ if(scriptsIndex[numScripts + i] == index) { |
+ return UCOL_REORDER_CODE_FIRST + i; |
} |
} |
return -1; |
} |
int32_t |
-CollationData::findScript(int32_t script) const { |
- if(script < 0 || 0xffff < script) { return -1; } |
- for(int32_t i = 0; i < scriptsLength;) { |
- int32_t limit = i + 2 + scripts[i + 1]; |
- for(int32_t j = i + 2; j < limit; ++j) { |
- if(script == scripts[j]) { return i; } |
+CollationData::getScriptIndex(int32_t script) const { |
+ if(script < 0) { |
+ return 0; |
+ } else if(script < numScripts) { |
+ return scriptsIndex[script]; |
+ } else if(script < UCOL_REORDER_CODE_FIRST) { |
+ return 0; |
+ } else { |
+ script -= UCOL_REORDER_CODE_FIRST; |
+ if(script < MAX_NUM_SPECIAL_REORDER_CODES) { |
+ return scriptsIndex[numScripts + script]; |
+ } else { |
+ return 0; |
} |
- i = limit; |
} |
- return -1; |
} |
int32_t |
@@ -163,85 +173,114 @@ CollationData::getEquivalentScripts(int32_t script, |
int32_t dest[], int32_t capacity, |
UErrorCode &errorCode) const { |
if(U_FAILURE(errorCode)) { return 0; } |
- int32_t i = findScript(script); |
- if(i < 0) { return 0; } |
- int32_t length = scripts[i + 1]; |
- U_ASSERT(length != 0); |
- if(length > capacity) { |
- errorCode = U_BUFFER_OVERFLOW_ERROR; |
- return length; |
+ int32_t index = getScriptIndex(script); |
+ if(index == 0) { return 0; } |
+ if(script >= UCOL_REORDER_CODE_FIRST) { |
+ // Special groups have no aliases. |
+ if(capacity > 0) { |
+ dest[0] = script; |
+ } else { |
+ errorCode = U_BUFFER_OVERFLOW_ERROR; |
+ } |
+ return 1; |
} |
- i += 2; |
- dest[0] = scripts[i++]; |
- for(int32_t j = 1; j < length; ++j) { |
- script = scripts[i++]; |
- // Sorted insertion. |
- for(int32_t k = j;; --k) { |
- // Invariant: dest[k] is free to receive either script or dest[k - 1]. |
- if(k > 0 && script < dest[k - 1]) { |
- dest[k] = dest[k - 1]; |
- } else { |
- dest[k] = script; |
- break; |
+ |
+ int32_t length = 0; |
+ for(int32_t i = 0; i < numScripts; ++i) { |
+ if(scriptsIndex[i] == index) { |
+ if(length < capacity) { |
+ dest[length] = i; |
} |
+ ++length; |
} |
} |
+ if(length > capacity) { |
+ errorCode = U_BUFFER_OVERFLOW_ERROR; |
+ } |
return length; |
} |
void |
-CollationData::makeReorderTable(const int32_t *reorder, int32_t length, |
- uint8_t table[256], UErrorCode &errorCode) const { |
- if(U_FAILURE(errorCode)) { return; } |
+CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, |
+ UVector32 &ranges, UErrorCode &errorCode) const { |
+ makeReorderRanges(reorder, length, FALSE, ranges, errorCode); |
+} |
- // Initialize the table. |
- // Never reorder special low and high primary lead bytes. |
- int32_t lowByte; |
- for(lowByte = 0; lowByte <= Collation::MERGE_SEPARATOR_BYTE; ++lowByte) { |
- table[lowByte] = lowByte; |
+void |
+CollationData::makeReorderRanges(const int32_t *reorder, int32_t length, |
+ UBool latinMustMove, |
+ UVector32 &ranges, UErrorCode &errorCode) const { |
+ if(U_FAILURE(errorCode)) { return; } |
+ ranges.removeAllElements(); |
+ if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) { |
+ return; |
} |
- // lowByte == 03 |
- int32_t highByte; |
- for(highByte = 0xff; highByte >= Collation::TRAIL_WEIGHT_BYTE; --highByte) { |
- table[highByte] = highByte; |
- } |
- // highByte == FE |
+ // Maps each script-or-group range to a new lead byte. |
+ uint8_t table[MAX_NUM_SCRIPT_RANGES]; |
+ uprv_memset(table, 0, sizeof(table)); |
- // Set intermediate bytes to 0 to indicate that they have not been set yet. |
- for(int32_t i = lowByte; i <= highByte; ++i) { |
- table[i] = 0; |
+ { |
+ // Set "don't care" values for reserved ranges. |
+ int32_t index = scriptsIndex[ |
+ numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST]; |
+ if(index != 0) { |
+ table[index] = 0xff; |
+ } |
+ index = scriptsIndex[ |
+ numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST]; |
+ if(index != 0) { |
+ table[index] = 0xff; |
+ } |
} |
+ // Never reorder special low and high primary lead bytes. |
+ U_ASSERT(scriptStartsLength >= 2); |
+ U_ASSERT(scriptStarts[0] == 0); |
+ int32_t lowStart = scriptStarts[1]; |
+ U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8)); |
+ int32_t highLimit = scriptStarts[scriptStartsLength - 1]; |
+ U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8)); |
+ |
// Get the set of special reorder codes in the input list. |
- // This supports up to 32 special reorder codes; |
+ // This supports a fixed number of special reorder codes; |
// it works for data with codes beyond UCOL_REORDER_CODE_LIMIT. |
uint32_t specials = 0; |
for(int32_t i = 0; i < length; ++i) { |
int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST; |
- if(0 <= reorderCode && reorderCode <= 31) { |
+ if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) { |
specials |= (uint32_t)1 << reorderCode; |
} |
} |
// Start the reordering with the special low reorder codes that do not occur in the input. |
- for(int32_t i = 0;; i += 3) { |
- if(scripts[i + 1] != 1) { break; } // Went beyond special single-code reorder codes. |
- int32_t reorderCode = (int32_t)scripts[i + 2] - UCOL_REORDER_CODE_FIRST; |
- if(reorderCode < 0) { break; } // Went beyond special reorder codes. |
- if((specials & ((uint32_t)1 << reorderCode)) == 0) { |
- int32_t head = scripts[i]; |
- int32_t firstByte = head >> 8; |
- int32_t lastByte = head & 0xff; |
- do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); |
+ for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) { |
+ int32_t index = scriptsIndex[numScripts + i]; |
+ if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) { |
+ lowStart = addLowScriptRange(table, index, lowStart); |
} |
} |
- // Reorder according to the input scripts, continuing from the bottom of the bytes range. |
+ // Skip the reserved range before Latin if Latin is the first script, |
+ // so that we do not move it unnecessarily. |
+ int32_t skippedReserved = 0; |
+ if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) { |
+ int32_t index = scriptsIndex[USCRIPT_LATIN]; |
+ U_ASSERT(index != 0); |
+ int32_t start = scriptStarts[index]; |
+ U_ASSERT(lowStart <= start); |
+ skippedReserved = start - lowStart; |
+ lowStart = start; |
+ } |
+ |
+ // Reorder according to the input scripts, continuing from the bottom of the primary range. |
+ int32_t originalLength = length; // length will be decremented if "others" is in the list. |
+ UBool hasReorderToEnd = FALSE; |
for(int32_t i = 0; i < length;) { |
int32_t script = reorder[i++]; |
if(script == USCRIPT_UNKNOWN) { |
// Put the remaining scripts at the top. |
+ hasReorderToEnd = TRUE; |
while(i < length) { |
script = reorder[--length]; |
if(script == USCRIPT_UNKNOWN || // Must occur at most once. |
@@ -249,16 +288,13 @@ CollationData::makeReorderTable(const int32_t *reorder, int32_t length, |
errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
return; |
} |
- int32_t index = findScript(script); |
- if(index < 0) { continue; } |
- int32_t head = scripts[index]; |
- int32_t firstByte = head >> 8; |
- int32_t lastByte = head & 0xff; |
- if(table[firstByte] != 0) { // Duplicate or equivalent script. |
+ int32_t index = getScriptIndex(script); |
+ if(index == 0) { continue; } |
+ if(table[index] != 0) { // Duplicate or equivalent script. |
errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
return; |
} |
- do { table[lastByte--] = highByte--; } while(firstByte <= lastByte); |
+ highLimit = addHighScriptRange(table, index, highLimit); |
} |
break; |
} |
@@ -268,24 +304,83 @@ CollationData::makeReorderTable(const int32_t *reorder, int32_t length, |
errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
return; |
} |
- int32_t index = findScript(script); |
- if(index < 0) { continue; } |
- int32_t head = scripts[index]; |
- int32_t firstByte = head >> 8; |
- int32_t lastByte = head & 0xff; |
- if(table[firstByte] != 0) { // Duplicate or equivalent script. |
+ int32_t index = getScriptIndex(script); |
+ if(index == 0) { continue; } |
+ if(table[index] != 0) { // Duplicate or equivalent script. |
errorCode = U_ILLEGAL_ARGUMENT_ERROR; |
return; |
} |
- do { table[firstByte++] = lowByte++; } while(firstByte <= lastByte); |
+ lowStart = addLowScriptRange(table, index, lowStart); |
} |
// Put all remaining scripts into the middle. |
- // Avoid table[0] which must remain 0. |
- for(int32_t i = 1; i <= 0xff; ++i) { |
- if(table[i] == 0) { table[i] = lowByte++; } |
+ for(int32_t i = 1; i < scriptStartsLength - 1; ++i) { |
+ int32_t leadByte = table[i]; |
+ if(leadByte != 0) { continue; } |
+ int32_t start = scriptStarts[i]; |
+ if(!hasReorderToEnd && start > lowStart) { |
+ // No need to move this script. |
+ lowStart = start; |
+ } |
+ lowStart = addLowScriptRange(table, i, lowStart); |
+ } |
+ if(lowStart > highLimit) { |
+ if((lowStart - (skippedReserved & 0xff00)) <= highLimit) { |
+ // Try not skipping the before-Latin reserved range. |
+ makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode); |
+ return; |
+ } |
+ // We need more primary lead bytes than available, despite the reserved ranges. |
+ errorCode = U_BUFFER_OVERFLOW_ERROR; |
+ return; |
+ } |
+ |
+ // Turn lead bytes into a list of (limit, offset) pairs. |
+ // Encode each pair in one list element: |
+ // Upper 16 bits = limit, lower 16 = signed lead byte offset. |
+ int32_t offset = 0; |
+ for(int32_t i = 1;; ++i) { |
+ int32_t nextOffset = offset; |
+ while(i < scriptStartsLength - 1) { |
+ int32_t newLeadByte = table[i]; |
+ if(newLeadByte == 0xff) { |
+ // "Don't care" lead byte for reserved range, continue with current offset. |
+ } else { |
+ nextOffset = newLeadByte - (scriptStarts[i] >> 8); |
+ if(nextOffset != offset) { break; } |
+ } |
+ ++i; |
+ } |
+ if(offset != 0 || i < scriptStartsLength - 1) { |
+ ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode); |
+ } |
+ if(i == scriptStartsLength - 1) { break; } |
+ offset = nextOffset; |
+ } |
+} |
+ |
+int32_t |
+CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const { |
+ int32_t start = scriptStarts[index]; |
+ if((start & 0xff) < (lowStart & 0xff)) { |
+ lowStart += 0x100; |
+ } |
+ table[index] = (uint8_t)(lowStart >> 8); |
+ int32_t limit = scriptStarts[index + 1]; |
+ lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff); |
+ return lowStart; |
+} |
+ |
+int32_t |
+CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const { |
+ int32_t limit = scriptStarts[index + 1]; |
+ if((limit & 0xff) > (highLimit & 0xff)) { |
+ highLimit -= 0x100; |
} |
- U_ASSERT(lowByte == highByte + 1); |
+ int32_t start = scriptStarts[index]; |
+ highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff); |
+ table[index] = (uint8_t)(highLimit >> 8); |
+ return highLimit; |
} |
U_NAMESPACE_END |