Index: source/i18n/collationdatareader.h |
diff --git a/source/i18n/collationdatareader.h b/source/i18n/collationdatareader.h |
index df65b862ab4049459287d9d56016166275995c50..4a9fa5eecadada2dcb420abeb040367753a1466c 100644 |
--- a/source/i18n/collationdatareader.h |
+++ b/source/i18n/collationdatareader.h |
@@ -1,6 +1,6 @@ |
/* |
******************************************************************************* |
-* Copyright (C) 2013-2014, International Business Machines |
+* Copyright (C) 2013-2015, International Business Machines |
* Corporation and others. All Rights Reserved. |
******************************************************************************* |
* collationdatareader.h |
@@ -109,7 +109,7 @@ private: |
/* |
* Format of collation data (ucadata.icu, binary data in coll/ *.res files). |
- * Format version 4.0. |
+ * Format version 5. |
* |
* The root collation data is stored in the ucadata.icu file. |
* Tailorings are stored inside .res resource bundle files, with a complete file header. |
@@ -151,10 +151,30 @@ private: |
* int32_t reorderCodes[]; -- empty in root |
* The list of script and reordering codes. |
* |
+ * Beginning with format version 5, this array may optionally |
+ * have trailing entries with a full list of reorder ranges |
+ * as described for CollationSettings::reorderRanges. |
+ * |
+ * Script or reorder codes are first and do not exceed 16-bit values. |
+ * Range limits are stored in the upper 16 bits, and are never 0. |
+ * Split this array into reorder codes and ranges at the first entry |
+ * with non-zero upper 16 bits. |
+ * |
+ * If the ranges are missing but needed for split-reordered primary lead bytes, |
+ * then they are regenerated at load time. |
+ * |
* uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes |
* Primary-weight lead byte permutation table. |
* Normally present when the reorderCodes are, but can be built at load time. |
* |
+ * Beginning with format version 5, a 0 entry at a non-zero index |
+ * (which is otherwise an illegal value) |
+ * means that the primary lead byte is "split" |
+ * (there are different offsets for primaries that share that lead byte) |
+ * and the reordering offset must be determined via the reorder ranges |
+ * that are either stored as part of the reorderCodes array |
+ * or regenerated at load time. |
+ * |
* UTrie2 trie; -- see utrie2_impl.h and utrie2.h |
* The trie holds the main collation data. Each code point is mapped to a 32-bit value. |
* It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, |
@@ -194,12 +214,35 @@ private: |
* See the CollationFastLatin class. |
* |
* uint16_t scripts[]; -- empty in all tailorings |
+ * Format version 5: |
+ * uint16_t numScripts; |
+ * uint16_t scriptsIndex[numScripts+16]; |
+ * uint16_t scriptStarts[]; |
+ * See CollationData::numScripts etc. |
+ * |
+ * Format version 4: |
* Table of the reordering groups with their first and last lead bytes, |
* and their script and reordering codes. |
* See CollationData::scripts. |
* |
* UBool compressibleBytes[]; -- empty in all tailorings |
* Flag for getSortKey(), indicating primary weight lead bytes that are compressible. |
+ * |
+ * ----------------- |
+ * Changes for formatVersion 5 (ICU 55) |
+ * |
+ * Reordering moves single scripts, not groups of scripts. |
+ * Reorder ranges are optionally appended to the reorderCodes, |
+ * and a 0 entry in the reorderTable indicates a split lead byte. |
+ * The scripts data has a new format. |
+ * |
+ * The rootElements may contain secondary and tertiary weights below common=05. |
+ * (Used for small Hiragana letters.) |
+ * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. |
+ * There are no other data structure changes, but builder code needs to be able to handle such data. |
+ * |
+ * The collation element for the merge separator code point U+FFFE |
+ * does not necessarily have special, unique secondary/tertiary weights any more. |
*/ |
U_NAMESPACE_END |