| Index: icu46/patches/segmentation.patch
|
| ===================================================================
|
| --- icu46/patches/segmentation.patch (revision 69841)
|
| +++ icu46/patches/segmentation.patch (working copy)
|
| @@ -1,14 +1,6 @@
|
| ---- source/common/brkeng.cpp 2007-09-11 20:53:13.000000000 -0700
|
| -+++ source/common/brkeng.cpp 2009-07-29 12:57:49.973382000 -0700
|
| -@@ -24,6 +24,7 @@
|
| - #include "umutex.h"
|
| - #include "uresimp.h"
|
| - #include "ubrkimpl.h"
|
| -+#include <stdio.h>
|
| -
|
| - U_NAMESPACE_BEGIN
|
| -
|
| -@@ -226,6 +227,30 @@
|
| +--- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800
|
| ++++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800
|
| +@@ -226,6 +226,30 @@
|
| case USCRIPT_THAI:
|
| engine = new ThaiBreakEngine(dict, status);
|
| break;
|
| @@ -39,7 +31,7 @@
|
| default:
|
| break;
|
| }
|
| -@@ -281,6 +306,13 @@
|
| +@@ -281,6 +305,13 @@
|
| dict = NULL;
|
| }
|
| return dict;
|
| @@ -54,20 +46,18 @@
|
| return NULL;
|
| }
|
| --- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700
|
| -+++ source/common/dictbe.cpp 2009-11-11 12:58:40.199829000 -0800
|
| -@@ -16,6 +16,11 @@
|
| ++++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800
|
| +@@ -16,6 +16,9 @@
|
| #include "unicode/ubrk.h"
|
| #include "uvector.h"
|
| #include "triedict.h"
|
| +#include "uassert.h"
|
| +#include "unicode/normlzr.h"
|
| +#include "cmemory.h"
|
| -+
|
| -+#include <stdio.h>
|
|
|
| U_NAMESPACE_BEGIN
|
|
|
| -@@ -422,6 +427,294 @@
|
| +@@ -422,6 +425,294 @@
|
| return wordsFound;
|
| }
|
|
|
| @@ -363,7 +353,7 @@
|
|
|
| #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
| --- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700
|
| -+++ source/common/dictbe.h 2009-07-27 13:01:17.704415000 -0700
|
| ++++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800
|
| @@ -1,8 +1,8 @@
|
| /**
|
| - *******************************************************************************
|
| @@ -371,13 +361,13 @@
|
| - * All Rights Reserved. *
|
| - *******************************************************************************
|
| + **********************************************************************************
|
| -+ * Copyright (C) 2006,2007, International Business Machines Corporation and others.
|
| ++ * Copyright (C) 2006-2010, International Business Machines Corporation and others.
|
| + * All Rights Reserved.
|
| + **********************************************************************************
|
| */
|
|
|
| #ifndef DICTBE_H
|
| -@@ -65,37 +65,37 @@
|
| +@@ -65,31 +65,31 @@
|
| */
|
| virtual ~DictionaryBreakEngine();
|
|
|
| @@ -430,19 +420,8 @@
|
| + * @return The number of breaks found.
|
| + */
|
| virtual int32_t findBreaks( UText *text,
|
| -- int32_t startPos,
|
| -- int32_t endPos,
|
| -- UBool reverse,
|
| -- int32_t breakType,
|
| -- UStack &foundBreaks ) const;
|
| -+ int32_t startPos,
|
| -+ int32_t endPos,
|
| -+ UBool reverse,
|
| -+ int32_t breakType,
|
| -+ UStack &foundBreaks ) const;
|
| -
|
| - protected:
|
| -
|
| + int32_t startPos,
|
| + int32_t endPos,
|
| @@ -114,7 +114,7 @@
|
| // virtual void setBreakTypes( uint32_t breakTypes );
|
|
|
| @@ -461,16 +440,7 @@
|
| *
|
| * @param text A UText representing the text
|
| * @param rangeStart The start of the range of dictionary characters
|
| -@@ -180,12 +180,72 @@
|
| - * @return The number of breaks found
|
| - */
|
| - virtual int32_t divideUpDictionaryRange( UText *text,
|
| -- int32_t rangeStart,
|
| -- int32_t rangeEnd,
|
| -- UStack &foundBreaks ) const;
|
| -+ int32_t rangeStart,
|
| -+ int32_t rangeEnd,
|
| -+ UStack &foundBreaks ) const;
|
| +@@ -186,6 +186,66 @@
|
|
|
| };
|
|
|
| @@ -537,34 +507,24 @@
|
|
|
| U_NAMESPACE_END
|
|
|
| ---- source/common/rbbi.cpp 2008-09-24 22:48:27.000000000 -0700
|
| -+++ source/common/rbbi.cpp 2009-07-27 13:01:17.710416000 -0700
|
| -@@ -29,6 +29,7 @@
|
| -
|
| - #include "uassert.h"
|
| - #include "uvector.h"
|
| -+#include <stdio.h>
|
| -
|
| - // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
|
| - #if U_LOCAL_SERVICE_HOOK
|
| -@@ -1552,10 +1553,14 @@
|
| +--- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700
|
| ++++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800
|
| +@@ -1555,10 +1555,12 @@
|
| int32_t endPos,
|
| UBool reverse) {
|
| // Reset the old break cache first.
|
| - uint32_t dictionaryCount = fDictionaryCharCount;
|
| -+// uint32_t dictionaryCount = fDictionaryCharCount;
|
| reset();
|
|
|
| - if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
|
| + // note: code segment below assumes that dictionary chars are in the
|
| + // startPos-endPos range
|
| + // value returned should be next character in sequence
|
| -+// if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
|
| + if ((endPos - startPos) <= 1) {
|
| return (reverse ? startPos : endPos);
|
| }
|
|
|
| -@@ -1684,7 +1689,7 @@
|
| +@@ -1711,7 +1713,7 @@
|
| // proposed break by one of the breaks we found. Use following() and
|
| // preceding() to do the work. They should never recurse in this case.
|
| if (reverse) {
|
| @@ -574,7 +534,7 @@
|
| else {
|
| return following(startPos);
|
| --- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800
|
| -+++ source/common/triedict.cpp 2009-07-27 13:01:17.718409000 -0700
|
| ++++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800
|
| @@ -20,6 +20,7 @@
|
| #include "uvector.h"
|
| #include "uvectr32.h"
|
| @@ -613,7 +573,7 @@
|
|
|
| -MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
|
| +MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,
|
| -+ UBool containsValue /* = FALSE */ ) {
|
| ++ UBool containsValue /* = FALSE */ ) {
|
| // Start the trie off with something. Having the root node already present
|
| // cuts a special case out of the search/insertion functions.
|
| // Making it a median character cuts the worse case for searches from
|
| @@ -627,7 +587,7 @@
|
|
|
| -MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
|
| +MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,
|
| -+ UBool containsValue /* = false */ ) {
|
| ++ UBool containsValue /* = false */ ) {
|
| fTrie = NULL;
|
| fIter = utext_openUChars(NULL, NULL, 0, &status);
|
| if (U_SUCCESS(status) && fIter == NULL) {
|
| @@ -638,13 +598,23 @@
|
| }
|
|
|
| MutableTrieDictionary::~MutableTrieDictionary() {
|
| -@@ -113,7 +130,8 @@
|
| - int &count,
|
| - int limit,
|
| - TernaryNode *&parent,
|
| +@@ -108,12 +125,13 @@
|
| +
|
| + int32_t
|
| + MutableTrieDictionary::search( UText *text,
|
| +- int32_t maxLength,
|
| +- int32_t *lengths,
|
| +- int &count,
|
| +- int limit,
|
| +- TernaryNode *&parent,
|
| - UBool &pMatched ) const {
|
| -+ UBool &pMatched,
|
| -+ uint16_t *values /*=NULL*/) const {
|
| ++ int32_t maxLength,
|
| ++ int32_t *lengths,
|
| ++ int &count,
|
| ++ int limit,
|
| ++ TernaryNode *&parent,
|
| ++ UBool &pMatched,
|
| ++ uint16_t *values /*=NULL*/) const {
|
| // TODO: current implementation works in UTF-16 space
|
| const TernaryNode *up = NULL;
|
| const TernaryNode *p = fTrie;
|
| @@ -700,20 +670,31 @@
|
| U_ASSERT(uc != U_SENTINEL);
|
| TernaryNode *newNode = new TernaryNode(uc);
|
| if (newNode == NULL) {
|
| -@@ -199,7 +226,11 @@
|
| +@@ -199,30 +226,23 @@
|
| parent = newNode;
|
| }
|
|
|
| - parent->flags |= kEndsWord;
|
| +-}
|
| +-
|
| +-#if 0
|
| +-void
|
| +-MutableTrieDictionary::addWords( UEnumeration *words,
|
| +- UErrorCode &status ) {
|
| +- int32_t length;
|
| +- const UChar *word;
|
| +- while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
|
| +- addWord(word, length, status);
|
| + if(fValued && value > 0){
|
| + parent->flags = value;
|
| + } else {
|
| + parent->flags |= kEndsWord;
|
| -+ }
|
| + }
|
| }
|
| +-#endif
|
|
|
| - #if 0
|
| -@@ -219,10 +250,11 @@
|
| + int32_t
|
| + MutableTrieDictionary::matches( UText *text,
|
| int32_t maxLength,
|
| int32_t *lengths,
|
| int &count,
|
| @@ -727,7 +708,7 @@
|
| }
|
|
|
| // Implementation of iteration for MutableTrieDictionary
|
| -@@ -277,7 +309,7 @@
|
| +@@ -277,7 +297,7 @@
|
| break;
|
| }
|
| case kEqual:
|
| @@ -736,7 +717,7 @@
|
| equal = (node->equal != NULL);
|
| // If this node should be part of the next emitted string, append
|
| // the UChar to the string, and make sure we pop it when we come
|
| -@@ -299,7 +331,7 @@
|
| +@@ -299,7 +319,7 @@
|
| }
|
| case kGreaterThan:
|
| // If this node's character is in the string, remove it.
|
| @@ -745,11 +726,12 @@
|
| unistr.truncate(unistr.length()-1);
|
| }
|
| if (node->high != NULL) {
|
| -@@ -354,12 +386,74 @@
|
| +@@ -354,12 +374,75 @@
|
| * CompactTrieDictionary
|
| */
|
|
|
| -+//TODO if time permits: minimise size of trie with logprobs by storing values
|
| ++//TODO further optimization:
|
| ++// minimise size of trie with logprobs by storing values
|
| +// for terminal nodes directly in offsets[]
|
| +// --> calculating from next offset *might* be simpler, but would have to add
|
| +// one last offset for logprob of last node
|
| @@ -821,7 +803,7 @@
|
| };
|
|
|
| // Note that to avoid platform-specific alignment issues, all members of the node
|
| -@@ -375,10 +469,14 @@
|
| +@@ -375,10 +458,14 @@
|
| enum CompactTrieNodeFlags {
|
| kVerticalNode = 0x1000, // This is a vertical node
|
| kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word
|
| @@ -839,7 +821,7 @@
|
| };
|
|
|
| // The two node types are distinguished by the kVerticalNode flag.
|
| -@@ -402,63 +500,177 @@
|
| +@@ -402,63 +489,177 @@
|
| uint16_t chars[1]; // Code units
|
| };
|
|
|
| @@ -914,23 +896,23 @@
|
| CompactTrieDictionary::data() const {
|
| - return fData;
|
| + return fInfo->address;
|
| - }
|
| -
|
| --// This function finds the address of a node for us, given its node ID
|
| ++}
|
| ++
|
| +//This function finds the address of a node for us, given its node ID
|
| - static inline const CompactTrieNode *
|
| --getCompactNode(const CompactTrieHeader *header, uint16_t node) {
|
| -- return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
|
| ++static inline const CompactTrieNode *
|
| +getCompactNode(const CompactTrieInfo *info, uint32_t node) {
|
| + if(node < info->root-1) {
|
| + return (const CompactTrieNode *)(&info->offsets[node]);
|
| + } else {
|
| + return (const CompactTrieNode *)(info->address + info->offsets[node]);
|
| + }
|
| -+}
|
| -+
|
| + }
|
| +
|
| +-// This function finds the address of a node for us, given its node ID
|
| +//this version of getCompactNode is currently only used in compactMutableTrieDictionary()
|
| -+static inline const CompactTrieNode *
|
| + static inline const CompactTrieNode *
|
| +-getCompactNode(const CompactTrieHeader *header, uint16_t node) {
|
| +- return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
|
| +getCompactNode(const CompactTrieHeader *header, uint32_t node) {
|
| + if(node < header->root-1) {
|
| + return (const CompactTrieNode *)(&header->offsets[node]);
|
| @@ -1037,7 +1019,7 @@
|
| }
|
|
|
| int32_t
|
| -@@ -466,17 +678,38 @@
|
| +@@ -466,17 +667,38 @@
|
| int32_t maxLength,
|
| int32_t *lengths,
|
| int &count,
|
| @@ -1078,7 +1060,7 @@
|
| lengths[mycount++] = i;
|
| --limit;
|
| }
|
| -@@ -487,7 +720,7 @@
|
| +@@ -487,7 +709,7 @@
|
| break;
|
| }
|
|
|
| @@ -1087,7 +1069,7 @@
|
| if (nodeCount == 0) {
|
| // Special terminal node; return now
|
| break;
|
| -@@ -507,35 +740,27 @@
|
| +@@ -507,35 +729,27 @@
|
| // To get here we must have come through the whole list successfully;
|
| // go on to the next node. Note that a word cannot end in the middle
|
| // of a vertical node.
|
| @@ -1137,7 +1119,7 @@
|
| count = mycount;
|
| return i;
|
| }
|
| -@@ -545,16 +770,16 @@
|
| +@@ -545,16 +759,16 @@
|
| private:
|
| UVector32 fNodeStack; // Stack of nodes to process
|
| UVector32 fIndexStack; // Stack of where in node we are
|
| @@ -1158,7 +1140,7 @@
|
| fIndexStack.push(0, status);
|
| unistr.remove();
|
| }
|
| -@@ -564,14 +789,14 @@
|
| +@@ -564,14 +778,14 @@
|
|
|
| virtual StringEnumeration *clone() const {
|
| UErrorCode status = U_ZERO_ERROR;
|
| @@ -1175,7 +1157,7 @@
|
| int32_t result = 0;
|
| while (counter.snext(status) != NULL && U_SUCCESS(status)) {
|
| ++result;
|
| -@@ -582,7 +807,7 @@
|
| +@@ -582,7 +796,7 @@
|
| virtual void reset(UErrorCode &status) {
|
| fNodeStack.removeAllElements();
|
| fIndexStack.removeAllElements();
|
| @@ -1184,7 +1166,7 @@
|
| fIndexStack.push(0, status);
|
| unistr.remove();
|
| }
|
| -@@ -595,26 +820,34 @@
|
| +@@ -595,26 +809,34 @@
|
| if (fNodeStack.empty() || U_FAILURE(status)) {
|
| return NULL;
|
| }
|
| @@ -1225,7 +1207,7 @@
|
| where = fIndexStack.push(0, status);
|
| goingDown = TRUE;
|
| }
|
| -@@ -623,7 +856,7 @@
|
| +@@ -623,7 +845,7 @@
|
| unistr.truncate(unistr.length()-nodeCount);
|
| fNodeStack.popi();
|
| fIndexStack.popi();
|
| @@ -1234,7 +1216,7 @@
|
| where = fIndexStack.peeki();
|
| }
|
| }
|
| -@@ -638,7 +871,7 @@
|
| +@@ -638,7 +860,7 @@
|
| // Push on next node
|
| unistr.append((UChar)hnode->entries[where].ch);
|
| fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
|
| @@ -1243,7 +1225,7 @@
|
| where = fIndexStack.push(0, status);
|
| goingDown = TRUE;
|
| }
|
| -@@ -646,12 +879,14 @@
|
| +@@ -646,12 +868,14 @@
|
| // Going up
|
| fNodeStack.popi();
|
| fIndexStack.popi();
|
| @@ -1259,7 +1241,7 @@
|
| if (goingDown && (node->flagscount & kParentEndsWord)) {
|
| return &unistr;
|
| }
|
| -@@ -664,7 +899,7 @@
|
| +@@ -664,7 +888,7 @@
|
| if (U_FAILURE(status)) {
|
| return NULL;
|
| }
|
| @@ -1268,7 +1250,7 @@
|
| }
|
|
|
| //
|
| -@@ -672,21 +907,36 @@
|
| +@@ -672,21 +896,36 @@
|
| // and back again
|
| //
|
|
|
| @@ -1311,7 +1293,7 @@
|
| nodes.push(this, status);
|
| }
|
|
|
| -@@ -694,87 +944,225 @@
|
| +@@ -694,87 +933,225 @@
|
| }
|
|
|
| virtual uint32_t size() {
|
| @@ -1553,7 +1535,7 @@
|
| }
|
|
|
| void addChar(UChar ch) {
|
| -@@ -784,60 +1172,85 @@
|
| +@@ -784,60 +1161,85 @@
|
| void setLink(BuildCompactTrieNode *node) {
|
| fEqual = node;
|
| }
|
| @@ -1651,16 +1633,16 @@
|
| }
|
| result = vResult;
|
| }
|
| -@@ -849,19 +1262,28 @@
|
| +@@ -849,19 +1251,28 @@
|
| // Uses recursion.
|
|
|
| static void walkHorizontal(const TernaryNode *node,
|
| - BuildCompactTrieHorizontalNode *building,
|
| - UStack &nodes,
|
| - UErrorCode &status) {
|
| -+ BuildCompactTrieHorizontalNode *building,
|
| -+ UStack &nodes,
|
| -+ UErrorCode &status, Hashtable *values = NULL) {
|
| ++ BuildCompactTrieHorizontalNode *building,
|
| ++ UStack &nodes,
|
| ++ UErrorCode &status, Hashtable *values = NULL) {
|
| while (U_SUCCESS(status) && node != NULL) {
|
| if (node->low != NULL) {
|
| - walkHorizontal(node->low, building, nodes, status);
|
| @@ -1687,7 +1669,7 @@
|
| }
|
| if (U_SUCCESS(status) && link != NULL) {
|
| building->addNode(node->ch, link, status);
|
| -@@ -881,13 +1303,15 @@
|
| +@@ -881,13 +1292,15 @@
|
| _sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
|
| BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
|
| BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
|
| @@ -1705,7 +1687,7 @@
|
| }
|
| // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
|
| if (left->fParentEndsWord != right->fParentEndsWord) {
|
| -@@ -898,12 +1322,19 @@
|
| +@@ -898,12 +1311,19 @@
|
| if (result != 0) {
|
| return result;
|
| }
|
| @@ -1728,7 +1710,7 @@
|
| // We need to compare the links vectors. They should be the
|
| // same size because the strings were equal.
|
| // We compare the node IDs instead of the pointers, to handle
|
| -@@ -914,9 +1345,10 @@
|
| +@@ -914,9 +1334,10 @@
|
| int32_t count = hleft->fLinks.size();
|
| for (int32_t i = 0; i < count && result == 0; ++i) {
|
| result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
|
| @@ -1740,7 +1722,7 @@
|
| // If they are equal to each other, mark them (speeds coalescing)
|
| if (result == 0) {
|
| left->fHasDuplicate = TRUE;
|
| -@@ -1031,20 +1463,25 @@
|
| +@@ -1031,20 +1452,25 @@
|
| // Add node 0, used as the NULL pointer/sentinel.
|
| nodes.addElement((int32_t)0, status);
|
|
|
| @@ -1770,7 +1752,7 @@
|
| #ifdef DEBUG_TRIE_DICT
|
| (void) ::times(&timing);
|
| fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
|
| -@@ -1077,21 +1514,37 @@
|
| +@@ -1077,21 +1503,37 @@
|
| return NULL;
|
| }
|
|
|
| @@ -1814,7 +1796,7 @@
|
| status = U_ILLEGAL_ARGUMENT_ERROR;
|
| return NULL;
|
| }
|
| -@@ -1111,9 +1564,14 @@
|
| +@@ -1111,9 +1553,14 @@
|
| status = U_MEMORY_ALLOCATION_ERROR;
|
| return NULL;
|
| }
|
| @@ -1831,7 +1813,7 @@
|
| header->nodeCount = nodeCount;
|
| header->offsets[0] = 0; // Sentinel
|
| header->root = translate.elementAti(root->fNodeID);
|
| -@@ -1123,23 +1581,40 @@
|
| +@@ -1123,23 +1570,40 @@
|
| }
|
| #endif
|
| uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
|
| @@ -1875,7 +1857,7 @@
|
| // Collect statistics on node types and sizes
|
| int hCount = 0;
|
| int vCount = 0;
|
| -@@ -1148,68 +1623,85 @@
|
| +@@ -1148,68 +1612,85 @@
|
| size_t hItemCount = 0;
|
| size_t vItemCount = 0;
|
| uint32_t previousOff = offset;
|
| @@ -1981,7 +1963,7 @@
|
| if (nodeCount == 0 || U_FAILURE(status)) {
|
| // Failure, or terminal node
|
| return NULL;
|
| -@@ -1234,29 +1726,41 @@
|
| +@@ -1234,29 +1715,41 @@
|
| previous = latest;
|
| }
|
| if (latest != NULL) {
|
| @@ -2029,7 +2011,7 @@
|
| if (U_FAILURE(status)) {
|
| delete root; // Clean up
|
| delete result;
|
| -@@ -1270,8 +1774,8 @@
|
| +@@ -1270,8 +1763,8 @@
|
|
|
| U_CAPI int32_t U_EXPORT2
|
| triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
|
| @@ -2040,7 +2022,7 @@
|
| if (status == NULL || U_FAILURE(*status)) {
|
| return 0;
|
| }
|
| -@@ -1286,14 +1790,14 @@
|
| +@@ -1286,14 +1779,14 @@
|
| //
|
| const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
|
| if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */
|
| @@ -2062,7 +2044,7 @@
|
| *status=U_UNSUPPORTED_ERROR;
|
| return 0;
|
| }
|
| -@@ -1311,8 +1815,10 @@
|
| +@@ -1311,8 +1804,10 @@
|
| //
|
| const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
|
| const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
|
| @@ -2075,7 +2057,7 @@
|
| {
|
| udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
|
| *status=U_UNSUPPORTED_ERROR;
|
| -@@ -1333,10 +1839,10 @@
|
| +@@ -1333,10 +1828,10 @@
|
| //
|
| if (length < sizeWithUData) {
|
| udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
|
| @@ -2088,7 +2070,7 @@
|
|
|
| //
|
| // Swap the Data. Do the data itself first, then the CompactTrieHeader, because
|
| -@@ -1355,20 +1861,38 @@
|
| +@@ -1355,20 +1850,38 @@
|
| }
|
|
|
| // We need to loop through all the nodes in the offset table, and swap each one.
|
| @@ -2133,7 +2115,7 @@
|
| uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
|
| ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
|
| }
|
| -@@ -1381,26 +1905,62 @@
|
| +@@ -1381,26 +1894,62 @@
|
| word = ds->readUInt16(inHNode->entries[j].equal);
|
| ds->writeUInt16(&outHNode->entries[j].equal, word);
|
| }
|
| @@ -2209,7 +2191,7 @@
|
| return sizeWithUData;
|
| }
|
| --- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700
|
| -+++ source/common/triedict.h 2009-07-27 13:01:17.723390000 -0700
|
| ++++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800
|
| @@ -47,7 +47,6 @@
|
| U_NAMESPACE_BEGIN
|
|
|
| @@ -2448,27 +2430,22 @@
|
| - /* TRIEDICT_H */
|
| +/* TRIEDICT_H */
|
| #endif
|
| ---- source/data/brkitr/brkfiles.mk 2009-04-21 15:42:37.000000000 -0700
|
| -+++ source/data/brkitr/brkfiles.mk 2009-07-27 13:01:17.730379000 -0700
|
| -@@ -34,13 +34,12 @@
|
| +--- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700
|
| ++++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800
|
| +@@ -509,8 +520,9 @@
|
| + #################################################### CTD
|
| + # CTD FILES
|
|
|
| +-$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
|
| +- $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
|
| ++# .ctd file now generated regardless of whether dictionary file exists
|
| ++$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
|
| ++ $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt
|
|
|
| - # List of compact trie dictionary files (ctd).
|
| --BRK_CTD_SOURCE = thaidict.txt
|
| -+BRK_CTD_SOURCE = thaidict.txt cjdict.txt
|
| -
|
| -
|
| - # List of break iterator files (brk).
|
| --BRK_SOURCE = word_POSIX.txt word_ja.txt sent_el.txt char_th.txt char.txt word.txt line.txt sent.txt title.txt
|
| -+BRK_SOURCE = word_POSIX.txt sent_el.txt char_th.txt char.txt word.txt line.txt sent.txt title.txt
|
| -
|
| -
|
| - # Ordinary resources
|
| --BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt ja.txt th.txt
|
| --
|
| -+BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt th.txt
|
| ---- source/data/brkitr/root.txt 2009-06-24 14:06:38.000000000 -0700
|
| -+++ source/data/brkitr/root.txt 2009-07-27 13:01:17.733382000 -0700
|
| + #################################################### CFU
|
| + # CFU FILES
|
| +--- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700
|
| ++++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800
|
| @@ -17,5 +17,8 @@
|
| }
|
| dictionaries{
|
| @@ -2478,173 +2455,8 @@
|
| + Kata:process(dependency){"cjdict.ctd"}
|
| }
|
| }
|
| ---- source/data/brkitr/word.txt 2009-06-24 14:06:38.000000000 -0700
|
| -+++ source/data/brkitr/word.txt 2010-08-27 16:24:25.969372000 -0700
|
| -@@ -29,29 +29,49 @@
|
| - $Newline = [\p{Word_Break = Newline}];
|
| - $Extend = [\p{Word_Break = Extend}];
|
| - $Format = [\p{Word_Break = Format}];
|
| -+$Hiragana = [:Hiragana:];
|
| - $Katakana = [\p{Word_Break = Katakana}];
|
| -+$Han = [:Han:];
|
| - $ALetter = [\p{Word_Break = ALetter}];
|
| --$MidNumLet = [\p{Word_Break = MidNumLet}];
|
| -+# Remove two full stop characters from $MidNumLet and add them to $MidNum
|
| -+# to break a hostname into its components at the cost of breaking
|
| -+# 'e.g.' and 'i.e.' as well.
|
| -+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
|
| -+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
|
| -+# while rules 6/7 are reverted to the old behavior we want.
|
| -+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
|
| - $MidLetter = [\p{Word_Break = MidLetter}];
|
| --$MidNum = [\p{Word_Break = MidNum}];
|
| --$Numeric = [\p{Word_Break = Numeric}];
|
| -+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
|
| -+$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
|
| - $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
| -
|
| -+# Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'.
|
| -+$HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]];
|
| -+# U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by
|
| -+# the current rule 6/7.
|
| -+$HebrewMidLet = [\u0022];
|
| -
|
| - # Dictionary character set, for triggering language-based break engines. Currently
|
| --# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
| --# 5.0 or later as the definition of Complex_Context was corrected to include all
|
| -+# limited to LineBreak=Complex_Context and CJK. Note that this set only works
|
| -+# in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all
|
| - # characters requiring dictionary break.
|
| -
|
| --$dictionary = [:LineBreak = Complex_Context:];
|
| - $Control = [\p{Grapheme_Cluster_Break = Control}];
|
| --$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
| -- # include the dictionary characters.
|
| -+$HangulSyllable = [\uac00-\ud7a3];
|
| -+$ComplexContext = [:LineBreak = Complex_Context:];
|
| -+$KanaKanji = [$Han $Hiragana $Katakana];
|
| -+$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
| -+$dictionary = [$ComplexContext $dictionaryCJK];
|
| -+
|
| -+# leave CJK scripts out of ALetterPlus
|
| -+$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
| -+
|
| -
|
| - #
|
| - # Rules 4 Ignore Format and Extend characters,
|
| - # except when they appear at the beginning of a region of text.
|
| - #
|
| -+# TODO: check if handling of katakana in dictionary makes rules incorrect/void.
|
| - $KatakanaEx = $Katakana ($Extend | $Format)*;
|
| - $ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
| - $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
| -@@ -59,8 +79,8 @@
|
| - $MidNumEx = $MidNum ($Extend | $Format)*;
|
| - $NumericEx = $Numeric ($Extend | $Format)*;
|
| - $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
| -+$HebrewLetEx = $HebrewLet ($Extend | $Format)*;
|
| -
|
| --$Hiragana = [\p{script=Hiragana}];
|
| - $Ideographic = [\p{Ideographic}];
|
| - $HiraganaEx = $Hiragana ($Extend | $Format)*;
|
| - $IdeographicEx = $Ideographic ($Extend | $Format)*;
|
| -@@ -79,12 +99,14 @@
|
| - # begins with a group of Format chars, or with a "word" consisting of a single
|
| - # char that is not in any of the listed word break categories followed by
|
| - # format char(s).
|
| --[^$CR $LF $Newline]? ($Extend | $Format)+;
|
| -+ # format char(s), or is not a CJK dictionary character.
|
| -+[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+;
|
| -
|
| - $NumericEx {100};
|
| - $ALetterEx {200};
|
| --$KatakanaEx {300}; # note: these status values override those from rule 5
|
| --$HiraganaEx {300}; # by virtual of being numerically larger.
|
| -+$HangulSyllable {200};
|
| -+$KatakanaEx {400}; #originally 300
|
| -+$HiraganaEx {400}; #originally 300
|
| - $IdeographicEx {400}; #
|
| -
|
| - #
|
| -@@ -96,6 +118,9 @@
|
| - # rule 6 and 7
|
| - $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
| -
|
| -+# Chrome addition
|
| -+$HebrewLetEx $HebrewMidLet $HebrewLetEx {200};
|
| -+
|
| - # rule 8
|
| -
|
| - $NumericEx $NumericEx {100};
|
| -@@ -114,19 +139,25 @@
|
| -
|
| - # rule 13
|
| -
|
| --$KatakanaEx $KatakanaEx {300};
|
| -+# To be consistent with '$KanaKanji $KanaKanji', changed
|
| -+# from 300 to 400.
|
| -+# See also TestRuleStatus in intltest/rbbiapts.cpp
|
| -+$KatakanaEx $KatakanaEx {400};
|
| -
|
| - # rule 13a/b
|
| -
|
| - $ALetterEx $ExtendNumLetEx {200}; # (13a)
|
| - $NumericEx $ExtendNumLetEx {100}; # (13a)
|
| --$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
| -+$KatakanaEx $ExtendNumLetEx {400}; # (13a)
|
| - $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
| -
|
| - $ExtendNumLetEx $ALetterEx {200}; # (13b)
|
| - $ExtendNumLetEx $NumericEx {100}; # (13b)
|
| --$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
| --
|
| -+$ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
| -+
|
| -+# special handling for CJK characters: chain for later dictionary segmentation
|
| -+$HangulSyllable $HangulSyllable {200};
|
| -+$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found
|
| -
|
| -
|
| - ## -------------------------------------------------
|
| -@@ -139,13 +170,15 @@
|
| - $BackMidNumEx = ($Format | $Extend)* $MidNum;
|
| - $BackMidLetterEx = ($Format | $Extend)* $MidLetter;
|
| - $BackKatakanaEx = ($Format | $Extend)* $Katakana;
|
| -+$BackHiraganaEx = ($Extend | $Format)* $Hiragana;
|
| - $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
|
| -+$BackHebrewLetEx = ($Format | $Extend)* $HebrewLet;
|
| -
|
| - # rule 3
|
| - $LF $CR;
|
| -
|
| - # rule 4
|
| --($Format | $Extend)* [^$CR $LF $Newline]?;
|
| -+($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;
|
| -
|
| - # rule 5
|
| -
|
| -@@ -155,6 +188,8 @@
|
| -
|
| - $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
|
| -
|
| -+# Chrome addition
|
| -+$BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx;
|
| -
|
| - # rule 8
|
| -
|
| -@@ -181,6 +216,10 @@
|
| - $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
|
| - ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
|
| -
|
| -+# special handling for CJK characters: chain for later dictionary segmentation
|
| -+$HangulSyllable $HangulSyllable;
|
| -+$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
|
| -+
|
| - ## -------------------------------------------------
|
| -
|
| - !!safe_reverse;
|
| ---- source/data/xml/brkitr/root.xml 2007-08-28 23:10:43.000000000 -0700
|
| -+++ source/data/xml/brkitr/root.xml 2009-07-27 13:01:17.746367000 -0700
|
| +--- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800
|
| ++++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800
|
| @@ -25,6 +25,9 @@
|
| </icu:boundaries>
|
| <icu:dictionaries>
|
| @@ -2655,9 +2467,9 @@
|
| </icu:dictionaries>
|
| </icu:breakIteratorData>
|
| </special>
|
| ---- source/test/cintltst/creststn.c 2009-06-26 09:49:55.000000000 -0700
|
| -+++ source/test/cintltst/creststn.c 2009-07-29 12:46:05.997405000 -0700
|
| -@@ -2181,21 +2181,21 @@
|
| +--- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700
|
| ++++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800
|
| +@@ -2188,21 +2188,21 @@
|
|
|
|
|
| {
|
| @@ -2684,13 +2496,8 @@
|
| status = U_ZERO_ERROR;
|
| }
|
| /* simple alias */
|
| -@@ -3024,4 +3024,3 @@
|
| - }
|
| -
|
| - }
|
| --
|
| ---- source/test/intltest/rbbiapts.cpp 2009-06-26 09:49:55.000000000 -0700
|
| -+++ source/test/intltest/rbbiapts.cpp 2009-07-28 13:56:30.208042000 -0700
|
| +--- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700
|
| ++++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800
|
| @@ -156,9 +156,13 @@
|
| if(*a!=*b){
|
| errln("Failed: boilerplate method operator!= does not return correct results");
|
| @@ -2716,7 +2523,7 @@
|
| }
|
|
|
| void RBBIAPITest::TestgetRules()
|
| -@@ -643,21 +648,21 @@
|
| +@@ -635,21 +640,21 @@
|
| //
|
| void RBBIAPITest::TestRuleStatus() {
|
| UChar str[30];
|
| @@ -2746,7 +2553,7 @@
|
|
|
| UErrorCode status=U_ZERO_ERROR;
|
|
|
| -@@ -896,9 +901,11 @@
|
| +@@ -888,9 +893,11 @@
|
|
|
| URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
|
| {
|
| @@ -2758,9 +2565,9 @@
|
| }
|
|
|
| {
|
| ---- source/test/intltest/rbbitst.cpp 2009-06-26 09:49:55.000000000 -0700
|
| -+++ source/test/intltest/rbbitst.cpp 2009-07-28 15:35:18.933226000 -0700
|
| -@@ -33,6 +33,8 @@
|
| +--- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700
|
| ++++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800
|
| +@@ -35,6 +35,8 @@
|
| #include <string.h>
|
| #include <stdio.h>
|
| #include <stdlib.h>
|
| @@ -2769,25 +2576,32 @@
|
|
|
| #define TEST_ASSERT(x) {if (!(x)) { \
|
| errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
|
| -@@ -108,6 +110,8 @@
|
| +@@ -138,11 +140,13 @@
|
| if (exec) TestThaiBreaks(); break;
|
| case 23: name = "TestTailoredBreaks";
|
| if (exec) TestTailoredBreaks(); break;
|
| + case 24: name = "TestTrieDictWithValue";
|
| + if(exec) TestTrieDictWithValue(); break;
|
| + #else
|
| +- case 21: case 22: case 23: name = "skip";
|
| ++ case 21: case 22: case 23: case 24: name = "skip";
|
| + break;
|
| + #endif
|
| +- case 24: name = "TestDictRules";
|
| ++ case 25: name = "TestDictRules";
|
| + if (exec) TestDictRules(); break;
|
| + case 25: name = "TestBug5532";
|
| + if (exec) TestBug5532(); break;
|
| +@@ -607,6 +611,8 @@
|
|
|
| - default: name = ""; break; //needed to end loop
|
| - }
|
| -@@ -570,6 +574,8 @@
|
|
|
| -
|
| void RBBITest::TestJapaneseWordBreak() {
|
| +// TODO: Rewrite this test for a dictionary-based word breaking.
|
| +#if 0
|
| UErrorCode status = U_ZERO_ERROR;
|
| BITestData japaneseWordSelection(status);
|
|
|
| -@@ -591,6 +597,7 @@
|
| +@@ -628,6 +634,7 @@
|
|
|
| generalIteratorTest(*e, japaneseWordSelection);
|
| delete e;
|
| @@ -2795,7 +2609,7 @@
|
| }
|
|
|
| void RBBITest::TestTrieDict() {
|
| -@@ -812,6 +819,372 @@
|
| +@@ -849,6 +856,372 @@
|
| delete compact2;
|
| }
|
|
|
| @@ -3168,7 +2982,7 @@
|
|
|
| //----------------------------------------------------------------------------
|
| //
|
| -@@ -1832,8 +2205,15 @@
|
| +@@ -1870,8 +2243,15 @@
|
| // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
|
| static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
|
| "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
|
| @@ -3184,7 +2998,7 @@
|
|
|
| // UBreakIteratorType UBRK_SENTENCE, Locale "el"
|
| // Add break after Greek question mark (cldrbug #2069).
|
| -@@ -2580,6 +2960,8 @@
|
| +@@ -2672,6 +3052,8 @@
|
| UnicodeSet *fNewlineSet;
|
| UnicodeSet *fKatakanaSet;
|
| UnicodeSet *fALetterSet;
|
| @@ -3193,7 +3007,7 @@
|
| UnicodeSet *fMidNumLetSet;
|
| UnicodeSet *fMidLetterSet;
|
| UnicodeSet *fMidNumSet;
|
| -@@ -2588,6 +2970,7 @@
|
| +@@ -2680,6 +3062,7 @@
|
| UnicodeSet *fOtherSet;
|
| UnicodeSet *fExtendSet;
|
| UnicodeSet *fExtendNumLetSet;
|
| @@ -3201,7 +3015,7 @@
|
|
|
| RegexMatcher *fMatcher;
|
|
|
| -@@ -2604,12 +2987,24 @@
|
| +@@ -2696,12 +3079,24 @@
|
| fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
|
| fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
|
| fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
|
| @@ -3228,7 +3042,7 @@
|
| fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
|
| fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
|
| fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
|
| -@@ -2633,13 +3028,14 @@
|
| +@@ -2725,13 +3120,14 @@
|
| fOtherSet->removeAll(*fFormatSet);
|
| fOtherSet->removeAll(*fExtendSet);
|
| // Inhibit dictionary characters from being tested at all.
|
| @@ -3244,7 +3058,7 @@
|
| fSets->addElement(fMidLetterSet, status);
|
| fSets->addElement(fMidNumLetSet, status);
|
| fSets->addElement(fMidNumSet, status);
|
| -@@ -3871,6 +4267,7 @@
|
| +@@ -3978,6 +4374,7 @@
|
| for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
|
| count --;
|
| if (forward[count] != i) {
|
| @@ -3252,7 +3066,7 @@
|
| test->errln("happy break test previous() failed: expected %d but got %d",
|
| forward[count], i);
|
| break;
|
| -@@ -3904,23 +4301,25 @@
|
| +@@ -4011,23 +4408,25 @@
|
| UErrorCode status = U_ZERO_ERROR;
|
| // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
| BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
| @@ -3282,7 +3096,7 @@
|
| "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
|
| "\\u0027\\u11af\\U000e0057\\u0602",
|
| "\\U0001d7f2\\U000e007\\u0004\\u0589",
|
| -@@ -3932,7 +4331,7 @@
|
| +@@ -4039,7 +4438,7 @@
|
| "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
|
| "\\u0233\\U000e0020\\u0a69\\u0d6a",
|
| "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
|
| @@ -3291,7 +3105,7 @@
|
| "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
|
| "\\ua183\\u102d\\u0bec\\u003a",
|
| "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
|
| -@@ -3942,7 +4341,7 @@
|
| +@@ -4049,7 +4448,7 @@
|
| "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
|
| "\\u003a\\u0664\\u00b7\\u1fba",
|
| "\\u003b\\u0027\\u00b7\\u47a3",
|
| @@ -3300,7 +3114,7 @@
|
| "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
|
| "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
|
| };
|
| -@@ -3997,12 +4396,12 @@
|
| +@@ -4104,12 +4503,12 @@
|
| "\\U0001d7f2\\U000e007d\\u0004\\u0589",
|
| "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
|
| "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
|
| @@ -3315,19 +3129,19 @@
|
| "\\ua183\\u102d\\u0bec\\u003a",
|
| "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
|
| "\\u003a\\u0e57\\u0fad\\u002e",
|
| ---- source/test/intltest/rbbitst.h 2009-04-22 00:53:50.000000000 -0700
|
| -+++ source/test/intltest/rbbitst.h 2009-07-27 13:01:17.767342000 -0700
|
| +--- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700
|
| ++++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800
|
| @@ -70,6 +70,7 @@
|
| void TestBug5775();
|
| void TestThaiBreaks();
|
| void TestTailoredBreaks();
|
| + void TestTrieDictWithValue();
|
| + void TestDictRules();
|
| + void TestBug5532();
|
|
|
| - void TestDebug();
|
| -
|
| ---- source/test/testdata/rbbitst.txt 2009-06-24 14:06:38.000000000 -0700
|
| -+++ source/test/testdata/rbbitst.txt 2009-07-29 12:56:31.483710000 -0700
|
| -@@ -162,7 +162,23 @@
|
| +--- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700
|
| ++++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800
|
| +@@ -161,7 +161,23 @@
|
| <data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data>
|
|
|
| # Hiragana & Katakana stay together, but separates from each other and Latin.
|
| @@ -3352,7 +3166,7 @@
|
|
|
| # Words with interior formatting characters
|
| <data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
|
| -@@ -170,6 +186,8 @@
|
| +@@ -169,6 +185,8 @@
|
| # to test for bug #4097779
|
| <data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
|
|
|
| @@ -3361,7 +3175,7 @@
|
|
|
| # to test for bug #4098467
|
| # What follows is a string of Korean characters (I found it in the Yellow Pages
|
| -@@ -179,9 +197,15 @@
|
| +@@ -178,9 +196,15 @@
|
| # precomposed syllables...
|
| <data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data>
|
|
|
| @@ -3379,7 +3193,7 @@
|
|
|
| #
|
| # Try some words from other scripts.
|
| -@@ -492,8 +516,7 @@
|
| +@@ -491,8 +515,7 @@
|
| <data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data>
|
|
|
| # conjoining jamo...
|
| @@ -3389,8 +3203,8 @@
|
|
|
| # to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
|
| <data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data>
|
| ---- source/test/testdata/testaliases.txt 2009-06-24 14:06:38.000000000 -0700
|
| -+++ source/test/testdata/testaliases.txt 2009-07-28 17:07:26.251120000 -0700
|
| +--- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0800
|
| ++++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0800
|
| @@ -28,7 +28,7 @@
|
| LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }
|
|
|
| @@ -3400,13 +3214,13 @@
|
|
|
| // aliasing arrays
|
| zoneTests {
|
| ---- source/tools/genctd/genctd.cpp 2006-09-04 09:28:24.000000000 -0700
|
| -+++ source/tools/genctd/genctd.cpp 2009-07-27 13:01:17.776335000 -0700
|
| +--- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700
|
| ++++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800
|
| @@ -1,6 +1,6 @@
|
| /*
|
| **********************************************************************
|
| --* Copyright (C) 2002-2006, International Business Machines
|
| -+* Copyright (C) 2002-2006,2008, International Business Machines
|
| +-* Copyright (C) 2002-2009, International Business Machines
|
| ++* Copyright (C) 2002-2010, International Business Machines
|
| * Corporation and others. All Rights Reserved.
|
| **********************************************************************
|
| *
|
| @@ -3426,7 +3240,7 @@
|
|
|
| #include <stdio.h>
|
| #include <stdlib.h>
|
| -@@ -198,147 +201,191 @@
|
| +@@ -199,147 +202,191 @@
|
| long wordFileSize;
|
| FILE *file;
|
| char *wordBufferC;
|
| @@ -3748,13 +3562,13 @@
|
| // Now, create a CompactTrieDictionary from the mutable dictionary
|
| CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
|
| if (U_FAILURE(status)) {
|
| -@@ -392,4 +439,3 @@
|
| +@@ -393,4 +440,3 @@
|
|
|
| #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
| }
|
| -
|
| --- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800
|
| -+++ source/tools/genctd/Makefile.in 2009-07-27 13:01:17.782326000 -0700
|
| ++++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800
|
| @@ -23,13 +23,13 @@
|
| ## Extra files to remove for 'make clean'
|
| CLEANFILES = *~ $(DEPS) $(MAN_FILES)
|
| @@ -3771,17 +3585,3 @@
|
| LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
|
|
| OBJECTS = genctd.o
|
| ---- source/data/Makefile.in 2009-05-20 23:03:54.000000000 -0700
|
| -+++ source/data/Makefile.in 2009-10-21 15:43:18.235201000 -0700
|
| -@@ -452,8 +452,9 @@
|
| - #################################################### CTD
|
| - # CTD FILES
|
| -
|
| --$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
|
| -- $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
|
| -+# .ctd file now generated regardless of whether dictionary file exists
|
| -+$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)
|
| -+ $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt
|
| -
|
| - #################################################### CFU
|
| - # CFU FILES
|
|
|