icu46/patches/segmentation.patch - Issue 6370014: CJK segmentation patch for ICU 4.6...

Unified Diff: icu46/patches/segmentation.patch

Issue 6370014: CJK segmentation patch for ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 9 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/patches/segmentation.patch

===================================================================

--- icu46/patches/segmentation.patch (revision 69841)

+++ icu46/patches/segmentation.patch (working copy)

@@ -1,14 +1,6 @@

---- source/common/brkeng.cpp 2007-09-11 20:53:13.000000000 -0700

-+++ source/common/brkeng.cpp 2009-07-29 12:57:49.973382000 -0700

-@@ -24,6 +24,7 @@

- #include "umutex.h"

- #include "uresimp.h"

- #include "ubrkimpl.h"

-+#include <stdio.h>

- U_NAMESPACE_BEGIN

-@@ -226,6 +227,30 @@

+--- source/common/brkeng.cpp 2009-11-11 07:47:22.000000000 -0800

++++ source/common/brkeng.cpp 2011-01-21 14:12:45.479922000 -0800

+@@ -226,6 +226,30 @@

case USCRIPT_THAI:

engine = new ThaiBreakEngine(dict, status);

break;

@@ -39,7 +31,7 @@

default:

break;

}

-@@ -281,6 +306,13 @@

+@@ -281,6 +305,13 @@

dict = NULL;

}

return dict;

@@ -54,20 +46,18 @@

return NULL;

}

--- source/common/dictbe.cpp 2008-06-13 12:21:12.000000000 -0700

-+++ source/common/dictbe.cpp 2009-11-11 12:58:40.199829000 -0800

-@@ -16,6 +16,11 @@

++++ source/common/dictbe.cpp 2011-01-21 14:12:45.468928000 -0800

+@@ -16,6 +16,9 @@

#include "unicode/ubrk.h"

#include "uvector.h"

#include "triedict.h"

+#include "uassert.h"

+#include "unicode/normlzr.h"

+#include "cmemory.h"

-+#include <stdio.h>

U_NAMESPACE_BEGIN

-@@ -422,6 +427,294 @@

+@@ -422,6 +425,294 @@

return wordsFound;

}

@@ -363,7 +353,7 @@

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

--- source/common/dictbe.h 2006-09-29 17:37:45.000000000 -0700

-+++ source/common/dictbe.h 2009-07-27 13:01:17.704415000 -0700

++++ source/common/dictbe.h 2011-01-21 14:12:45.492920000 -0800

@@ -1,8 +1,8 @@

/**

- *******************************************************************************

@@ -371,13 +361,13 @@

- *******************************************************************************

+ **********************************************************************************

#ifndef DICTBE_H

-@@ -65,37 +65,37 @@

+@@ -65,31 +65,31 @@

virtual ~DictionaryBreakEngine();

@@ -430,19 +420,8 @@

+ * @return The number of breaks found.

+ */

virtual int32_t findBreaks( UText *text,

-- int32_t startPos,

-- int32_t endPos,

-- UBool reverse,

-- int32_t breakType,

-- UStack &foundBreaks ) const;

-+ int32_t startPos,

-+ int32_t endPos,

-+ UBool reverse,

-+ int32_t breakType,

-+ UStack &foundBreaks ) const;

- protected:

+ int32_t startPos,

+ int32_t endPos,

@@ -114,7 +114,7 @@

// virtual void setBreakTypes( uint32_t breakTypes );

@@ -461,16 +440,7 @@

* @param text A UText representing the text

* @param rangeStart The start of the range of dictionary characters

-@@ -180,12 +180,72 @@

- * @return The number of breaks found

- */

- virtual int32_t divideUpDictionaryRange( UText *text,

-- int32_t rangeStart,

-- int32_t rangeEnd,

-- UStack &foundBreaks ) const;

-+ int32_t rangeStart,

-+ int32_t rangeEnd,

-+ UStack &foundBreaks ) const;

+@@ -186,6 +186,66 @@

};

@@ -537,34 +507,24 @@

U_NAMESPACE_END

---- source/common/rbbi.cpp 2008-09-24 22:48:27.000000000 -0700

-+++ source/common/rbbi.cpp 2009-07-27 13:01:17.710416000 -0700

-@@ -29,6 +29,7 @@

- #include "uassert.h"

- #include "uvector.h"

-+#include <stdio.h>

- // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.

- #if U_LOCAL_SERVICE_HOOK

-@@ -1552,10 +1553,14 @@

+--- source/common/rbbi.cpp 2010-07-22 17:15:37.000000000 -0700

++++ source/common/rbbi.cpp 2011-01-21 14:12:45.457938000 -0800

+@@ -1555,10 +1555,12 @@

int32_t endPos,

UBool reverse) {

// Reset the old break cache first.

- uint32_t dictionaryCount = fDictionaryCharCount;

-+// uint32_t dictionaryCount = fDictionaryCharCount;

reset();

- if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {

+ // note: code segment below assumes that dictionary chars are in the

+ // startPos-endPos range

+ // value returned should be next character in sequence

-+// if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {

+ if ((endPos - startPos) <= 1) {

return (reverse ? startPos : endPos);

}

-@@ -1684,7 +1689,7 @@

+@@ -1711,7 +1713,7 @@

// proposed break by one of the breaks we found. Use following() and

// preceding() to do the work. They should never recurse in this case.

if (reverse) {

@@ -574,7 +534,7 @@

else {

return following(startPos);

--- source/common/triedict.cpp 2008-02-13 01:35:50.000000000 -0800

-+++ source/common/triedict.cpp 2009-07-27 13:01:17.718409000 -0700

++++ source/common/triedict.cpp 2011-01-21 14:12:45.271006000 -0800

@@ -20,6 +20,7 @@

#include "uvector.h"

#include "uvectr32.h"

@@ -613,7 +573,7 @@

-MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {

+MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status,

-+ UBool containsValue /* = FALSE */ ) {

++ UBool containsValue /* = FALSE */ ) {

// Start the trie off with something. Having the root node already present

// cuts a special case out of the search/insertion functions.

// Making it a median character cuts the worse case for searches from

@@ -627,7 +587,7 @@

-MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {

+MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status,

-+ UBool containsValue /* = false */ ) {

++ UBool containsValue /* = false */ ) {

fTrie = NULL;

fIter = utext_openUChars(NULL, NULL, 0, &status);

if (U_SUCCESS(status) && fIter == NULL) {

@@ -638,13 +598,23 @@

}

MutableTrieDictionary::~MutableTrieDictionary() {

-@@ -113,7 +130,8 @@

- int &count,

- int limit,

- TernaryNode *&parent,

+@@ -108,12 +125,13 @@

+ int32_t

+ MutableTrieDictionary::search( UText *text,

+- int32_t maxLength,

+- int32_t *lengths,

+- int &count,

+- int limit,

+- TernaryNode *&parent,

- UBool &pMatched ) const {

-+ UBool &pMatched,

-+ uint16_t *values /*=NULL*/) const {

++ int32_t maxLength,

++ int32_t *lengths,

++ int &count,

++ int limit,

++ TernaryNode *&parent,

++ UBool &pMatched,

++ uint16_t *values /*=NULL*/) const {

// TODO: current implementation works in UTF-16 space

const TernaryNode *up = NULL;

const TernaryNode *p = fTrie;

@@ -700,20 +670,31 @@

U_ASSERT(uc != U_SENTINEL);

TernaryNode *newNode = new TernaryNode(uc);

if (newNode == NULL) {

-@@ -199,7 +226,11 @@

+@@ -199,30 +226,23 @@

parent = newNode;

}

- parent->flags |= kEndsWord;

+-}

+-#if 0

+-void

+-MutableTrieDictionary::addWords( UEnumeration *words,

+- UErrorCode &status ) {

+- int32_t length;

+- const UChar *word;

+- while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {

+- addWord(word, length, status);

+ if(fValued && value > 0){

+ parent->flags = value;

+ } else {

+ parent->flags |= kEndsWord;

-+ }

+ }

}

+-#endif

- #if 0

-@@ -219,10 +250,11 @@

+ int32_t

+ MutableTrieDictionary::matches( UText *text,

int32_t maxLength,

int32_t *lengths,

int &count,

@@ -727,7 +708,7 @@

}

// Implementation of iteration for MutableTrieDictionary

-@@ -277,7 +309,7 @@

+@@ -277,7 +297,7 @@

break;

}

case kEqual:

@@ -736,7 +717,7 @@

equal = (node->equal != NULL);

// If this node should be part of the next emitted string, append

// the UChar to the string, and make sure we pop it when we come

-@@ -299,7 +331,7 @@

+@@ -299,7 +319,7 @@

}

case kGreaterThan:

// If this node's character is in the string, remove it.

@@ -745,11 +726,12 @@

unistr.truncate(unistr.length()-1);

}

if (node->high != NULL) {

-@@ -354,12 +386,74 @@

+@@ -354,12 +374,75 @@

* CompactTrieDictionary

-+//TODO if time permits: minimise size of trie with logprobs by storing values

++//TODO further optimization:

++// minimise size of trie with logprobs by storing values

+// for terminal nodes directly in offsets[]

+// --> calculating from next offset *might* be simpler, but would have to add

+// one last offset for logprob of last node

@@ -821,7 +803,7 @@

};

// Note that to avoid platform-specific alignment issues, all members of the node

-@@ -375,10 +469,14 @@

+@@ -375,10 +458,14 @@

enum CompactTrieNodeFlags {

kVerticalNode = 0x1000, // This is a vertical node

kParentEndsWord = 0x2000, // The node whose equal link points to this ends a word

@@ -839,7 +821,7 @@

};

// The two node types are distinguished by the kVerticalNode flag.

-@@ -402,63 +500,177 @@

+@@ -402,63 +489,177 @@

uint16_t chars[1]; // Code units

};

@@ -914,23 +896,23 @@

CompactTrieDictionary::data() const {

- return fData;

+ return fInfo->address;

- }

--// This function finds the address of a node for us, given its node ID

++}

+//This function finds the address of a node for us, given its node ID

- static inline const CompactTrieNode *

--getCompactNode(const CompactTrieHeader *header, uint16_t node) {

-- return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);

++static inline const CompactTrieNode *

+getCompactNode(const CompactTrieInfo *info, uint32_t node) {

+ if(node < info->root-1) {

+ return (const CompactTrieNode *)(&info->offsets[node]);

+ } else {

+ return (const CompactTrieNode *)(info->address + info->offsets[node]);

+ }

-+}

+ }

+-// This function finds the address of a node for us, given its node ID

+//this version of getCompactNode is currently only used in compactMutableTrieDictionary()

-+static inline const CompactTrieNode *

+ static inline const CompactTrieNode *

+-getCompactNode(const CompactTrieHeader *header, uint16_t node) {

+- return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);

+getCompactNode(const CompactTrieHeader *header, uint32_t node) {

+ if(node < header->root-1) {

+ return (const CompactTrieNode *)(&header->offsets[node]);

@@ -1037,7 +1019,7 @@

}

int32_t

-@@ -466,17 +678,38 @@

+@@ -466,17 +667,38 @@

int32_t maxLength,

int32_t *lengths,

int &count,

@@ -1078,7 +1060,7 @@

lengths[mycount++] = i;

--limit;

}

-@@ -487,7 +720,7 @@

+@@ -487,7 +709,7 @@

break;

}

@@ -1087,7 +1069,7 @@

if (nodeCount == 0) {

// Special terminal node; return now

break;

-@@ -507,35 +740,27 @@

+@@ -507,35 +729,27 @@

// To get here we must have come through the whole list successfully;

// go on to the next node. Note that a word cannot end in the middle

// of a vertical node.

@@ -1137,7 +1119,7 @@

count = mycount;

return i;

}

-@@ -545,16 +770,16 @@

+@@ -545,16 +759,16 @@

private:

UVector32 fNodeStack; // Stack of nodes to process

UVector32 fIndexStack; // Stack of where in node we are

@@ -1158,7 +1140,7 @@

fIndexStack.push(0, status);

unistr.remove();

}

-@@ -564,14 +789,14 @@

+@@ -564,14 +778,14 @@

virtual StringEnumeration *clone() const {

UErrorCode status = U_ZERO_ERROR;

@@ -1175,7 +1157,7 @@

int32_t result = 0;

while (counter.snext(status) != NULL && U_SUCCESS(status)) {

++result;

-@@ -582,7 +807,7 @@

+@@ -582,7 +796,7 @@

virtual void reset(UErrorCode &status) {

fNodeStack.removeAllElements();

fIndexStack.removeAllElements();

@@ -1184,7 +1166,7 @@

fIndexStack.push(0, status);

unistr.remove();

}

-@@ -595,26 +820,34 @@

+@@ -595,26 +809,34 @@

if (fNodeStack.empty() || U_FAILURE(status)) {

return NULL;

}

@@ -1225,7 +1207,7 @@

where = fIndexStack.push(0, status);

goingDown = TRUE;

}

-@@ -623,7 +856,7 @@

+@@ -623,7 +845,7 @@

unistr.truncate(unistr.length()-nodeCount);

fNodeStack.popi();

fIndexStack.popi();

@@ -1234,7 +1216,7 @@

where = fIndexStack.peeki();

}

-@@ -638,7 +871,7 @@

+@@ -638,7 +860,7 @@

// Push on next node

unistr.append((UChar)hnode->entries[where].ch);

fIndexStack.setElementAt(where+1, fIndexStack.size()-1);

@@ -1243,7 +1225,7 @@

where = fIndexStack.push(0, status);

goingDown = TRUE;

}

-@@ -646,12 +879,14 @@

+@@ -646,12 +868,14 @@

// Going up

fNodeStack.popi();

fIndexStack.popi();

@@ -1259,7 +1241,7 @@

if (goingDown && (node->flagscount & kParentEndsWord)) {

return &unistr;

}

-@@ -664,7 +899,7 @@

+@@ -664,7 +888,7 @@

if (U_FAILURE(status)) {

return NULL;

}

@@ -1268,7 +1250,7 @@

}

-@@ -672,21 +907,36 @@

+@@ -672,21 +896,36 @@

// and back again

@@ -1311,7 +1293,7 @@

nodes.push(this, status);

}

-@@ -694,87 +944,225 @@

+@@ -694,87 +933,225 @@

}

virtual uint32_t size() {

@@ -1553,7 +1535,7 @@

}

void addChar(UChar ch) {

-@@ -784,60 +1172,85 @@

+@@ -784,60 +1161,85 @@

void setLink(BuildCompactTrieNode *node) {

fEqual = node;

}

@@ -1651,16 +1633,16 @@

}

result = vResult;

}

-@@ -849,19 +1262,28 @@

+@@ -849,19 +1251,28 @@

// Uses recursion.

static void walkHorizontal(const TernaryNode *node,

- BuildCompactTrieHorizontalNode *building,

- UStack &nodes,

- UErrorCode &status) {

-+ BuildCompactTrieHorizontalNode *building,

-+ UStack &nodes,

-+ UErrorCode &status, Hashtable *values = NULL) {

++ BuildCompactTrieHorizontalNode *building,

++ UStack &nodes,

++ UErrorCode &status, Hashtable *values = NULL) {

while (U_SUCCESS(status) && node != NULL) {

if (node->low != NULL) {

- walkHorizontal(node->low, building, nodes, status);

@@ -1687,7 +1669,7 @@

}

if (U_SUCCESS(status) && link != NULL) {

building->addNode(node->ch, link, status);

-@@ -881,13 +1303,15 @@

+@@ -881,13 +1292,15 @@

_sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {

BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;

BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;

@@ -1705,7 +1687,7 @@

}

// Next, the "parent ends word" flag. If that differs, we cannot coalesce.

if (left->fParentEndsWord != right->fParentEndsWord) {

-@@ -898,12 +1322,19 @@

+@@ -898,12 +1311,19 @@

if (result != 0) {

return result;

}

@@ -1728,7 +1710,7 @@

// We need to compare the links vectors. They should be the

// same size because the strings were equal.

// We compare the node IDs instead of the pointers, to handle

-@@ -914,9 +1345,10 @@

+@@ -914,9 +1334,10 @@

int32_t count = hleft->fLinks.size();

for (int32_t i = 0; i < count && result == 0; ++i) {

result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -

@@ -1740,7 +1722,7 @@

// If they are equal to each other, mark them (speeds coalescing)

if (result == 0) {

left->fHasDuplicate = TRUE;

-@@ -1031,20 +1463,25 @@

+@@ -1031,20 +1452,25 @@

// Add node 0, used as the NULL pointer/sentinel.

nodes.addElement((int32_t)0, status);

@@ -1770,7 +1752,7 @@

#ifdef DEBUG_TRIE_DICT

(void) ::times(&timing);

fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",

-@@ -1077,21 +1514,37 @@

+@@ -1077,21 +1503,37 @@

return NULL;

}

@@ -1814,7 +1796,7 @@

status = U_ILLEGAL_ARGUMENT_ERROR;

return NULL;

}

-@@ -1111,9 +1564,14 @@

+@@ -1111,9 +1553,14 @@

status = U_MEMORY_ALLOCATION_ERROR;

return NULL;

}

@@ -1831,7 +1813,7 @@

header->nodeCount = nodeCount;

header->offsets[0] = 0; // Sentinel

header->root = translate.elementAti(root->fNodeID);

-@@ -1123,23 +1581,40 @@

+@@ -1123,23 +1570,40 @@

}

#endif

uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));

@@ -1875,7 +1857,7 @@

// Collect statistics on node types and sizes

int hCount = 0;

int vCount = 0;

-@@ -1148,68 +1623,85 @@

+@@ -1148,68 +1612,85 @@

size_t hItemCount = 0;

size_t vItemCount = 0;

uint32_t previousOff = offset;

@@ -1981,7 +1963,7 @@

if (nodeCount == 0 || U_FAILURE(status)) {

// Failure, or terminal node

return NULL;

-@@ -1234,29 +1726,41 @@

+@@ -1234,29 +1715,41 @@

previous = latest;

}

if (latest != NULL) {

@@ -2029,7 +2011,7 @@

if (U_FAILURE(status)) {

delete root; // Clean up

delete result;

-@@ -1270,8 +1774,8 @@

+@@ -1270,8 +1763,8 @@

U_CAPI int32_t U_EXPORT2

triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,

@@ -2040,7 +2022,7 @@

if (status == NULL || U_FAILURE(*status)) {

return 0;

}

-@@ -1286,14 +1790,14 @@

+@@ -1286,14 +1779,14 @@

const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);

if(!( pInfo->dataFormat[0]==0x54 && /* dataFormat="TrDc" */

@@ -2062,7 +2044,7 @@

*status=U_UNSUPPORTED_ERROR;

return 0;

}

-@@ -1311,8 +1815,10 @@

+@@ -1311,8 +1804,10 @@

const uint8_t *inBytes =(const uint8_t *)inData+headerSize;

const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;

@@ -2075,7 +2057,7 @@

{

udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");

*status=U_UNSUPPORTED_ERROR;

-@@ -1333,10 +1839,10 @@

+@@ -1333,10 +1828,10 @@

if (length < sizeWithUData) {

udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",

@@ -2088,7 +2070,7 @@

// Swap the Data. Do the data itself first, then the CompactTrieHeader, because

-@@ -1355,20 +1861,38 @@

+@@ -1355,20 +1850,38 @@

}

// We need to loop through all the nodes in the offset table, and swap each one.

@@ -2133,7 +2115,7 @@

uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);

ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));

}

-@@ -1381,26 +1905,62 @@

+@@ -1381,26 +1894,62 @@

word = ds->readUInt16(inHNode->entries[j].equal);

ds->writeUInt16(&outHNode->entries[j].equal, word);

}

@@ -2209,7 +2191,7 @@

return sizeWithUData;

}

--- source/common/triedict.h 2006-06-06 15:38:49.000000000 -0700

-+++ source/common/triedict.h 2009-07-27 13:01:17.723390000 -0700

++++ source/common/triedict.h 2011-01-21 14:12:45.496927000 -0800

@@ -47,7 +47,6 @@

U_NAMESPACE_BEGIN

@@ -2448,27 +2430,22 @@

- /* TRIEDICT_H */

+/* TRIEDICT_H */

#endif

---- source/data/brkitr/brkfiles.mk 2009-04-21 15:42:37.000000000 -0700

-+++ source/data/brkitr/brkfiles.mk 2009-07-27 13:01:17.730379000 -0700

-@@ -34,13 +34,12 @@

+--- source/data/Makefile.in 2010-10-29 13:21:33.000000000 -0700

++++ source/data/Makefile.in 2011-01-26 16:24:24.856798000 -0800

+@@ -509,8 +520,9 @@

+ #################################################### CTD

+ # CTD FILES

+-$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

+- $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<

++# .ctd file now generated regardless of whether dictionary file exists

++$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

++ $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt

- # List of compact trie dictionary files (ctd).

--BRK_CTD_SOURCE = thaidict.txt

-+BRK_CTD_SOURCE = thaidict.txt cjdict.txt

- # List of break iterator files (brk).

--BRK_SOURCE = word_POSIX.txt word_ja.txt sent_el.txt char_th.txt char.txt word.txt line.txt sent.txt title.txt

-+BRK_SOURCE = word_POSIX.txt sent_el.txt char_th.txt char.txt word.txt line.txt sent.txt title.txt

- # Ordinary resources

--BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt ja.txt th.txt

-+BRK_RES_SOURCE = el.txt en.txt en_US.txt en_US_POSIX.txt th.txt

---- source/data/brkitr/root.txt 2009-06-24 14:06:38.000000000 -0700

-+++ source/data/brkitr/root.txt 2009-07-27 13:01:17.733382000 -0700

+ #################################################### CFU

+ # CFU FILES

+--- source/data/brkitr/root.txt 2010-07-28 17:18:28.000000000 -0700

++++ source/data/brkitr/root.txt 2011-01-21 14:12:45.653922000 -0800

@@ -17,5 +17,8 @@

}

dictionaries{

@@ -2478,173 +2455,8 @@

+ Kata:process(dependency){"cjdict.ctd"}

}

---- source/data/brkitr/word.txt 2009-06-24 14:06:38.000000000 -0700

-+++ source/data/brkitr/word.txt 2010-08-27 16:24:25.969372000 -0700

-@@ -29,29 +29,49 @@

- $Newline = [\p{Word_Break = Newline}];

- $Extend = [\p{Word_Break = Extend}];

- $Format = [\p{Word_Break = Format}];

-+$Hiragana = [:Hiragana:];

- $Katakana = [\p{Word_Break = Katakana}];

-+$Han = [:Han:];

- $ALetter = [\p{Word_Break = ALetter}];

--$MidNumLet = [\p{Word_Break = MidNumLet}];

-+# Remove two full stop characters from $MidNumLet and add them to $MidNum

-+# to break a hostname into its components at the cost of breaking

-+# 'e.g.' and 'i.e.' as well.

-+# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.

-+# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected

-+# while rules 6/7 are reverted to the old behavior we want.

-+$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];

- $MidLetter = [\p{Word_Break = MidLetter}];

--$MidNum = [\p{Word_Break = MidNum}];

--$Numeric = [\p{Word_Break = Numeric}];

-+$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]];

-+$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits

- $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];

-+# Extra sets not to break 'HebrewLetter U+0022 HebrewLetter'.

-+$HebrewLet = [\p{Word_Break = ALetter} & \p{Script = Hebrew} - [\u05F3]];

-+# U+05F3 is ALetter and U+05F4 is MidLetter so that they're covered by

-+# the current rule 6/7.

-+$HebrewMidLet = [\u0022];

- # Dictionary character set, for triggering language-based break engines. Currently

--# limited to LineBreak=Complex_Context. Note that this set only works in Unicode

--# 5.0 or later as the definition of Complex_Context was corrected to include all

-+# limited to LineBreak=Complex_Context and CJK. Note that this set only works

-+# in Unicode 5.0 or later as the definition of Complex_Context was corrected to include all

- # characters requiring dictionary break.

--$dictionary = [:LineBreak = Complex_Context:];

- $Control = [\p{Grapheme_Cluster_Break = Control}];

--$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not

-- # include the dictionary characters.

-+$HangulSyllable = [\uac00-\ud7a3];

-+$ComplexContext = [:LineBreak = Complex_Context:];

-+$KanaKanji = [$Han $Hiragana $Katakana];

-+$dictionaryCJK = [$KanaKanji $HangulSyllable];

-+$dictionary = [$ComplexContext $dictionaryCJK];

-+# leave CJK scripts out of ALetterPlus

-+$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];

- #

- # Rules 4 Ignore Format and Extend characters,

- # except when they appear at the beginning of a region of text.

- #

-+# TODO: check if handling of katakana in dictionary makes rules incorrect/void.

- $KatakanaEx = $Katakana ($Extend | $Format)*;

- $ALetterEx = $ALetterPlus ($Extend | $Format)*;

- $MidNumLetEx = $MidNumLet ($Extend | $Format)*;

-@@ -59,8 +79,8 @@

- $MidNumEx = $MidNum ($Extend | $Format)*;

- $NumericEx = $Numeric ($Extend | $Format)*;

- $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;

-+$HebrewLetEx = $HebrewLet ($Extend | $Format)*;

--$Hiragana = [\p{script=Hiragana}];

- $Ideographic = [\p{Ideographic}];

- $HiraganaEx = $Hiragana ($Extend | $Format)*;

- $IdeographicEx = $Ideographic ($Extend | $Format)*;

-@@ -79,12 +99,14 @@

- # begins with a group of Format chars, or with a "word" consisting of a single

- # char that is not in any of the listed word break categories followed by

- # format char(s).

--[^$CR $LF $Newline]? ($Extend | $Format)+;

-+ # format char(s), or is not a CJK dictionary character.

-+[^$CR $LF $Newline $dictionaryCJK]? ($Extend | $Format)+;

- $NumericEx {100};

- $ALetterEx {200};

--$KatakanaEx {300}; # note: these status values override those from rule 5

--$HiraganaEx {300}; # by virtual of being numerically larger.

-+$HangulSyllable {200};

-+$KatakanaEx {400}; #originally 300

-+$HiraganaEx {400}; #originally 300

- $IdeographicEx {400}; #

- #

-@@ -96,6 +118,9 @@

- # rule 6 and 7

- $ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};

-+# Chrome addition

-+$HebrewLetEx $HebrewMidLet $HebrewLetEx {200};

- # rule 8

- $NumericEx $NumericEx {100};

-@@ -114,19 +139,25 @@

- # rule 13

--$KatakanaEx $KatakanaEx {300};

-+# To be consistent with '$KanaKanji $KanaKanji', changed

-+# from 300 to 400.

-+# See also TestRuleStatus in intltest/rbbiapts.cpp

-+$KatakanaEx $KatakanaEx {400};

- # rule 13a/b

- $ALetterEx $ExtendNumLetEx {200}; # (13a)

- $NumericEx $ExtendNumLetEx {100}; # (13a)

--$KatakanaEx $ExtendNumLetEx {300}; # (13a)

-+$KatakanaEx $ExtendNumLetEx {400}; # (13a)

- $ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)

- $ExtendNumLetEx $ALetterEx {200}; # (13b)

- $ExtendNumLetEx $NumericEx {100}; # (13b)

--$ExtendNumLetEx $KatakanaEx {300}; # (13b)

-+$ExtendNumLetEx $KatakanaEx {400}; # (13b)

-+# special handling for CJK characters: chain for later dictionary segmentation

-+$HangulSyllable $HangulSyllable {200};

-+$KanaKanji $KanaKanji {400}; #different rule status if both kanji and kana found

- ## -------------------------------------------------

-@@ -139,13 +170,15 @@

- $BackMidNumEx = ($Format | $Extend)* $MidNum;

- $BackMidLetterEx = ($Format | $Extend)* $MidLetter;

- $BackKatakanaEx = ($Format | $Extend)* $Katakana;

-+$BackHiraganaEx = ($Extend | $Format)* $Hiragana;

- $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;

-+$BackHebrewLetEx = ($Format | $Extend)* $HebrewLet;

- # rule 3

- $LF $CR;

- # rule 4

--($Format | $Extend)* [^$CR $LF $Newline]?;

-+($Format | $Extend)* [^$CR $LF $Newline $dictionaryCJK]?;

- # rule 5

-@@ -155,6 +188,8 @@

- $BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;

-+# Chrome addition

-+$BackHebrewLetEx $HebrewMidLet $BackHebrewLetEx;

- # rule 8

-@@ -181,6 +216,10 @@

- $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);

- ($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;

-+# special handling for CJK characters: chain for later dictionary segmentation

-+$HangulSyllable $HangulSyllable;

-+$KanaKanji $KanaKanji; #different rule status if both kanji and kana found

- ## -------------------------------------------------

- !!safe_reverse;

---- source/data/xml/brkitr/root.xml 2007-08-28 23:10:43.000000000 -0700

-+++ source/data/xml/brkitr/root.xml 2009-07-27 13:01:17.746367000 -0700

+--- source/data/xml/brkitr/root.xml 2010-03-01 15:13:18.000000000 -0800

++++ source/data/xml/brkitr/root.xml 2011-01-21 14:12:45.735922000 -0800

@@ -25,6 +25,9 @@

</icu:boundaries>

<icu:dictionaries>

@@ -2655,9 +2467,9 @@

</icu:dictionaries>

</icu:breakIteratorData>

</special>

---- source/test/cintltst/creststn.c 2009-06-26 09:49:55.000000000 -0700

-+++ source/test/cintltst/creststn.c 2009-07-29 12:46:05.997405000 -0700

-@@ -2181,21 +2181,21 @@

+--- source/test/cintltst/creststn.c 2010-10-28 10:44:02.000000000 -0700

++++ source/test/cintltst/creststn.c 2011-01-21 14:12:44.995020000 -0800

+@@ -2188,21 +2188,21 @@

{

@@ -2684,13 +2496,8 @@

status = U_ZERO_ERROR;

}

/* simple alias */

-@@ -3024,4 +3024,3 @@

- }

---- source/test/intltest/rbbiapts.cpp 2009-06-26 09:49:55.000000000 -0700

-+++ source/test/intltest/rbbiapts.cpp 2009-07-28 13:56:30.208042000 -0700

+--- source/test/intltest/rbbiapts.cpp 2010-07-12 11:03:29.000000000 -0700

++++ source/test/intltest/rbbiapts.cpp 2011-01-21 14:12:45.033014000 -0800

@@ -156,9 +156,13 @@

if(*a!=*b){

errln("Failed: boilerplate method operator!= does not return correct results");

@@ -2716,7 +2523,7 @@

}

void RBBIAPITest::TestgetRules()

-@@ -643,21 +648,21 @@

+@@ -635,21 +640,21 @@

void RBBIAPITest::TestRuleStatus() {

UChar str[30];

@@ -2746,7 +2553,7 @@

UErrorCode status=U_ZERO_ERROR;

-@@ -896,9 +901,11 @@

+@@ -888,9 +893,11 @@

URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);

{

@@ -2758,9 +2565,9 @@

}

{

---- source/test/intltest/rbbitst.cpp 2009-06-26 09:49:55.000000000 -0700

-+++ source/test/intltest/rbbitst.cpp 2009-07-28 15:35:18.933226000 -0700

-@@ -33,6 +33,8 @@

+--- source/test/intltest/rbbitst.cpp 2010-10-08 18:23:28.000000000 -0700

++++ source/test/intltest/rbbitst.cpp 2011-01-21 14:12:45.180030000 -0800

+@@ -35,6 +35,8 @@

#include <string.h>

#include <stdio.h>

#include <stdlib.h>

@@ -2769,25 +2576,32 @@

#define TEST_ASSERT(x) {if (!(x)) { \

errln("Failure in file %s, line %d", __FILE__, __LINE__);}}

-@@ -108,6 +110,8 @@

+@@ -138,11 +140,13 @@

if (exec) TestThaiBreaks(); break;

case 23: name = "TestTailoredBreaks";

if (exec) TestTailoredBreaks(); break;

+ case 24: name = "TestTrieDictWithValue";

+ if(exec) TestTrieDictWithValue(); break;

+ #else

+- case 21: case 22: case 23: name = "skip";

++ case 21: case 22: case 23: case 24: name = "skip";

+ break;

+ #endif

+- case 24: name = "TestDictRules";

++ case 25: name = "TestDictRules";

+ if (exec) TestDictRules(); break;

+ case 25: name = "TestBug5532";

+ if (exec) TestBug5532(); break;

+@@ -607,6 +611,8 @@

- default: name = ""; break; //needed to end loop

- }

-@@ -570,6 +574,8 @@

void RBBITest::TestJapaneseWordBreak() {

+// TODO: Rewrite this test for a dictionary-based word breaking.

+#if 0

UErrorCode status = U_ZERO_ERROR;

BITestData japaneseWordSelection(status);

-@@ -591,6 +597,7 @@

+@@ -628,6 +634,7 @@

generalIteratorTest(*e, japaneseWordSelection);

delete e;

@@ -2795,7 +2609,7 @@

}

void RBBITest::TestTrieDict() {

-@@ -812,6 +819,372 @@

+@@ -849,6 +856,372 @@

delete compact2;

}

@@ -3168,7 +2982,7 @@

//----------------------------------------------------------------------------

-@@ -1832,8 +2205,15 @@

+@@ -1870,8 +2243,15 @@

// Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).

static const char jaWordText[] = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"

"\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";

@@ -3184,7 +2998,7 @@

// UBreakIteratorType UBRK_SENTENCE, Locale "el"

// Add break after Greek question mark (cldrbug #2069).

-@@ -2580,6 +2960,8 @@

+@@ -2672,6 +3052,8 @@

UnicodeSet *fNewlineSet;

UnicodeSet *fKatakanaSet;

UnicodeSet *fALetterSet;

@@ -3193,7 +3007,7 @@

UnicodeSet *fMidNumLetSet;

UnicodeSet *fMidLetterSet;

UnicodeSet *fMidNumSet;

-@@ -2588,6 +2970,7 @@

+@@ -2680,6 +3062,7 @@

UnicodeSet *fOtherSet;

UnicodeSet *fExtendSet;

UnicodeSet *fExtendNumLetSet;

@@ -3201,7 +3015,7 @@

RegexMatcher *fMatcher;

-@@ -2604,12 +2987,24 @@

+@@ -2696,12 +3079,24 @@

fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);

fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);

fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);

@@ -3228,7 +3042,7 @@

fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);

fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);

fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);

-@@ -2633,13 +3028,14 @@

+@@ -2725,13 +3120,14 @@

fOtherSet->removeAll(*fFormatSet);

fOtherSet->removeAll(*fExtendSet);

// Inhibit dictionary characters from being tested at all.

@@ -3244,7 +3058,7 @@

fSets->addElement(fMidLetterSet, status);

fSets->addElement(fMidNumLetSet, status);

fSets->addElement(fMidNumSet, status);

-@@ -3871,6 +4267,7 @@

+@@ -3978,6 +4374,7 @@

for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {

count --;

if (forward[count] != i) {

@@ -3252,7 +3066,7 @@

test->errln("happy break test previous() failed: expected %d but got %d",

forward[count], i);

break;

-@@ -3904,23 +4301,25 @@

+@@ -4011,23 +4408,25 @@

UErrorCode status = U_ZERO_ERROR;

// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);

BreakIterator *bi = BreakIterator::createWordInstance(locale, status);

@@ -3282,7 +3096,7 @@

"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",

"\\u0027\\u11af\\U000e0057\\u0602",

"\\U0001d7f2\\U000e007\\u0004\\u0589",

-@@ -3932,7 +4331,7 @@

+@@ -4039,7 +4438,7 @@

"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",

"\\u0233\\U000e0020\\u0a69\\u0d6a",

"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",

@@ -3291,7 +3105,7 @@

"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",

"\\ua183\\u102d\\u0bec\\u003a",

"\\u17e8\\u06e7\\u002e\\u096d\\u003b",

-@@ -3942,7 +4341,7 @@

+@@ -4049,7 +4448,7 @@

"\\U000e005d\\u2044\\u0731\\u0650\\u0061",

"\\u003a\\u0664\\u00b7\\u1fba",

"\\u003b\\u0027\\u00b7\\u47a3",

@@ -3300,7 +3114,7 @@

"\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",

"\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",

};

-@@ -3997,12 +4396,12 @@

+@@ -4104,12 +4503,12 @@

"\\U0001d7f2\\U000e007d\\u0004\\u0589",

"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",

"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",

@@ -3315,19 +3129,19 @@

"\\ua183\\u102d\\u0bec\\u003a",

"\\u17e8\\u06e7\\u002e\\u096d\\u003b",

"\\u003a\\u0e57\\u0fad\\u002e",

---- source/test/intltest/rbbitst.h 2009-04-22 00:53:50.000000000 -0700

-+++ source/test/intltest/rbbitst.h 2009-07-27 13:01:17.767342000 -0700

+--- source/test/intltest/rbbitst.h 2010-07-22 17:15:37.000000000 -0700

++++ source/test/intltest/rbbitst.h 2011-01-21 14:12:45.152007000 -0800

@@ -70,6 +70,7 @@

void TestBug5775();

void TestThaiBreaks();

void TestTailoredBreaks();

+ void TestTrieDictWithValue();

+ void TestDictRules();

+ void TestBug5532();

- void TestDebug();

---- source/test/testdata/rbbitst.txt 2009-06-24 14:06:38.000000000 -0700

-+++ source/test/testdata/rbbitst.txt 2009-07-29 12:56:31.483710000 -0700

-@@ -162,7 +162,23 @@

+--- source/test/testdata/rbbitst.txt 2010-07-28 17:18:28.000000000 -0700

++++ source/test/testdata/rbbitst.txt 2011-01-21 14:12:45.221011000 -0800

+@@ -161,7 +161,23 @@

# Hiragana & Katakana stay together, but separates from each other and Latin.

@@ -3352,7 +3166,7 @@

# Words with interior formatting characters

<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>

-@@ -170,6 +186,8 @@

+@@ -169,6 +185,8 @@

# to test for bug #4097779

<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>

@@ -3361,7 +3175,7 @@

# to test for bug #4098467

# What follows is a string of Korean characters (I found it in the Yellow Pages

-@@ -179,9 +197,15 @@

+@@ -178,9 +196,15 @@

# precomposed syllables...

@@ -3379,7 +3193,7 @@

# Try some words from other scripts.

-@@ -492,8 +516,7 @@

+@@ -491,8 +515,7 @@

# conjoining jamo...

@@ -3389,8 +3203,8 @@

# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd

---- source/test/testdata/testaliases.txt 2009-06-24 14:06:38.000000000 -0700

-+++ source/test/testdata/testaliases.txt 2009-07-28 17:07:26.251120000 -0700

+--- source/test/testdata/testaliases.txt 2009-11-12 13:53:42.000000000 -0800

++++ source/test/testdata/testaliases.txt 2011-01-21 14:12:45.204005000 -0800

@@ -28,7 +28,7 @@

LocaleScript:alias { "/ICUDATA/ja/LocaleScript" }

@@ -3400,13 +3214,13 @@

// aliasing arrays

zoneTests {

---- source/tools/genctd/genctd.cpp 2006-09-04 09:28:24.000000000 -0700

-+++ source/tools/genctd/genctd.cpp 2009-07-27 13:01:17.776335000 -0700

+--- source/tools/genctd/genctd.cpp 2009-08-04 14:09:17.000000000 -0700

++++ source/tools/genctd/genctd.cpp 2011-01-21 14:12:45.564923000 -0800

@@ -1,6 +1,6 @@

**********************************************************************

@@ -3426,7 +3240,7 @@

#include <stdio.h>

#include <stdlib.h>

-@@ -198,147 +201,191 @@

+@@ -199,147 +202,191 @@

long wordFileSize;

FILE *file;

char *wordBufferC;

@@ -3748,13 +3562,13 @@

// Now, create a CompactTrieDictionary from the mutable dictionary

CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);

if (U_FAILURE(status)) {

-@@ -392,4 +439,3 @@

+@@ -393,4 +440,3 @@

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

}

--- source/tools/genctd/Makefile.in 2006-12-16 13:07:01.000000000 -0800

-+++ source/tools/genctd/Makefile.in 2009-07-27 13:01:17.782326000 -0700

++++ source/tools/genctd/Makefile.in 2011-01-21 14:12:45.555920000 -0800

@@ -23,13 +23,13 @@

## Extra files to remove for 'make clean'

CLEANFILES = *~ $(DEPS) $(MAN_FILES)

@@ -3771,17 +3585,3 @@

LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)

OBJECTS = genctd.o

---- source/data/Makefile.in 2009-05-20 23:03:54.000000000 -0700

-+++ source/data/Makefile.in 2009-10-21 15:43:18.235201000 -0700

-@@ -452,8 +452,9 @@

- #################################################### CTD

- # CTD FILES

--$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

-- $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<

-+# .ctd file now generated regardless of whether dictionary file exists

-+$(BRKBLDDIR)/%.ctd: $(TOOLBINDIR)/genctd$(TOOLEXEEXT) $(DAT_FILES)

-+ $(INVOKE) $(TOOLBINDIR)/genctd -c -i $(BUILDDIR) -o $@ $(BRKSRCDIR)/$(*F).txt

- #################################################### CFU

- # CFU FILES

« no previous file with comments | « no previous file | icu46/source/common/brkeng.cpp » ('j') | no next file with comments »