icu46/source/common/normlzr.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/normlzr.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *************************************************************************

	3 * COPYRIGHT:

	4 * Copyright (c) 1996-2010, International Business Machines Corporation and

	5 * others. All Rights Reserved.

	6 *************************************************************************

	7 */

	8

	9 #include "unicode/utypes.h"

	10

	11 #if !UCONFIG_NO_NORMALIZATION

	12

	13 #include "unicode/uniset.h"

	14 #include "unicode/unistr.h"

	15 #include "unicode/chariter.h"

	16 #include "unicode/schriter.h"

	17 #include "unicode/uchriter.h"

	18 #include "unicode/normlzr.h"

	19 #include "cmemory.h"

	20 #include "normalizer2impl.h"

	21 #include "uprops.h" // for uniset_getUnicode32Instance()

	22

	23 U_NAMESPACE_BEGIN

	24

	25 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)

	26

	27 //-------------------------------------------------------------------------

	28 // Constructors and other boilerplate

	29 //-------------------------------------------------------------------------

	30

	31 Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :

	32 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

	33 text(new StringCharacterIterator(str)),

	34 currentIndex(0), nextIndex(0),

	35 buffer(), bufferPos(0)

	36 {

	37 init();

	38 }

	39

	40 Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode ) :

	41 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

	42 text(new UCharCharacterIterator(str, length)),

	43 currentIndex(0), nextIndex(0),

	44 buffer(), bufferPos(0)

	45 {

	46 init();

	47 }

	48

	49 Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :

	50 UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),

	51 text(iter.clone()),

	52 currentIndex(0), nextIndex(0),

	53 buffer(), bufferPos(0)

	54 {

	55 init();

	56 }

	57

	58 Normalizer::Normalizer(const Normalizer &copy) :

	59 UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOpt ions(copy.fOptions),

	60 text(copy.text->clone()),

	61 currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),

	62 buffer(copy.buffer), bufferPos(copy.bufferPos)

	63 {

	64 init();

	65 }

	66

	67 static const UChar _NUL=0;

	68

	69 void

	70 Normalizer::init() {

	71 UErrorCode errorCode=U_ZERO_ERROR;

	72 fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);

	73 if(fOptions&UNORM_UNICODE_3_2) {

	74 delete fFilteredNorm2;

	75 fNorm2=fFilteredNorm2=

	76 new FilteredNormalizer2(fNorm2, uniset_getUnicode32Instance(errorC ode));

	77 }

	78 if(U_FAILURE(errorCode)) {

	79 errorCode=U_ZERO_ERROR;

	80 fNorm2=Normalizer2Factory::getNoopInstance(errorCode);

	81 }

	82 }

	83

	84 Normalizer::~Normalizer()

	85 {

	86 delete fFilteredNorm2;

	87 delete text;

	88 }

	89

	90 Normalizer*

	91 Normalizer::clone() const

	92 {

	93 return new Normalizer(*this);

	94 }

	95

	96 /**

	97 * Generates a hash code for this iterator.

	98 */

	99 int32_t Normalizer::hashCode() const

	100 {

	101 return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;

	102 }

	103

	104 UBool Normalizer::operator==(const Normalizer& that) const

	105 {

	106 return

	107 this==&that \|\|

	108 (fUMode==that.fUMode &&

	109 fOptions==that.fOptions &&

	110 text==that.text &&

	111 buffer==that.buffer &&

	112 bufferPos==that.bufferPos &&

	113 nextIndex==that.nextIndex);

	114 }

	115

	116 //-------------------------------------------------------------------------

	117 // Static utility methods

	118 //-------------------------------------------------------------------------

	119

	120 void U_EXPORT2

	121 Normalizer::normalize(const UnicodeString& source,

	122 UNormalizationMode mode, int32_t options,

	123 UnicodeString& result,

	124 UErrorCode &status) {

	125 if(source.isBogus() \|\| U_FAILURE(status)) {

	126 result.setToBogus();

	127 if(U_SUCCESS(status)) {

	128 status=U_ILLEGAL_ARGUMENT_ERROR;

	129 }

	130 } else {

	131 UnicodeString localDest;

	132 UnicodeString *dest;

	133

	134 if(&source!=&result) {

	135 dest=&result;

	136 } else {

	137 // the source and result strings are the same object, use a temporar y one

	138 dest=&localDest;

	139 }

	140 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

	141 if(U_SUCCESS(status)) {

	142 if(options&UNORM_UNICODE_3_2) {

	143 FilteredNormalizer2(n2, uniset_getUnicode32Instance(status)).

	144 normalize(source, *dest, status);

	145 } else {

	146 n2->normalize(source, *dest, status);

	147 }

	148 }

	149 if(dest==&localDest && U_SUCCESS(status)) {

	150 result=*dest;

	151 }

	152 }

	153 }

	154

	155 void U_EXPORT2

	156 Normalizer::compose(const UnicodeString& source,

	157 UBool compat, int32_t options,

	158 UnicodeString& result,

	159 UErrorCode &status) {

	160 normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);

	161 }

	162

	163 void U_EXPORT2

	164 Normalizer::decompose(const UnicodeString& source,

	165 UBool compat, int32_t options,

	166 UnicodeString& result,

	167 UErrorCode &status) {

	168 normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);

	169 }

	170

	171 UNormalizationCheckResult

	172 Normalizer::quickCheck(const UnicodeString& source,

	173 UNormalizationMode mode, int32_t options,

	174 UErrorCode &status) {

	175 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

	176 if(U_SUCCESS(status)) {

	177 if(options&UNORM_UNICODE_3_2) {

	178 return FilteredNormalizer2(n2, uniset_getUnicode32Instance(status) ).

	179 quickCheck(source, status);

	180 } else {

	181 return n2->quickCheck(source, status);

	182 }

	183 } else {

	184 return UNORM_MAYBE;

	185 }

	186 }

	187

	188 UBool

	189 Normalizer::isNormalized(const UnicodeString& source,

	190 UNormalizationMode mode, int32_t options,

	191 UErrorCode &status) {

	192 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);

	193 if(U_SUCCESS(status)) {

	194 if(options&UNORM_UNICODE_3_2) {

	195 return FilteredNormalizer2(n2, uniset_getUnicode32Instance(status) ).

	196 isNormalized(source, status);

	197 } else {

	198 return n2->isNormalized(source, status);

	199 }

	200 } else {

	201 return FALSE;

	202 }

	203 }

	204

	205 UnicodeString & U_EXPORT2

	206 Normalizer::concatenate(UnicodeString &left, UnicodeString &right,

	207 UnicodeString &result,

	208 UNormalizationMode mode, int32_t options,

	209 UErrorCode &errorCode) {

	210 if(left.isBogus() \|\| right.isBogus() \|\| U_FAILURE(errorCode)) {

	211 result.setToBogus();

	212 if(U_SUCCESS(errorCode)) {

	213 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

	214 }

	215 } else {

	216 UnicodeString localDest;

	217 UnicodeString *dest;

	218

	219 if(&right!=&result) {

	220 dest=&result;

	221 } else {

	222 // the right and result strings are the same object, use a temporary one

	223 dest=&localDest;

	224 }

	225 *dest=left;

	226 const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);

	227 if(U_SUCCESS(errorCode)) {

	228 if(options&UNORM_UNICODE_3_2) {

	229 FilteredNormalizer2(n2, uniset_getUnicode32Instance(errorCode) ).

	230 append(*dest, right, errorCode);

	231 } else {

	232 n2->append(*dest, right, errorCode);

	233 }

	234 }

	235 if(dest==&localDest && U_SUCCESS(errorCode)) {

	236 result=*dest;

	237 }

	238 }

	239 return result;

	240 }

	241

	242 //-------------------------------------------------------------------------

	243 // Iteration API

	244 //-------------------------------------------------------------------------

	245

	246 /**

	247 * Return the current character in the normalized text.

	248 */

	249 UChar32 Normalizer::current() {

	250 if(bufferPos<buffer.length() \|\| nextNormalize()) {

	251 return buffer.char32At(bufferPos);

	252 } else {

	253 return DONE;

	254 }

	255 }

	256

	257 /**

	258 * Return the next character in the normalized text and advance

	259 * the iteration position by one. If the end

	260 * of the text has already been reached, {@link #DONE} is returned.

	261 */

	262 UChar32 Normalizer::next() {

	263 if(bufferPos<buffer.length() \|\| nextNormalize()) {

	264 UChar32 c=buffer.char32At(bufferPos);

	265 bufferPos+=UTF_CHAR_LENGTH(c);

	266 return c;

	267 } else {

	268 return DONE;

	269 }

	270 }

	271

	272 /**

	273 * Return the previous character in the normalized text and decrement

	274 * the iteration position by one. If the beginning

	275 * of the text has already been reached, {@link #DONE} is returned.

	276 */

	277 UChar32 Normalizer::previous() {

	278 if(bufferPos>0 \|\| previousNormalize()) {

	279 UChar32 c=buffer.char32At(bufferPos-1);

	280 bufferPos-=UTF_CHAR_LENGTH(c);

	281 return c;

	282 } else {

	283 return DONE;

	284 }

	285 }

	286

	287 void Normalizer::reset() {

	288 currentIndex=nextIndex=text->setToStart();

	289 clearBuffer();

	290 }

	291

	292 void

	293 Normalizer::setIndexOnly(int32_t index) {

	294 text->setIndex(index); // pins index

	295 currentIndex=nextIndex=text->getIndex();

	296 clearBuffer();

	297 }

	298

	299 /**

	300 * Return the first character in the normalized text. This resets

	301 * the <tt>Normalizer's</tt> position to the beginning of the text.

	302 */

	303 UChar32 Normalizer::first() {

	304 reset();

	305 return next();

	306 }

	307

	308 /**

	309 * Return the last character in the normalized text. This resets

	310 * the <tt>Normalizer's</tt> position to be just before the

	311 * the input text corresponding to that normalized character.

	312 */

	313 UChar32 Normalizer::last() {

	314 currentIndex=nextIndex=text->setToEnd();

	315 clearBuffer();

	316 return previous();

	317 }

	318

	319 /**

	320 * Retrieve the current iteration position in the input text that is

	321 * being normalized. This method is useful in applications such as

	322 * searching, where you need to be able to determine the position in

	323 * the input text that corresponds to a given normalized output character.

	324 * <p>

	325 * <b>Note:</b> This method sets the position in the <em>input</em>, while

	326 * {@link #next} and {@link #previous} iterate through characters in the

	327 * <em>output</em>. This means that there is not necessarily a one-to-one

	328 * correspondence between characters returned by <tt>next</tt> and

	329 * <tt>previous</tt> and the indices passed to and returned from

	330 * <tt>setIndex</tt> and {@link #getIndex}.

	331 *

	332 */

	333 int32_t Normalizer::getIndex() const {

	334 if(bufferPos<buffer.length()) {

	335 return currentIndex;

	336 } else {

	337 return nextIndex;

	338 }

	339 }

	340

	341 /**

	342 * Retrieve the index of the start of the input text. This is the begin index

	343 * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt >

	344 * over which this <tt>Normalizer</tt> is iterating

	345 */

	346 int32_t Normalizer::startIndex() const {

	347 return text->startIndex();

	348 }

	349

	350 /**

	351 * Retrieve the index of the end of the input text. This is the end index

	352 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>

	353 * over which this <tt>Normalizer</tt> is iterating

	354 */

	355 int32_t Normalizer::endIndex() const {

	356 return text->endIndex();

	357 }

	358

	359 //-------------------------------------------------------------------------

	360 // Property access methods

	361 //-------------------------------------------------------------------------

	362

	363 void

	364 Normalizer::setMode(UNormalizationMode newMode)

	365 {

	366 fUMode = newMode;

	367 init();

	368 }

	369

	370 UNormalizationMode

	371 Normalizer::getUMode() const

	372 {

	373 return fUMode;

	374 }

	375

	376 void

	377 Normalizer::setOption(int32_t option,

	378 UBool value)

	379 {

	380 if (value) {

	381 fOptions \|= option;

	382 } else {

	383 fOptions &= (~option);

	384 }

	385 init();

	386 }

	387

	388 UBool

	389 Normalizer::getOption(int32_t option) const

	390 {

	391 return (fOptions & option) != 0;

	392 }

	393

	394 /**

	395 * Set the input text over which this <tt>Normalizer</tt> will iterate.

	396 * The iteration position is set to the beginning of the input text.

	397 */

	398 void

	399 Normalizer::setText(const UnicodeString& newText,

	400 UErrorCode &status)

	401 {

	402 if (U_FAILURE(status)) {

	403 return;

	404 }

	405 CharacterIterator *newIter = new StringCharacterIterator(newText);

	406 if (newIter == NULL) {

	407 status = U_MEMORY_ALLOCATION_ERROR;

	408 return;

	409 }

	410 delete text;

	411 text = newIter;

	412 reset();

	413 }

	414

	415 /**

	416 * Set the input text over which this <tt>Normalizer</tt> will iterate.

	417 * The iteration position is set to the beginning of the string.

	418 */

	419 void

	420 Normalizer::setText(const CharacterIterator& newText,

	421 UErrorCode &status)

	422 {

	423 if (U_FAILURE(status)) {

	424 return;

	425 }

	426 CharacterIterator *newIter = newText.clone();

	427 if (newIter == NULL) {

	428 status = U_MEMORY_ALLOCATION_ERROR;

	429 return;

	430 }

	431 delete text;

	432 text = newIter;

	433 reset();

	434 }

	435

	436 void

	437 Normalizer::setText(const UChar* newText,

	438 int32_t length,

	439 UErrorCode &status)

	440 {

	441 if (U_FAILURE(status)) {

	442 return;

	443 }

	444 CharacterIterator *newIter = new UCharCharacterIterator(newText, length);

	445 if (newIter == NULL) {

	446 status = U_MEMORY_ALLOCATION_ERROR;

	447 return;

	448 }

	449 delete text;

	450 text = newIter;

	451 reset();

	452 }

	453

	454 /**

	455 * Copies the text under iteration into the UnicodeString referred to by "result ".

	456 * @param result Receives a copy of the text under iteration.

	457 */

	458 void

	459 Normalizer::getText(UnicodeString& result)

	460 {

	461 text->getText(result);

	462 }

	463

	464 //-------------------------------------------------------------------------

	465 // Private utility methods

	466 //-------------------------------------------------------------------------

	467

	468 void Normalizer::clearBuffer() {

	469 buffer.remove();

	470 bufferPos=0;

	471 }

	472

	473 UBool

	474 Normalizer::nextNormalize() {

	475 clearBuffer();

	476 currentIndex=nextIndex;

	477 text->setIndex(nextIndex);

	478 if(!text->hasNext()) {

	479 return FALSE;

	480 }

	481 // Skip at least one character so we make progress.

	482 UnicodeString segment(text->next32PostInc());

	483 while(text->hasNext()) {

	484 UChar32 c;

	485 if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {

	486 text->move32(-1, CharacterIterator::kCurrent);

	487 break;

	488 }

	489 segment.append(c);

	490 }

	491 nextIndex=text->getIndex();

	492 UErrorCode errorCode=U_ZERO_ERROR;

	493 fNorm2->normalize(segment, buffer, errorCode);

	494 return U_SUCCESS(errorCode) && !buffer.isEmpty();

	495 }

	496

	497 UBool

	498 Normalizer::previousNormalize() {

	499 clearBuffer();

	500 nextIndex=currentIndex;

	501 text->setIndex(currentIndex);

	502 if(!text->hasPrevious()) {

	503 return FALSE;

	504 }

	505 UnicodeString segment;

	506 while(text->hasPrevious()) {

	507 UChar32 c=text->previous32();

	508 segment.insert(0, c);

	509 if(fNorm2->hasBoundaryBefore(c)) {

	510 break;

	511 }

	512 }

	513 currentIndex=text->getIndex();

	514 UErrorCode errorCode=U_ZERO_ERROR;

	515 fNorm2->normalize(segment, buffer, errorCode);

	516 bufferPos=buffer.length();

	517 return U_SUCCESS(errorCode) && !buffer.isEmpty();

	518 }

	519

	520 U_NAMESPACE_END

	521

	522 #endif /* #if !UCONFIG_NO_NORMALIZATION */

OLD	NEW

« no previous file with comments | « icu46/source/common/normalizer2impl.cpp ('k') | icu46/source/common/parsepos.cpp » ('j') | no next file with comments »