icu46/source/common/uts46.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/uts46.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 *******************************************************************************

	3 * Copyright (C) 2010, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 *******************************************************************************

	6 * file name: uts46.cpp

	7 * encoding: US-ASCII

	8 * tab size: 8 (not used)

	9 * indentation:4

	10 *

	11 * created on: 2010mar09

	12 * created by: Markus W. Scherer

	13 */

	14

	15 #include "unicode/utypes.h"

	16

	17 #if !UCONFIG_NO_IDNA

	18

	19 #include "unicode/idna.h"

	20 #include "unicode/normalizer2.h"

	21 #include "unicode/ustring.h"

	22 #include "cmemory.h"

	23 #include "cstring.h"

	24 #include "punycode.h"

	25 #include "ustr_imp.h"

	26

	27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))

	28

	29 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:

	30 //

	31 // The domain name length limit is 255 octets in an internal DNS representation

	32 // where the last ("root") label is the empty label

	33 // represented by length byte 0 alone.

	34 // In a conventional string, this translates to 253 characters, or 254

	35 // if there is a trailing dot for the root label.

	36

	37 U_NAMESPACE_BEGIN

	38

	39 // Severe errors which usually result in a U+FFFD replacement character in the r esult string.

	40 const uint32_t severeErrors=

	41 UIDNA_ERROR_LEADING_COMBINING_MARK\|

	42 UIDNA_ERROR_DISALLOWED\|

	43 UIDNA_ERROR_PUNYCODE\|

	44 UIDNA_ERROR_LABEL_HAS_DOT\|

	45 UIDNA_ERROR_INVALID_ACE_LABEL;

	46

	47 static inline UBool

	48 isASCIIString(const UnicodeString &dest) {

	49 const UChar *s=dest.getBuffer();

	50 const UChar *limit=s+dest.length();

	51 while(s<limit) {

	52 if(*s++>0x7f) {

	53 return FALSE;

	54 }

	55 }

	56 return TRUE;

	57 }

	58

	59 static UBool

	60 isASCIIOkBiDi(const UChar *s, int32_t length);

	61

	62 static UBool

	63 isASCIIOkBiDi(const char *s, int32_t length);

	64

	65 // IDNA class default implementations -------------------------------------- ***

	66

	67 void

	68 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,

	69 IDNAInfo &info, UErrorCode &errorCode) const {

	70 if(U_SUCCESS(errorCode)) {

	71 UnicodeString destString;

	72 labelToASCII(UnicodeString::fromUTF8(label), destString,

	73 info, errorCode).toUTF8(dest);

	74 }

	75 }

	76

	77 void

	78 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,

	79 IDNAInfo &info, UErrorCode &errorCode) const {

	80 if(U_SUCCESS(errorCode)) {

	81 UnicodeString destString;

	82 labelToUnicode(UnicodeString::fromUTF8(label), destString,

	83 info, errorCode).toUTF8(dest);

	84 }

	85 }

	86

	87 void

	88 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,

	89 IDNAInfo &info, UErrorCode &errorCode) const {

	90 if(U_SUCCESS(errorCode)) {

	91 UnicodeString destString;

	92 nameToASCII(UnicodeString::fromUTF8(name), destString,

	93 info, errorCode).toUTF8(dest);

	94 }

	95 }

	96

	97 void

	98 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,

	99 IDNAInfo &info, UErrorCode &errorCode) const {

	100 if(U_SUCCESS(errorCode)) {

	101 UnicodeString destString;

	102 nameToUnicode(UnicodeString::fromUTF8(name), destString,

	103 info, errorCode).toUTF8(dest);

	104 }

	105 }

	106

	107 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(IDNA)

	108

	109 // UTS46 class declaration ------------------------------------------------- ***

	110

	111 class UTS46 : public IDNA {

	112 public:

	113 UTS46(uint32_t options, UErrorCode &errorCode);

	114 virtual ~UTS46();

	115

	116 virtual UnicodeString &

	117 labelToASCII(const UnicodeString &label, UnicodeString &dest,

	118 IDNAInfo &info, UErrorCode &errorCode) const;

	119

	120 virtual UnicodeString &

	121 labelToUnicode(const UnicodeString &label, UnicodeString &dest,

	122 IDNAInfo &info, UErrorCode &errorCode) const;

	123

	124 virtual UnicodeString &

	125 nameToASCII(const UnicodeString &name, UnicodeString &dest,

	126 IDNAInfo &info, UErrorCode &errorCode) const;

	127

	128 virtual UnicodeString &

	129 nameToUnicode(const UnicodeString &name, UnicodeString &dest,

	130 IDNAInfo &info, UErrorCode &errorCode) const;

	131

	132 virtual void

	133 labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,

	134 IDNAInfo &info, UErrorCode &errorCode) const;

	135

	136 virtual void

	137 labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,

	138 IDNAInfo &info, UErrorCode &errorCode) const;

	139

	140 virtual void

	141 nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,

	142 IDNAInfo &info, UErrorCode &errorCode) const;

	143

	144 virtual void

	145 nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,

	146 IDNAInfo &info, UErrorCode &errorCode) const;

	147

	148 private:

	149 UnicodeString &

	150 process(const UnicodeString &src,

	151 UBool isLabel, UBool toASCII,

	152 UnicodeString &dest,

	153 IDNAInfo &info, UErrorCode &errorCode) const;

	154

	155 void

	156 processUTF8(const StringPiece &src,

	157 UBool isLabel, UBool toASCII,

	158 ByteSink &dest,

	159 IDNAInfo &info, UErrorCode &errorCode) const;

	160

	161 UnicodeString &

	162 processUnicode(const UnicodeString &src,

	163 int32_t labelStart, int32_t mappingStart,

	164 UBool isLabel, UBool toASCII,

	165 UnicodeString &dest,

	166 IDNAInfo &info, UErrorCode &errorCode) const;

	167

	168 // returns the new dest.length()

	169 int32_t

	170 mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,

	171 UErrorCode &errorCode) const;

	172

	173 // returns the new label length

	174 int32_t

	175 processLabel(UnicodeString &dest,

	176 int32_t labelStart, int32_t labelLength,

	177 UBool toASCII,

	178 IDNAInfo &info, UErrorCode &errorCode) const;

	179 int32_t

	180 markBadACELabel(UnicodeString &dest,

	181 int32_t labelStart, int32_t labelLength,

	182 UBool toASCII, IDNAInfo &info) const;

	183

	184 void

	185 checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) cons t;

	186

	187 UBool

	188 isLabelOkContextJ(const UChar *label, int32_t labelLength) const;

	189

	190 const Normalizer2 &uts46Norm2; // uts46.nrm

	191 uint32_t options;

	192 };

	193

	194 IDNA *

	195 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {

	196 if(U_SUCCESS(errorCode)) {

	197 IDNA *idna=new UTS46(options, errorCode);

	198 if(idna==NULL) {

	199 errorCode=U_MEMORY_ALLOCATION_ERROR;

	200 } else if(U_FAILURE(errorCode)) {

	201 delete idna;

	202 idna=NULL;

	203 }

	204 return idna;

	205 } else {

	206 return NULL;

	207 }

	208 }

	209

	210 // UTS46 implementation ---------------------------------------------------- ***

	211

	212 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)

	213 : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, er rorCode)),

	214 options(opt) {}

	215

	216 UTS46::~UTS46() {}

	217

	218 UnicodeString &

	219 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,

	220 IDNAInfo &info, UErrorCode &errorCode) const {

	221 return process(label, TRUE, TRUE, dest, info, errorCode);

	222 }

	223

	224 UnicodeString &

	225 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,

	226 IDNAInfo &info, UErrorCode &errorCode) const {

	227 return process(label, TRUE, FALSE, dest, info, errorCode);

	228 }

	229

	230 UnicodeString &

	231 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,

	232 IDNAInfo &info, UErrorCode &errorCode) const {

	233 process(name, FALSE, TRUE, dest, info, errorCode);

	234 if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&

	235 isASCIIString(dest) &&

	236 (dest.length()>254 \|\| dest[253]!=0x2e)

	237 ) {

	238 info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

	239 }

	240 return dest;

	241 }

	242

	243 UnicodeString &

	244 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,

	245 IDNAInfo &info, UErrorCode &errorCode) const {

	246 return process(name, FALSE, FALSE, dest, info, errorCode);

	247 }

	248

	249 void

	250 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,

	251 IDNAInfo &info, UErrorCode &errorCode) const {

	252 processUTF8(label, TRUE, TRUE, dest, info, errorCode);

	253 }

	254

	255 void

	256 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,

	257 IDNAInfo &info, UErrorCode &errorCode) const {

	258 processUTF8(label, TRUE, FALSE, dest, info, errorCode);

	259 }

	260

	261 void

	262 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,

	263 IDNAInfo &info, UErrorCode &errorCode) const {

	264 processUTF8(name, FALSE, TRUE, dest, info, errorCode);

	265 }

	266

	267 void

	268 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,

	269 IDNAInfo &info, UErrorCode &errorCode) const {

	270 processUTF8(name, FALSE, FALSE, dest, info, errorCode);

	271 }

	272

	273 // UTS #46 data for ASCII characters.

	274 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase

	275 // and passes through all other ASCII characters.

	276 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed

	277 // using this data.

	278 // The ASCII fastpath also uses this data.

	279 // Values: -1=disallowed 0==valid 1==mapped (lowercase)

	280 static const int8_t asciiData[128]={

	281 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

	282 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

	283 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP

	284 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1,

	285 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE

	286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,

	287 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z

	288 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

	289 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,

	290 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z

	291 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1

	293 };

	294

	295 UnicodeString &

	296 UTS46::process(const UnicodeString &src,

	297 UBool isLabel, UBool toASCII,

	298 UnicodeString &dest,

	299 IDNAInfo &info, UErrorCode &errorCode) const {

	300 // uts46Norm2.normalize() would do all of this error checking and setup,

	301 // but with the ASCII fastpath we do not always call it, and do not

	302 // call it first.

	303 if(U_FAILURE(errorCode)) {

	304 dest.setToBogus();

	305 return dest;

	306 }

	307 const UChar *srcArray=src.getBuffer();

	308 if(&dest==&src \|\| srcArray==NULL) {

	309 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

	310 dest.setToBogus();

	311 return dest;

	312 }

	313 // Arguments are fine, reset output values.

	314 dest.remove();

	315 info.reset();

	316 int32_t srcLength=src.length();

	317 if(srcLength==0) {

	318 if(toASCII) {

	319 info.errors\|=UIDNA_ERROR_EMPTY_LABEL;

	320 }

	321 return dest;

	322 }

	323 UChar *destArray=dest.getBuffer(srcLength);

	324 if(destArray==NULL) {

	325 errorCode=U_MEMORY_ALLOCATION_ERROR;

	326 return dest;

	327 }

	328 // ASCII fastpath

	329 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

	330 int32_t labelStart=0;

	331 int32_t i;

	332 for(i=0;; ++i) {

	333 if(i==srcLength) {

	334 if(toASCII) {

	335 if((i-labelStart)>63) {

	336 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	337 }

	338 // There is a trailing dot if labelStart==i.

	339 if(!isLabel && i>=254 && (i>254 \|\| labelStart<i)) {

	340 info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

	341 }

	342 }

	343 info.errors\|=info.labelErrors;

	344 dest.releaseBuffer(i);

	345 return dest;

	346 }

	347 UChar c=srcArray[i];

	348 if(c>0x7f) {

	349 break;

	350 }

	351 int cData=asciiData[c];

	352 if(cData>0) {

	353 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.

	354 } else if(cData<0 && disallowNonLDHDot) {

	355 break; // Replacing with U+FFFD can be complicated for toASCII.

	356 } else {

	357 destArray[i]=c;

	358 if(c==0x2d) { // hyphen

	359 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {

	360 // "??--..." is Punycode or forbidden.

	361 ++i; // '-' was copied to dest already

	362 break;

	363 }

	364 if(i==labelStart) {

	365 // label starts with "-"

	366 info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;

	367 }

	368 if((i+1)==srcLength \|\| srcArray[i+1]==0x2e) {

	369 // label ends with "-"

	370 info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;

	371 }

	372 } else if(c==0x2e) { // dot

	373 if(isLabel) {

	374 // Replacing with U+FFFD can be complicated for toASCII.

	375 ++i; // '.' was copied to dest already

	376 break;

	377 }

	378 if(toASCII) {

	379 // Permit an empty label at the end but not elsewhere.

	380 if(i==labelStart && i<(srcLength-1)) {

	381 info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;

	382 } else if((i-labelStart)>63) {

	383 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	384 }

	385 }

	386 info.errors\|=info.labelErrors;

	387 info.labelErrors=0;

	388 labelStart=i+1;

	389 }

	390 }

	391 }

	392 info.errors\|=info.labelErrors;

	393 dest.releaseBuffer(i);

	394 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);

	395 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&

	396 (!info.isOkBiDi \|\| (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), lab elStart)))

	397 ) {

	398 info.errors\|=UIDNA_ERROR_BIDI;

	399 }

	400 return dest;

	401 }

	402

	403 void

	404 UTS46::processUTF8(const StringPiece &src,

	405 UBool isLabel, UBool toASCII,

	406 ByteSink &dest,

	407 IDNAInfo &info, UErrorCode &errorCode) const {

	408 if(U_FAILURE(errorCode)) {

	409 return;

	410 }

	411 const char *srcArray=src.data();

	412 int32_t srcLength=src.length();

	413 if(srcArray==NULL && srcLength!=0) {

	414 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

	415 return;

	416 }

	417 // Arguments are fine, reset output values.

	418 info.reset();

	419 if(srcLength==0) {

	420 if(toASCII) {

	421 info.errors\|=UIDNA_ERROR_EMPTY_LABEL;

	422 }

	423 dest.Flush();

	424 return;

	425 }

	426 UnicodeString destString;

	427 int32_t labelStart=0;

	428 if(srcLength<=256) { // length of stackArray[]

	429 // ASCII fastpath

	430 char stackArray[256];

	431 int32_t destCapacity;

	432 char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,

	433 stackArray, LENGTHOF(stackArray), & destCapacity);

	434 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

	435 int32_t i;

	436 for(i=0;; ++i) {

	437 if(i==srcLength) {

	438 if(toASCII) {

	439 if((i-labelStart)>63) {

	440 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	441 }

	442 // There is a trailing dot if labelStart==i.

	443 if(!isLabel && i>=254 && (i>254 \|\| labelStart<i)) {

	444 info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

	445 }

	446 }

	447 info.errors\|=info.labelErrors;

	448 dest.Append(destArray, i);

	449 dest.Flush();

	450 return;

	451 }

	452 char c=srcArray[i];

	453 if((int8_t)c<0) { // (uint8_t)c>0x7f

	454 break;

	455 }

	456 int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with a char.

	457 if(cData>0) {

	458 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter.

	459 } else if(cData<0 && disallowNonLDHDot) {

	460 break; // Replacing with U+FFFD can be complicated for toASCII.

	461 } else {

	462 destArray[i]=c;

	463 if(c==0x2d) { // hyphen

	464 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {

	465 // "??--..." is Punycode or forbidden.

	466 break;

	467 }

	468 if(i==labelStart) {

	469 // label starts with "-"

	470 info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;

	471 }

	472 if((i+1)==srcLength \|\| srcArray[i+1]==0x2e) {

	473 // label ends with "-"

	474 info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;

	475 }

	476 } else if(c==0x2e) { // dot

	477 if(isLabel) {

	478 break; // Replacing with U+FFFD can be complicated for toASCII.

	479 }

	480 if(toASCII) {

	481 // Permit an empty label at the end but not elsewhere.

	482 if(i==labelStart && i<(srcLength-1)) {

	483 info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;

	484 } else if((i-labelStart)>63) {

	485 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	486 }

	487 }

	488 info.errors\|=info.labelErrors;

	489 info.labelErrors=0;

	490 labelStart=i+1;

	491 }

	492 }

	493 }

	494 info.errors\|=info.labelErrors;

	495 // Convert the processed ASCII prefix of the current label to UTF-16.

	496 int32_t mappingStart=i-labelStart;

	497 destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, map pingStart));

	498 // Output the previous ASCII labels and process the rest of src in UTF-1 6.

	499 dest.Append(destArray, labelStart);

	500 processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,

	501 isLabel, toASCII,

	502 destString, info, errorCode);

	503 } else {

	504 // src is too long for the ASCII fastpath implementation.

	505 processUnicode(UnicodeString::fromUTF8(src), 0, 0,

	506 isLabel, toASCII,

	507 destString, info, errorCode);

	508 }

	509 destString.toUTF8(dest); // calls dest.Flush()

	510 if(toASCII && !isLabel) {

	511 // length==labelStart==254 means that there is a trailing dot (ok) and

	512 // destString is empty (do not index at 253-labelStart).

	513 int32_t length=labelStart+destString.length();

	514 if( length>=254 && isASCIIString(destString) &&

	515 (length>254 \|\|

	516 (labelStart<254 && destString[253-labelStart]!=0x2e))

	517 ) {

	518 info.errors\|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

	519 }

	520 }

	521 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&

	522 (!info.isOkBiDi \|\| (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart) ))

	523 ) {

	524 info.errors\|=UIDNA_ERROR_BIDI;

	525 }

	526 }

	527

	528 UnicodeString &

	529 UTS46::processUnicode(const UnicodeString &src,

	530 int32_t labelStart, int32_t mappingStart,

	531 UBool isLabel, UBool toASCII,

	532 UnicodeString &dest,

	533 IDNAInfo &info, UErrorCode &errorCode) const {

	534 if(mappingStart==0) {

	535 uts46Norm2.normalize(src, dest, errorCode);

	536 } else {

	537 uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart ), errorCode);

	538 }

	539 if(U_FAILURE(errorCode)) {

	540 return dest;

	541 }

	542 UBool doMapDevChars=

	543 toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :

	544 (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;

	545 const UChar *destArray=dest.getBuffer();

	546 int32_t destLength=dest.length();

	547 int32_t labelLimit=labelStart;

	548 while(labelLimit<destLength) {

	549 UChar c=destArray[labelLimit];

	550 if(c==0x2e && !isLabel) {

	551 int32_t labelLength=labelLimit-labelStart;

	552 int32_t newLength=processLabel(dest, labelStart, labelLength,

	553 toASCII, info, errorCode);

	554 info.errors\|=info.labelErrors;

	555 info.labelErrors=0;

	556 if(U_FAILURE(errorCode)) {

	557 return dest;

	558 }

	559 destArray=dest.getBuffer();

	560 destLength+=newLength-labelLength;

	561 labelLimit=labelStart+=newLength+1;

	562 } else if(0xdf<=c && c<=0x200d && (c==0xdf \|\| c==0x3c2 \|\| c>=0x200c)) {

	563 info.isTransDiff=TRUE;

	564 if(doMapDevChars) {

	565 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);

	566 if(U_FAILURE(errorCode)) {

	567 return dest;

	568 }

	569 destArray=dest.getBuffer();

	570 // Do not increment labelLimit in case c was removed.

	571 // All deviation characters have been mapped, no need to check f or them again.

	572 doMapDevChars=FALSE;

	573 } else {

	574 ++labelLimit;

	575 }

	576 } else {

	577 ++labelLimit;

	578 }

	579 }

	580 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)

	581 // but not an empty label elsewhere nor a completely empty domain name.

	582 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.

	583 if(0==labelStart \|\| labelStart<labelLimit) {

	584 processLabel(dest, labelStart, labelLimit-labelStart,

	585 toASCII, info, errorCode);

	586 info.errors\|=info.labelErrors;

	587 }

	588 return dest;

	589 }

	590

	591 int32_t

	592 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart ,

	593 UErrorCode &errorCode) const {

	594 int32_t length=dest.length();

	595 UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length);

	596 if(s==NULL) {

	597 errorCode=U_MEMORY_ALLOCATION_ERROR;

	598 return length;

	599 }

	600 int32_t capacity=dest.getCapacity();

	601 UBool didMapDevChars=FALSE;

	602 int32_t readIndex=mappingStart, writeIndex=mappingStart;

	603 do {

	604 UChar c=s[readIndex++];

	605 switch(c) {

	606 case 0xdf:

	607 // Map sharp s to ss.

	608 didMapDevChars=TRUE;

	609 s[writeIndex++]=0x73; // Replace sharp s with first s.

	610 // Insert second s and account for possible buffer reallocation.

	611 if(writeIndex==readIndex) {

	612 if(length==capacity) {

	613 dest.releaseBuffer(length);

	614 s=dest.getBuffer(length+1);

	615 if(s==NULL) {

	616 errorCode=U_MEMORY_ALLOCATION_ERROR;

	617 return length;

	618 }

	619 capacity=dest.getCapacity();

	620 }

	621 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);

	622 ++readIndex;

	623 }

	624 s[writeIndex++]=0x73;

	625 ++length;

	626 break;

	627 case 0x3c2: // Map final sigma to nonfinal sigma.

	628 didMapDevChars=TRUE;

	629 s[writeIndex++]=0x3c3;

	630 break;

	631 case 0x200c: // Ignore/remove ZWNJ.

	632 case 0x200d: // Ignore/remove ZWJ.

	633 didMapDevChars=TRUE;

	634 --length;

	635 break;

	636 default:

	637 // Only really necessary if writeIndex was different from readIndex.

	638 s[writeIndex++]=c;

	639 break;

	640 }

	641 } while(writeIndex<length);

	642 dest.releaseBuffer(length);

	643 if(didMapDevChars) {

	644 // Mapping deviation characters might have resulted in an un-NFC string.

	645 // We could use either the NFC or the UTS #46 normalizer.

	646 // By using the UTS #46 normalizer again, we avoid having to load a seco nd .nrm data file.

	647 UnicodeString normalized;

	648 uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCo de);

	649 if(U_SUCCESS(errorCode)) {

	650 dest.replace(labelStart, 0x7fffffff, normalized);

	651 return dest.length();

	652 }

	653 }

	654 return length;

	655 }

	656

	657 // Some non-ASCII characters are equivalent to sequences with

	658 // non-LDH ASCII characters. To find them:

	659 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)

	660 static inline UBool

	661 isNonASCIIDisallowedSTD3Valid(UChar32 c) {

	662 return c==0x2260 \|\| c==0x226E \|\| c==0x226F;

	663 }

	664

	665 // Replace the label in dest with the label string, if the label was modified.

	666 // If &label==&dest then the label was modified in-place and labelLength

	667 // is the new label length, different from label.length().

	668 // If &label!=&dest then labelLength==label.length().

	669 // Returns labelLength (= the new label length).

	670 static int32_t

	671 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLengt h,

	672 const UnicodeString &label, int32_t labelLength) {

	673 if(&label!=&dest) {

	674 dest.replace(destLabelStart, destLabelLength, label);

	675 }

	676 return labelLength;

	677 }

	678

	679 int32_t

	680 UTS46::processLabel(UnicodeString &dest,

	681 int32_t labelStart, int32_t labelLength,

	682 UBool toASCII,

	683 IDNAInfo &info, UErrorCode &errorCode) const {

	684 UnicodeString fromPunycode;

	685 UnicodeString *labelString;

	686 const UChar *label=dest.getBuffer()+labelStart;

	687 int32_t destLabelStart=labelStart;

	688 int32_t destLabelLength=labelLength;

	689 UBool wasPunycode;

	690 if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && l abel[3]==0x2d) {

	691 // Label starts with "xn--", try to un-Punycode it.

	692 wasPunycode=TRUE;

	693 UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most labels should fit

	694 if(unicodeBuffer==NULL) {

	695 // Should never occur if we used capacity==-1 which uses the interna l buffer.

	696 errorCode=U_MEMORY_ALLOCATION_ERROR;

	697 return labelLength;

	698 }

	699 UErrorCode punycodeErrorCode=U_ZERO_ERROR;

	700 int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,

	701 unicodeBuffer, fromPunycode.getC apacity(),

	702 NULL, &punycodeErrorCode);

	703 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {

	704 fromPunycode.releaseBuffer(0);

	705 unicodeBuffer=fromPunycode.getBuffer(unicodeLength);

	706 if(unicodeBuffer==NULL) {

	707 errorCode=U_MEMORY_ALLOCATION_ERROR;

	708 return labelLength;

	709 }

	710 punycodeErrorCode=U_ZERO_ERROR;

	711 unicodeLength=u_strFromPunycode(label+4, labelLength-4,

	712 unicodeBuffer, fromPunycode.getCapac ity(),

	713 NULL, &punycodeErrorCode);

	714 }

	715 fromPunycode.releaseBuffer(unicodeLength);

	716 if(U_FAILURE(punycodeErrorCode)) {

	717 info.labelErrors\|=UIDNA_ERROR_PUNYCODE;

	718 return markBadACELabel(dest, labelStart, labelLength, toASCII, info) ;

	719 }

	720 // Check for NFC, and for characters that are not

	721 // valid or deviation characters according to the normalizer.

	722 // If there is something wrong, then the string will change.

	723 // Note that the normalizer passes through non-LDH ASCII and deviation c haracters.

	724 // Deviation characters are ok in Punycode even in transitional processi ng.

	725 // In the code further below, if we find non-LDH ASCII and we have UIDNA _USE_STD3_RULES

	726 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.

	727 UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);

	728 if(U_FAILURE(errorCode)) {

	729 return labelLength;

	730 }

	731 if(!isValid) {

	732 info.labelErrors\|=UIDNA_ERROR_INVALID_ACE_LABEL;

	733 return markBadACELabel(dest, labelStart, labelLength, toASCII, info) ;

	734 }

	735 labelString=&fromPunycode;

	736 label=fromPunycode.getBuffer();

	737 labelStart=0;

	738 labelLength=fromPunycode.length();

	739 } else {

	740 wasPunycode=FALSE;

	741 labelString=&dest;

	742 }

	743 // Validity check

	744 if(labelLength==0) {

	745 if(toASCII) {

	746 info.labelErrors\|=UIDNA_ERROR_EMPTY_LABEL;

	747 }

	748 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, labelLength);

	749 }

	750 // labelLength>0

	751 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {

	752 // label starts with "??--"

	753 info.labelErrors\|=UIDNA_ERROR_HYPHEN_3_4;

	754 }

	755 if(label[0]==0x2d) {

	756 // label starts with "-"

	757 info.labelErrors\|=UIDNA_ERROR_LEADING_HYPHEN;

	758 }

	759 if(label[labelLength-1]==0x2d) {

	760 // label ends with "-"

	761 info.labelErrors\|=UIDNA_ERROR_TRAILING_HYPHEN;

	762 }

	763 // If the label was not a Punycode label, then it was the result of

	764 // mapping, normalization and label segmentation.

	765 // If the label was in Punycode, then we mapped it again above

	766 // and checked its validity.

	767 // Now we handle the STD3 restriction to LDH characters (if set)

	768 // and we look for U+FFFD which indicates disallowed characters

	769 // in a non-Punycode label or U+FFFD itself in a Punycode label.

	770 // We also check for dots which can come from the input to a single-label fu nction.

	771 // Ok to cast away const because we own the UnicodeString.

	772 UChar s=(UChar )label;

	773 const UChar *limit=label+labelLength;

	774 UChar oredChars=0;

	775 // If we enforce STD3 rules, then ASCII characters other than LDH and dot ar e disallowed.

	776 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

	777 do {

	778 UChar c=*s;

	779 if(c<=0x7f) {

	780 if(c==0x2e) {

	781 info.labelErrors\|=UIDNA_ERROR_LABEL_HAS_DOT;

	782 *s=0xfffd;

	783 } else if(disallowNonLDHDot && asciiData[c]<0) {

	784 info.labelErrors\|=UIDNA_ERROR_DISALLOWED;

	785 *s=0xfffd;

	786 }

	787 } else {

	788 oredChars\|=c;

	789 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {

	790 info.labelErrors\|=UIDNA_ERROR_DISALLOWED;

	791 *s=0xfffd;

	792 } else if(c==0xfffd) {

	793 info.labelErrors\|=UIDNA_ERROR_DISALLOWED;

	794 }

	795 }

	796 ++s;

	797 } while(s<limit);

	798 // Check for a leading combining mark after other validity checks

	799 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.

	800 UChar32 c;

	801 int32_t cpLength=0;

	802 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.

	803 U16_NEXT_UNSAFE(label, cpLength, c);

	804 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {

	805 info.labelErrors\|=UIDNA_ERROR_LEADING_COMBINING_MARK;

	806 labelString->replace(labelStart, cpLength, (UChar)0xfffd);

	807 label=labelString->getBuffer()+labelStart;

	808 labelLength+=1-cpLength;

	809 if(labelString==&dest) {

	810 destLabelLength=labelLength;

	811 }

	812 }

	813 if((info.labelErrors&severeErrors)==0) {

	814 // Do contextual checks only if we do not have U+FFFD from a severe erro r

	815 // because U+FFFD can make these checks fail.

	816 if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi \|\| info.isOkBiDi)) {

	817 checkLabelBiDi(label, labelLength, info);

	818 }

	819 if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&

	820 !isLabelOkContextJ(label, labelLength)

	821 ) {

	822 info.labelErrors\|=UIDNA_ERROR_CONTEXTJ;

	823 }

	824 if(toASCII) {

	825 if(wasPunycode) {

	826 // Leave a Punycode label unchanged if it has no severe errors.

	827 if(destLabelLength>63) {

	828 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	829 }

	830 return destLabelLength;

	831 } else if(oredChars>=0x80) {

	832 // Contains non-ASCII characters.

	833 UnicodeString punycode;

	834 UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label length

	835 if(buffer==NULL) {

	836 errorCode=U_MEMORY_ALLOCATION_ERROR;

	837 return destLabelLength;

	838 }

	839 buffer[0]=0x78; // Write "xn--".

	840 buffer[1]=0x6e;

	841 buffer[2]=0x2d;

	842 buffer[3]=0x2d;

	843 int32_t punycodeLength=u_strToPunycode(label, labelLength,

	844 buffer+4, punycode.getCapa city()-4,

	845 NULL, &errorCode);

	846 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {

	847 errorCode=U_ZERO_ERROR;

	848 punycode.releaseBuffer(4);

	849 buffer=punycode.getBuffer(4+punycodeLength);

	850 if(buffer==NULL) {

	851 errorCode=U_MEMORY_ALLOCATION_ERROR;

	852 return destLabelLength;

	853 }

	854 punycodeLength=u_strToPunycode(label, labelLength,

	855 buffer+4, punycode.getCapacity ()-4,

	856 NULL, &errorCode);

	857 }

	858 punycodeLength+=4;

	859 punycode.releaseBuffer(punycodeLength);

	860 if(U_FAILURE(errorCode)) {

	861 return destLabelLength;

	862 }

	863 if(punycodeLength>63) {

	864 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	865 }

	866 return replaceLabel(dest, destLabelStart, destLabelLength,

	867 punycode, punycodeLength);

	868 } else {

	869 // all-ASCII label

	870 if(labelLength>63) {

	871 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	872 }

	873 }

	874 }

	875 } else {

	876 // If a Punycode label has severe errors,

	877 // then leave it but make sure it does not look valid.

	878 if(wasPunycode) {

	879 info.labelErrors\|=UIDNA_ERROR_INVALID_ACE_LABEL;

	880 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCI I, info);

	881 }

	882 }

	883 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, lab elLength);

	884 }

	885

	886 // Make sure an ACE label does not look valid.

	887 // Append U+FFFD if the label has only LDH characters.

	888 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD .

	889 int32_t

	890 UTS46::markBadACELabel(UnicodeString &dest,

	891 int32_t labelStart, int32_t labelLength,

	892 UBool toASCII, IDNAInfo &info) const {

	893 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

	894 UBool isASCII=TRUE;

	895 UBool onlyLDH=TRUE;

	896 const UChar *label=dest.getBuffer()+labelStart;

	897 // Ok to cast away const because we own the UnicodeString.

	898 UChar s=(UChar )label+4; // After the initial "xn--".

	899 const UChar *limit=label+labelLength;

	900 do {

	901 UChar c=*s;

	902 if(c<=0x7f) {

	903 if(c==0x2e) {

	904 info.labelErrors\|=UIDNA_ERROR_LABEL_HAS_DOT;

	905 *s=0xfffd;

	906 isASCII=onlyLDH=FALSE;

	907 } else if(asciiData[c]<0) {

	908 onlyLDH=FALSE;

	909 if(disallowNonLDHDot) {

	910 *s=0xfffd;

	911 isASCII=FALSE;

	912 }

	913 }

	914 } else {

	915 isASCII=onlyLDH=FALSE;

	916 }

	917 } while(++s<limit);

	918 if(onlyLDH) {

	919 dest.insert(labelStart+labelLength, (UChar)0xfffd);

	920 ++labelLength;

	921 } else {

	922 if(toASCII && isASCII && labelLength>63) {

	923 info.labelErrors\|=UIDNA_ERROR_LABEL_TOO_LONG;

	924 }

	925 }

	926 return labelLength;

	927 }

	928

	929 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);

	930 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)\|U_MASK(U_RIGHT_TO_LEFT_ARABIC);

	931 const uint32_t L_R_AL_MASK=L_MASK\|R_AL_MASK;

	932

	933 const uint32_t R_AL_AN_MASK=R_AL_MASK\|U_MASK(U_ARABIC_NUMBER);

	934

	935 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)\|U_MASK(U_ARABIC_NUMBER);

	936 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK\|EN_AN_MASK;

	937 const uint32_t L_EN_MASK=L_MASK\|U_MASK(U_EUROPEAN_NUMBER);

	938

	939 const uint32_t ES_CS_ET_ON_BN_NSM_MASK=

	940 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)\|

	941 U_MASK(U_COMMON_NUMBER_SEPARATOR)\|

	942 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)\|

	943 U_MASK(U_OTHER_NEUTRAL)\|

	944 U_MASK(U_BOUNDARY_NEUTRAL)\|

	945 U_MASK(U_DIR_NON_SPACING_MARK);

	946 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK\|ES_CS_ET_ON_BN_NSM_MASK;

	947 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK\|EN_AN_MASK\|ES_CS_ET_ ON_BN_NSM_MASK;

	948

	949 // We scan the whole label and check both for whether it contains RTL characters

	950 // and whether it passes the BiDi Rule.

	951 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find

	952 // that a domain name is a BiDi domain name (has an RTL label) only after

	953 // processing several earlier labels.

	954 void

	955 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) c onst {

	956 // IDNA2008 BiDi rule

	957 // Get the directionality of the first character.

	958 UChar32 c;

	959 int32_t i=0;

	960 U16_NEXT_UNSAFE(label, i, c);

	961 uint32_t firstMask=U_MASK(u_charDirection(c));

	962 // 1. The first character must be a character with BIDI property L, R

	963 // or AL. If it has the R or AL property, it is an RTL label; if it

	964 // has the L property, it is an LTR label.

	965 if((firstMask&~L_R_AL_MASK)!=0) {

	966 info.isOkBiDi=FALSE;

	967 }

	968 // Get the directionality of the last non-NSM character.

	969 uint32_t lastMask;

	970 for(;;) {

	971 if(i>=labelLength) {

	972 lastMask=firstMask;

	973 break;

	974 }

	975 U16_PREV_UNSAFE(label, labelLength, c);

	976 UCharDirection dir=u_charDirection(c);

	977 if(dir!=U_DIR_NON_SPACING_MARK) {

	978 lastMask=U_MASK(dir);

	979 break;

	980 }

	981 }

	982 // 3. In an RTL label, the end of the label must be a character with

	983 // BIDI property R, AL, EN or AN, followed by zero or more

	984 // characters with BIDI property NSM.

	985 // 6. In an LTR label, the end of the label must be a character with

	986 // BIDI property L or EN, followed by zero or more characters with

	987 // BIDI property NSM.

	988 if( (firstMask&L_MASK)!=0 ?

	989 (lastMask&~L_EN_MASK)!=0 :

	990 (lastMask&~R_AL_EN_AN_MASK)!=0

	991 ) {

	992 info.isOkBiDi=FALSE;

	993 }

	994 // Get the directionalities of the intervening characters.

	995 uint32_t mask=0;

	996 while(i<labelLength) {

	997 U16_NEXT_UNSAFE(label, i, c);

	998 mask\|=U_MASK(u_charDirection(c));

	999 }

	1000 if(firstMask&L_MASK) {

	1001 // 5. In an LTR label, only characters with the BIDI properties L, EN,

	1002 // ES, CS, ET, ON, BN and NSM are allowed.

	1003 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {

	1004 info.isOkBiDi=FALSE;

	1005 }

	1006 } else {

	1007 // 2. In an RTL label, only characters with the BIDI properties R, AL,

	1008 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.

	1009 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {

	1010 info.isOkBiDi=FALSE;

	1011 }

	1012 // 4. In an RTL label, if an EN is present, no AN may be present, and

	1013 // vice versa.

	1014 if((mask&EN_AN_MASK)==EN_AN_MASK) {

	1015 info.isOkBiDi=FALSE;

	1016 }

	1017 }

	1018 // An RTL label is a label that contains at least one character of type

	1019 // R, AL or AN. [...]

	1020 // A "BIDI domain name" is a domain name that contains at least one RTL

	1021 // label. [...]

	1022 // The following rule, consisting of six conditions, applies to labels

	1023 // in BIDI domain names.

	1024 if(((firstMask\|mask\|lastMask)&R_AL_AN_MASK)!=0) {

	1025 info.isBiDi=TRUE;

	1026 }

	1027 }

	1028

	1029 // Special code for the ASCII prefix of a BiDi domain name.

	1030 // The ASCII prefix is all-LTR.

	1031

	1032 // IDNA2008 BiDi rule, parts relevant to ASCII labels:

	1033 // 1. The first character must be a character with BIDI property L [...]

	1034 // 5. In an LTR label, only characters with the BIDI properties L, EN,

	1035 // ES, CS, ET, ON, BN and NSM are allowed.

	1036 // 6. In an LTR label, the end of the label must be a character with

	1037 // BIDI property L or EN [...]

	1038

	1039 // UTF-16 version, called for mapped ASCII prefix.

	1040 // Cannot contain uppercase A-Z.

	1041 // s[length-1] must be the trailing dot.

	1042 static UBool

	1043 isASCIIOkBiDi(const UChar *s, int32_t length) {

	1044 int32_t labelStart=0;

	1045 for(int32_t i=0; i<length; ++i) {

	1046 UChar c=s[i];

	1047 if(c==0x2e) { // dot

	1048 if(i>labelStart) {

	1049 c=s[i-1];

	1050 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {

	1051 // Last character in the label is not an L or EN.

	1052 return FALSE;

	1053 }

	1054 }

	1055 labelStart=i+1;

	1056 } else if(i==labelStart) {

	1057 if(!(0x61<=c && c<=0x7a)) {

	1058 // First character in the label is not an L.

	1059 return FALSE;

	1060 }

	1061 } else {

	1062 if(c<=0x20 && (c>=0x1c \|\| (9<=c && c<=0xd))) {

	1063 // Intermediate character in the label is a B, S or WS.

	1064 return FALSE;

	1065 }

	1066 }

	1067 }

	1068 return TRUE;

	1069 }

	1070

	1071 // UTF-8 version, called for source ASCII prefix.

	1072 // Can contain uppercase A-Z.

	1073 // s[length-1] must be the trailing dot.

	1074 static UBool

	1075 isASCIIOkBiDi(const char *s, int32_t length) {

	1076 int32_t labelStart=0;

	1077 for(int32_t i=0; i<length; ++i) {

	1078 char c=s[i];

	1079 if(c==0x2e) { // dot

	1080 if(i>labelStart) {

	1081 c=s[i-1];

	1082 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c & & c<=0x39)) {

	1083 // Last character in the label is not an L or EN.

	1084 return FALSE;

	1085 }

	1086 }

	1087 labelStart=i+1;

	1088 } else if(i==labelStart) {

	1089 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {

	1090 // First character in the label is not an L.

	1091 return FALSE;

	1092 }

	1093 } else {

	1094 if(c<=0x20 && (c>=0x1c \|\| (9<=c && c<=0xd))) {

	1095 // Intermediate character in the label is a B, S or WS.

	1096 return FALSE;

	1097 }

	1098 }

	1099 }

	1100 return TRUE;

	1101 }

	1102

	1103 UBool

	1104 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {

	1105 // [IDNA2008-Tables]

	1106 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER

	1107 for(int32_t i=0; i<labelLength; ++i) {

	1108 if(label[i]==0x200c) {

	1109 // Appendix A.1. ZERO WIDTH NON-JOINER

	1110 // Rule Set:

	1111 // False;

	1112 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;

	1113 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C

	1114 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True;

	1115 if(i==0) {

	1116 return FALSE;

	1117 }

	1118 UChar32 c;

	1119 int32_t j=i;

	1120 U16_PREV_UNSAFE(label, j, c);

	1121 if(u_getCombiningClass(c)==9) {

	1122 continue;

	1123 }

	1124 // check precontext (Joining_Type:{L,D})(Joining_Type:T)*

	1125 for(;;) {

	1126 UJoiningType type=(UJoiningType)u_getIntPropertyValue(c, UCHAR_J OINING_TYPE);

	1127 if(type==U_JT_TRANSPARENT) {

	1128 if(j==0) {

	1129 return FALSE;

	1130 }

	1131 U16_PREV_UNSAFE(label, j, c);

	1132 } else if(type==U_JT_LEFT_JOINING \|\| type==U_JT_DUAL_JOINING) {

	1133 break; // precontext fulfilled

	1134 } else {

	1135 return FALSE;

	1136 }

	1137 }

	1138 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})

	1139 for(j=i+1;;) {

	1140 if(j==labelLength) {

	1141 return FALSE;

	1142 }

	1143 U16_NEXT_UNSAFE(label, j, c);

	1144 UJoiningType type=(UJoiningType)u_getIntPropertyValue(c, UCHAR_J OINING_TYPE);

	1145 if(type==U_JT_TRANSPARENT) {

	1146 // just skip this character

	1147 } else if(type==U_JT_RIGHT_JOINING \|\| type==U_JT_DUAL_JOINING) {

	1148 break; // postcontext fulfilled

	1149 } else {

	1150 return FALSE;

	1151 }

	1152 }

	1153 } else if(label[i]==0x200d) {

	1154 // Appendix A.2. ZERO WIDTH JOINER (U+200D)

	1155 // Rule Set:

	1156 // False;

	1157 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;

	1158 if(i==0) {

	1159 return FALSE;

	1160 }

	1161 UChar32 c;

	1162 int32_t j=i;

	1163 U16_PREV_UNSAFE(label, j, c);

	1164 if(u_getCombiningClass(c)!=9) {

	1165 return FALSE;

	1166 }

	1167 }

	1168 }

	1169 return TRUE;

	1170 }

	1171

	1172 U_NAMESPACE_END

	1173

	1174 // C API ------------------------------------------------------------------- ***

	1175

	1176 U_NAMESPACE_USE

	1177

	1178 U_DRAFT UIDNA * U_EXPORT2

	1179 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {

	1180 return reinterpret_cast<UIDNA >(IDNA::createUTS46Instance(options, pErrorC ode));

	1181 }

	1182

	1183 U_DRAFT void U_EXPORT2

	1184 uidna_close(UIDNA *idna) {

	1185 delete reinterpret_cast<IDNA *>(idna);

	1186 }

	1187

	1188 static UBool

	1189 checkArgs(const void *label, int32_t length,

	1190 void *dest, int32_t capacity,

	1191 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1192 if(U_FAILURE(*pErrorCode)) {

	1193 return FALSE;

	1194 }

	1195 // sizeof(UIDNAInfo)=16 in the first API version.

	1196 if(pInfo==NULL \|\| pInfo->size<16) {

	1197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

	1198 return FALSE;

	1199 }

	1200 if( (label==NULL ? length!=0 : length<-1) \|\|

	1201 (dest==NULL ? capacity!=0 : capacity<0) \|\|

	1202 (dest==label && label!=NULL)

	1203 ) {

	1204 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

	1205 return FALSE;

	1206 }

	1207 // Set all *pInfo bytes to 0 except for the size field itself.

	1208 uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));

	1209 return TRUE;

	1210 }

	1211

	1212 static void

	1213 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {

	1214 pInfo->isTransitionalDifferent=info.isTransitionalDifferent();

	1215 pInfo->errors=info.getErrors();

	1216 }

	1217

	1218 U_DRAFT int32_t U_EXPORT2

	1219 uidna_labelToASCII(const UIDNA *idna,

	1220 const UChar *label, int32_t length,

	1221 UChar *dest, int32_t capacity,

	1222 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1223 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

	1224 return 0;

	1225 }

	1226 UnicodeString src((UBool)(length<0), label, length);

	1227 UnicodeString destString(dest, 0, capacity);

	1228 IDNAInfo info;

	1229 reinterpret_cast<const IDNA >(idna)->labelToASCII(src, destString, info, p ErrorCode);

	1230 idnaInfoToStruct(info, pInfo);

	1231 return destString.extract(dest, capacity, *pErrorCode);

	1232 }

	1233

	1234 U_DRAFT int32_t U_EXPORT2

	1235 uidna_labelToUnicode(const UIDNA *idna,

	1236 const UChar *label, int32_t length,

	1237 UChar *dest, int32_t capacity,

	1238 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1239 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

	1240 return 0;

	1241 }

	1242 UnicodeString src((UBool)(length<0), label, length);

	1243 UnicodeString destString(dest, 0, capacity);

	1244 IDNAInfo info;

	1245 reinterpret_cast<const IDNA >(idna)->labelToUnicode(src, destString, info, pErrorCode);

	1246 idnaInfoToStruct(info, pInfo);

	1247 return destString.extract(dest, capacity, *pErrorCode);

	1248 }

	1249

	1250 U_DRAFT int32_t U_EXPORT2

	1251 uidna_nameToASCII(const UIDNA *idna,

	1252 const UChar *name, int32_t length,

	1253 UChar *dest, int32_t capacity,

	1254 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1255 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

	1256 return 0;

	1257 }

	1258 UnicodeString src((UBool)(length<0), name, length);

	1259 UnicodeString destString(dest, 0, capacity);

	1260 IDNAInfo info;

	1261 reinterpret_cast<const IDNA >(idna)->nameToASCII(src, destString, info, pE rrorCode);

	1262 idnaInfoToStruct(info, pInfo);

	1263 return destString.extract(dest, capacity, *pErrorCode);

	1264 }

	1265

	1266 U_DRAFT int32_t U_EXPORT2

	1267 uidna_nameToUnicode(const UIDNA *idna,

	1268 const UChar *name, int32_t length,

	1269 UChar *dest, int32_t capacity,

	1270 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1271 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

	1272 return 0;

	1273 }

	1274 UnicodeString src((UBool)(length<0), name, length);

	1275 UnicodeString destString(dest, 0, capacity);

	1276 IDNAInfo info;

	1277 reinterpret_cast<const IDNA >(idna)->nameToUnicode(src, destString, info, pErrorCode);

	1278 idnaInfoToStruct(info, pInfo);

	1279 return destString.extract(dest, capacity, *pErrorCode);

	1280 }

	1281

	1282 U_DRAFT int32_t U_EXPORT2

	1283 uidna_labelToASCII_UTF8(const UIDNA *idna,

	1284 const char *label, int32_t length,

	1285 char *dest, int32_t capacity,

	1286 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1287 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

	1288 return 0;

	1289 }

	1290 StringPiece src(label, length<0 ? uprv_strlen(label) : length);

	1291 CheckedArrayByteSink sink(dest, capacity);

	1292 IDNAInfo info;

	1293 reinterpret_cast<const IDNA >(idna)->labelToASCII_UTF8(src, sink, info, pE rrorCode);

	1294 idnaInfoToStruct(info, pInfo);

	1295 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError Code);

	1296 }

	1297

	1298 U_DRAFT int32_t U_EXPORT2

	1299 uidna_labelToUnicodeUTF8(const UIDNA *idna,

	1300 const char *label, int32_t length,

	1301 char *dest, int32_t capacity,

	1302 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1303 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

	1304 return 0;

	1305 }

	1306 StringPiece src(label, length<0 ? uprv_strlen(label) : length);

	1307 CheckedArrayByteSink sink(dest, capacity);

	1308 IDNAInfo info;

	1309 reinterpret_cast<const IDNA >(idna)->labelToUnicodeUTF8(src, sink, info, p ErrorCode);

	1310 idnaInfoToStruct(info, pInfo);

	1311 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError Code);

	1312 }

	1313

	1314 U_DRAFT int32_t U_EXPORT2

	1315 uidna_nameToASCII_UTF8(const UIDNA *idna,

	1316 const char *name, int32_t length,

	1317 char *dest, int32_t capacity,

	1318 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1319 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

	1320 return 0;

	1321 }

	1322 StringPiece src(name, length<0 ? uprv_strlen(name) : length);

	1323 CheckedArrayByteSink sink(dest, capacity);

	1324 IDNAInfo info;

	1325 reinterpret_cast<const IDNA >(idna)->nameToASCII_UTF8(src, sink, info, pEr rorCode);

	1326 idnaInfoToStruct(info, pInfo);

	1327 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError Code);

	1328 }

	1329

	1330 U_DRAFT int32_t U_EXPORT2

	1331 uidna_nameToUnicodeUTF8(const UIDNA *idna,

	1332 const char *name, int32_t length,

	1333 char *dest, int32_t capacity,

	1334 UIDNAInfo pInfo, UErrorCode pErrorCode) {

	1335 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

	1336 return 0;

	1337 }

	1338 StringPiece src(name, length<0 ? uprv_strlen(name) : length);

	1339 CheckedArrayByteSink sink(dest, capacity);

	1340 IDNAInfo info;

	1341 reinterpret_cast<const IDNA >(idna)->nameToUnicodeUTF8(src, sink, info, pE rrorCode);

	1342 idnaInfoToStruct(info, pInfo);

	1343 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError Code);

	1344 }

	1345

	1346 #endif // UCONFIG_NO_IDNA

OLD	NEW

« no previous file with comments | « icu46/source/common/utrie2_impl.h ('k') | icu46/source/common/utypes.c » ('j') | no next file with comments »