OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ******************************************************************************* |
| 3 * Copyright (C) 2010, International Business Machines |
| 4 * Corporation and others. All Rights Reserved. |
| 5 ******************************************************************************* |
| 6 * file name: uts46.cpp |
| 7 * encoding: US-ASCII |
| 8 * tab size: 8 (not used) |
| 9 * indentation:4 |
| 10 * |
| 11 * created on: 2010mar09 |
| 12 * created by: Markus W. Scherer |
| 13 */ |
| 14 |
| 15 #include "unicode/utypes.h" |
| 16 |
| 17 #if !UCONFIG_NO_IDNA |
| 18 |
| 19 #include "unicode/idna.h" |
| 20 #include "unicode/normalizer2.h" |
| 21 #include "unicode/ustring.h" |
| 22 #include "cmemory.h" |
| 23 #include "cstring.h" |
| 24 #include "punycode.h" |
| 25 #include "ustr_imp.h" |
| 26 |
| 27 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
| 28 |
| 29 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG: |
| 30 // |
| 31 // The domain name length limit is 255 octets in an internal DNS representation |
| 32 // where the last ("root") label is the empty label |
| 33 // represented by length byte 0 alone. |
| 34 // In a conventional string, this translates to 253 characters, or 254 |
| 35 // if there is a trailing dot for the root label. |
| 36 |
| 37 U_NAMESPACE_BEGIN |
| 38 |
| 39 // Severe errors which usually result in a U+FFFD replacement character in the r
esult string. |
| 40 const uint32_t severeErrors= |
| 41 UIDNA_ERROR_LEADING_COMBINING_MARK| |
| 42 UIDNA_ERROR_DISALLOWED| |
| 43 UIDNA_ERROR_PUNYCODE| |
| 44 UIDNA_ERROR_LABEL_HAS_DOT| |
| 45 UIDNA_ERROR_INVALID_ACE_LABEL; |
| 46 |
| 47 static inline UBool |
| 48 isASCIIString(const UnicodeString &dest) { |
| 49 const UChar *s=dest.getBuffer(); |
| 50 const UChar *limit=s+dest.length(); |
| 51 while(s<limit) { |
| 52 if(*s++>0x7f) { |
| 53 return FALSE; |
| 54 } |
| 55 } |
| 56 return TRUE; |
| 57 } |
| 58 |
| 59 static UBool |
| 60 isASCIIOkBiDi(const UChar *s, int32_t length); |
| 61 |
| 62 static UBool |
| 63 isASCIIOkBiDi(const char *s, int32_t length); |
| 64 |
| 65 // IDNA class default implementations -------------------------------------- *** |
| 66 |
| 67 void |
| 68 IDNA::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, |
| 69 IDNAInfo &info, UErrorCode &errorCode) const { |
| 70 if(U_SUCCESS(errorCode)) { |
| 71 UnicodeString destString; |
| 72 labelToASCII(UnicodeString::fromUTF8(label), destString, |
| 73 info, errorCode).toUTF8(dest); |
| 74 } |
| 75 } |
| 76 |
| 77 void |
| 78 IDNA::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, |
| 79 IDNAInfo &info, UErrorCode &errorCode) const { |
| 80 if(U_SUCCESS(errorCode)) { |
| 81 UnicodeString destString; |
| 82 labelToUnicode(UnicodeString::fromUTF8(label), destString, |
| 83 info, errorCode).toUTF8(dest); |
| 84 } |
| 85 } |
| 86 |
| 87 void |
| 88 IDNA::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, |
| 89 IDNAInfo &info, UErrorCode &errorCode) const { |
| 90 if(U_SUCCESS(errorCode)) { |
| 91 UnicodeString destString; |
| 92 nameToASCII(UnicodeString::fromUTF8(name), destString, |
| 93 info, errorCode).toUTF8(dest); |
| 94 } |
| 95 } |
| 96 |
| 97 void |
| 98 IDNA::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, |
| 99 IDNAInfo &info, UErrorCode &errorCode) const { |
| 100 if(U_SUCCESS(errorCode)) { |
| 101 UnicodeString destString; |
| 102 nameToUnicode(UnicodeString::fromUTF8(name), destString, |
| 103 info, errorCode).toUTF8(dest); |
| 104 } |
| 105 } |
| 106 |
| 107 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(IDNA) |
| 108 |
| 109 // UTS46 class declaration ------------------------------------------------- *** |
| 110 |
| 111 class UTS46 : public IDNA { |
| 112 public: |
| 113 UTS46(uint32_t options, UErrorCode &errorCode); |
| 114 virtual ~UTS46(); |
| 115 |
| 116 virtual UnicodeString & |
| 117 labelToASCII(const UnicodeString &label, UnicodeString &dest, |
| 118 IDNAInfo &info, UErrorCode &errorCode) const; |
| 119 |
| 120 virtual UnicodeString & |
| 121 labelToUnicode(const UnicodeString &label, UnicodeString &dest, |
| 122 IDNAInfo &info, UErrorCode &errorCode) const; |
| 123 |
| 124 virtual UnicodeString & |
| 125 nameToASCII(const UnicodeString &name, UnicodeString &dest, |
| 126 IDNAInfo &info, UErrorCode &errorCode) const; |
| 127 |
| 128 virtual UnicodeString & |
| 129 nameToUnicode(const UnicodeString &name, UnicodeString &dest, |
| 130 IDNAInfo &info, UErrorCode &errorCode) const; |
| 131 |
| 132 virtual void |
| 133 labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, |
| 134 IDNAInfo &info, UErrorCode &errorCode) const; |
| 135 |
| 136 virtual void |
| 137 labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, |
| 138 IDNAInfo &info, UErrorCode &errorCode) const; |
| 139 |
| 140 virtual void |
| 141 nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, |
| 142 IDNAInfo &info, UErrorCode &errorCode) const; |
| 143 |
| 144 virtual void |
| 145 nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, |
| 146 IDNAInfo &info, UErrorCode &errorCode) const; |
| 147 |
| 148 private: |
| 149 UnicodeString & |
| 150 process(const UnicodeString &src, |
| 151 UBool isLabel, UBool toASCII, |
| 152 UnicodeString &dest, |
| 153 IDNAInfo &info, UErrorCode &errorCode) const; |
| 154 |
| 155 void |
| 156 processUTF8(const StringPiece &src, |
| 157 UBool isLabel, UBool toASCII, |
| 158 ByteSink &dest, |
| 159 IDNAInfo &info, UErrorCode &errorCode) const; |
| 160 |
| 161 UnicodeString & |
| 162 processUnicode(const UnicodeString &src, |
| 163 int32_t labelStart, int32_t mappingStart, |
| 164 UBool isLabel, UBool toASCII, |
| 165 UnicodeString &dest, |
| 166 IDNAInfo &info, UErrorCode &errorCode) const; |
| 167 |
| 168 // returns the new dest.length() |
| 169 int32_t |
| 170 mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart, |
| 171 UErrorCode &errorCode) const; |
| 172 |
| 173 // returns the new label length |
| 174 int32_t |
| 175 processLabel(UnicodeString &dest, |
| 176 int32_t labelStart, int32_t labelLength, |
| 177 UBool toASCII, |
| 178 IDNAInfo &info, UErrorCode &errorCode) const; |
| 179 int32_t |
| 180 markBadACELabel(UnicodeString &dest, |
| 181 int32_t labelStart, int32_t labelLength, |
| 182 UBool toASCII, IDNAInfo &info) const; |
| 183 |
| 184 void |
| 185 checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) cons
t; |
| 186 |
| 187 UBool |
| 188 isLabelOkContextJ(const UChar *label, int32_t labelLength) const; |
| 189 |
| 190 const Normalizer2 &uts46Norm2; // uts46.nrm |
| 191 uint32_t options; |
| 192 }; |
| 193 |
| 194 IDNA * |
| 195 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) { |
| 196 if(U_SUCCESS(errorCode)) { |
| 197 IDNA *idna=new UTS46(options, errorCode); |
| 198 if(idna==NULL) { |
| 199 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 200 } else if(U_FAILURE(errorCode)) { |
| 201 delete idna; |
| 202 idna=NULL; |
| 203 } |
| 204 return idna; |
| 205 } else { |
| 206 return NULL; |
| 207 } |
| 208 } |
| 209 |
| 210 // UTS46 implementation ---------------------------------------------------- *** |
| 211 |
| 212 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode) |
| 213 : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, er
rorCode)), |
| 214 options(opt) {} |
| 215 |
| 216 UTS46::~UTS46() {} |
| 217 |
| 218 UnicodeString & |
| 219 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest, |
| 220 IDNAInfo &info, UErrorCode &errorCode) const { |
| 221 return process(label, TRUE, TRUE, dest, info, errorCode); |
| 222 } |
| 223 |
| 224 UnicodeString & |
| 225 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest, |
| 226 IDNAInfo &info, UErrorCode &errorCode) const { |
| 227 return process(label, TRUE, FALSE, dest, info, errorCode); |
| 228 } |
| 229 |
| 230 UnicodeString & |
| 231 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest, |
| 232 IDNAInfo &info, UErrorCode &errorCode) const { |
| 233 process(name, FALSE, TRUE, dest, info, errorCode); |
| 234 if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0
&& |
| 235 isASCIIString(dest) && |
| 236 (dest.length()>254 || dest[253]!=0x2e) |
| 237 ) { |
| 238 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
| 239 } |
| 240 return dest; |
| 241 } |
| 242 |
| 243 UnicodeString & |
| 244 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest, |
| 245 IDNAInfo &info, UErrorCode &errorCode) const { |
| 246 return process(name, FALSE, FALSE, dest, info, errorCode); |
| 247 } |
| 248 |
| 249 void |
| 250 UTS46::labelToASCII_UTF8(const StringPiece &label, ByteSink &dest, |
| 251 IDNAInfo &info, UErrorCode &errorCode) const { |
| 252 processUTF8(label, TRUE, TRUE, dest, info, errorCode); |
| 253 } |
| 254 |
| 255 void |
| 256 UTS46::labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest, |
| 257 IDNAInfo &info, UErrorCode &errorCode) const { |
| 258 processUTF8(label, TRUE, FALSE, dest, info, errorCode); |
| 259 } |
| 260 |
| 261 void |
| 262 UTS46::nameToASCII_UTF8(const StringPiece &name, ByteSink &dest, |
| 263 IDNAInfo &info, UErrorCode &errorCode) const { |
| 264 processUTF8(name, FALSE, TRUE, dest, info, errorCode); |
| 265 } |
| 266 |
| 267 void |
| 268 UTS46::nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest, |
| 269 IDNAInfo &info, UErrorCode &errorCode) const { |
| 270 processUTF8(name, FALSE, FALSE, dest, info, errorCode); |
| 271 } |
| 272 |
| 273 // UTS #46 data for ASCII characters. |
| 274 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase |
| 275 // and passes through all other ASCII characters. |
| 276 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed |
| 277 // using this data. |
| 278 // The ASCII fastpath also uses this data. |
| 279 // Values: -1=disallowed 0==valid 1==mapped (lowercase) |
| 280 static const int8_t asciiData[128]={ |
| 281 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| 282 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
| 283 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP |
| 284 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, |
| 285 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE |
| 286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, |
| 287 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z |
| 288 -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 289 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, |
| 290 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z |
| 291 -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 |
| 293 }; |
| 294 |
| 295 UnicodeString & |
| 296 UTS46::process(const UnicodeString &src, |
| 297 UBool isLabel, UBool toASCII, |
| 298 UnicodeString &dest, |
| 299 IDNAInfo &info, UErrorCode &errorCode) const { |
| 300 // uts46Norm2.normalize() would do all of this error checking and setup, |
| 301 // but with the ASCII fastpath we do not always call it, and do not |
| 302 // call it first. |
| 303 if(U_FAILURE(errorCode)) { |
| 304 dest.setToBogus(); |
| 305 return dest; |
| 306 } |
| 307 const UChar *srcArray=src.getBuffer(); |
| 308 if(&dest==&src || srcArray==NULL) { |
| 309 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 310 dest.setToBogus(); |
| 311 return dest; |
| 312 } |
| 313 // Arguments are fine, reset output values. |
| 314 dest.remove(); |
| 315 info.reset(); |
| 316 int32_t srcLength=src.length(); |
| 317 if(srcLength==0) { |
| 318 if(toASCII) { |
| 319 info.errors|=UIDNA_ERROR_EMPTY_LABEL; |
| 320 } |
| 321 return dest; |
| 322 } |
| 323 UChar *destArray=dest.getBuffer(srcLength); |
| 324 if(destArray==NULL) { |
| 325 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 326 return dest; |
| 327 } |
| 328 // ASCII fastpath |
| 329 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
| 330 int32_t labelStart=0; |
| 331 int32_t i; |
| 332 for(i=0;; ++i) { |
| 333 if(i==srcLength) { |
| 334 if(toASCII) { |
| 335 if((i-labelStart)>63) { |
| 336 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 337 } |
| 338 // There is a trailing dot if labelStart==i. |
| 339 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { |
| 340 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
| 341 } |
| 342 } |
| 343 info.errors|=info.labelErrors; |
| 344 dest.releaseBuffer(i); |
| 345 return dest; |
| 346 } |
| 347 UChar c=srcArray[i]; |
| 348 if(c>0x7f) { |
| 349 break; |
| 350 } |
| 351 int cData=asciiData[c]; |
| 352 if(cData>0) { |
| 353 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. |
| 354 } else if(cData<0 && disallowNonLDHDot) { |
| 355 break; // Replacing with U+FFFD can be complicated for toASCII. |
| 356 } else { |
| 357 destArray[i]=c; |
| 358 if(c==0x2d) { // hyphen |
| 359 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { |
| 360 // "??--..." is Punycode or forbidden. |
| 361 ++i; // '-' was copied to dest already |
| 362 break; |
| 363 } |
| 364 if(i==labelStart) { |
| 365 // label starts with "-" |
| 366 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
| 367 } |
| 368 if((i+1)==srcLength || srcArray[i+1]==0x2e) { |
| 369 // label ends with "-" |
| 370 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
| 371 } |
| 372 } else if(c==0x2e) { // dot |
| 373 if(isLabel) { |
| 374 // Replacing with U+FFFD can be complicated for toASCII. |
| 375 ++i; // '.' was copied to dest already |
| 376 break; |
| 377 } |
| 378 if(toASCII) { |
| 379 // Permit an empty label at the end but not elsewhere. |
| 380 if(i==labelStart && i<(srcLength-1)) { |
| 381 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
| 382 } else if((i-labelStart)>63) { |
| 383 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 384 } |
| 385 } |
| 386 info.errors|=info.labelErrors; |
| 387 info.labelErrors=0; |
| 388 labelStart=i+1; |
| 389 } |
| 390 } |
| 391 } |
| 392 info.errors|=info.labelErrors; |
| 393 dest.releaseBuffer(i); |
| 394 processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode); |
| 395 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && |
| 396 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), lab
elStart))) |
| 397 ) { |
| 398 info.errors|=UIDNA_ERROR_BIDI; |
| 399 } |
| 400 return dest; |
| 401 } |
| 402 |
| 403 void |
| 404 UTS46::processUTF8(const StringPiece &src, |
| 405 UBool isLabel, UBool toASCII, |
| 406 ByteSink &dest, |
| 407 IDNAInfo &info, UErrorCode &errorCode) const { |
| 408 if(U_FAILURE(errorCode)) { |
| 409 return; |
| 410 } |
| 411 const char *srcArray=src.data(); |
| 412 int32_t srcLength=src.length(); |
| 413 if(srcArray==NULL && srcLength!=0) { |
| 414 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 415 return; |
| 416 } |
| 417 // Arguments are fine, reset output values. |
| 418 info.reset(); |
| 419 if(srcLength==0) { |
| 420 if(toASCII) { |
| 421 info.errors|=UIDNA_ERROR_EMPTY_LABEL; |
| 422 } |
| 423 dest.Flush(); |
| 424 return; |
| 425 } |
| 426 UnicodeString destString; |
| 427 int32_t labelStart=0; |
| 428 if(srcLength<=256) { // length of stackArray[] |
| 429 // ASCII fastpath |
| 430 char stackArray[256]; |
| 431 int32_t destCapacity; |
| 432 char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20, |
| 433 stackArray, LENGTHOF(stackArray), &
destCapacity); |
| 434 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
| 435 int32_t i; |
| 436 for(i=0;; ++i) { |
| 437 if(i==srcLength) { |
| 438 if(toASCII) { |
| 439 if((i-labelStart)>63) { |
| 440 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 441 } |
| 442 // There is a trailing dot if labelStart==i. |
| 443 if(!isLabel && i>=254 && (i>254 || labelStart<i)) { |
| 444 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
| 445 } |
| 446 } |
| 447 info.errors|=info.labelErrors; |
| 448 dest.Append(destArray, i); |
| 449 dest.Flush(); |
| 450 return; |
| 451 } |
| 452 char c=srcArray[i]; |
| 453 if((int8_t)c<0) { // (uint8_t)c>0x7f |
| 454 break; |
| 455 } |
| 456 int cData=asciiData[(int)c]; // Cast: gcc warns about indexing with
a char. |
| 457 if(cData>0) { |
| 458 destArray[i]=c+0x20; // Lowercase an uppercase ASCII letter. |
| 459 } else if(cData<0 && disallowNonLDHDot) { |
| 460 break; // Replacing with U+FFFD can be complicated for toASCII. |
| 461 } else { |
| 462 destArray[i]=c; |
| 463 if(c==0x2d) { // hyphen |
| 464 if(i==(labelStart+3) && srcArray[i-1]==0x2d) { |
| 465 // "??--..." is Punycode or forbidden. |
| 466 break; |
| 467 } |
| 468 if(i==labelStart) { |
| 469 // label starts with "-" |
| 470 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
| 471 } |
| 472 if((i+1)==srcLength || srcArray[i+1]==0x2e) { |
| 473 // label ends with "-" |
| 474 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
| 475 } |
| 476 } else if(c==0x2e) { // dot |
| 477 if(isLabel) { |
| 478 break; // Replacing with U+FFFD can be complicated for
toASCII. |
| 479 } |
| 480 if(toASCII) { |
| 481 // Permit an empty label at the end but not elsewhere. |
| 482 if(i==labelStart && i<(srcLength-1)) { |
| 483 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
| 484 } else if((i-labelStart)>63) { |
| 485 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 486 } |
| 487 } |
| 488 info.errors|=info.labelErrors; |
| 489 info.labelErrors=0; |
| 490 labelStart=i+1; |
| 491 } |
| 492 } |
| 493 } |
| 494 info.errors|=info.labelErrors; |
| 495 // Convert the processed ASCII prefix of the current label to UTF-16. |
| 496 int32_t mappingStart=i-labelStart; |
| 497 destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, map
pingStart)); |
| 498 // Output the previous ASCII labels and process the rest of src in UTF-1
6. |
| 499 dest.Append(destArray, labelStart); |
| 500 processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0,
mappingStart, |
| 501 isLabel, toASCII, |
| 502 destString, info, errorCode); |
| 503 } else { |
| 504 // src is too long for the ASCII fastpath implementation. |
| 505 processUnicode(UnicodeString::fromUTF8(src), 0, 0, |
| 506 isLabel, toASCII, |
| 507 destString, info, errorCode); |
| 508 } |
| 509 destString.toUTF8(dest); // calls dest.Flush() |
| 510 if(toASCII && !isLabel) { |
| 511 // length==labelStart==254 means that there is a trailing dot (ok) and |
| 512 // destString is empty (do not index at 253-labelStart). |
| 513 int32_t length=labelStart+destString.length(); |
| 514 if( length>=254 && isASCIIString(destString) && |
| 515 (length>254 || |
| 516 (labelStart<254 && destString[253-labelStart]!=0x2e)) |
| 517 ) { |
| 518 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; |
| 519 } |
| 520 } |
| 521 if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 && |
| 522 (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)
)) |
| 523 ) { |
| 524 info.errors|=UIDNA_ERROR_BIDI; |
| 525 } |
| 526 } |
| 527 |
| 528 UnicodeString & |
| 529 UTS46::processUnicode(const UnicodeString &src, |
| 530 int32_t labelStart, int32_t mappingStart, |
| 531 UBool isLabel, UBool toASCII, |
| 532 UnicodeString &dest, |
| 533 IDNAInfo &info, UErrorCode &errorCode) const { |
| 534 if(mappingStart==0) { |
| 535 uts46Norm2.normalize(src, dest, errorCode); |
| 536 } else { |
| 537 uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart
), errorCode); |
| 538 } |
| 539 if(U_FAILURE(errorCode)) { |
| 540 return dest; |
| 541 } |
| 542 UBool doMapDevChars= |
| 543 toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 : |
| 544 (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0; |
| 545 const UChar *destArray=dest.getBuffer(); |
| 546 int32_t destLength=dest.length(); |
| 547 int32_t labelLimit=labelStart; |
| 548 while(labelLimit<destLength) { |
| 549 UChar c=destArray[labelLimit]; |
| 550 if(c==0x2e && !isLabel) { |
| 551 int32_t labelLength=labelLimit-labelStart; |
| 552 int32_t newLength=processLabel(dest, labelStart, labelLength, |
| 553 toASCII, info, errorCode); |
| 554 info.errors|=info.labelErrors; |
| 555 info.labelErrors=0; |
| 556 if(U_FAILURE(errorCode)) { |
| 557 return dest; |
| 558 } |
| 559 destArray=dest.getBuffer(); |
| 560 destLength+=newLength-labelLength; |
| 561 labelLimit=labelStart+=newLength+1; |
| 562 } else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) { |
| 563 info.isTransDiff=TRUE; |
| 564 if(doMapDevChars) { |
| 565 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode); |
| 566 if(U_FAILURE(errorCode)) { |
| 567 return dest; |
| 568 } |
| 569 destArray=dest.getBuffer(); |
| 570 // Do not increment labelLimit in case c was removed. |
| 571 // All deviation characters have been mapped, no need to check f
or them again. |
| 572 doMapDevChars=FALSE; |
| 573 } else { |
| 574 ++labelLimit; |
| 575 } |
| 576 } else { |
| 577 ++labelLimit; |
| 578 } |
| 579 } |
| 580 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is
ok) |
| 581 // but not an empty label elsewhere nor a completely empty domain name. |
| 582 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0. |
| 583 if(0==labelStart || labelStart<labelLimit) { |
| 584 processLabel(dest, labelStart, labelLimit-labelStart, |
| 585 toASCII, info, errorCode); |
| 586 info.errors|=info.labelErrors; |
| 587 } |
| 588 return dest; |
| 589 } |
| 590 |
| 591 int32_t |
| 592 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart
, |
| 593 UErrorCode &errorCode) const { |
| 594 int32_t length=dest.length(); |
| 595 UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length); |
| 596 if(s==NULL) { |
| 597 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 598 return length; |
| 599 } |
| 600 int32_t capacity=dest.getCapacity(); |
| 601 UBool didMapDevChars=FALSE; |
| 602 int32_t readIndex=mappingStart, writeIndex=mappingStart; |
| 603 do { |
| 604 UChar c=s[readIndex++]; |
| 605 switch(c) { |
| 606 case 0xdf: |
| 607 // Map sharp s to ss. |
| 608 didMapDevChars=TRUE; |
| 609 s[writeIndex++]=0x73; // Replace sharp s with first s. |
| 610 // Insert second s and account for possible buffer reallocation. |
| 611 if(writeIndex==readIndex) { |
| 612 if(length==capacity) { |
| 613 dest.releaseBuffer(length); |
| 614 s=dest.getBuffer(length+1); |
| 615 if(s==NULL) { |
| 616 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 617 return length; |
| 618 } |
| 619 capacity=dest.getCapacity(); |
| 620 } |
| 621 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex); |
| 622 ++readIndex; |
| 623 } |
| 624 s[writeIndex++]=0x73; |
| 625 ++length; |
| 626 break; |
| 627 case 0x3c2: // Map final sigma to nonfinal sigma. |
| 628 didMapDevChars=TRUE; |
| 629 s[writeIndex++]=0x3c3; |
| 630 break; |
| 631 case 0x200c: // Ignore/remove ZWNJ. |
| 632 case 0x200d: // Ignore/remove ZWJ. |
| 633 didMapDevChars=TRUE; |
| 634 --length; |
| 635 break; |
| 636 default: |
| 637 // Only really necessary if writeIndex was different from readIndex. |
| 638 s[writeIndex++]=c; |
| 639 break; |
| 640 } |
| 641 } while(writeIndex<length); |
| 642 dest.releaseBuffer(length); |
| 643 if(didMapDevChars) { |
| 644 // Mapping deviation characters might have resulted in an un-NFC string. |
| 645 // We could use either the NFC or the UTS #46 normalizer. |
| 646 // By using the UTS #46 normalizer again, we avoid having to load a seco
nd .nrm data file. |
| 647 UnicodeString normalized; |
| 648 uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCo
de); |
| 649 if(U_SUCCESS(errorCode)) { |
| 650 dest.replace(labelStart, 0x7fffffff, normalized); |
| 651 return dest.length(); |
| 652 } |
| 653 } |
| 654 return length; |
| 655 } |
| 656 |
| 657 // Some non-ASCII characters are equivalent to sequences with |
| 658 // non-LDH ASCII characters. To find them: |
| 659 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt) |
| 660 static inline UBool |
| 661 isNonASCIIDisallowedSTD3Valid(UChar32 c) { |
| 662 return c==0x2260 || c==0x226E || c==0x226F; |
| 663 } |
| 664 |
| 665 // Replace the label in dest with the label string, if the label was modified. |
| 666 // If &label==&dest then the label was modified in-place and labelLength |
| 667 // is the new label length, different from label.length(). |
| 668 // If &label!=&dest then labelLength==label.length(). |
| 669 // Returns labelLength (= the new label length). |
| 670 static int32_t |
| 671 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLengt
h, |
| 672 const UnicodeString &label, int32_t labelLength) { |
| 673 if(&label!=&dest) { |
| 674 dest.replace(destLabelStart, destLabelLength, label); |
| 675 } |
| 676 return labelLength; |
| 677 } |
| 678 |
| 679 int32_t |
| 680 UTS46::processLabel(UnicodeString &dest, |
| 681 int32_t labelStart, int32_t labelLength, |
| 682 UBool toASCII, |
| 683 IDNAInfo &info, UErrorCode &errorCode) const { |
| 684 UnicodeString fromPunycode; |
| 685 UnicodeString *labelString; |
| 686 const UChar *label=dest.getBuffer()+labelStart; |
| 687 int32_t destLabelStart=labelStart; |
| 688 int32_t destLabelLength=labelLength; |
| 689 UBool wasPunycode; |
| 690 if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && l
abel[3]==0x2d) { |
| 691 // Label starts with "xn--", try to un-Punycode it. |
| 692 wasPunycode=TRUE; |
| 693 UChar *unicodeBuffer=fromPunycode.getBuffer(-1); // capacity==-1: most
labels should fit |
| 694 if(unicodeBuffer==NULL) { |
| 695 // Should never occur if we used capacity==-1 which uses the interna
l buffer. |
| 696 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 697 return labelLength; |
| 698 } |
| 699 UErrorCode punycodeErrorCode=U_ZERO_ERROR; |
| 700 int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4, |
| 701 unicodeBuffer, fromPunycode.getC
apacity(), |
| 702 NULL, &punycodeErrorCode); |
| 703 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) { |
| 704 fromPunycode.releaseBuffer(0); |
| 705 unicodeBuffer=fromPunycode.getBuffer(unicodeLength); |
| 706 if(unicodeBuffer==NULL) { |
| 707 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 708 return labelLength; |
| 709 } |
| 710 punycodeErrorCode=U_ZERO_ERROR; |
| 711 unicodeLength=u_strFromPunycode(label+4, labelLength-4, |
| 712 unicodeBuffer, fromPunycode.getCapac
ity(), |
| 713 NULL, &punycodeErrorCode); |
| 714 } |
| 715 fromPunycode.releaseBuffer(unicodeLength); |
| 716 if(U_FAILURE(punycodeErrorCode)) { |
| 717 info.labelErrors|=UIDNA_ERROR_PUNYCODE; |
| 718 return markBadACELabel(dest, labelStart, labelLength, toASCII, info)
; |
| 719 } |
| 720 // Check for NFC, and for characters that are not |
| 721 // valid or deviation characters according to the normalizer. |
| 722 // If there is something wrong, then the string will change. |
| 723 // Note that the normalizer passes through non-LDH ASCII and deviation c
haracters. |
| 724 // Deviation characters are ok in Punycode even in transitional processi
ng. |
| 725 // In the code further below, if we find non-LDH ASCII and we have UIDNA
_USE_STD3_RULES |
| 726 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. |
| 727 UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode); |
| 728 if(U_FAILURE(errorCode)) { |
| 729 return labelLength; |
| 730 } |
| 731 if(!isValid) { |
| 732 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; |
| 733 return markBadACELabel(dest, labelStart, labelLength, toASCII, info)
; |
| 734 } |
| 735 labelString=&fromPunycode; |
| 736 label=fromPunycode.getBuffer(); |
| 737 labelStart=0; |
| 738 labelLength=fromPunycode.length(); |
| 739 } else { |
| 740 wasPunycode=FALSE; |
| 741 labelString=&dest; |
| 742 } |
| 743 // Validity check |
| 744 if(labelLength==0) { |
| 745 if(toASCII) { |
| 746 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL; |
| 747 } |
| 748 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString,
labelLength); |
| 749 } |
| 750 // labelLength>0 |
| 751 if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) { |
| 752 // label starts with "??--" |
| 753 info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4; |
| 754 } |
| 755 if(label[0]==0x2d) { |
| 756 // label starts with "-" |
| 757 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN; |
| 758 } |
| 759 if(label[labelLength-1]==0x2d) { |
| 760 // label ends with "-" |
| 761 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN; |
| 762 } |
| 763 // If the label was not a Punycode label, then it was the result of |
| 764 // mapping, normalization and label segmentation. |
| 765 // If the label was in Punycode, then we mapped it again above |
| 766 // and checked its validity. |
| 767 // Now we handle the STD3 restriction to LDH characters (if set) |
| 768 // and we look for U+FFFD which indicates disallowed characters |
| 769 // in a non-Punycode label or U+FFFD itself in a Punycode label. |
| 770 // We also check for dots which can come from the input to a single-label fu
nction. |
| 771 // Ok to cast away const because we own the UnicodeString. |
| 772 UChar *s=(UChar *)label; |
| 773 const UChar *limit=label+labelLength; |
| 774 UChar oredChars=0; |
| 775 // If we enforce STD3 rules, then ASCII characters other than LDH and dot ar
e disallowed. |
| 776 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
| 777 do { |
| 778 UChar c=*s; |
| 779 if(c<=0x7f) { |
| 780 if(c==0x2e) { |
| 781 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; |
| 782 *s=0xfffd; |
| 783 } else if(disallowNonLDHDot && asciiData[c]<0) { |
| 784 info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
| 785 *s=0xfffd; |
| 786 } |
| 787 } else { |
| 788 oredChars|=c; |
| 789 if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) { |
| 790 info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
| 791 *s=0xfffd; |
| 792 } else if(c==0xfffd) { |
| 793 info.labelErrors|=UIDNA_ERROR_DISALLOWED; |
| 794 } |
| 795 } |
| 796 ++s; |
| 797 } while(s<limit); |
| 798 // Check for a leading combining mark after other validity checks |
| 799 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here. |
| 800 UChar32 c; |
| 801 int32_t cpLength=0; |
| 802 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD. |
| 803 U16_NEXT_UNSAFE(label, cpLength, c); |
| 804 if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) { |
| 805 info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK; |
| 806 labelString->replace(labelStart, cpLength, (UChar)0xfffd); |
| 807 label=labelString->getBuffer()+labelStart; |
| 808 labelLength+=1-cpLength; |
| 809 if(labelString==&dest) { |
| 810 destLabelLength=labelLength; |
| 811 } |
| 812 } |
| 813 if((info.labelErrors&severeErrors)==0) { |
| 814 // Do contextual checks only if we do not have U+FFFD from a severe erro
r |
| 815 // because U+FFFD can make these checks fail. |
| 816 if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) { |
| 817 checkLabelBiDi(label, labelLength, info); |
| 818 } |
| 819 if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && |
| 820 !isLabelOkContextJ(label, labelLength) |
| 821 ) { |
| 822 info.labelErrors|=UIDNA_ERROR_CONTEXTJ; |
| 823 } |
| 824 if(toASCII) { |
| 825 if(wasPunycode) { |
| 826 // Leave a Punycode label unchanged if it has no severe errors. |
| 827 if(destLabelLength>63) { |
| 828 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 829 } |
| 830 return destLabelLength; |
| 831 } else if(oredChars>=0x80) { |
| 832 // Contains non-ASCII characters. |
| 833 UnicodeString punycode; |
| 834 UChar *buffer=punycode.getBuffer(63); // 63==maximum DNS label
length |
| 835 if(buffer==NULL) { |
| 836 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 837 return destLabelLength; |
| 838 } |
| 839 buffer[0]=0x78; // Write "xn--". |
| 840 buffer[1]=0x6e; |
| 841 buffer[2]=0x2d; |
| 842 buffer[3]=0x2d; |
| 843 int32_t punycodeLength=u_strToPunycode(label, labelLength, |
| 844 buffer+4, punycode.getCapa
city()-4, |
| 845 NULL, &errorCode); |
| 846 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
| 847 errorCode=U_ZERO_ERROR; |
| 848 punycode.releaseBuffer(4); |
| 849 buffer=punycode.getBuffer(4+punycodeLength); |
| 850 if(buffer==NULL) { |
| 851 errorCode=U_MEMORY_ALLOCATION_ERROR; |
| 852 return destLabelLength; |
| 853 } |
| 854 punycodeLength=u_strToPunycode(label, labelLength, |
| 855 buffer+4, punycode.getCapacity
()-4, |
| 856 NULL, &errorCode); |
| 857 } |
| 858 punycodeLength+=4; |
| 859 punycode.releaseBuffer(punycodeLength); |
| 860 if(U_FAILURE(errorCode)) { |
| 861 return destLabelLength; |
| 862 } |
| 863 if(punycodeLength>63) { |
| 864 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 865 } |
| 866 return replaceLabel(dest, destLabelStart, destLabelLength, |
| 867 punycode, punycodeLength); |
| 868 } else { |
| 869 // all-ASCII label |
| 870 if(labelLength>63) { |
| 871 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 872 } |
| 873 } |
| 874 } |
| 875 } else { |
| 876 // If a Punycode label has severe errors, |
| 877 // then leave it but make sure it does not look valid. |
| 878 if(wasPunycode) { |
| 879 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL; |
| 880 return markBadACELabel(dest, destLabelStart, destLabelLength, toASCI
I, info); |
| 881 } |
| 882 } |
| 883 return replaceLabel(dest, destLabelStart, destLabelLength, *labelString, lab
elLength); |
| 884 } |
| 885 |
| 886 // Make sure an ACE label does not look valid. |
| 887 // Append U+FFFD if the label has only LDH characters. |
| 888 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD
. |
| 889 int32_t |
| 890 UTS46::markBadACELabel(UnicodeString &dest, |
| 891 int32_t labelStart, int32_t labelLength, |
| 892 UBool toASCII, IDNAInfo &info) const { |
| 893 UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0; |
| 894 UBool isASCII=TRUE; |
| 895 UBool onlyLDH=TRUE; |
| 896 const UChar *label=dest.getBuffer()+labelStart; |
| 897 // Ok to cast away const because we own the UnicodeString. |
| 898 UChar *s=(UChar *)label+4; // After the initial "xn--". |
| 899 const UChar *limit=label+labelLength; |
| 900 do { |
| 901 UChar c=*s; |
| 902 if(c<=0x7f) { |
| 903 if(c==0x2e) { |
| 904 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT; |
| 905 *s=0xfffd; |
| 906 isASCII=onlyLDH=FALSE; |
| 907 } else if(asciiData[c]<0) { |
| 908 onlyLDH=FALSE; |
| 909 if(disallowNonLDHDot) { |
| 910 *s=0xfffd; |
| 911 isASCII=FALSE; |
| 912 } |
| 913 } |
| 914 } else { |
| 915 isASCII=onlyLDH=FALSE; |
| 916 } |
| 917 } while(++s<limit); |
| 918 if(onlyLDH) { |
| 919 dest.insert(labelStart+labelLength, (UChar)0xfffd); |
| 920 ++labelLength; |
| 921 } else { |
| 922 if(toASCII && isASCII && labelLength>63) { |
| 923 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG; |
| 924 } |
| 925 } |
| 926 return labelLength; |
| 927 } |
| 928 |
| 929 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT); |
| 930 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC); |
| 931 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK; |
| 932 |
| 933 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER); |
| 934 |
| 935 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER); |
| 936 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; |
| 937 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER); |
| 938 |
| 939 const uint32_t ES_CS_ET_ON_BN_NSM_MASK= |
| 940 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)| |
| 941 U_MASK(U_COMMON_NUMBER_SEPARATOR)| |
| 942 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)| |
| 943 U_MASK(U_OTHER_NEUTRAL)| |
| 944 U_MASK(U_BOUNDARY_NEUTRAL)| |
| 945 U_MASK(U_DIR_NON_SPACING_MARK); |
| 946 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; |
| 947 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_
ON_BN_NSM_MASK; |
| 948 |
| 949 // We scan the whole label and check both for whether it contains RTL characters |
| 950 // and whether it passes the BiDi Rule. |
| 951 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find |
| 952 // that a domain name is a BiDi domain name (has an RTL label) only after |
| 953 // processing several earlier labels. |
| 954 void |
| 955 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) c
onst { |
| 956 // IDNA2008 BiDi rule |
| 957 // Get the directionality of the first character. |
| 958 UChar32 c; |
| 959 int32_t i=0; |
| 960 U16_NEXT_UNSAFE(label, i, c); |
| 961 uint32_t firstMask=U_MASK(u_charDirection(c)); |
| 962 // 1. The first character must be a character with BIDI property L, R |
| 963 // or AL. If it has the R or AL property, it is an RTL label; if it |
| 964 // has the L property, it is an LTR label. |
| 965 if((firstMask&~L_R_AL_MASK)!=0) { |
| 966 info.isOkBiDi=FALSE; |
| 967 } |
| 968 // Get the directionality of the last non-NSM character. |
| 969 uint32_t lastMask; |
| 970 for(;;) { |
| 971 if(i>=labelLength) { |
| 972 lastMask=firstMask; |
| 973 break; |
| 974 } |
| 975 U16_PREV_UNSAFE(label, labelLength, c); |
| 976 UCharDirection dir=u_charDirection(c); |
| 977 if(dir!=U_DIR_NON_SPACING_MARK) { |
| 978 lastMask=U_MASK(dir); |
| 979 break; |
| 980 } |
| 981 } |
| 982 // 3. In an RTL label, the end of the label must be a character with |
| 983 // BIDI property R, AL, EN or AN, followed by zero or more |
| 984 // characters with BIDI property NSM. |
| 985 // 6. In an LTR label, the end of the label must be a character with |
| 986 // BIDI property L or EN, followed by zero or more characters with |
| 987 // BIDI property NSM. |
| 988 if( (firstMask&L_MASK)!=0 ? |
| 989 (lastMask&~L_EN_MASK)!=0 : |
| 990 (lastMask&~R_AL_EN_AN_MASK)!=0 |
| 991 ) { |
| 992 info.isOkBiDi=FALSE; |
| 993 } |
| 994 // Get the directionalities of the intervening characters. |
| 995 uint32_t mask=0; |
| 996 while(i<labelLength) { |
| 997 U16_NEXT_UNSAFE(label, i, c); |
| 998 mask|=U_MASK(u_charDirection(c)); |
| 999 } |
| 1000 if(firstMask&L_MASK) { |
| 1001 // 5. In an LTR label, only characters with the BIDI properties L, EN, |
| 1002 // ES, CS, ET, ON, BN and NSM are allowed. |
| 1003 if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { |
| 1004 info.isOkBiDi=FALSE; |
| 1005 } |
| 1006 } else { |
| 1007 // 2. In an RTL label, only characters with the BIDI properties R, AL, |
| 1008 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. |
| 1009 if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) { |
| 1010 info.isOkBiDi=FALSE; |
| 1011 } |
| 1012 // 4. In an RTL label, if an EN is present, no AN may be present, and |
| 1013 // vice versa. |
| 1014 if((mask&EN_AN_MASK)==EN_AN_MASK) { |
| 1015 info.isOkBiDi=FALSE; |
| 1016 } |
| 1017 } |
| 1018 // An RTL label is a label that contains at least one character of type |
| 1019 // R, AL or AN. [...] |
| 1020 // A "BIDI domain name" is a domain name that contains at least one RTL |
| 1021 // label. [...] |
| 1022 // The following rule, consisting of six conditions, applies to labels |
| 1023 // in BIDI domain names. |
| 1024 if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) { |
| 1025 info.isBiDi=TRUE; |
| 1026 } |
| 1027 } |
| 1028 |
| 1029 // Special code for the ASCII prefix of a BiDi domain name. |
| 1030 // The ASCII prefix is all-LTR. |
| 1031 |
| 1032 // IDNA2008 BiDi rule, parts relevant to ASCII labels: |
| 1033 // 1. The first character must be a character with BIDI property L [...] |
| 1034 // 5. In an LTR label, only characters with the BIDI properties L, EN, |
| 1035 // ES, CS, ET, ON, BN and NSM are allowed. |
| 1036 // 6. In an LTR label, the end of the label must be a character with |
| 1037 // BIDI property L or EN [...] |
| 1038 |
| 1039 // UTF-16 version, called for mapped ASCII prefix. |
| 1040 // Cannot contain uppercase A-Z. |
| 1041 // s[length-1] must be the trailing dot. |
| 1042 static UBool |
| 1043 isASCIIOkBiDi(const UChar *s, int32_t length) { |
| 1044 int32_t labelStart=0; |
| 1045 for(int32_t i=0; i<length; ++i) { |
| 1046 UChar c=s[i]; |
| 1047 if(c==0x2e) { // dot |
| 1048 if(i>labelStart) { |
| 1049 c=s[i-1]; |
| 1050 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) { |
| 1051 // Last character in the label is not an L or EN. |
| 1052 return FALSE; |
| 1053 } |
| 1054 } |
| 1055 labelStart=i+1; |
| 1056 } else if(i==labelStart) { |
| 1057 if(!(0x61<=c && c<=0x7a)) { |
| 1058 // First character in the label is not an L. |
| 1059 return FALSE; |
| 1060 } |
| 1061 } else { |
| 1062 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { |
| 1063 // Intermediate character in the label is a B, S or WS. |
| 1064 return FALSE; |
| 1065 } |
| 1066 } |
| 1067 } |
| 1068 return TRUE; |
| 1069 } |
| 1070 |
| 1071 // UTF-8 version, called for source ASCII prefix. |
| 1072 // Can contain uppercase A-Z. |
| 1073 // s[length-1] must be the trailing dot. |
| 1074 static UBool |
| 1075 isASCIIOkBiDi(const char *s, int32_t length) { |
| 1076 int32_t labelStart=0; |
| 1077 for(int32_t i=0; i<length; ++i) { |
| 1078 char c=s[i]; |
| 1079 if(c==0x2e) { // dot |
| 1080 if(i>labelStart) { |
| 1081 c=s[i-1]; |
| 1082 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c &
& c<=0x39)) { |
| 1083 // Last character in the label is not an L or EN. |
| 1084 return FALSE; |
| 1085 } |
| 1086 } |
| 1087 labelStart=i+1; |
| 1088 } else if(i==labelStart) { |
| 1089 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) { |
| 1090 // First character in the label is not an L. |
| 1091 return FALSE; |
| 1092 } |
| 1093 } else { |
| 1094 if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { |
| 1095 // Intermediate character in the label is a B, S or WS. |
| 1096 return FALSE; |
| 1097 } |
| 1098 } |
| 1099 } |
| 1100 return TRUE; |
| 1101 } |
| 1102 |
| 1103 UBool |
| 1104 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const { |
| 1105 // [IDNA2008-Tables] |
| 1106 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER |
| 1107 for(int32_t i=0; i<labelLength; ++i) { |
| 1108 if(label[i]==0x200c) { |
| 1109 // Appendix A.1. ZERO WIDTH NON-JOINER |
| 1110 // Rule Set: |
| 1111 // False; |
| 1112 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
| 1113 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C |
| 1114 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; |
| 1115 if(i==0) { |
| 1116 return FALSE; |
| 1117 } |
| 1118 UChar32 c; |
| 1119 int32_t j=i; |
| 1120 U16_PREV_UNSAFE(label, j, c); |
| 1121 if(u_getCombiningClass(c)==9) { |
| 1122 continue; |
| 1123 } |
| 1124 // check precontext (Joining_Type:{L,D})(Joining_Type:T)* |
| 1125 for(;;) { |
| 1126 UJoiningType type=(UJoiningType)u_getIntPropertyValue(c, UCHAR_J
OINING_TYPE); |
| 1127 if(type==U_JT_TRANSPARENT) { |
| 1128 if(j==0) { |
| 1129 return FALSE; |
| 1130 } |
| 1131 U16_PREV_UNSAFE(label, j, c); |
| 1132 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) { |
| 1133 break; // precontext fulfilled |
| 1134 } else { |
| 1135 return FALSE; |
| 1136 } |
| 1137 } |
| 1138 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D}) |
| 1139 for(j=i+1;;) { |
| 1140 if(j==labelLength) { |
| 1141 return FALSE; |
| 1142 } |
| 1143 U16_NEXT_UNSAFE(label, j, c); |
| 1144 UJoiningType type=(UJoiningType)u_getIntPropertyValue(c, UCHAR_J
OINING_TYPE); |
| 1145 if(type==U_JT_TRANSPARENT) { |
| 1146 // just skip this character |
| 1147 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) { |
| 1148 break; // postcontext fulfilled |
| 1149 } else { |
| 1150 return FALSE; |
| 1151 } |
| 1152 } |
| 1153 } else if(label[i]==0x200d) { |
| 1154 // Appendix A.2. ZERO WIDTH JOINER (U+200D) |
| 1155 // Rule Set: |
| 1156 // False; |
| 1157 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; |
| 1158 if(i==0) { |
| 1159 return FALSE; |
| 1160 } |
| 1161 UChar32 c; |
| 1162 int32_t j=i; |
| 1163 U16_PREV_UNSAFE(label, j, c); |
| 1164 if(u_getCombiningClass(c)!=9) { |
| 1165 return FALSE; |
| 1166 } |
| 1167 } |
| 1168 } |
| 1169 return TRUE; |
| 1170 } |
| 1171 |
| 1172 U_NAMESPACE_END |
| 1173 |
| 1174 // C API ------------------------------------------------------------------- *** |
| 1175 |
| 1176 U_NAMESPACE_USE |
| 1177 |
| 1178 U_DRAFT UIDNA * U_EXPORT2 |
| 1179 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) { |
| 1180 return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorC
ode)); |
| 1181 } |
| 1182 |
| 1183 U_DRAFT void U_EXPORT2 |
| 1184 uidna_close(UIDNA *idna) { |
| 1185 delete reinterpret_cast<IDNA *>(idna); |
| 1186 } |
| 1187 |
| 1188 static UBool |
| 1189 checkArgs(const void *label, int32_t length, |
| 1190 void *dest, int32_t capacity, |
| 1191 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1192 if(U_FAILURE(*pErrorCode)) { |
| 1193 return FALSE; |
| 1194 } |
| 1195 // sizeof(UIDNAInfo)=16 in the first API version. |
| 1196 if(pInfo==NULL || pInfo->size<16) { |
| 1197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1198 return FALSE; |
| 1199 } |
| 1200 if( (label==NULL ? length!=0 : length<-1) || |
| 1201 (dest==NULL ? capacity!=0 : capacity<0) || |
| 1202 (dest==label && label!=NULL) |
| 1203 ) { |
| 1204 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
| 1205 return FALSE; |
| 1206 } |
| 1207 // Set all *pInfo bytes to 0 except for the size field itself. |
| 1208 uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size)); |
| 1209 return TRUE; |
| 1210 } |
| 1211 |
| 1212 static void |
| 1213 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) { |
| 1214 pInfo->isTransitionalDifferent=info.isTransitionalDifferent(); |
| 1215 pInfo->errors=info.getErrors(); |
| 1216 } |
| 1217 |
| 1218 U_DRAFT int32_t U_EXPORT2 |
| 1219 uidna_labelToASCII(const UIDNA *idna, |
| 1220 const UChar *label, int32_t length, |
| 1221 UChar *dest, int32_t capacity, |
| 1222 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1223 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
| 1224 return 0; |
| 1225 } |
| 1226 UnicodeString src((UBool)(length<0), label, length); |
| 1227 UnicodeString destString(dest, 0, capacity); |
| 1228 IDNAInfo info; |
| 1229 reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *p
ErrorCode); |
| 1230 idnaInfoToStruct(info, pInfo); |
| 1231 return destString.extract(dest, capacity, *pErrorCode); |
| 1232 } |
| 1233 |
| 1234 U_DRAFT int32_t U_EXPORT2 |
| 1235 uidna_labelToUnicode(const UIDNA *idna, |
| 1236 const UChar *label, int32_t length, |
| 1237 UChar *dest, int32_t capacity, |
| 1238 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1239 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
| 1240 return 0; |
| 1241 } |
| 1242 UnicodeString src((UBool)(length<0), label, length); |
| 1243 UnicodeString destString(dest, 0, capacity); |
| 1244 IDNAInfo info; |
| 1245 reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info,
*pErrorCode); |
| 1246 idnaInfoToStruct(info, pInfo); |
| 1247 return destString.extract(dest, capacity, *pErrorCode); |
| 1248 } |
| 1249 |
| 1250 U_DRAFT int32_t U_EXPORT2 |
| 1251 uidna_nameToASCII(const UIDNA *idna, |
| 1252 const UChar *name, int32_t length, |
| 1253 UChar *dest, int32_t capacity, |
| 1254 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1255 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
| 1256 return 0; |
| 1257 } |
| 1258 UnicodeString src((UBool)(length<0), name, length); |
| 1259 UnicodeString destString(dest, 0, capacity); |
| 1260 IDNAInfo info; |
| 1261 reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pE
rrorCode); |
| 1262 idnaInfoToStruct(info, pInfo); |
| 1263 return destString.extract(dest, capacity, *pErrorCode); |
| 1264 } |
| 1265 |
| 1266 U_DRAFT int32_t U_EXPORT2 |
| 1267 uidna_nameToUnicode(const UIDNA *idna, |
| 1268 const UChar *name, int32_t length, |
| 1269 UChar *dest, int32_t capacity, |
| 1270 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1271 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
| 1272 return 0; |
| 1273 } |
| 1274 UnicodeString src((UBool)(length<0), name, length); |
| 1275 UnicodeString destString(dest, 0, capacity); |
| 1276 IDNAInfo info; |
| 1277 reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *
pErrorCode); |
| 1278 idnaInfoToStruct(info, pInfo); |
| 1279 return destString.extract(dest, capacity, *pErrorCode); |
| 1280 } |
| 1281 |
| 1282 U_DRAFT int32_t U_EXPORT2 |
| 1283 uidna_labelToASCII_UTF8(const UIDNA *idna, |
| 1284 const char *label, int32_t length, |
| 1285 char *dest, int32_t capacity, |
| 1286 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1287 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
| 1288 return 0; |
| 1289 } |
| 1290 StringPiece src(label, length<0 ? uprv_strlen(label) : length); |
| 1291 CheckedArrayByteSink sink(dest, capacity); |
| 1292 IDNAInfo info; |
| 1293 reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pE
rrorCode); |
| 1294 idnaInfoToStruct(info, pInfo); |
| 1295 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError
Code); |
| 1296 } |
| 1297 |
| 1298 U_DRAFT int32_t U_EXPORT2 |
| 1299 uidna_labelToUnicodeUTF8(const UIDNA *idna, |
| 1300 const char *label, int32_t length, |
| 1301 char *dest, int32_t capacity, |
| 1302 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1303 if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) { |
| 1304 return 0; |
| 1305 } |
| 1306 StringPiece src(label, length<0 ? uprv_strlen(label) : length); |
| 1307 CheckedArrayByteSink sink(dest, capacity); |
| 1308 IDNAInfo info; |
| 1309 reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *p
ErrorCode); |
| 1310 idnaInfoToStruct(info, pInfo); |
| 1311 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError
Code); |
| 1312 } |
| 1313 |
| 1314 U_DRAFT int32_t U_EXPORT2 |
| 1315 uidna_nameToASCII_UTF8(const UIDNA *idna, |
| 1316 const char *name, int32_t length, |
| 1317 char *dest, int32_t capacity, |
| 1318 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1319 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
| 1320 return 0; |
| 1321 } |
| 1322 StringPiece src(name, length<0 ? uprv_strlen(name) : length); |
| 1323 CheckedArrayByteSink sink(dest, capacity); |
| 1324 IDNAInfo info; |
| 1325 reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pEr
rorCode); |
| 1326 idnaInfoToStruct(info, pInfo); |
| 1327 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError
Code); |
| 1328 } |
| 1329 |
| 1330 U_DRAFT int32_t U_EXPORT2 |
| 1331 uidna_nameToUnicodeUTF8(const UIDNA *idna, |
| 1332 const char *name, int32_t length, |
| 1333 char *dest, int32_t capacity, |
| 1334 UIDNAInfo *pInfo, UErrorCode *pErrorCode) { |
| 1335 if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) { |
| 1336 return 0; |
| 1337 } |
| 1338 StringPiece src(name, length<0 ? uprv_strlen(name) : length); |
| 1339 CheckedArrayByteSink sink(dest, capacity); |
| 1340 IDNAInfo info; |
| 1341 reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pE
rrorCode); |
| 1342 idnaInfoToStruct(info, pInfo); |
| 1343 return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pError
Code); |
| 1344 } |
| 1345 |
| 1346 #endif // UCONFIG_NO_IDNA |
OLD | NEW |