OLD | NEW |
(Empty) | |
| 1 from . import idnadata |
| 2 import bisect |
| 3 import unicodedata |
| 4 import re |
| 5 import sys |
| 6 from .intranges import intranges_contain |
| 7 |
| 8 _virama_combining_class = 9 |
| 9 _alabel_prefix = b'xn--' |
| 10 _unicode_dots_re = re.compile(u'[\u002e\u3002\uff0e\uff61]') |
| 11 |
| 12 if sys.version_info[0] == 3: |
| 13 unicode = str |
| 14 unichr = chr |
| 15 |
| 16 class IDNAError(UnicodeError): |
| 17 """ Base exception for all IDNA-encoding related problems """ |
| 18 pass |
| 19 |
| 20 |
| 21 class IDNABidiError(IDNAError): |
| 22 """ Exception when bidirectional requirements are not satisfied """ |
| 23 pass |
| 24 |
| 25 |
| 26 class InvalidCodepoint(IDNAError): |
| 27 """ Exception when a disallowed or unallocated codepoint is used """ |
| 28 pass |
| 29 |
| 30 |
| 31 class InvalidCodepointContext(IDNAError): |
| 32 """ Exception when the codepoint is not valid in the context it is used """ |
| 33 pass |
| 34 |
| 35 |
| 36 def _combining_class(cp): |
| 37 return unicodedata.combining(unichr(cp)) |
| 38 |
| 39 def _is_script(cp, script): |
| 40 return intranges_contain(ord(cp), idnadata.scripts[script]) |
| 41 |
| 42 def _punycode(s): |
| 43 return s.encode('punycode') |
| 44 |
| 45 def _unot(s): |
| 46 return 'U+{0:04X}'.format(s) |
| 47 |
| 48 |
| 49 def valid_label_length(label): |
| 50 |
| 51 if len(label) > 63: |
| 52 return False |
| 53 return True |
| 54 |
| 55 |
| 56 def valid_string_length(label, trailing_dot): |
| 57 |
| 58 if len(label) > (254 if trailing_dot else 253): |
| 59 return False |
| 60 return True |
| 61 |
| 62 |
| 63 def check_bidi(label, check_ltr=False): |
| 64 |
| 65 # Bidi rules should only be applied if string contains RTL characters |
| 66 bidi_label = False |
| 67 for (idx, cp) in enumerate(label, 1): |
| 68 direction = unicodedata.bidirectional(cp) |
| 69 if direction == '': |
| 70 # String likely comes from a newer version of Unicode |
| 71 raise IDNABidiError('Unknown directionality in label {0} at position
{1}'.format(repr(label), idx)) |
| 72 if direction in ['R', 'AL', 'AN']: |
| 73 bidi_label = True |
| 74 break |
| 75 if not bidi_label and not check_ltr: |
| 76 return True |
| 77 |
| 78 # Bidi rule 1 |
| 79 direction = unicodedata.bidirectional(label[0]) |
| 80 if direction in ['R', 'AL']: |
| 81 rtl = True |
| 82 elif direction == 'L': |
| 83 rtl = False |
| 84 else: |
| 85 raise IDNABidiError('First codepoint in label {0} must be directionality
L, R or AL'.format(repr(label))) |
| 86 |
| 87 valid_ending = False |
| 88 number_type = False |
| 89 for (idx, cp) in enumerate(label, 1): |
| 90 direction = unicodedata.bidirectional(cp) |
| 91 |
| 92 if rtl: |
| 93 # Bidi rule 2 |
| 94 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON',
'BN', 'NSM']: |
| 95 raise IDNABidiError('Invalid direction for codepoint at position
{0} in a right-to-left label'.format(idx)) |
| 96 # Bidi rule 3 |
| 97 if direction in ['R', 'AL', 'EN', 'AN']: |
| 98 valid_ending = True |
| 99 elif direction != 'NSM': |
| 100 valid_ending = False |
| 101 # Bidi rule 4 |
| 102 if direction in ['AN', 'EN']: |
| 103 if not number_type: |
| 104 number_type = direction |
| 105 else: |
| 106 if number_type != direction: |
| 107 raise IDNABidiError('Can not mix numeral types in a righ
t-to-left label') |
| 108 else: |
| 109 # Bidi rule 5 |
| 110 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']
: |
| 111 raise IDNABidiError('Invalid direction for codepoint at position
{0} in a left-to-right label'.format(idx)) |
| 112 # Bidi rule 6 |
| 113 if direction in ['L', 'EN']: |
| 114 valid_ending = True |
| 115 elif direction != 'NSM': |
| 116 valid_ending = False |
| 117 |
| 118 if not valid_ending: |
| 119 raise IDNABidiError('Label ends with illegal codepoint directionality') |
| 120 |
| 121 return True |
| 122 |
| 123 |
| 124 def check_initial_combiner(label): |
| 125 |
| 126 if unicodedata.category(label[0])[0] == 'M': |
| 127 raise IDNAError('Label begins with an illegal combining character') |
| 128 return True |
| 129 |
| 130 |
| 131 def check_hyphen_ok(label): |
| 132 |
| 133 if label[2:4] == '--': |
| 134 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position') |
| 135 if label[0] == '-' or label[-1] == '-': |
| 136 raise IDNAError('Label must not start or end with a hyphen') |
| 137 return True |
| 138 |
| 139 |
| 140 def check_nfc(label): |
| 141 |
| 142 if unicodedata.normalize('NFC', label) != label: |
| 143 raise IDNAError('Label must be in Normalization Form C') |
| 144 |
| 145 |
| 146 def valid_contextj(label, pos): |
| 147 |
| 148 cp_value = ord(label[pos]) |
| 149 |
| 150 if cp_value == 0x200c: |
| 151 |
| 152 if pos > 0: |
| 153 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: |
| 154 return True |
| 155 |
| 156 ok = False |
| 157 for i in range(pos-1, -1, -1): |
| 158 joining_type = idnadata.joining_types.get(ord(label[i])) |
| 159 if joining_type == 'T': |
| 160 continue |
| 161 if joining_type in ['L', 'D']: |
| 162 ok = True |
| 163 break |
| 164 |
| 165 if not ok: |
| 166 return False |
| 167 |
| 168 ok = False |
| 169 for i in range(pos+1, len(label)): |
| 170 joining_type = idnadata.joining_types.get(ord(label[i])) |
| 171 if joining_type == 'T': |
| 172 continue |
| 173 if joining_type in ['R', 'D']: |
| 174 ok = True |
| 175 break |
| 176 return ok |
| 177 |
| 178 if cp_value == 0x200d: |
| 179 |
| 180 if pos > 0: |
| 181 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: |
| 182 return True |
| 183 return False |
| 184 |
| 185 else: |
| 186 |
| 187 return False |
| 188 |
| 189 |
| 190 def valid_contexto(label, pos, exception=False): |
| 191 |
| 192 cp_value = ord(label[pos]) |
| 193 |
| 194 if cp_value == 0x00b7: |
| 195 if 0 < pos < len(label)-1: |
| 196 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c: |
| 197 return True |
| 198 return False |
| 199 |
| 200 elif cp_value == 0x0375: |
| 201 if pos < len(label)-1 and len(label) > 1: |
| 202 return _is_script(label[pos + 1], 'Greek') |
| 203 return False |
| 204 |
| 205 elif cp_value == 0x05f3 or cp_value == 0x05f4: |
| 206 if pos > 0: |
| 207 return _is_script(label[pos - 1], 'Hebrew') |
| 208 return False |
| 209 |
| 210 elif cp_value == 0x30fb: |
| 211 for cp in label: |
| 212 if cp == u'\u30fb': |
| 213 continue |
| 214 if not _is_script(cp, 'Hiragana') and not _is_script(cp, 'Katakana')
and not _is_script(cp, 'Han'): |
| 215 return False |
| 216 return True |
| 217 |
| 218 elif 0x660 <= cp_value <= 0x669: |
| 219 for cp in label: |
| 220 if 0x6f0 <= ord(cp) <= 0x06f9: |
| 221 return False |
| 222 return True |
| 223 |
| 224 elif 0x6f0 <= cp_value <= 0x6f9: |
| 225 for cp in label: |
| 226 if 0x660 <= ord(cp) <= 0x0669: |
| 227 return False |
| 228 return True |
| 229 |
| 230 |
| 231 def check_label(label): |
| 232 |
| 233 if isinstance(label, (bytes, bytearray)): |
| 234 label = label.decode('utf-8') |
| 235 if len(label) == 0: |
| 236 raise IDNAError('Empty Label') |
| 237 |
| 238 check_nfc(label) |
| 239 check_hyphen_ok(label) |
| 240 check_initial_combiner(label) |
| 241 |
| 242 for (pos, cp) in enumerate(label): |
| 243 cp_value = ord(cp) |
| 244 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']): |
| 245 continue |
| 246 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ'])
: |
| 247 if not valid_contextj(label, pos): |
| 248 raise InvalidCodepointContext('Joiner {0} not allowed at positio
n {1} in {2}'.format(_unot(cp_value), pos+1, repr(label))) |
| 249 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO'])
: |
| 250 if not valid_contexto(label, pos): |
| 251 raise InvalidCodepointContext('Codepoint {0} not allowed at posi
tion {1} in {2}'.format(_unot(cp_value), pos+1, repr(label))) |
| 252 else: |
| 253 raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not all
owed'.format(_unot(cp_value), pos+1, repr(label))) |
| 254 |
| 255 check_bidi(label) |
| 256 |
| 257 |
| 258 def alabel(label): |
| 259 |
| 260 try: |
| 261 label = label.encode('ascii') |
| 262 try: |
| 263 ulabel(label) |
| 264 except: |
| 265 raise IDNAError('The label {0} is not a valid A-label'.format(label)
) |
| 266 if not valid_label_length(label): |
| 267 raise IDNAError('Label too long') |
| 268 return label |
| 269 except UnicodeError: |
| 270 pass |
| 271 |
| 272 if not label: |
| 273 raise IDNAError('No Input') |
| 274 |
| 275 label = unicode(label) |
| 276 check_label(label) |
| 277 label = _punycode(label) |
| 278 label = _alabel_prefix + label |
| 279 |
| 280 if not valid_label_length(label): |
| 281 raise IDNAError('Label too long') |
| 282 |
| 283 return label |
| 284 |
| 285 |
| 286 def ulabel(label): |
| 287 |
| 288 if not isinstance(label, (bytes, bytearray)): |
| 289 try: |
| 290 label = label.encode('ascii') |
| 291 except UnicodeError: |
| 292 check_label(label) |
| 293 return label |
| 294 |
| 295 label = label.lower() |
| 296 if label.startswith(_alabel_prefix): |
| 297 label = label[len(_alabel_prefix):] |
| 298 else: |
| 299 check_label(label) |
| 300 return label.decode('ascii') |
| 301 |
| 302 label = label.decode('punycode') |
| 303 check_label(label) |
| 304 return label |
| 305 |
| 306 |
| 307 def uts46_remap(domain, std3_rules=True, transitional=False): |
| 308 """Re-map the characters in the string according to UTS46 processing.""" |
| 309 from .uts46data import uts46data |
| 310 output = u"" |
| 311 try: |
| 312 for pos, char in enumerate(domain): |
| 313 code_point = ord(char) |
| 314 uts46row = uts46data[code_point if code_point < 256 else |
| 315 bisect.bisect_left(uts46data, (code_point, "Z")) - 1] |
| 316 status = uts46row[1] |
| 317 replacement = uts46row[2] if len(uts46row) == 3 else None |
| 318 if (status == "V" or |
| 319 (status == "D" and not transitional) or |
| 320 (status == "3" and std3_rules and replacement is None)): |
| 321 output += char |
| 322 elif replacement is not None and (status == "M" or |
| 323 (status == "3" and std3_rules) or |
| 324 (status == "D" and transitional)): |
| 325 output += replacement |
| 326 elif status != "I": |
| 327 raise IndexError() |
| 328 return unicodedata.normalize("NFC", output) |
| 329 except IndexError: |
| 330 raise InvalidCodepoint( |
| 331 "Codepoint {0} not allowed at position {1} in {2}".format( |
| 332 _unot(code_point), pos + 1, repr(domain))) |
| 333 |
| 334 |
| 335 def encode(s, strict=False, uts46=False, std3_rules=False, transitional=False): |
| 336 |
| 337 if isinstance(s, (bytes, bytearray)): |
| 338 s = s.decode("ascii") |
| 339 if uts46: |
| 340 s = uts46_remap(s, std3_rules, transitional) |
| 341 trailing_dot = False |
| 342 result = [] |
| 343 if strict: |
| 344 labels = s.split('.') |
| 345 else: |
| 346 labels = _unicode_dots_re.split(s) |
| 347 while labels and not labels[0]: |
| 348 del labels[0] |
| 349 if not labels: |
| 350 raise IDNAError('Empty domain') |
| 351 if labels[-1] == '': |
| 352 del labels[-1] |
| 353 trailing_dot = True |
| 354 for label in labels: |
| 355 result.append(alabel(label)) |
| 356 if trailing_dot: |
| 357 result.append(b'') |
| 358 s = b'.'.join(result) |
| 359 if not valid_string_length(s, trailing_dot): |
| 360 raise IDNAError('Domain too long') |
| 361 return s |
| 362 |
| 363 |
| 364 def decode(s, strict=False, uts46=False, std3_rules=False): |
| 365 |
| 366 if isinstance(s, (bytes, bytearray)): |
| 367 s = s.decode("ascii") |
| 368 if uts46: |
| 369 s = uts46_remap(s, std3_rules, False) |
| 370 trailing_dot = False |
| 371 result = [] |
| 372 if not strict: |
| 373 labels = _unicode_dots_re.split(s) |
| 374 else: |
| 375 labels = s.split(u'.') |
| 376 while labels and not labels[0]: |
| 377 del labels[0] |
| 378 if not labels: |
| 379 raise IDNAError('Empty domain') |
| 380 if not labels[-1]: |
| 381 del labels[-1] |
| 382 trailing_dot = True |
| 383 for label in labels: |
| 384 result.append(ulabel(label)) |
| 385 if trailing_dot: |
| 386 result.append(u'') |
| 387 return u'.'.join(result) |
OLD | NEW |