icu46/source/common/utf_impl.c - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/common/utf_impl.c

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 ******************************************************************************

	3 *

	4 * Copyright (C) 1999-2006, International Business Machines

	5 * Corporation and others. All Rights Reserved.

	6 *

	7 ******************************************************************************

	8 * file name: utf_impl.c

	9 * encoding: US-ASCII

	10 * tab size: 8 (not used)

	11 * indentation:4

	12 *

	13 * created on: 1999sep13

	14 * created by: Markus W. Scherer

	15 *

	16 * This file provides implementation functions for macros in the utfXX.h

	17 * that would otherwise be too long as macros.

	18 */

	19

	20 /* set import/export definitions */

	21 #ifndef U_UTF8_IMPL

	22 # define U_UTF8_IMPL

	23 #endif

	24

	25 #include "unicode/utypes.h"

	26

	27 /*

	28 * This table could be replaced on many machines by

	29 * a few lines of assembler code using an

	30 * "index of first 0-bit from msb" instruction and

	31 * one or two more integer instructions.

	32 *

	33 * For example, on an i386, do something like

	34 * - MOV AL, leadByte

	35 * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0)

	36 * - MOV AH, 0

	37 * - BSR BX, AX (16-bit)

	38 * - MOV AX, 6 (result)

	39 * - JZ finish (ZF==1 if leadByte==0xff)

	40 * - SUB AX, BX (result)

	41 * -finish:

	42 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)

	43 *

	44 * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;

	45 * lead bytes above 0xf4 are illegal.

	46 * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.

	47 */

	48 U_EXPORT const uint8_t

	49 utf8_countTrailBytes[256]={

	50 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	51 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	52 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	54

	55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	59

	60 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	63 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	64

	65 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

	66 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

	67

	68 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

	69 3, 3, 3, 3, 3,

	70 3, 3, 3, /* illegal in Unicode */

	71 4, 4, 4, 4, /* illegal in Unicode */

	72 5, 5, /* illegal in Unicode */

	73 0, 0 /* illegal bytes 0xfe and 0xff */

	74 };

	75

	76 static const UChar32

	77 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };

	78

	79 static const UChar32

	80 utf8_errorValue[6]={

	81 UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,

	82 0x3ffffff, 0x7fffffff

	83 };

	84

	85 /*

	86 * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling

	87 * UTF8_NEXT_CHAR_SAFE().

	88 *

	89 * The "strict" parameter controls the error behavior:

	90 * <0 "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative

	91 * code point result.

	92 * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):

	93 * All illegal byte sequences yield a positive code point such that this

	94 * result code point would be encoded with the same number of bytes as

	95 * the illegal sequence.

	96 * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):

	97 * Same as the obsolete "safe" behavior, but non-characters are also treated

	98 * like illegal sequences.

	99 *

	100 * The special negative (<0) value -2 is used for lenient treatment of surrogate

	101 * code points as legal. Some implementations use this for roundtripping of

	102 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they

	103 * contain unpaired surrogates.

	104 *

	105 * Note that a UBool is the same as an int8_t.

	106 */

	107 U_CAPI UChar32 U_EXPORT2

	108 utf8_nextCharSafeBody(const uint8_t s, int32_t pi, int32_t length, UChar32 c, UBool strict) {

	109 int32_t i=*pi;

	110 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);

	111 if((i)+count<=(length)) {

	112 uint8_t trail, illegal=0;

	113

	114 UTF8_MASK_LEAD_BYTE((c), count);

	115 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */

	116 switch(count) {

	117 /* each branch falls through to the next one */

	118 case 5:

	119 case 4:

	120 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode 's UTF-8 */

	121 illegal=1;

	122 break;

	123 case 3:

	124 trail=s[(i)++];

	125 (c)=((c)<<6)\|(trail&0x3f);

	126 if(c<0x110) {

	127 illegal\|=(trail&0xc0)^0x80;

	128 } else {

	129 /* code point>0x10ffff, outside Unicode */

	130 illegal=1;

	131 break;

	132 }

	133 case 2:

	134 trail=s[(i)++];

	135 (c)=((c)<<6)\|(trail&0x3f);

	136 illegal\|=(trail&0xc0)^0x80;

	137 case 1:

	138 trail=s[(i)++];

	139 (c)=((c)<<6)\|(trail&0x3f);

	140 illegal\|=(trail&0xc0)^0x80;

	141 break;

	142 case 0:

	143 if(strict>=0) {

	144 return UTF8_ERROR_VALUE_1;

	145 } else {

	146 return U_SENTINEL;

	147 }

	148 /* no default branch to optimize switch() - all values are covered */

	149 }

	150

	151 /*

	152 * All the error handling should return a value

	153 * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.

	154 *

	155 * Starting with Unicode 3.0.1, non-shortest forms are illegal.

	156 * Starting with Unicode 3.2, surrogate code points must not be

	157 * encoded in UTF-8, and there are no irregular sequences any more.

	158 *

	159 * U8_ macros (new in ICU 2.4) return negative values for error conditio ns.

	160 */

	161

	162 /* correct sequence - all trail bytes have (b7..b6)==(10)? */

	163 /* illegal is also set if count>=4 */

	164 if(illegal \|\| (c)<utf8_minLegal[count] \|\| (UTF_IS_SURROGATE(c) && strict !=-2)) {

	165 /* error handling */

	166 uint8_t errorCount=count;

	167 /* don't go beyond this sequence */

	168 i=*pi;

	169 while(count>0 && UTF8_IS_TRAIL(s[i])) {

	170 ++(i);

	171 --count;

	172 }

	173 if(strict>=0) {

	174 c=utf8_errorValue[errorCount-count];

	175 } else {

	176 c=U_SENTINEL;

	177 }

	178 } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {

	179 /* strict: forbid non-characters like U+fffe */

	180 c=utf8_errorValue[count];

	181 }

	182 } else /* too few bytes left */ {

	183 /* error handling */

	184 int32_t i0=i;

	185 /* don't just set (i)=(length) in case there is an illegal sequence */

	186 while((i)<(length) && UTF8_IS_TRAIL(s[i])) {

	187 ++(i);

	188 }

	189 if(strict>=0) {

	190 c=utf8_errorValue[i-i0];

	191 } else {

	192 c=U_SENTINEL;

	193 }

	194 }

	195 *pi=i;

	196 return c;

	197 }

	198

	199 U_CAPI int32_t U_EXPORT2

	200 utf8_appendCharSafeBody(uint8_t s, int32_t i, int32_t length, UChar32 c, UBool pIsError) {

	201 if((uint32_t)(c)<=0x7ff) {

	202 if((i)+1<(length)) {

	203 (s)[(i)++]=(uint8_t)(((c)>>6)\|0xc0);

	204 (s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80);

	205 return i;

	206 }

	207 } else if((uint32_t)(c)<=0xffff) {

	208 /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */

	209 if((i)+2<(length) && !U_IS_SURROGATE(c)) {

	210 (s)[(i)++]=(uint8_t)(((c)>>12)\|0xe0);

	211 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)\|0x80);

	212 (s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80);

	213 return i;

	214 }

	215 } else if((uint32_t)(c)<=0x10ffff) {

	216 if((i)+3<(length)) {

	217 (s)[(i)++]=(uint8_t)(((c)>>18)\|0xf0);

	218 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)\|0x80);

	219 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)\|0x80);

	220 (s)[(i)++]=(uint8_t)(((c)&0x3f)\|0x80);

	221 return i;

	222 }

	223 }

	224 /* c>0x10ffff or not enough space, write an error value */

	225 if(pIsError!=NULL) {

	226 *pIsError=TRUE;

	227 } else {

	228 length-=i;

	229 if(length>0) {

	230 int32_t offset;

	231 if(length>3) {

	232 length=3;

	233 }

	234 s+=i;

	235 offset=0;

	236 c=utf8_errorValue[length-1];

	237 UTF8_APPEND_CHAR_UNSAFE(s, offset, c);

	238 i=i+offset;

	239 }

	240 }

	241 return i;

	242 }

	243

	244 U_CAPI UChar32 U_EXPORT2

	245 utf8_prevCharSafeBody(const uint8_t s, int32_t start, int32_t pi, UChar32 c, U Bool strict) {

	246 int32_t i=*pi;

	247 uint8_t b, count=1, shift=6;

	248

	249 /* extract value bits from the last trail byte */

	250 c&=0x3f;

	251

	252 for(;;) {

	253 if(i<=start) {

	254 /* no lead byte at all */

	255 if(strict>=0) {

	256 return UTF8_ERROR_VALUE_1;

	257 } else {

	258 return U_SENTINEL;

	259 }

	260 /break;/

	261 }

	262

	263 /* read another previous byte */

	264 b=s[--i];

	265 if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */

	266 if(b&0x40) {

	267 /* lead byte, this will always end the loop */

	268 uint8_t shouldCount=UTF8_COUNT_TRAIL_BYTES(b);

	269

	270 if(count==shouldCount) {

	271 /* set the new position */

	272 *pi=i;

	273 UTF8_MASK_LEAD_BYTE(b, count);

	274 c\|=(UChar32)b<<shift;

	275 if(count>=4 \|\| c>0x10ffff \|\| c<utf8_minLegal[count] \|\| (UTF_ IS_SURROGATE(c) && strict!=-2) \|\| (strict>0 && UTF_IS_UNICODE_NONCHAR(c))) {

	276 /* illegal sequence or (strict and non-character) */

	277 if(count>=4) {

	278 count=3;

	279 }

	280 if(strict>=0) {

	281 c=utf8_errorValue[count];

	282 } else {

	283 c=U_SENTINEL;

	284 }

	285 } else {

	286 /* exit with correct c */

	287 }

	288 } else {

	289 /* the lead byte does not match the number of trail bytes */

	290 /* only set the position to the lead byte if it would

	291 include the trail byte that we started with */

	292 if(count<shouldCount) {

	293 *pi=i;

	294 if(strict>=0) {

	295 c=utf8_errorValue[count];

	296 } else {

	297 c=U_SENTINEL;

	298 }

	299 } else {

	300 if(strict>=0) {

	301 c=UTF8_ERROR_VALUE_1;

	302 } else {

	303 c=U_SENTINEL;

	304 }

	305 }

	306 }

	307 break;

	308 } else if(count<5) {

	309 /* trail byte */

	310 c\|=(UChar32)(b&0x3f)<<shift;

	311 ++count;

	312 shift+=6;

	313 } else {

	314 /* more than 5 trail bytes is illegal */

	315 if(strict>=0) {

	316 c=UTF8_ERROR_VALUE_1;

	317 } else {

	318 c=U_SENTINEL;

	319 }

	320 break;

	321 }

	322 } else {

	323 /* single-byte character precedes trailing bytes */

	324 if(strict>=0) {

	325 c=UTF8_ERROR_VALUE_1;

	326 } else {

	327 c=U_SENTINEL;

	328 }

	329 break;

	330 }

	331 }

	332 return c;

	333 }

	334

	335 U_CAPI int32_t U_EXPORT2

	336 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {

	337 /* i had been decremented once before the function call */

	338 int32_t I=i, Z;

	339 uint8_t b;

	340

	341 /* read at most the 6 bytes s[Z] to s[i], inclusively */

	342 if(I-5>start) {

	343 Z=I-5;

	344 } else {

	345 Z=start;

	346 }

	347

	348 /* return I if the sequence starting there is long enough to include i */

	349 do {

	350 b=s[I];

	351 if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */

	352 break;

	353 } else if(b>=0xc0) {

	354 if(UTF8_COUNT_TRAIL_BYTES(b)>=(i-I)) {

	355 return I;

	356 } else {

	357 break;

	358 }

	359 }

	360 } while(Z<=--I);

	361

	362 /* return i itself to be consistent with the FWD_1 macro */

	363 return i;

	364 }

OLD	NEW

« no previous file with comments | « icu46/source/common/utext.cpp ('k') | icu46/source/common/util.h » ('j') | no next file with comments »