third_party/libphonenumber/cpp/src/utf/rune.c - Issue 6930013: Re-committing http://codereview.chromium.org/6803005/ after fixing multi-dll build:

Side by Side Diff: third_party/libphonenumber/cpp/src/utf/rune.c

Issue 6930013: Re-committing http://codereview.chromium.org/6803005/ after fixing multi-dll build: (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 9 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 * The authors of this software are Rob Pike and Ken Thompson.

	3 * Copyright (c) 2002 by Lucent Technologies.

	4 * Permission to use, copy, modify, and distribute this software for any

	5 * purpose without fee is hereby granted, provided that this entire notice

	6 * is included in all copies of any software which is or includes a copy

	7 * or modification of this software and in all copies of the supporting

	8 * documentation for such software.

	9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED

	10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE AN Y

	11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY

	12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.

	13 */

	14 #include <stdarg.h>

	15 #include <string.h>

	16 #include "utf.h"

	17 #include "utfdef.h"

	18

	19 enum

	20 {

	21 Bit1 = 7,

	22 Bitx = 6,

	23 Bit2 = 5,

	24 Bit3 = 4,

	25 Bit4 = 3,

	26 Bit5 = 2,

	27

	28 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */

	29 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */

	30 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */

	31 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */

	32 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */

	33 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */

	34

	35 Rune1 = (1<<(Bit1+0Bitx))-1, / 0000 0000 0111 1111 */

	36 Rune2 = (1<<(Bit2+1Bitx))-1, / 0000 0111 1111 1111 */

	37 Rune3 = (1<<(Bit3+2Bitx))-1, / 1111 1111 1111 1111 */

	38 Rune4 = (1<<(Bit4+3*Bitx))-1,

	39 /* 0001 1111 1111 1111 1111 1111 */

	40

	41 Maskx = (1<<Bitx)-1, /* 0011 1111 */

	42 Testx = Maskx ^ 0xFF, /* 1100 0000 */

	43

	44 Bad = Runeerror,

	45 };

	46

	47 /*

	48 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24

	49 * This is a slower but "safe" version of the old chartorune

	50 * that works on strings that are not necessarily null-terminated.

	51 *

	52 * If you know for sure that your string is null-terminated,

	53 * chartorune will be a bit faster.

	54 *

	55 * It is guaranteed not to attempt to access "length"

	56 * past the incoming pointer. This is to avoid

	57 * possible access violations. If the string appears to be

	58 * well-formed but incomplete (i.e., to get the whole Rune

	59 * we'd need to read past str+length) then we'll set the Rune

	60 * to Bad and return 0.

	61 *

	62 * Note that if we have decoding problems for other

	63 * reasons, we return 1 instead of 0.

	64 */

	65 int

	66 charntorune(Rune rune, const char str, int length)

	67 {

	68 int c, c1, c2, c3;

	69 long l;

	70

	71 /* When we're not allowed to read anything */

	72 if(length <= 0) {

	73 goto badlen;

	74 }

	75

	76 /*

	77 * one character sequence (7-bit value)

	78 * 00000-0007F => T1

	79 */

	80 c = (uchar)str;

	81 if(c < Tx) {

	82 *rune = c;

	83 return 1;

	84 }

	85

	86 // If we can't read more than one character we must stop

	87 if(length <= 1) {

	88 goto badlen;

	89 }

	90

	91 /*

	92 * two character sequence (11-bit value)

	93 * 0080-07FF => T2 Tx

	94 */

	95 c1 = (uchar)(str+1) ^ Tx;

	96 if(c1 & Testx)

	97 goto bad;

	98 if(c < T3) {

	99 if(c < T2)

	100 goto bad;

	101 l = ((c << Bitx) \| c1) & Rune2;

	102 if(l <= Rune1)

	103 goto bad;

	104 *rune = l;

	105 return 2;

	106 }

	107

	108 // If we can't read more than two characters we must stop

	109 if(length <= 2) {

	110 goto badlen;

	111 }

	112

	113 /*

	114 * three character sequence (16-bit value)

	115 * 0800-FFFF => T3 Tx Tx

	116 */

	117 c2 = (uchar)(str+2) ^ Tx;

	118 if(c2 & Testx)

	119 goto bad;

	120 if(c < T4) {

	121 l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;

	122 if(l <= Rune2)

	123 goto bad;

	124 *rune = l;

	125 return 3;

	126 }

	127

	128 if (length <= 3)

	129 goto badlen;

	130

	131 /*

	132 * four character sequence (21-bit value)

	133 * 10000-1FFFFF => T4 Tx Tx Tx

	134 */

	135 c3 = (uchar)(str+3) ^ Tx;

	136 if (c3 & Testx)

	137 goto bad;

	138 if (c < T5) {

	139 l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4 ;

	140 if (l <= Rune3)

	141 goto bad;

	142 *rune = l;

	143 return 4;

	144 }

	145

	146 // Support for 5-byte or longer UTF-8 would go here, but

	147 // since we don't have that, we'll just fall through to bad.

	148

	149 /*

	150 * bad decoding

	151 */

	152 bad:

	153 *rune = Bad;

	154 return 1;

	155 badlen:

	156 *rune = Bad;

	157 return 0;

	158

	159 }

	160

	161

	162 /*

	163 * This is the older "unsafe" version, which works fine on

	164 * null-terminated strings.

	165 */

	166 int

	167 chartorune(Rune rune, const char str)

	168 {

	169 int c, c1, c2, c3;

	170 long l;

	171

	172 /*

	173 * one character sequence

	174 * 00000-0007F => T1

	175 */

	176 c = (uchar)str;

	177 if(c < Tx) {

	178 *rune = c;

	179 return 1;

	180 }

	181

	182 /*

	183 * two character sequence

	184 * 0080-07FF => T2 Tx

	185 */

	186 c1 = (uchar)(str+1) ^ Tx;

	187 if(c1 & Testx)

	188 goto bad;

	189 if(c < T3) {

	190 if(c < T2)

	191 goto bad;

	192 l = ((c << Bitx) \| c1) & Rune2;

	193 if(l <= Rune1)

	194 goto bad;

	195 *rune = l;

	196 return 2;

	197 }

	198

	199 /*

	200 * three character sequence

	201 * 0800-FFFF => T3 Tx Tx

	202 */

	203 c2 = (uchar)(str+2) ^ Tx;

	204 if(c2 & Testx)

	205 goto bad;

	206 if(c < T4) {

	207 l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;

	208 if(l <= Rune2)

	209 goto bad;

	210 *rune = l;

	211 return 3;

	212 }

	213

	214 /*

	215 * four character sequence (21-bit value)

	216 * 10000-1FFFFF => T4 Tx Tx Tx

	217 */

	218 c3 = (uchar)(str+3) ^ Tx;

	219 if (c3 & Testx)

	220 goto bad;

	221 if (c < T5) {

	222 l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4 ;

	223 if (l <= Rune3)

	224 goto bad;

	225 *rune = l;

	226 return 4;

	227 }

	228

	229 /*

	230 * Support for 5-byte or longer UTF-8 would go here, but

	231 * since we don't have that, we'll just fall through to bad.

	232 */

	233

	234 /*

	235 * bad decoding

	236 */

	237 bad:

	238 *rune = Bad;

	239 return 1;

	240 }

	241

	242 int

	243 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {

	244 *consumed = charntorune(rune, str, length);

	245 return rune != Runeerror \|\| consumed == 3;

	246 }

	247

	248 int

	249 runetochar(char str, const Rune rune)

	250 {

	251 /* Runes are signed, so convert to unsigned for range check. */

	252 unsigned long c;

	253

	254 /*

	255 * one character sequence

	256 * 00000-0007F => 00-7F

	257 */

	258 c = *rune;

	259 if(c <= Rune1) {

	260 str[0] = c;

	261 return 1;

	262 }

	263

	264 /*

	265 * two character sequence

	266 * 0080-07FF => T2 Tx

	267 */

	268 if(c <= Rune2) {

	269 str[0] = T2 \| (c >> 1*Bitx);

	270 str[1] = Tx \| (c & Maskx);

	271 return 2;

	272 }

	273

	274 /*

	275 * If the Rune is out of range, convert it to the error rune.

	276 * Do this test here because the error rune encodes to three bytes.

	277 * Doing it earlier would duplicate work, since an out of range

	278 * Rune wouldn't have fit in one or two bytes.

	279 */

	280 if (c > Runemax)

	281 c = Runeerror;

	282

	283 /*

	284 * three character sequence

	285 * 0800-FFFF => T3 Tx Tx

	286 */

	287 if (c <= Rune3) {

	288 str[0] = T3 \| (c >> 2*Bitx);

	289 str[1] = Tx \| ((c >> 1*Bitx) & Maskx);

	290 str[2] = Tx \| (c & Maskx);

	291 return 3;

	292 }

	293

	294 /*

	295 * four character sequence (21-bit value)

	296 * 10000-1FFFFF => T4 Tx Tx Tx

	297 */

	298 str[0] = T4 \| (c >> 3*Bitx);

	299 str[1] = Tx \| ((c >> 2*Bitx) & Maskx);

	300 str[2] = Tx \| ((c >> 1*Bitx) & Maskx);

	301 str[3] = Tx \| (c & Maskx);

	302 return 4;

	303 }

	304

	305 int

	306 runelen(Rune rune)

	307 {

	308 char str[10];

	309

	310 return runetochar(str, &rune);

	311 }

	312

	313 int

	314 runenlen(const Rune *r, int nrune)

	315 {

	316 int nb, c;

	317

	318 nb = 0;

	319 while(nrune--) {

	320 c = *r++;

	321 if (c <= Rune1)

	322 nb++;

	323 else if (c <= Rune2)

	324 nb += 2;

	325 else if (c <= Rune3)

	326 nb += 3;

	327 else /* assert(c <= Rune4) */

	328 nb += 4;

	329 }

	330 return nb;

	331 }

	332

	333 int

	334 fullrune(const char *str, int n)

	335 {

	336 if (n > 0) {

	337 int c = (uchar)str;

	338 if (c < Tx)

	339 return 1;

	340 if (n > 1) {

	341 if (c < T3)

	342 return 1;

	343 if (n > 2) {

	344 if (c < T4 \|\| n > 3)

	345 return 1;

	346 }

	347 }

	348 }

	349 return 0;

	350 }

OLD	NEW

« no previous file with comments | « third_party/libphonenumber/cpp/src/utf/README ('k') | third_party/libphonenumber/cpp/src/utf/stringpiece.h » ('j') | no next file with comments »