icu46/source/i18n/csrutf8.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Side by Side Diff: icu46/source/i18n/csrutf8.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2008, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 */

	7

	8 #include "unicode/utypes.h"

	9

	10 #if !UCONFIG_NO_CONVERSION

	11

	12 #include "csrutf8.h"

	13

	14 U_NAMESPACE_BEGIN

	15

	16 CharsetRecog_UTF8::~CharsetRecog_UTF8()

	17 {

	18 // nothing to do

	19 }

	20

	21 const char *CharsetRecog_UTF8::getName() const

	22 {

	23 return "UTF-8";

	24 }

	25

	26 int32_t CharsetRecog_UTF8::match(InputText* det) {

	27 bool hasBOM = FALSE;

	28 int32_t numValid = 0;

	29 int32_t numInvalid = 0;

	30 const uint8_t *input = det->fRawInput;

	31 int32_t i;

	32 int32_t trailBytes = 0;

	33 int32_t confidence;

	34

	35 if (det->fRawLength >= 3 &&

	36 input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {

	37 hasBOM = TRUE;

	38 }

	39

	40 // Scan for multi-byte sequences

	41 for (i=0; i < det->fRawLength; i += 1) {

	42 int32_t b = input[i];

	43

	44 if ((b & 0x80) == 0) {

	45 continue; // ASCII

	46 }

	47

	48 // Hi bit on char found. Figure out how long the sequence should be

	49 if ((b & 0x0E0) == 0x0C0) {

	50 trailBytes = 1;

	51 } else if ((b & 0x0F0) == 0x0E0) {

	52 trailBytes = 2;

	53 } else if ((b & 0x0F8) == 0xF0) {

	54 trailBytes = 3;

	55 } else {

	56 numInvalid += 1;

	57

	58 if (numInvalid > 5) {

	59 break;

	60 }

	61

	62 trailBytes = 0;

	63 }

	64

	65 // Verify that we've got the right number of trail bytes in the sequence

	66 for (;;) {

	67 i += 1;

	68

	69 if (i >= det->fRawLength) {

	70 break;

	71 }

	72

	73 b = input[i];

	74

	75 if ((b & 0xC0) != 0x080) {

	76 numInvalid += 1;

	77 break;

	78 }

	79

	80 if (--trailBytes == 0) {

	81 numValid += 1;

	82 break;

	83 }

	84 }

	85

	86 }

	87

	88 // Cook up some sort of confidence score, based on presense of a BOM

	89 // and the existence of valid and/or invalid multi-byte sequences.

	90 confidence = 0;

	91 if (hasBOM && numInvalid == 0) {

	92 confidence = 100;

	93 } else if (hasBOM && numValid > numInvalid*10) {

	94 confidence = 80;

	95 } else if (numValid > 3 && numInvalid == 0) {

	96 confidence = 100;

	97 } else if (numValid > 0 && numInvalid == 0) {

	98 confidence = 80;

	99 } else if (numValid == 0 && numInvalid == 0) {

	100 // Plain ASCII.

	101 confidence = 10;

	102 } else if (numValid > numInvalid*10) {

	103 // Probably corruput utf-8 data. Valid sequences aren't likely by chanc e.

	104 confidence = 25;

	105 }

	106

	107 return confidence;

	108 }

	109

	110 U_NAMESPACE_END

	111 #endif

OLD	NEW

« no previous file with comments | « icu46/source/i18n/csrutf8.h ('k') | icu46/source/i18n/curramt.cpp » ('j') | no next file with comments »