icu46/source/i18n/csrutf8.cpp - Issue 5516007: Check in the pristine copy of ICU 4.6...

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Unified Diff: icu46/source/i18n/csrutf8.cpp

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/csrutf8.cpp

===================================================================

--- icu46/source/i18n/csrutf8.cpp (revision 0)

+++ icu46/source/i18n/csrutf8.cpp (revision 0)

@@ -0,0 +1,111 @@

+/*

+ **********************************************************************

+ */

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_CONVERSION

+#include "csrutf8.h"

+U_NAMESPACE_BEGIN

+CharsetRecog_UTF8::~CharsetRecog_UTF8()

+ // nothing to do

+const char *CharsetRecog_UTF8::getName() const

+ return "UTF-8";

+int32_t CharsetRecog_UTF8::match(InputText* det) {

+ bool hasBOM = FALSE;

+ int32_t numValid = 0;

+ int32_t numInvalid = 0;

+ const uint8_t *input = det->fRawInput;

+ int32_t i;

+ int32_t trailBytes = 0;

+ int32_t confidence;

+ if (det->fRawLength >= 3 &&

+ input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {

+ hasBOM = TRUE;

+ }

+ // Scan for multi-byte sequences

+ for (i=0; i < det->fRawLength; i += 1) {

+ int32_t b = input[i];

+ if ((b & 0x80) == 0) {

+ continue; // ASCII

+ }

+ // Hi bit on char found. Figure out how long the sequence should be

+ if ((b & 0x0E0) == 0x0C0) {

+ trailBytes = 1;

+ } else if ((b & 0x0F0) == 0x0E0) {

+ trailBytes = 2;

+ } else if ((b & 0x0F8) == 0xF0) {

+ trailBytes = 3;

+ } else {

+ numInvalid += 1;

+ if (numInvalid > 5) {

+ break;

+ }

+ trailBytes = 0;

+ }

+ // Verify that we've got the right number of trail bytes in the sequence

+ for (;;) {

+ i += 1;

+ if (i >= det->fRawLength) {

+ break;

+ }

+ b = input[i];

+ if ((b & 0xC0) != 0x080) {

+ numInvalid += 1;

+ break;

+ }

+ if (--trailBytes == 0) {

+ numValid += 1;

+ break;

+ }

+ // Cook up some sort of confidence score, based on presense of a BOM

+ // and the existence of valid and/or invalid multi-byte sequences.

+ confidence = 0;

+ if (hasBOM && numInvalid == 0) {

+ confidence = 100;

+ } else if (hasBOM && numValid > numInvalid*10) {

+ confidence = 80;

+ } else if (numValid > 3 && numInvalid == 0) {

+ confidence = 100;

+ } else if (numValid > 0 && numInvalid == 0) {

+ confidence = 80;

+ } else if (numValid == 0 && numInvalid == 0) {

+ // Plain ASCII.

+ confidence = 10;

+ } else if (numValid > numInvalid*10) {

+ // Probably corruput utf-8 data. Valid sequences aren't likely by chance.

+ confidence = 25;

+ }

+ return confidence;

+U_NAMESPACE_END

+#endif

Property changes on: icu46/source/i18n/csrutf8.cpp

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/csrutf8.h ('k') | icu46/source/i18n/curramt.cpp » ('j') | no next file with comments »