third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc - Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows...

Unified Diff: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc

Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h ('k') | third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc

===================================================================

--- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc (revision 0)

+++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc (revision 0)

@@ -0,0 +1,229 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h"

+// This code is copied from google3/util/utf8/internal/utf8statetable.cc and was

+// not modified (it generates a lot of lint warnings, but I decided not to fix

+// them to simplify its maintenance).

+// Return true if current Tbl pointer is within state0 range

+// Note that unsigned compare checks both ends of range simultaneously

+static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {

+ const uint8* Tbl0 = &st->state_table[st->state0];

+ return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);

+// Look up property of one UTF-8 character and advance over it

+// Return 0 if input length is zero

+// Return 0 and advance one byte if input is ill-formed

+uint8 UTF8GenericProperty(const UTF8PropObj* st,

+ const uint8** src,

+ int* srclen) {

+ if (*srclen <= 0) {

+ return 0;

+ }

+ const uint8* lsrc = *src;

+ const uint8* Tbl_0 = &st->state_table[st->state0];

+ const uint8* Tbl = Tbl_0;

+ int e;

+ int eshift = st->entry_shift;

+ // Short series of tests faster than switch, optimizes 7-bit ASCII

+ unsigned char c = lsrc[0];

+ if (static_cast<signed char>(c) >= 0) { // one byte

+ e = Tbl[c];

+ *src += 1;

+ *srclen -= 1;

+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes

+ e = Tbl[c];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[1]];

+ *src += 2;

+ *srclen -= 2;

+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes

+ e = Tbl[c];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[1]];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[2]];

+ *src += 3;

+ *srclen -= 3;

+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes

+ e = Tbl[c];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[1]];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[2]];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[3]];

+ *src += 4;

+ *srclen -= 4;

+ } else { // Ill-formed

+ e = 0;

+ *src += 1;

+ *srclen -= 1;

+ }

+ return e;

+// BigOneByte versions are needed for tables > 240 states, but most

+// won't need the TwoByte versions.

+// Internally, to next-to-last offset is multiplied by 16 and the last

+// offset is relative instead of absolute.

+// Look up property of one UTF-8 character and advance over it

+// Return 0 if input length is zero

+// Return 0 and advance one byte if input is ill-formed

+uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,

+ const uint8** src,

+ int* srclen) {

+ if (*srclen <= 0) {

+ return 0;

+ }

+ const uint8* lsrc = *src;

+ const uint8* Tbl_0 = &st->state_table[st->state0];

+ const uint8* Tbl = Tbl_0;

+ int e;

+ int eshift = st->entry_shift;

+ // Short series of tests faster than switch, optimizes 7-bit ASCII

+ unsigned char c = lsrc[0];

+ if (static_cast<signed char>(c) >= 0) { // one byte

+ e = Tbl[c];

+ *src += 1;

+ *srclen -= 1;

+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes

+ e = Tbl[c];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[1]];

+ *src += 2;

+ *srclen -= 2;

+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes

+ e = Tbl[c];

+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range

+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];

+ Tbl = &Tbl[e << eshift]; // Relative +/-

+ e = Tbl[lsrc[2]];

+ *src += 3;

+ *srclen -= 3;

+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes

+ e = Tbl[c];

+ Tbl = &Tbl_0[e << eshift];

+ e = Tbl[lsrc[1]];

+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range

+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];

+ Tbl = &Tbl[e << eshift]; // Relative +/-

+ e = Tbl[lsrc[3]];

+ *src += 4;

+ *srclen -= 4;

+ } else { // Ill-formed

+ e = 0;

+ *src += 1;

+ *srclen -= 1;

+ }

+ return e;

+// Scan a UTF-8 stringpiece based on a state table.

+// Always scan complete UTF-8 characters

+// Set number of bytes scanned. Return reason for exiting

+int UTF8GenericScan(const UTF8ScanObj* st,

+ const uint8* str,

+ const int len,

+ int* bytes_consumed) {

+ int eshift = st->entry_shift; // 6 (space optimized) or 8

+ // int nEntries = (1 << eshift); // 64 or 256 entries per state

+ const uint8* isrc = str;

+ //reinterpret_cast<const uint8*>(str.data());

+ const uint8* src = isrc;

+ //const int len = str.length();

+ const uint8* srclimit = isrc + len;

+ const uint8* srclimit8 = srclimit - 7;

+ *bytes_consumed = 0;

+ if (len == 0) return kExitOK;

+ const uint8* Tbl_0 = &st->state_table[st->state0];

+DoAgain:

+ // Do state-table scan

+ int e = 0;

+ uint8 c;

+ // Do fast for groups of 8 identity bytes.

+ // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,

+ // including slowing slightly on cr/lf/ht

+ //----------------------------

+ const uint8* Tbl2 = &st->fast_state[0];

+ uint32 losub = st->losub;

+ uint32 hiadd = st->hiadd;

+ while (src < srclimit8) {

+ uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];

+ uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];

+ src += 8;

+ // This is a fast range check for all bytes in [lowsub..0x80-hiadd)

+ uint32 temp = (s0123 - losub) | (s0123 + hiadd) |

+ (s4567 - losub) | (s4567 + hiadd);

+ if ((temp & 0x80808080) != 0) {

+ // We typically end up here on cr/lf/ht; src was incremented

+ int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |

+ (Tbl2[src[-6]] | Tbl2[src[-5]]);

+ if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange

+ e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |

+ (Tbl2[src[-2]] | Tbl2[src[-1]]);

+ if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange

+ // Else OK, go around again

+ }

+ //----------------------------

+ // Byte-at-a-time scan

+ //----------------------------

+ const uint8* Tbl = Tbl_0;

+ while (src < srclimit) {

+ c = *src;

+ e = Tbl[c];

+ src++;

+ if (e >= kExitIllegalStructure) {break;}

+ Tbl = &Tbl_0[e << eshift];

+ }

+ //----------------------------

+ // Exit posibilities:

+ // Some exit code, !state0, back up over last char

+ // Some exit code, state0, back up one byte exactly

+ // source consumed, !state0, back up over partial char

+ // source consumed, state0, exit OK

+ // For illegal byte in state0, avoid backup up over PREVIOUS char

+ // For truncated last char, back up to beginning of it

+ if (e >= kExitIllegalStructure) {

+ // Back up over exactly one byte of rejected/illegal UTF-8 character

+ src--;

+ // Back up more if needed

+ if (!InStateZero(st, Tbl)) {

+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

+ }

+ } else if (!InStateZero(st, Tbl)) {

+ // Back up over truncated UTF-8 character

+ e = kExitIllegalStructure;

+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));

+ } else {

+ // Normal termination, source fully consumed

+ e = kExitOK;

+ }

+ if (e == kExitDoAgain) {

+ // Loop back up to the fast scan

+ goto DoAgain;

+ }

+ *bytes_consumed = src - isrc;

+ return e;

Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\win\cld_utf8statetable.cc

___________________________________________________________________

Added: svn:eol-style

+ LF