Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(784)

Unified Diff: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc

Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 11 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc
===================================================================
--- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc (revision 0)
+++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc (revision 0)
@@ -0,0 +1,229 @@
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h"
+
+// This code is copied from google3/util/utf8/internal/utf8statetable.cc and was
+// not modified (it generates a lot of lint warnings, but I decided not to fix
+// them to simplify its maintenance).
+
+
+// Return true if current Tbl pointer is within state0 range
+// Note that unsigned compare checks both ends of range simultaneously
+static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
+ const uint8* Tbl0 = &st->state_table[st->state0];
+ return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
+}
+
+
+// Look up property of one UTF-8 character and advance over it
+// Return 0 if input length is zero
+// Return 0 and advance one byte if input is ill-formed
+uint8 UTF8GenericProperty(const UTF8PropObj* st,
+ const uint8** src,
+ int* srclen) {
+ if (*srclen <= 0) {
+ return 0;
+ }
+
+ const uint8* lsrc = *src;
+ const uint8* Tbl_0 = &st->state_table[st->state0];
+ const uint8* Tbl = Tbl_0;
+ int e;
+ int eshift = st->entry_shift;
+
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
+ unsigned char c = lsrc[0];
+ if (static_cast<signed char>(c) >= 0) { // one byte
+ e = Tbl[c];
+ *src += 1;
+ *srclen -= 1;
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
+ e = Tbl[c];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[1]];
+ *src += 2;
+ *srclen -= 2;
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
+ e = Tbl[c];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[1]];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[2]];
+ *src += 3;
+ *srclen -= 3;
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
+ e = Tbl[c];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[1]];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[2]];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[3]];
+ *src += 4;
+ *srclen -= 4;
+ } else { // Ill-formed
+ e = 0;
+ *src += 1;
+ *srclen -= 1;
+ }
+ return e;
+}
+
+// BigOneByte versions are needed for tables > 240 states, but most
+// won't need the TwoByte versions.
+// Internally, to next-to-last offset is multiplied by 16 and the last
+// offset is relative instead of absolute.
+// Look up property of one UTF-8 character and advance over it
+// Return 0 if input length is zero
+// Return 0 and advance one byte if input is ill-formed
+uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
+ const uint8** src,
+ int* srclen) {
+ if (*srclen <= 0) {
+ return 0;
+ }
+
+ const uint8* lsrc = *src;
+ const uint8* Tbl_0 = &st->state_table[st->state0];
+ const uint8* Tbl = Tbl_0;
+ int e;
+ int eshift = st->entry_shift;
+
+ // Short series of tests faster than switch, optimizes 7-bit ASCII
+ unsigned char c = lsrc[0];
+ if (static_cast<signed char>(c) >= 0) { // one byte
+ e = Tbl[c];
+ *src += 1;
+ *srclen -= 1;
+ } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) { // two bytes
+ e = Tbl[c];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[1]];
+ *src += 2;
+ *srclen -= 2;
+ } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) { // three bytes
+ e = Tbl[c];
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
+ Tbl = &Tbl[e << eshift]; // Relative +/-
+ e = Tbl[lsrc[2]];
+ *src += 3;
+ *srclen -= 3;
+ }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) { // four bytes
+ e = Tbl[c];
+ Tbl = &Tbl_0[e << eshift];
+ e = Tbl[lsrc[1]];
+ Tbl = &Tbl_0[e << (eshift + 4)]; // 16x the range
+ e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
+ Tbl = &Tbl[e << eshift]; // Relative +/-
+ e = Tbl[lsrc[3]];
+ *src += 4;
+ *srclen -= 4;
+ } else { // Ill-formed
+ e = 0;
+ *src += 1;
+ *srclen -= 1;
+ }
+ return e;
+}
+
+// Scan a UTF-8 stringpiece based on a state table.
+// Always scan complete UTF-8 characters
+// Set number of bytes scanned. Return reason for exiting
+int UTF8GenericScan(const UTF8ScanObj* st,
+ const uint8* str,
+ const int len,
+ int* bytes_consumed) {
+ int eshift = st->entry_shift; // 6 (space optimized) or 8
+ // int nEntries = (1 << eshift); // 64 or 256 entries per state
+
+ const uint8* isrc = str;
+ //reinterpret_cast<const uint8*>(str.data());
+ const uint8* src = isrc;
+ //const int len = str.length();
+ const uint8* srclimit = isrc + len;
+ const uint8* srclimit8 = srclimit - 7;
+ *bytes_consumed = 0;
+ if (len == 0) return kExitOK;
+
+ const uint8* Tbl_0 = &st->state_table[st->state0];
+
+DoAgain:
+ // Do state-table scan
+ int e = 0;
+ uint8 c;
+
+ // Do fast for groups of 8 identity bytes.
+ // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
+ // including slowing slightly on cr/lf/ht
+ //----------------------------
+ const uint8* Tbl2 = &st->fast_state[0];
+ uint32 losub = st->losub;
+ uint32 hiadd = st->hiadd;
+ while (src < srclimit8) {
+ uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
+ uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
+ src += 8;
+ // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
+ uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
+ (s4567 - losub) | (s4567 + hiadd);
+ if ((temp & 0x80808080) != 0) {
+ // We typically end up here on cr/lf/ht; src was incremented
+ int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
+ (Tbl2[src[-6]] | Tbl2[src[-5]]);
+ if (e0123 != 0) {src -= 8; break;} // Exit on Non-interchange
+ e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
+ (Tbl2[src[-2]] | Tbl2[src[-1]]);
+ if (e0123 != 0) {src -= 4; break;} // Exit on Non-interchange
+ // Else OK, go around again
+ }
+ }
+ //----------------------------
+
+ // Byte-at-a-time scan
+ //----------------------------
+ const uint8* Tbl = Tbl_0;
+ while (src < srclimit) {
+ c = *src;
+ e = Tbl[c];
+ src++;
+ if (e >= kExitIllegalStructure) {break;}
+ Tbl = &Tbl_0[e << eshift];
+ }
+ //----------------------------
+
+
+ // Exit posibilities:
+ // Some exit code, !state0, back up over last char
+ // Some exit code, state0, back up one byte exactly
+ // source consumed, !state0, back up over partial char
+ // source consumed, state0, exit OK
+ // For illegal byte in state0, avoid backup up over PREVIOUS char
+ // For truncated last char, back up to beginning of it
+
+ if (e >= kExitIllegalStructure) {
+ // Back up over exactly one byte of rejected/illegal UTF-8 character
+ src--;
+ // Back up more if needed
+ if (!InStateZero(st, Tbl)) {
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
+ }
+ } else if (!InStateZero(st, Tbl)) {
+ // Back up over truncated UTF-8 character
+ e = kExitIllegalStructure;
+ do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
+ } else {
+ // Normal termination, source fully consumed
+ e = kExitOK;
+ }
+
+ if (e == kExitDoAgain) {
+ // Loop back up to the fast scan
+ goto DoAgain;
+ }
+
+ *bytes_consumed = src - isrc;
+ return e;
+}
Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\win\cld_utf8statetable.cc
___________________________________________________________________
Added: svn:eol-style
+ LF

Powered by Google App Engine
This is Rietveld 408576698