| Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc
 | 
| ===================================================================
 | 
| --- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc	(revision 0)
 | 
| +++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.cc	(revision 0)
 | 
| @@ -0,0 +1,229 @@
 | 
| +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
 | 
| +// Use of this source code is governed by a BSD-style license that can be
 | 
| +// found in the LICENSE file.
 | 
| +
 | 
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h"
 | 
| +
 | 
| +// This code is copied from google3/util/utf8/internal/utf8statetable.cc and was
 | 
| +// not modified (it generates a lot of lint warnings, but I decided not to fix
 | 
| +// them to simplify its maintenance).
 | 
| +
 | 
| +
 | 
| +// Return true if current Tbl pointer is within state0 range
 | 
| +// Note that unsigned compare checks both ends of range simultaneously
 | 
| +static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
 | 
| +  const uint8* Tbl0 = &st->state_table[st->state0];
 | 
| +  return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
 | 
| +}
 | 
| +
 | 
| +
 | 
| +// Look up property of one UTF-8 character and advance over it
 | 
| +// Return 0 if input length is zero
 | 
| +// Return 0 and advance one byte if input is ill-formed
 | 
| +uint8 UTF8GenericProperty(const UTF8PropObj* st,
 | 
| +                          const uint8** src,
 | 
| +                          int* srclen) {
 | 
| +  if (*srclen <= 0) {
 | 
| +    return 0;
 | 
| +  }
 | 
| +
 | 
| +  const uint8* lsrc = *src;
 | 
| +  const uint8* Tbl_0 = &st->state_table[st->state0];
 | 
| +  const uint8* Tbl = Tbl_0;
 | 
| +  int e;
 | 
| +  int eshift = st->entry_shift;
 | 
| +
 | 
| +  // Short series of tests faster than switch, optimizes 7-bit ASCII
 | 
| +  unsigned char c = lsrc[0];
 | 
| +  if (static_cast<signed char>(c) >= 0) {           // one byte
 | 
| +    e = Tbl[c];
 | 
| +    *src += 1;
 | 
| +    *srclen -= 1;
 | 
| +  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
 | 
| +    e = Tbl[c];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[1]];
 | 
| +    *src += 2;
 | 
| +    *srclen -= 2;
 | 
| +  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
 | 
| +    e = Tbl[c];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[1]];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[2]];
 | 
| +    *src += 3;
 | 
| +    *srclen -= 3;
 | 
| +  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
 | 
| +    e = Tbl[c];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[1]];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[2]];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[3]];
 | 
| +    *src += 4;
 | 
| +    *srclen -= 4;
 | 
| +  } else {                                                // Ill-formed
 | 
| +    e = 0;
 | 
| +    *src += 1;
 | 
| +    *srclen -= 1;
 | 
| +  }
 | 
| +  return e;
 | 
| +}
 | 
| +
 | 
| +// BigOneByte versions are needed for tables > 240 states, but most
 | 
| +// won't need the TwoByte versions.
 | 
| +// Internally, to next-to-last offset is multiplied by 16 and the last
 | 
| +// offset is relative instead of absolute.
 | 
| +// Look up property of one UTF-8 character and advance over it
 | 
| +// Return 0 if input length is zero
 | 
| +// Return 0 and advance one byte if input is ill-formed
 | 
| +uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
 | 
| +                          const uint8** src,
 | 
| +                          int* srclen) {
 | 
| +  if (*srclen <= 0) {
 | 
| +    return 0;
 | 
| +  }
 | 
| +
 | 
| +  const uint8* lsrc = *src;
 | 
| +  const uint8* Tbl_0 = &st->state_table[st->state0];
 | 
| +  const uint8* Tbl = Tbl_0;
 | 
| +  int e;
 | 
| +  int eshift = st->entry_shift;
 | 
| +
 | 
| +  // Short series of tests faster than switch, optimizes 7-bit ASCII
 | 
| +  unsigned char c = lsrc[0];
 | 
| +  if (static_cast<signed char>(c) >= 0) {           // one byte
 | 
| +    e = Tbl[c];
 | 
| +    *src += 1;
 | 
| +    *srclen -= 1;
 | 
| +  } else if (((c & 0xe0) == 0xc0) && (*srclen >= 2)) {     // two bytes
 | 
| +    e = Tbl[c];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[1]];
 | 
| +    *src += 2;
 | 
| +    *srclen -= 2;
 | 
| +  } else if (((c & 0xf0) == 0xe0) && (*srclen >= 3)) {     // three bytes
 | 
| +    e = Tbl[c];
 | 
| +    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
 | 
| +    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[1]];
 | 
| +    Tbl = &Tbl[e << eshift];          // Relative +/-
 | 
| +    e = Tbl[lsrc[2]];
 | 
| +    *src += 3;
 | 
| +    *srclen -= 3;
 | 
| +  }else if (((c & 0xf8) == 0xf0) && (*srclen >= 4)) {     // four bytes
 | 
| +    e = Tbl[c];
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +    e = Tbl[lsrc[1]];
 | 
| +    Tbl = &Tbl_0[e << (eshift + 4)];  // 16x the range
 | 
| +    e = (reinterpret_cast<const int8*>(Tbl))[lsrc[2]];
 | 
| +    Tbl = &Tbl[e << eshift];          // Relative +/-
 | 
| +    e = Tbl[lsrc[3]];
 | 
| +    *src += 4;
 | 
| +    *srclen -= 4;
 | 
| +  } else {                                                // Ill-formed
 | 
| +    e = 0;
 | 
| +    *src += 1;
 | 
| +    *srclen -= 1;
 | 
| +  }
 | 
| +  return e;
 | 
| +}
 | 
| +
 | 
| +// Scan a UTF-8 stringpiece based on a state table.
 | 
| +// Always scan complete UTF-8 characters
 | 
| +// Set number of bytes scanned. Return reason for exiting
 | 
| +int UTF8GenericScan(const UTF8ScanObj* st,
 | 
| +                    const uint8* str,
 | 
| +                    const int len,
 | 
| +                    int* bytes_consumed) {
 | 
| +  int eshift = st->entry_shift;        // 6 (space optimized) or 8
 | 
| +  // int nEntries = (1 << eshift);       // 64 or 256 entries per state
 | 
| +
 | 
| +  const uint8* isrc = str;
 | 
| +    //reinterpret_cast<const uint8*>(str.data());
 | 
| +  const uint8* src = isrc;
 | 
| +  //const int len = str.length();
 | 
| +  const uint8* srclimit = isrc + len;
 | 
| +  const uint8* srclimit8 = srclimit - 7;
 | 
| +  *bytes_consumed = 0;
 | 
| +  if (len == 0) return kExitOK;
 | 
| +
 | 
| +  const uint8* Tbl_0 = &st->state_table[st->state0];
 | 
| +
 | 
| +DoAgain:
 | 
| +  // Do state-table scan
 | 
| +  int e = 0;
 | 
| +  uint8 c;
 | 
| +
 | 
| +  // Do fast for groups of 8 identity bytes.
 | 
| +  // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
 | 
| +  // including slowing slightly on cr/lf/ht
 | 
| +  //----------------------------
 | 
| +  const uint8* Tbl2 = &st->fast_state[0];
 | 
| +  uint32 losub = st->losub;
 | 
| +  uint32 hiadd = st->hiadd;
 | 
| +  while (src < srclimit8) {
 | 
| +    uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
 | 
| +    uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
 | 
| +    src += 8;
 | 
| +    // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
 | 
| +    uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
 | 
| +                  (s4567 - losub) | (s4567 + hiadd);
 | 
| +    if ((temp & 0x80808080) != 0) {
 | 
| +      // We typically end up here on cr/lf/ht; src was incremented
 | 
| +      int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
 | 
| +                  (Tbl2[src[-6]] | Tbl2[src[-5]]);
 | 
| +      if (e0123 != 0) {src -= 8; break;}    // Exit on Non-interchange
 | 
| +      e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
 | 
| +              (Tbl2[src[-2]] | Tbl2[src[-1]]);
 | 
| +      if (e0123 != 0) {src -= 4; break;}    // Exit on Non-interchange
 | 
| +      // Else OK, go around again
 | 
| +    }
 | 
| +  }
 | 
| +  //----------------------------
 | 
| +
 | 
| +  // Byte-at-a-time scan
 | 
| +  //----------------------------
 | 
| +  const uint8* Tbl = Tbl_0;
 | 
| +  while (src < srclimit) {
 | 
| +    c = *src;
 | 
| +    e = Tbl[c];
 | 
| +    src++;
 | 
| +    if (e >= kExitIllegalStructure) {break;}
 | 
| +    Tbl = &Tbl_0[e << eshift];
 | 
| +  }
 | 
| +  //----------------------------
 | 
| +
 | 
| +
 | 
| +  // Exit posibilities:
 | 
| +  //  Some exit code, !state0, back up over last char
 | 
| +  //  Some exit code, state0, back up one byte exactly
 | 
| +  //  source consumed, !state0, back up over partial char
 | 
| +  //  source consumed, state0, exit OK
 | 
| +  // For illegal byte in state0, avoid backup up over PREVIOUS char
 | 
| +  // For truncated last char, back up to beginning of it
 | 
| +
 | 
| +  if (e >= kExitIllegalStructure) {
 | 
| +    // Back up over exactly one byte of rejected/illegal UTF-8 character
 | 
| +    src--;
 | 
| +    // Back up more if needed
 | 
| +    if (!InStateZero(st, Tbl)) {
 | 
| +      do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
 | 
| +    }
 | 
| +  } else if (!InStateZero(st, Tbl)) {
 | 
| +    // Back up over truncated UTF-8 character
 | 
| +    e = kExitIllegalStructure;
 | 
| +    do {src--;} while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
 | 
| +  } else {
 | 
| +    // Normal termination, source fully consumed
 | 
| +    e = kExitOK;
 | 
| +  }
 | 
| +
 | 
| +  if (e == kExitDoAgain) {
 | 
| +    // Loop back up to the fast scan
 | 
| +    goto DoAgain;
 | 
| +  }
 | 
| +
 | 
| +  *bytes_consumed = src - isrc;
 | 
| +  return e;
 | 
| +}
 | 
| 
 | 
| Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\win\cld_utf8statetable.cc
 | 
| ___________________________________________________________________
 | 
| Added: svn:eol-style
 | 
|    + LF
 | 
| 
 | 
| 
 |