| Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.cc | 
| =================================================================== | 
| --- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.cc	(revision 0) | 
| +++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.cc	(revision 0) | 
| @@ -0,0 +1,259 @@ | 
| +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. | 
| +// Use of this source code is governed by a BSD-style license that can be | 
| +// found in the LICENSE file. | 
| + | 
| +// Remember a subset of a sequence of values, using a modest amount of memory | 
| + | 
| +/*** | 
| +  Design: | 
| +  Accumulate in powers of three, using 3-way median to collapse entries. | 
| +  At any given time, there is one most-dense (highest power of 3) range of | 
| +  entries and a series of less-dense ranges that hold 0..2 entries each. There | 
| +  is a bounded-size storage array of S cells for all the entries. | 
| + | 
| +  The overflow detect is set up so that a new higher power of 3, K+1, is | 
| +  triggered precisely when range K has 3n entries and all ranges < K have | 
| +  zero entries. | 
| + | 
| +  In general, think of the range sizes as a multi-digit base 3 number, except | 
| +  the highest digit may exceed 2: | 
| + | 
| +  3**6  3**5  3**4  3**3  3**2  3**1  3**0  K=2 | 
| +    0     0     0     0   3n-1    2     2   unused:1 | 
| + | 
| +  There are a total of 3n-1 + 2 + 2 entries in use. Assume a size limit S at | 
| +  one more than that, and we add a new 3**0 entry and "carry" by performing | 
| +  medians on any group of 3 elements: | 
| + | 
| +  3**6  3**5  3**4  3**3  3**2  3**1  3**0       K=2 | 
| +    0     0     0     0   3n-1    2     3        unused:0 | 
| +    0     0     0     0   3n-1    3     0 carry  unused:2 | 
| +    0     0     0     0    3n     0     0 carry  unused:4 | 
| + | 
| +  To accumulate 2 entries at all levels < K and 3 just before the first carry at | 
| +  level 0, we need 2*K + 1 unused cells after doing all carries, or five cells | 
| +  in this case. Since we only have 4 cells in the example above, we need to | 
| +  make room by starting a new power of three: | 
| + | 
| +  3**6  3**5  3**4  3**3  3**2  3**1  3**0 | 
| +    0     0     0     0    3n     0     0        K=2 unused:4 | 
| +    0     0     0     n     0     0     0        K=3 unused:2n+4 | 
| + | 
| +  In the code below, we don't worry about overflow from the topmost place. | 
| + | 
| + | 
| +***/ | 
| + | 
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/subsetsequence.h" | 
| +#include <stdio.h> | 
| + | 
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_logging.h" | 
| + | 
| +void DumpInts(const char* label, const int* v, int n) { | 
| +  printf("%s ", label); | 
| +  for (int i = 0; i < n; ++i) { | 
| +    printf("%d ", v[i]); | 
| +  } | 
| +  printf("\n"); | 
| +} | 
| + | 
| +void DumpUint8s(const char* label, const uint8* v, int n) { | 
| +  printf("%s ", label); | 
| +  for (int i = 0; i < n; ++i) { | 
| +    printf("%d ", v[i]); | 
| +  } | 
| +  printf("\n"); | 
| +} | 
| + | 
| +// Return median of seq_[sub] .. seq_[sub+2], favoring middle element | 
| +uint8 SubsetSequence::Median3(int sub) { | 
| +  if (seq_[sub] == seq_[sub + 1]) { | 
| +    return seq_[sub]; | 
| +  } | 
| +  if (seq_[sub] == seq_[sub + 2]) { | 
| +    return seq_[sub]; | 
| +  } | 
| +  return seq_[sub + 1]; | 
| +} | 
| + | 
| +void SubsetSequence::Init() { | 
| +  // printf("Init\n"); | 
| + | 
| +  k_ = 0; | 
| +  count_[0] = 0; | 
| +  next_e_ = 0; | 
| +  seq_[0] = 0;    // Default value if no calls to Add | 
| + | 
| +  // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3 | 
| +  int reserve = (2 * k_ + 1); | 
| +  level_limit_e_ = kMaxSeq_ - reserve; | 
| +  level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3 | 
| +  limit_e_ = level_limit_e_; | 
| +} | 
| + | 
| +// Compress level k by 3x, creating level k+1 | 
| +void SubsetSequence::NewLevel() { | 
| +  // printf("NewLevel 3 ** %d\n", k_ + 1); | 
| +  //DumpUint8s("count[k]", count_, k_ + 1); | 
| +  //DumpUint8s("seq[next]", seq_, next_e_); | 
| + | 
| +  // Incoming level must be an exact multiple of three in size | 
| +  CHECK((count_[k_] % 3) == 0); | 
| +  int k_size = count_[k_]; | 
| +  int new_size = k_size / 3; | 
| + | 
| +  // Compress down by 3x, via median | 
| +  for (int j = 0; j < new_size; ++j) { | 
| +    seq_[j] = Median3(j * 3); | 
| +  } | 
| + | 
| +  // Update counts | 
| +  count_[k_] = 0; | 
| +  // Else Overflow -- just continue with 3x dense Level K | 
| +  if (k_ < (kMaxLevel_ - 1)) {++k_;} | 
| +  count_[k_] = new_size; | 
| + | 
| +  // Update limits | 
| +  next_e_ = new_size; | 
| +  limit_e_ = next_e_ + 3; | 
| + | 
| +  // Want largest <= kMaxSeq_ that allows reserve and makes count_[k_] = 0 mod 3 | 
| +  int reserve = (2 * k_ + 1); | 
| +  level_limit_e_ = kMaxSeq_ - reserve; | 
| +  level_limit_e_ = (level_limit_e_ / 3) * 3;  // Round down to multiple of 3 | 
| +                                              // | 
| +  //DumpUint8s("after: count[k]", count_, k_ + 1); | 
| +  //DumpUint8s("after: seq[next]", seq_, next_e_); | 
| +} | 
| + | 
| +void SubsetSequence::DoCarries() { | 
| +  CHECK(count_[k_] > 3);    // We depend on count_[k_] being > 3 to stop while | 
| +  // Make room by carrying | 
| + | 
| +  //DumpUint8s("DoCarries count[k]", count_, k_ + 1); | 
| +  //DumpUint8s("DoCarries seq[next]", seq_, next_e_); | 
| + | 
| +  int i = 0; | 
| +  while (count_[i] == 3) { | 
| +    next_e_ -= 3; | 
| +    seq_[next_e_] = Median3(next_e_); | 
| +    ++next_e_; | 
| +    count_[i] = 0; | 
| +    ++count_[i + 1]; | 
| +    ++i; | 
| +  } | 
| +  limit_e_ = next_e_ + 3; | 
| + | 
| +  //DumpUint8s("after: DoCarries count[k]", count_, k_ + 1); | 
| +  //DumpUint8s("after: DoCarries seq[next]", seq_, next_e_); | 
| + | 
| +  // If we just fully carried into level K, | 
| +  // Make sure there is now enough room, else start level K + 1 | 
| +  if (i >= k_) { | 
| +    CHECK(count_[k_] == next_e_); | 
| +    if (next_e_ >= level_limit_e_) { | 
| +      NewLevel(); | 
| +    } | 
| +  } | 
| +} | 
| + | 
| +void SubsetSequence::Add(uint8 e) { | 
| +  // Add an entry then carry as needed | 
| +  seq_[next_e_] = e; | 
| +  ++next_e_; | 
| +  ++count_[0]; | 
| + | 
| +  if (next_e_ >= limit_e_) { | 
| +    DoCarries(); | 
| +  } | 
| +} | 
| + | 
| + | 
| +// Collapse tail end by simple median across disparate-weight values, | 
| +// dropping or duplicating last value if need be. | 
| +// This routine is idempotent. | 
| +void SubsetSequence::Flush() { | 
| +  // printf("Flush %d\n", count_[k_]); | 
| +  int start_tail = count_[k_]; | 
| +  int size_tail = next_e_ - start_tail; | 
| +  if ((size_tail % 3) == 2) { | 
| +    seq_[next_e_] = seq_[next_e_ - 1];      // Duplicate last value | 
| +    ++size_tail; | 
| +  } | 
| + | 
| +  // Compress tail down by 3x, via median | 
| +  int new_size = size_tail / 3;             // May delete last value | 
| +  for (int j = 0; j < new_size; ++j) { | 
| +    seq_[start_tail + j] = Median3(start_tail + j * 3); | 
| +  } | 
| + | 
| +  next_e_ = start_tail + new_size; | 
| +  count_[k_] = next_e_; | 
| +} | 
| + | 
| + | 
| +// Extract representative pattern of exactly N values into dst[0..n-1] | 
| +// This routine may be called multiple times, but it may downsample as a | 
| +// side effect, causing subsequent calls with larger N to get poor answers. | 
| +void SubsetSequence::Extract(int to_n, uint8* dst) { | 
| +  // Collapse partial-carries in tail | 
| +  Flush(); | 
| + | 
| +  // Just use Bresenham to resample | 
| +  int from_n = next_e_; | 
| +  if (to_n >= from_n) { | 
| +    // Up-sample from_n => to_n | 
| +    int err = to_n - 1;          // bias toward no overshoot | 
| +    int j = 0; | 
| +    for (int i = 0; i < to_n; ++i) { | 
| +      dst[i] = seq_[j]; | 
| +      err -= from_n; | 
| +      if (err < 0) { | 
| +        ++j; | 
| +        err += to_n; | 
| +      } | 
| +    } | 
| +  } else { | 
| +    // Get to the point that the number of samples is <= 3 * to_n | 
| +    while (next_e_ > (to_n * 3)) { | 
| +      // Compress down by 3x, via median | 
| +      // printf("Extract, median %d / 3\n", next_e_); | 
| +      if ((next_e_ % 3) == 2) { | 
| +        seq_[next_e_] = seq_[next_e_ - 1];    // Duplicate last value | 
| +        ++next_e_; | 
| +      } | 
| +      int new_size = next_e_ / 3;             // May delete last value | 
| +      for (int j = 0; j < new_size; ++j) { | 
| +        seq_[j] = Median3(j * 3); | 
| +      } | 
| +      next_e_ = new_size; | 
| +      count_[k_] = next_e_; | 
| +    } | 
| +    from_n = next_e_; | 
| + | 
| +    if (to_n == from_n) { | 
| +      // Copy verbatim | 
| +      for (int i = 0; i < to_n; ++i) { | 
| +        dst[i] = seq_[i]; | 
| +      } | 
| +      return; | 
| +    } | 
| + | 
| +    // Down-sample from_n => to_n, using medians | 
| +    int err = 0;          // Bias to immediate median sample | 
| +    int j = 0; | 
| +    for (int i = 0; i < from_n; ++i) { | 
| +      err -= to_n; | 
| +      if (err < 0) { | 
| +        if (i <= (next_e_ - 2)) { | 
| +          dst[j] = Median3(i); | 
| +        } else { | 
| +          dst[j] = seq_[i]; | 
| +        } | 
| +        ++j; | 
| +        err += from_n; | 
| +      } | 
| +    } | 
| +  } | 
| + | 
| +} | 
|  | 
| Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\subsetsequence.cc | 
| ___________________________________________________________________ | 
| Added: svn:eol-style | 
| + LF | 
|  | 
|  |