third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc - Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows...

Unified Diff: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc

Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h ('k') | third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/letterscript_enum.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc

===================================================================

--- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc (revision 0)

+++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc (revision 0)

@@ -0,0 +1,570 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h"

+#include <stdio.h>

+#include <string.h>

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h"

+static const Language GRAY_LANG = (Language)254;

+static const int kMaxUpToWordBoundary = 50; // span < this make longer,

+ // else make shorter

+static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes

+ // to round to word boundary,

+ // direction above

+static const char kSpecialSymbol[256] = { // true for < > &

+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

+ 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,

+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,

+};

+#define LT 0 // <

+#define GT 1 // >

+#define EX 2 // !

+#define HY 3 // -

+#define QU 4 // "

+#define AP 5 // '

+#define SL 6 // /

+#define S_ 7

+#define C_ 8

+#define R_ 9

+#define I_ 10

+#define P_ 11

+#define T_ 12

+#define Y_ 13

+#define L_ 14

+#define E_ 15

+#define CR 16 // <cr> or <lf>

+#define NL 17 // non-letter: ASCII whitespace, digit, punctuation

+#define PL 18 // possible letter, incl. &

+#define xx 19 // <unused>

+// Map byte to one of ~20 interesting categories for cheap tag parsing

+static const uint8 kCharToSub[256] = {

+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,

+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

+ NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,

+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,

+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,

+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,

+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,

+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,

+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,

+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,

+};

+#undef LT

+#undef GT

+#undef EX

+#undef HY

+#undef QU

+#undef AP

+#undef SL

+#undef S_

+#undef C_

+#undef R_

+#undef I_

+#undef P_

+#undef T_

+#undef Y_

+#undef L_

+#undef E_

+#undef CR

+#undef NL

+#undef PL

+#undef xx

+#define OK 0

+#define X_ 1

+// State machine to do cheap parse of non-letter strings incl. tags

+// advances <tag>

+// | |

+// advances <tag> ... </tag> for <script> <style>

+// | |

+// advances

+// | |

+// advances <tag

+// || (0)

+// advances <tag <tag2>

+// || (0)

+static const uint8 kTagParseTbl_0[] = {

+// < > ! - " ' / S C R I P T Y L E CR NL PL xx

+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK

+ X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error

+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL*

+ X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] <

+ X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <!

+ X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!-

+ 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.*

+ 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*-

+ 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*--

+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.*

+ 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"

+ 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'

+ X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '

+// < > ! - " ' / S C R I P T Y L E CR NL PL xx

+ X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S

+ X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC

+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR

+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI

+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP

+ X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT

+ 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*

+ 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<

+ 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</

+ 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S

+ 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC

+ 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR

+ 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI

+ 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP

+ 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT

+// < > ! - " ' / S C R I P T Y L E CR NL PL xx

+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST

+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY

+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL

+ X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE

+ 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*

+ 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<

+ 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</

+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S

+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST

+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY

+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL

+ 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE

+};

+#undef OK

+#undef X_

+/*

+// Convert GetTimeOfDay output to 64-bit usec

+static inline uint64 Microseconds(const struct timeval& t) {

+ // The SumReducer uses uint64, so convert to (uint64) microseconds,

+ // not (double) seconds.

+ return t.tv_sec * 1000000ULL + t.tv_usec;

+*/

+// Returns true if character is < > or &

+bool inline IsSpecial(char c) {

+ if ((c & 0xe0) == 0x20) {

+ return kSpecialSymbol[static_cast<uint8>(c)];

+ }

+ return false;

+// Quick Skip to next letter or < > & or to end of string (eos)

+// Always return is_letter for eos

+int ScanToLetterOrSpecial(const char* src, int len) {

+ int bytes_consumed;

+ cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len,

+ &bytes_consumed);

+ return bytes_consumed;

+// src points to non-letter, such as tag-opening '<'

+// Return length from here to next possible letter

+// On eos or another < before >, return 1

+// advances <tag>

+// | |

+// advances <tag> ... </tag> for <script> <style>

+// | |

+// advances

+// | |

+// advances <tag

+// || (1)

+// advances <tag <tag2>

+// || (1)

+int ScanToPossibleLetter(const char* isrc, int len) {

+ const uint8* src = reinterpret_cast<const uint8*>(isrc);

+ const uint8* srclimit = src + len;

+ const uint8* tagParseTbl = kTagParseTbl_0;

+ int e = 0;

+ while (src < srclimit) {

+ e = tagParseTbl[kCharToSub[*src++]];

+ if ((e & ~1) == 0) {

+ // We overshot by one byte

+ --src;

+ break;

+ }

+ tagParseTbl = &kTagParseTbl_0[e * 20];

+ }

+ if (src >= srclimit) {

+ // We fell off the end of the text.

+ // It looks like the most common case for this is a truncated file, not

+ // mismatched angle brackets. So we pretend that the last char was '>'

+ return len;

+ }

+ // OK to be in state 0 or state 2 at exit

+ if ((e != 0) && (e != 2)) {

+ // Error, '<' followed by '<'

+ // We want to back up to first <, then advance by one byte past it

+ int offset = src - reinterpret_cast<const uint8*>(isrc);

+ // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc);

+ // Backscan to first '<' and return enough length to just get past it

+ --offset; // back up over the second '<', which caused us to stop

+ while ((0 < offset) && (isrc[offset] != '<')) {

+ // Find the first '<', which is unmatched

+ --offset;

+ }

+ // skip to just beyond first '<'

+ // printf(" returning %d\n", offset + 1);

+ return offset + 1;

+ }

+ return src - reinterpret_cast<const uint8*>(isrc);

+ScriptScanner::ScriptScanner(const char* buffer,

+ int buffer_length,

+ bool is_plain_text)

+ : start_byte_(buffer),

+ next_byte_(buffer),

+ next_byte_limit_(buffer + buffer_length),

+ byte_length_(buffer_length),

+ is_plain_text_(is_plain_text) {

+ script_buffer_ = new char[getone::kMaxScriptBuffer];

+ script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer];

+ScriptScanner::~ScriptScanner() {

+ delete[] script_buffer_;

+ delete[] script_buffer_lower_;

+// Get to the first real non-tag letter or entity that is a letter

+// Sets script of that letter

+// Return len if no more letters

+int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {

+ int sc = UNKNOWN_LSCRIPT;

+ int skip = 0;

+ int tlen, plen;

+ // Do run of non-letters (tag | &NL | NL)*

+ while (skip < len) {

+ // Do fast scan to next interesting byte

+ // int oldskip = skip;

+ skip += ScanToLetterOrSpecial(src + skip, len - skip);

+ // TEMP

+ // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n",

+ // oldskip, src[oldskip], skip, src[skip]);

+ // Check for no more letters/specials

+ if (skip >= len) {

+ // All done

+ return len;

+ }

+ // We are at a letter, nonletter, tag, or entity

+ if (IsSpecial(src[skip]) && !is_plain_text_) {

+ if (src[skip] == '<') {

+ // Begining of tag; skip to end and go around again

+ tlen = ScanToPossibleLetter(src + skip, len - skip);

+ sc = 0;

+ // printf("<...> ");

+ } else if (src[skip] == '>') {

+ // Unexpected end of tag; skip it and go around again

+ tlen = 1; // Over the >

+ sc = 0;

+ // printf("..> ");

+ } else if (src[skip] == '&') {

+ // Expand entity, no advance

+ char temp[4];

+ EntityToBuffer(src + skip, len - skip,

+ temp, &tlen, &plen);

+ sc = getone::GetUTF8LetterScriptNum(temp);

+ // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc);

+ }

+ } else {

+ // Update 1..4 bytes

+ tlen = cld_UniLib::OneCharLen(src + skip);

+ sc = getone::GetUTF8LetterScriptNum(src + skip);

+ // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc);

+ }

+ // TEMP

+ // printf("sc=%d ", sc);

+ if (sc != 0) {break;} // Letter found

+ skip += tlen; // Advance

+ }

+ *script = sc;

+ return skip;

+// Copy next run of same-script non-tag letters to buffer [NUL terminated]

+// Buffer has leading space and all text is lowercased

+bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) {

+ span->text = script_buffer_;

+ span->text_bytes = 0;

+ span->offset = next_byte_ - start_byte_;

+ span->script = UNKNOWN_LSCRIPT;

+ span->lang = UNKNOWN_LANGUAGE;

+ span->truncated = false;

+ // printf("GetOneScriptSpan[[ ");

+ // struct timeval script_start, script_mid, script_end;

+ int spanscript; // The script of this span

+ int sc = UNKNOWN_LSCRIPT; // The script of next character

+ int tlen, plen;

+ script_buffer_[0] = ' '; // Always a space at front of output

+ script_buffer_[1] = '\0';

+ int take = 0;

+ int put = 1; // Start after the initial space

+ // gettimeofday(&script_start, NULL);

+ // Get to the first real non-tag letter or entity that is a letter

+ int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);

+ next_byte_ += skip;

+ byte_length_ -= skip;

+ if (byte_length_ <= 0) {

+ // printf("]]\n");

+ return false; // No more letters to be found

+ }

+ // gettimeofday(&script_mid, NULL);

+ // There is at least one letter, so we know the script for this span

+ // printf("{%d} ", spanscript);

+ span->script = (UnicodeLScript)spanscript;

+ // Go over alternating spans of same-script letters and non-letters,

+ // copying letters to buffer with single spaces for each run of non-letters

+ while (take < byte_length_) {

+ // Copy run of letters in same script (&LS | LS)*

+ int letter_count = 0; // Keep track of word length

+ bool need_break = false;

+ while (take < byte_length_) {

+ // We are at a letter, nonletter, tag, or entity

+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {

+ // printf("\"%c\" ", next_byte_[take]);

+ if (next_byte_[take] == '<') {

+ // Begining of tag

+ sc = 0;

+ break;

+ } else if (next_byte_[take] == '>') {

+ // Unexpected end of tag

+ sc = 0;

+ break;

+ } else if (next_byte_[take] == '&') {

+ // Copy entity, no advance

+ EntityToBuffer(next_byte_ + take, byte_length_ - take,

+ script_buffer_ + put, &tlen, &plen);

+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);

+ }

+ } else {

+ // Real letter, safely copy up to 4 bytes, increment by 1..4

+ // Will update by 1..4 bytes at Advance, below

+ tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take);

+ if (take < (byte_length_ - 3)) {

+ // Fast case

+ *reinterpret_cast<uint32*>(script_buffer_ + put) =

+ *reinterpret_cast<const uint32*>(next_byte_ + take);

+ } else {

+ // Slow case, happens 1-3 times per input document

+ memcpy(script_buffer_ + put, next_byte_ + take, plen);

+ }

+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);

+ }

+ // printf("sc(%c)=%d ", next_byte_[take], sc);

+ // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen);

+ // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc);

+ // Allow continue across a single letter in a different script:

+ // A B D = three scripts, c = common script, i = inherited script,

+ // - = don't care, ( = take position before the += below

+ // AAA(A- continue

+ //

+ // AAA(BA continue

+ // AAA(BB break

+ // AAA(Bc continue (breaks after B)

+ // AAA(BD break

+ // AAA(Bi break

+ //

+ // AAA(c- break

+ //

+ // AAA(i- continue

+ //

+ if ((sc != spanscript) && (sc != ULScript_Inherited)) {

+ // Might need to break this script span

+ if (sc == ULScript_Common) {

+ need_break = true;

+ } else {

+ // Look at next following character, ignoring entity as Common

+ int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen);

+ if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {

+ need_break = true;

+ }

+ if (need_break) {break;} // Non-letter or letter in wrong script

+ take += tlen; // Advance

+ put += plen; // Advance

+ ++letter_count;

+ if (put >= getone::kMaxScriptBytes) {

+ // Buffer is full

+ span->truncated = true;

+ break;

+ }

+ } // End while letters

+ // Do run of non-letters (tag | &NL | NL)*

+ while (take < byte_length_) {

+ // Do fast scan to next interesting byte

+ take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);

+ // Check for no more letters/specials

+ if (take >= byte_length_) {

+ take = byte_length_;

+ break;

+ }

+ // We are at a letter, nonletter, tag, or entity

+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) {

+ // printf("\"%c\" ", next_byte_[take]);

+ if (next_byte_[take] == '<') {

+ // Begining of tag; skip to end and go around again

+ tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take);

+ sc = 0;

+ // printf("<...> ");

+ } else if (next_byte_[take] == '>') {

+ // Unexpected end of tag; skip it and go around again

+ tlen = 1; // Over the >

+ sc = 0;

+ // printf("..> ");

+ } else if (next_byte_[take] == '&') {

+ // Expand entity, no advance

+ EntityToBuffer(next_byte_ + take, byte_length_ - take,

+ script_buffer_ + put, &tlen, &plen);

+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put);

+ }

+ } else {

+ // Update 1..4

+ tlen = cld_UniLib::OneCharLen(next_byte_ + take);

+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take);

+ }

+ // printf("sc[%c]=%d ", next_byte_[take], sc);

+ if (sc != 0) {break;} // Letter found

+ take += tlen; // Advance

+ } // End while not-letters

+ script_buffer_[put++] = ' ';

+ // We are at a letter again (or eos), after letter* not-letter*

+ if (sc != spanscript) {break;} // Letter in wrong script

+ if (put >= getone::kMaxScriptBytes - 8) {

+ // Buffer is almost full

+ span->truncated = true;

+ break;

+ }

+ // Update input position

+ next_byte_ += take;

+ byte_length_ -= take;

+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0

+ // kMaxScriptBytes | | put

+ script_buffer_[put + 0] = ' ';

+ script_buffer_[put + 1] = ' ';

+ script_buffer_[put + 2] = ' ';

+ script_buffer_[put + 3] = '\0';

+ span->text_bytes = put; // Does not include the last four chars above

+ // printf(" %d]]\n\n", put);

+ return true;

+// Force Latin, Cyrillic, Greek scripts to be lowercase

+void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) {

+ // On Windows, text is lowercased beforehand, so no need to do anything here.

+#if !defined(CLD_WINDOWS)

+ // If needed, lowercase all the text. If we do it sooner, might miss

+ // lowercasing an entity such as Á

+ // We only need to do this for Latn and Cyrl scripts

+ if ((span->script == ULScript_Latin) ||

+ (span->script == ULScript_Cyrillic) ||

+ (span->script == ULScript_Greek)) {

+ // Full Unicode lowercase of the entire buffer, including

+ // four pad bytes off the end

+ int consumed, filled;

+ UniLib::ToLower(span->text, span->text_bytes + 4,

+ script_buffer_lower_, getone::kMaxScriptLowerBuffer,

+ &consumed, &filled);

+ span->text = script_buffer_lower_;

+ span->text_bytes = filled - 4;

+ }

+#endif

+// Copy next run of same-script non-tag letters to buffer [NUL terminated]

+// Force Latin and Cyrillic scripts to be lowercase

+bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) {

+ bool ok = GetOneScriptSpan(span);

+ LowerScriptSpan(span);

+ return ok;

+// Gets lscript number for letters; always returns

+// 0 (common script) for non-letters

+int getone::GetUTF8LetterScriptNum(const char* src) {

+ int srclen = cld_UniLib::OneCharLen(src);

+ const uint8* usrc = reinterpret_cast<const uint8*>(src);

+ return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen);