Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc |
=================================================================== |
--- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc (revision 0) |
+++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.cc (revision 0) |
@@ -0,0 +1,570 @@ |
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/getonescriptspan.h" |
+#include <stdio.h> |
+#include <string.h> |
+ |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propjustletter.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8propletterscriptnum.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/utf8scannotjustletterspecial.h" |
+ |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_htmlutils.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8statetable.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_utf8utils.h" |
+ |
+static const Language GRAY_LANG = (Language)254; |
+ |
+static const int kMaxUpToWordBoundary = 50; // span < this make longer, |
+ // else make shorter |
+static const int kMaxAdvanceToWordBoundary = 10; // +/- this many bytes |
+ // to round to word boundary, |
+ // direction above |
+ |
+static const char kSpecialSymbol[256] = { // true for < > & |
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
+ 0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0, |
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
+ |
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
+}; |
+ |
+ |
+ |
+#define LT 0 // < |
+#define GT 1 // > |
+#define EX 2 // ! |
+#define HY 3 // - |
+#define QU 4 // " |
+#define AP 5 // ' |
+#define SL 6 // / |
+#define S_ 7 |
+#define C_ 8 |
+#define R_ 9 |
+#define I_ 10 |
+#define P_ 11 |
+#define T_ 12 |
+#define Y_ 13 |
+#define L_ 14 |
+#define E_ 15 |
+#define CR 16 // <cr> or <lf> |
+#define NL 17 // non-letter: ASCII whitespace, digit, punctuation |
+#define PL 18 // possible letter, incl. & |
+#define xx 19 // <unused> |
+ |
+// Map byte to one of ~20 interesting categories for cheap tag parsing |
+static const uint8 kCharToSub[256] = { |
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL, |
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
+ NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL, |
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL, |
+ |
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, |
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, |
+ PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL, |
+ P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL, |
+ |
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
+ NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, |
+ |
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
+ PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, |
+}; |
+ |
+#undef LT |
+#undef GT |
+#undef EX |
+#undef HY |
+#undef QU |
+#undef AP |
+#undef SL |
+#undef S_ |
+#undef C_ |
+#undef R_ |
+#undef I_ |
+#undef P_ |
+#undef T_ |
+#undef Y_ |
+#undef L_ |
+#undef E_ |
+#undef CR |
+#undef NL |
+#undef PL |
+#undef xx |
+ |
+ |
+#define OK 0 |
+#define X_ 1 |
+ |
+// State machine to do cheap parse of non-letter strings incl. tags |
+// advances <tag> |
+// | | |
+// advances <tag> ... </tag> for <script> <style> |
+// | | |
+// advances <!-- ... <tag> ... --> |
+// | | |
+// advances <tag |
+// || (0) |
+// advances <tag <tag2> |
+// || (0) |
+static const uint8 kTagParseTbl_0[] = { |
+// < > ! - " ' / S C R I P T Y L E CR NL PL xx |
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [0] OK |
+ X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error |
+ 3, 2, 2, 2, 2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK, 2, 2,OK,X_, // [2] NL* |
+ X_, 2, 4, 9, 10,11, 9,13, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [3] < |
+ X_, 2, 9, 5, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [4] <! |
+ X_, 2, 9, 6, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [5] <!- |
+ 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [6] <!--.* |
+ 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [7] <!--.*- |
+ 6, 2, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,X_, // [8] <!--.*-- |
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [9] <.* |
+ 10,10,10,10, 9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*" |
+ 11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*' |
+ X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " ' |
+ |
+// < > ! - " ' / S C R I P T Y L E CR NL PL xx |
+ X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9, 9, 9, 9,X_, // [13] <S |
+ X_, 2, 9, 9, 10,11, 9, 9, 9,15, 9, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [14] <SC |
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9,16, 9, 9, 9, 9, 9, 9, 9, 9,X_, // [15] <SCR |
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9,17, 9, 9, 9, 9, 9, 9, 9,X_, // [16] <SCRI |
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 18, 9, 9, 9, 9, 9, 9,X_, // [17] <SCRIP |
+ X_,19, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT |
+ 20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .* |
+ 19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*< |
+ 19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [21] <SCRIPT .*</ |
+ 19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S |
+ 19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC |
+ 19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR |
+ 19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI |
+ 19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP |
+ 19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT |
+ |
+// < > ! - " ' / S C R I P T Y L E CR NL PL xx |
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9,29, 9, 9, 9, 9, 9,X_, // [28] <ST |
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9,30, 9, 9, 9, 9,X_, // [29] <STY |
+ X_, 2, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9,31, 9, 9, 9,X_, // [30] <STYL |
+ X_,32, 9, 9, 10,11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE |
+ 33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .* |
+ 32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*< |
+ 32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [34] <STYLE .*</ |
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S |
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST |
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY |
+ 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL |
+ 32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE |
+}; |
+ |
+#undef OK |
+#undef X_ |
+ |
+ |
+/* |
+// Convert GetTimeOfDay output to 64-bit usec |
+static inline uint64 Microseconds(const struct timeval& t) { |
+ // The SumReducer uses uint64, so convert to (uint64) microseconds, |
+ // not (double) seconds. |
+ return t.tv_sec * 1000000ULL + t.tv_usec; |
+} |
+*/ |
+ |
+ |
+// Returns true if character is < > or & |
+bool inline IsSpecial(char c) { |
+ if ((c & 0xe0) == 0x20) { |
+ return kSpecialSymbol[static_cast<uint8>(c)]; |
+ } |
+ return false; |
+} |
+ |
+// Quick Skip to next letter or < > & or to end of string (eos) |
+// Always return is_letter for eos |
+int ScanToLetterOrSpecial(const char* src, int len) { |
+ int bytes_consumed; |
+ cld::UTF8GenericScan(&utf8scannotjustletterspecial_obj, src, len, |
+ &bytes_consumed); |
+ return bytes_consumed; |
+} |
+ |
+ |
+ |
+// src points to non-letter, such as tag-opening '<' |
+// Return length from here to next possible letter |
+// On eos or another < before >, return 1 |
+// advances <tag> |
+// | | |
+// advances <tag> ... </tag> for <script> <style> |
+// | | |
+// advances <!-- ... <tag> ... --> |
+// | | |
+// advances <tag |
+// || (1) |
+// advances <tag <tag2> |
+// || (1) |
+int ScanToPossibleLetter(const char* isrc, int len) { |
+ const uint8* src = reinterpret_cast<const uint8*>(isrc); |
+ const uint8* srclimit = src + len; |
+ const uint8* tagParseTbl = kTagParseTbl_0; |
+ int e = 0; |
+ while (src < srclimit) { |
+ e = tagParseTbl[kCharToSub[*src++]]; |
+ if ((e & ~1) == 0) { |
+ // We overshot by one byte |
+ --src; |
+ break; |
+ } |
+ tagParseTbl = &kTagParseTbl_0[e * 20]; |
+ } |
+ |
+ if (src >= srclimit) { |
+ // We fell off the end of the text. |
+ // It looks like the most common case for this is a truncated file, not |
+ // mismatched angle brackets. So we pretend that the last char was '>' |
+ return len; |
+ } |
+ |
+ // OK to be in state 0 or state 2 at exit |
+ if ((e != 0) && (e != 2)) { |
+ // Error, '<' followed by '<' |
+ // We want to back up to first <, then advance by one byte past it |
+ int offset = src - reinterpret_cast<const uint8*>(isrc); |
+ // printf("ScanToPossibleLetter error at %d[%d] in '%s'\n",offset,e,isrc); |
+ |
+ // Backscan to first '<' and return enough length to just get past it |
+ --offset; // back up over the second '<', which caused us to stop |
+ while ((0 < offset) && (isrc[offset] != '<')) { |
+ // Find the first '<', which is unmatched |
+ --offset; |
+ } |
+ // skip to just beyond first '<' |
+ // printf(" returning %d\n", offset + 1); |
+ return offset + 1; |
+ } |
+ |
+ return src - reinterpret_cast<const uint8*>(isrc); |
+} |
+ |
+ |
+ |
+ScriptScanner::ScriptScanner(const char* buffer, |
+ int buffer_length, |
+ bool is_plain_text) |
+ : start_byte_(buffer), |
+ next_byte_(buffer), |
+ next_byte_limit_(buffer + buffer_length), |
+ byte_length_(buffer_length), |
+ is_plain_text_(is_plain_text) { |
+ script_buffer_ = new char[getone::kMaxScriptBuffer]; |
+ script_buffer_lower_ = new char[getone::kMaxScriptLowerBuffer]; |
+} |
+ |
+ScriptScanner::~ScriptScanner() { |
+ delete[] script_buffer_; |
+ delete[] script_buffer_lower_; |
+} |
+ |
+ |
+ |
+ |
+// Get to the first real non-tag letter or entity that is a letter |
+// Sets script of that letter |
+// Return len if no more letters |
+int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) { |
+ int sc = UNKNOWN_LSCRIPT; |
+ int skip = 0; |
+ int tlen, plen; |
+ |
+ // Do run of non-letters (tag | &NL | NL)* |
+ while (skip < len) { |
+ // Do fast scan to next interesting byte |
+ // int oldskip = skip; |
+ skip += ScanToLetterOrSpecial(src + skip, len - skip); |
+ // TEMP |
+ // printf("ScanToLetterOrSpecial[%d] 0x%02x => [%d] 0x%02x\n", |
+ // oldskip, src[oldskip], skip, src[skip]); |
+ |
+ // Check for no more letters/specials |
+ if (skip >= len) { |
+ // All done |
+ return len; |
+ } |
+ |
+ // We are at a letter, nonletter, tag, or entity |
+ if (IsSpecial(src[skip]) && !is_plain_text_) { |
+ if (src[skip] == '<') { |
+ // Begining of tag; skip to end and go around again |
+ tlen = ScanToPossibleLetter(src + skip, len - skip); |
+ sc = 0; |
+ // printf("<...> "); |
+ } else if (src[skip] == '>') { |
+ // Unexpected end of tag; skip it and go around again |
+ tlen = 1; // Over the > |
+ sc = 0; |
+ // printf("..> "); |
+ } else if (src[skip] == '&') { |
+ // Expand entity, no advance |
+ char temp[4]; |
+ EntityToBuffer(src + skip, len - skip, |
+ temp, &tlen, &plen); |
+ sc = getone::GetUTF8LetterScriptNum(temp); |
+ // printf("#(%02x%02x)=%d ", temp[0], temp[1], sc); |
+ } |
+ } else { |
+ // Update 1..4 bytes |
+ tlen = cld_UniLib::OneCharLen(src + skip); |
+ sc = getone::GetUTF8LetterScriptNum(src + skip); |
+ // printf("#(%02x%02x)=%d ", src[skip], src[skip+1], sc); |
+ } |
+ // TEMP |
+ // printf("sc=%d ", sc); |
+ if (sc != 0) {break;} // Letter found |
+ skip += tlen; // Advance |
+ } |
+ |
+ *script = sc; |
+ return skip; |
+} |
+ |
+ |
+ |
+// Copy next run of same-script non-tag letters to buffer [NUL terminated] |
+// Buffer has leading space and all text is lowercased |
+bool ScriptScanner::GetOneScriptSpan(getone::LangSpan* span) { |
+ span->text = script_buffer_; |
+ span->text_bytes = 0; |
+ span->offset = next_byte_ - start_byte_; |
+ span->script = UNKNOWN_LSCRIPT; |
+ span->lang = UNKNOWN_LANGUAGE; |
+ span->truncated = false; |
+ |
+ // printf("GetOneScriptSpan[[ "); |
+ // struct timeval script_start, script_mid, script_end; |
+ |
+ int spanscript; // The script of this span |
+ int sc = UNKNOWN_LSCRIPT; // The script of next character |
+ int tlen, plen; |
+ |
+ |
+ script_buffer_[0] = ' '; // Always a space at front of output |
+ script_buffer_[1] = '\0'; |
+ int take = 0; |
+ int put = 1; // Start after the initial space |
+ |
+ // gettimeofday(&script_start, NULL); |
+ // Get to the first real non-tag letter or entity that is a letter |
+ int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript); |
+ next_byte_ += skip; |
+ byte_length_ -= skip; |
+ if (byte_length_ <= 0) { |
+ // printf("]]\n"); |
+ return false; // No more letters to be found |
+ } |
+ |
+ // gettimeofday(&script_mid, NULL); |
+ |
+ // There is at least one letter, so we know the script for this span |
+ // printf("{%d} ", spanscript); |
+ span->script = (UnicodeLScript)spanscript; |
+ |
+ |
+ // Go over alternating spans of same-script letters and non-letters, |
+ // copying letters to buffer with single spaces for each run of non-letters |
+ while (take < byte_length_) { |
+ // Copy run of letters in same script (&LS | LS)* |
+ int letter_count = 0; // Keep track of word length |
+ bool need_break = false; |
+ while (take < byte_length_) { |
+ // We are at a letter, nonletter, tag, or entity |
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) { |
+ // printf("\"%c\" ", next_byte_[take]); |
+ if (next_byte_[take] == '<') { |
+ // Begining of tag |
+ sc = 0; |
+ break; |
+ } else if (next_byte_[take] == '>') { |
+ // Unexpected end of tag |
+ sc = 0; |
+ break; |
+ } else if (next_byte_[take] == '&') { |
+ // Copy entity, no advance |
+ EntityToBuffer(next_byte_ + take, byte_length_ - take, |
+ script_buffer_ + put, &tlen, &plen); |
+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); |
+ } |
+ } else { |
+ // Real letter, safely copy up to 4 bytes, increment by 1..4 |
+ // Will update by 1..4 bytes at Advance, below |
+ tlen = plen = cld_UniLib::OneCharLen(next_byte_ + take); |
+ if (take < (byte_length_ - 3)) { |
+ // Fast case |
+ *reinterpret_cast<uint32*>(script_buffer_ + put) = |
+ *reinterpret_cast<const uint32*>(next_byte_ + take); |
+ } else { |
+ // Slow case, happens 1-3 times per input document |
+ memcpy(script_buffer_ + put, next_byte_ + take, plen); |
+ } |
+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); |
+ } |
+ // printf("sc(%c)=%d ", next_byte_[take], sc); |
+ // char xtmp[8]; memcpy(xtmp,script_buffer_ + put, plen); |
+ // xtmp[plen] = '\0'; printf("'%s'{%d} ", xtmp, sc); |
+ |
+ // Allow continue across a single letter in a different script: |
+ // A B D = three scripts, c = common script, i = inherited script, |
+ // - = don't care, ( = take position before the += below |
+ // AAA(A- continue |
+ // |
+ // AAA(BA continue |
+ // AAA(BB break |
+ // AAA(Bc continue (breaks after B) |
+ // AAA(BD break |
+ // AAA(Bi break |
+ // |
+ // AAA(c- break |
+ // |
+ // AAA(i- continue |
+ // |
+ |
+ if ((sc != spanscript) && (sc != ULScript_Inherited)) { |
+ // Might need to break this script span |
+ if (sc == ULScript_Common) { |
+ need_break = true; |
+ } else { |
+ // Look at next following character, ignoring entity as Common |
+ int sc2 = getone::GetUTF8LetterScriptNum(next_byte_ + take + tlen); |
+ if ((sc2 != ULScript_Common) && (sc2 != spanscript)) { |
+ need_break = true; |
+ } |
+ } |
+ } |
+ if (need_break) {break;} // Non-letter or letter in wrong script |
+ |
+ take += tlen; // Advance |
+ put += plen; // Advance |
+ ++letter_count; |
+ if (put >= getone::kMaxScriptBytes) { |
+ // Buffer is full |
+ span->truncated = true; |
+ break; |
+ } |
+ } // End while letters |
+ |
+ // Do run of non-letters (tag | &NL | NL)* |
+ while (take < byte_length_) { |
+ // Do fast scan to next interesting byte |
+ take += ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take); |
+ |
+ // Check for no more letters/specials |
+ if (take >= byte_length_) { |
+ take = byte_length_; |
+ break; |
+ } |
+ |
+ // We are at a letter, nonletter, tag, or entity |
+ if (IsSpecial(next_byte_[take]) && !is_plain_text_) { |
+ // printf("\"%c\" ", next_byte_[take]); |
+ if (next_byte_[take] == '<') { |
+ // Begining of tag; skip to end and go around again |
+ tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take); |
+ sc = 0; |
+ // printf("<...> "); |
+ } else if (next_byte_[take] == '>') { |
+ // Unexpected end of tag; skip it and go around again |
+ tlen = 1; // Over the > |
+ sc = 0; |
+ // printf("..> "); |
+ } else if (next_byte_[take] == '&') { |
+ // Expand entity, no advance |
+ EntityToBuffer(next_byte_ + take, byte_length_ - take, |
+ script_buffer_ + put, &tlen, &plen); |
+ sc = getone::GetUTF8LetterScriptNum(script_buffer_ + put); |
+ } |
+ } else { |
+ // Update 1..4 |
+ tlen = cld_UniLib::OneCharLen(next_byte_ + take); |
+ sc = getone::GetUTF8LetterScriptNum(next_byte_ + take); |
+ } |
+ // printf("sc[%c]=%d ", next_byte_[take], sc); |
+ if (sc != 0) {break;} // Letter found |
+ take += tlen; // Advance |
+ } // End while not-letters |
+ |
+ script_buffer_[put++] = ' '; |
+ |
+ // We are at a letter again (or eos), after letter* not-letter* |
+ if (sc != spanscript) {break;} // Letter in wrong script |
+ if (put >= getone::kMaxScriptBytes - 8) { |
+ // Buffer is almost full |
+ span->truncated = true; |
+ break; |
+ } |
+ } |
+ |
+ // Update input position |
+ next_byte_ += take; |
+ byte_length_ -= take; |
+ |
+ // Put four more spaces/NUL. Worst case is abcd _ _ _ \0 |
+ // kMaxScriptBytes | | put |
+ script_buffer_[put + 0] = ' '; |
+ script_buffer_[put + 1] = ' '; |
+ script_buffer_[put + 2] = ' '; |
+ script_buffer_[put + 3] = '\0'; |
+ |
+ span->text_bytes = put; // Does not include the last four chars above |
+ |
+ // printf(" %d]]\n\n", put); |
+ return true; |
+} |
+ |
+// Force Latin, Cyrillic, Greek scripts to be lowercase |
+void ScriptScanner::LowerScriptSpan(getone::LangSpan* span) { |
+ // On Windows, text is lowercased beforehand, so no need to do anything here. |
+#if !defined(CLD_WINDOWS) |
+ // If needed, lowercase all the text. If we do it sooner, might miss |
+ // lowercasing an entity such as Á |
+ // We only need to do this for Latn and Cyrl scripts |
+ if ((span->script == ULScript_Latin) || |
+ (span->script == ULScript_Cyrillic) || |
+ (span->script == ULScript_Greek)) { |
+ // Full Unicode lowercase of the entire buffer, including |
+ // four pad bytes off the end |
+ int consumed, filled; |
+ UniLib::ToLower(span->text, span->text_bytes + 4, |
+ script_buffer_lower_, getone::kMaxScriptLowerBuffer, |
+ &consumed, &filled); |
+ span->text = script_buffer_lower_; |
+ span->text_bytes = filled - 4; |
+ } |
+#endif |
+} |
+ |
+// Copy next run of same-script non-tag letters to buffer [NUL terminated] |
+// Force Latin and Cyrillic scripts to be lowercase |
+bool ScriptScanner::GetOneScriptSpanLower(getone::LangSpan* span) { |
+ bool ok = GetOneScriptSpan(span); |
+ LowerScriptSpan(span); |
+ return ok; |
+} |
+ |
+// Gets lscript number for letters; always returns |
+// 0 (common script) for non-letters |
+int getone::GetUTF8LetterScriptNum(const char* src) { |
+ int srclen = cld_UniLib::OneCharLen(src); |
+ const uint8* usrc = reinterpret_cast<const uint8*>(src); |
+ return UTF8GenericProperty(&utf8propletterscriptnum_obj, &usrc, &srclen); |
+} |