Index: chrome/third_party/hunspell/src/hunspell/csutil.cxx |
=================================================================== |
--- chrome/third_party/hunspell/src/hunspell/csutil.cxx (revision 21721) |
+++ chrome/third_party/hunspell/src/hunspell/csutil.cxx (working copy) |
@@ -5,10 +5,12 @@ |
#include <cstdlib> |
#include <cstring> |
#include <cstdio> |
+#include <cctype> |
#else |
#include <stdlib.h> |
#include <string.h> |
#include <stdio.h> |
+#include <ctype.h> |
#endif |
#include "csutil.hxx" |
@@ -43,17 +45,18 @@ |
using namespace std; |
#endif |
#else |
-#ifndef W32 |
+#ifndef WIN32 |
using namespace std; |
#endif |
#endif |
-struct unicode_info2 * utf_tbl = NULL; |
+static struct unicode_info2 * utf_tbl = NULL; |
+static int utf_tbl_count = 0; // utf_tbl can be used by multiple Hunspell instances |
/* only UTF-16 (BMP) implementation */ |
char * u16_u8(char * dest, int size, const w_char * src, int srclen) { |
- char * u8 = dest; |
- char * u8_max = u8 + size; |
+ signed char * u8 = (signed char *)dest; |
+ signed char * u8_max = (signed char *)(u8 + size); |
const w_char * u2 = src; |
const w_char * u2_max = src + srclen; |
while ((u2 < u2_max) && (u8 < u8_max)) { |
@@ -100,12 +103,12 @@ |
/* only UTF-16 (BMP) implementation */ |
int u8_u16(w_char * dest, int size, const char * src) { |
- const char * u8 = src; |
+ const signed char * u8 = (const signed char *)src; |
w_char * u2 = dest; |
w_char * u2_max = u2 + size; |
while ((u2 < u2_max) && *u8) { |
- switch ((*u8) & 0xf0) { |
+ switch ((*u8) & 0xf0) { |
case 0x00: |
case 0x10: |
case 0x20: |
@@ -122,7 +125,7 @@ |
case 0x90: |
case 0xa0: |
case 0xb0: { |
- HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %d. character position\n%s\n", u8 - src, src); |
+ HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Unexpected continuation bytes in %ld. character position\n%s\n", static_cast<long>(u8 - (signed char *)src), src); |
u2->h = 0xff; |
u2->l = 0xfd; |
break; |
@@ -134,7 +137,7 @@ |
u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); |
u8++; |
} else { |
- HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); |
+ HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); |
u2->h = 0xff; |
u2->l = 0xfd; |
} |
@@ -148,12 +151,12 @@ |
u2->l = (*u8 << 6) + (*(u8+1) & 0x3f); |
u8++; |
} else { |
- HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); |
+ HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); |
u2->h = 0xff; |
u2->l = 0xfd; |
} |
} else { |
- HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %d. character position:\n%s\n", u8 - src, src); |
+ HUNSPELL_WARNING(stderr, "UTF-8 encoding error. Missing continuation byte in %ld. character position:\n%s\n", static_cast<long>(u8 - (signed char *)src), src); |
u2->h = 0xff; |
u2->l = 0xfd; |
} |
@@ -218,13 +221,11 @@ |
char * mystrsep(char ** stringp, const char delim) |
{ |
- char * rv = NULL; |
char * mp = *stringp; |
- int n = strlen(mp); |
- if (n > 0) { |
+ if (*mp != '\0') { |
char * dp; |
if (delim) { |
- dp = (char *)memchr(mp,(int)((unsigned char)delim),n); |
+ dp = strchr(mp, delim); |
} else { |
// don't use isspace() here, the string can be in some random charset |
// that's way different than the locale's |
@@ -234,22 +235,16 @@ |
if (dp) { |
*stringp = dp+1; |
int nc = (int)((unsigned long)dp - (unsigned long)mp); |
- rv = (char *) malloc(nc+1); |
- memcpy(rv,mp,nc); |
- *(rv+nc) = '\0'; |
- return rv; |
+ *(mp+nc) = '\0'; |
+ return mp; |
} else { |
- rv = (char *) malloc(n+1); |
- memcpy(rv, mp, n); |
- *(rv+n) = '\0'; |
- *stringp = mp + n; |
- return rv; |
+ *stringp = mp + strlen(mp); |
+ return mp; |
} |
} |
return NULL; |
} |
- |
// replaces strdup with ansi version |
char * mystrdup(const char * s) |
{ |
@@ -257,12 +252,27 @@ |
if (s) { |
int sl = strlen(s); |
d = (char *) malloc(((sl+1) * sizeof(char))); |
- if (d) memcpy(d,s,((sl+1)*sizeof(char))); |
+ if (d) { |
+ memcpy(d,s,((sl+1)*sizeof(char))); |
+ return d; |
+ } |
+ HUNSPELL_WARNING(stderr, "Can't allocate memory.\n"); |
} |
return d; |
} |
- |
- |
+ |
+ // strcat for limited length destination string |
+ char * mystrcat(char * dest, const char * st, int max) { |
+ int len; |
+ int len2; |
+ if (dest == NULL || st == NULL) return dest; |
+ len = strlen(dest); |
+ len2 = strlen(st); |
+ if (len + len2 + 1 > max) return dest; |
+ strcpy(dest + len, st); |
+ return dest; |
+ } |
+ |
// remove cross-platform text line end characters |
void mychomp(char * s) |
{ |
@@ -289,113 +299,259 @@ |
return d; |
} |
-#ifdef HUNSPELL_EXPERIMENTAL |
- // append s to ends of every lines in text |
- void strlinecat(char * dest, const char * s) |
- { |
- char * dup = mystrdup(dest); |
- char * source = dup; |
- int len = strlen(s); |
- while (*source) { |
- if (*source == '\n') { |
- strncpy(dest, s, len); |
- dest += len; |
- } |
- *dest = *source; |
- source++; dest++; |
- } |
- strcpy(dest, s); |
- free(dup); |
- } |
- |
// break text to lines |
// return number of lines |
-int line_tok(const char * text, char *** lines) { |
+int line_tok(const char * text, char *** lines, char breakchar) { |
int linenum = 0; |
char * dup = mystrdup(text); |
- char * p = strchr(dup, '\n'); |
+ char * p = strchr(dup, breakchar); |
while (p) { |
linenum++; |
*p = '\0'; |
p++; |
- p = strchr(p, '\n'); |
+ p = strchr(p, breakchar); |
} |
- *lines = (char **) calloc(linenum + 1, sizeof(char *)); |
- if (!(*lines)) return -1; |
+ linenum++; |
+// fprintf(stderr, "LINEN:%d %p %p\n", linenum, lines, *lines); |
+ *lines = (char **) malloc(linenum * sizeof(char *)); |
+// fprintf(stderr, "hello\n"); |
+ if (!(*lines)) { |
+ free(dup); |
+ return 0; |
+ } |
- p = dup; |
- for (int i = 0; i < linenum + 1; i++) { |
- (*lines)[i] = mystrdup(p); |
+ p = dup; |
+ int l = 0; |
+ for (int i = 0; i < linenum; i++) { |
+ if (*p != '\0') { |
+ (*lines)[l] = mystrdup(p); |
+ if (!(*lines)[l]) { |
+ for (i = 0; i < l; i++) free((*lines)[i]); |
+ free(dup); |
+ return 0; |
+ } |
+ l++; |
+ } |
p += strlen(p) + 1; |
} |
free(dup); |
- return linenum; |
+ if (!l) free(*lines); |
+ return l; |
} |
// uniq line in place |
-char * line_uniq(char * text) { |
+char * line_uniq(char * text, char breakchar) { |
char ** lines; |
- int linenum = line_tok(text, &lines); |
+ int linenum = line_tok(text, &lines, breakchar); |
int i; |
strcpy(text, lines[0]); |
- for ( i = 1; i<=linenum; i++ ) { |
+ for ( i = 1; i < linenum; i++ ) { |
int dup = 0; |
for (int j = 0; j < i; j++) { |
if (strcmp(lines[i], lines[j]) == 0) dup = 1; |
} |
if (!dup) { |
- if ((i > 1) || (*(lines[0]) != '\0')) strcat(text, "\n"); |
+ if ((i > 1) || (*(lines[0]) != '\0')) { |
+ sprintf(text + strlen(text), "%c", breakchar); |
+ } |
strcat(text, lines[i]); |
} |
} |
- for ( i = 0; i<=linenum; i++ ) { |
+ for ( i = 0; i < linenum; i++ ) { |
if (lines[i]) free(lines[i]); |
} |
if (lines) free(lines); |
return text; |
} |
+// uniq and boundary for compound analysis: "1\n\2\n\1" -> " ( \1 | \2 ) " |
+char * line_uniq_app(char ** text, char breakchar) { |
+ if (!strchr(*text, breakchar)) { |
+ return *text; |
+ } |
+ |
+ char ** lines; |
+ int i; |
+ int linenum = line_tok(*text, &lines, breakchar); |
+ int dup = 0; |
+ for (i = 0; i < linenum; i++) { |
+ for (int j = 0; j < (i - 1); j++) { |
+ if (strcmp(lines[i], lines[j]) == 0) { |
+ *(lines[i]) = '\0'; |
+ dup++; |
+ break; |
+ } |
+ } |
+ } |
+ if ((linenum - dup) == 1) { |
+ strcpy(*text, lines[0]); |
+ freelist(&lines, linenum); |
+ return *text; |
+ } |
+ char * newtext = (char *) malloc(strlen(*text) + 2 * linenum + 3 + 1); |
+ if (newtext) { |
+ free(*text); |
+ *text = newtext; |
+ } else { |
+ freelist(&lines, linenum); |
+ return *text; |
+ } |
+ strcpy(*text," ( "); |
+ for (i = 0; i < linenum; i++) if (*(lines[i])) { |
+ sprintf(*text + strlen(*text), "%s%s", lines[i], " | "); |
+ } |
+ (*text)[strlen(*text) - 2] = ')'; // " ) " |
+ freelist(&lines, linenum); |
+ return *text; |
+} |
+ |
+ // append s to ends of every lines in text |
+ void strlinecat(char * dest, const char * s) |
+ { |
+ char * dup = mystrdup(dest); |
+ char * source = dup; |
+ int len = strlen(s); |
+ if (dup) { |
+ while (*source) { |
+ if (*source == '\n') { |
+ strncpy(dest, s, len); |
+ dest += len; |
+ } |
+ *dest = *source; |
+ source++; dest++; |
+ } |
+ strcpy(dest, s); |
+ free(dup); |
+ } |
+ } |
+ |
// change \n to char c |
-char * line_join(char * text, char c) { |
+char * tr(char * text, char oldc, char newc) { |
char * p; |
- for (p = text; *p; p++) if (*p == '\n') *p = c; |
+ for (p = text; *p; p++) if (*p == oldc) *p = newc; |
return text; |
} |
-// leave only last {[^}]*} substring for handling zero morphemes |
-char * delete_zeros(char * morphout) { |
- char * p = morphout; |
- char * q = p; |
- char * q2 = NULL; |
- int suffix = 0; |
- |
- for (;*p && *(p+1);) { |
- switch (*p) { |
- case '{': |
- q2 = q; |
- q--; |
- break; |
- case '}': |
- if (q2) { |
- suffix = 1; |
- q--; |
- } |
- break; |
- default: |
- if (suffix) { |
- q = q2; |
- } |
- suffix = 0; |
- *q = *p; |
+// morphcmp(): compare MORPH_DERI_SFX, MORPH_INFL_SFX and MORPH_TERM_SFX fields |
+// in the first line of the inputs |
+// return 0, if inputs equal |
+// return 1, if inputs may equal with a secondary suffix |
+// otherwise return -1 |
+int morphcmp(const char * s, const char * t) |
+{ |
+ int se = 0; |
+ int te = 0; |
+ const char * sl; |
+ const char * tl; |
+ const char * olds; |
+ const char * oldt; |
+ if (!s || !t) return 1; |
+ olds = s; |
+ sl = strchr(s, '\n'); |
+ s = strstr(s, MORPH_DERI_SFX); |
+ if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); |
+ if (!s || (sl && sl < s)) { |
+ s= strstr(olds, MORPH_TERM_SFX); |
+ olds = NULL; |
+ } |
+ oldt = t; |
+ tl = strchr(t, '\n'); |
+ t = strstr(t, MORPH_DERI_SFX); |
+ if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); |
+ if (!t || (tl && tl < t)) { |
+ t = strstr(oldt, MORPH_TERM_SFX); |
+ oldt = NULL; |
+ } |
+ while (s && t && (!sl || sl > s) && (!tl || tl > t)) { |
+ s += MORPH_TAG_LEN; |
+ t += MORPH_TAG_LEN; |
+ se = 0; |
+ te = 0; |
+ while ((*s == *t) && !se && !te) { |
+ s++; |
+ t++; |
+ switch(*s) { |
+ case ' ': |
+ case '\n': |
+ case '\t': |
+ case '\0': se = 1; |
+ } |
+ switch(*t) { |
+ case ' ': |
+ case '\n': |
+ case '\t': |
+ case '\0': te = 1; |
+ } |
} |
- p++; |
- q++; |
+ if (!se || !te) { |
+ // not terminal suffix difference |
+ if (olds) return -1; |
+ return 1; |
+ } |
+ olds = s; |
+ s = strstr(s, MORPH_DERI_SFX); |
+ if (!s || (sl && sl < s)) s = strstr(olds, MORPH_INFL_SFX); |
+ if (!s || (sl && sl < s)) { |
+ s = strstr(olds, MORPH_TERM_SFX); |
+ olds = NULL; |
+ } |
+ oldt = t; |
+ t = strstr(t, MORPH_DERI_SFX); |
+ if (!t || (tl && tl < t)) t = strstr(oldt, MORPH_INFL_SFX); |
+ if (!t || (tl && tl < t)) { |
+ t = strstr(oldt, MORPH_TERM_SFX); |
+ oldt = NULL; |
+ } |
} |
- *q = '\0'; |
- return morphout; |
+ if (!s && !t && se && te) return 0; |
+ return 1; |
} |
-#endif // END OF HUNSPELL_EXPERIMENTAL CODE |
+int get_sfxcount(const char * morph) |
+{ |
+ if (!morph || !*morph) return 0; |
+ int n = 0; |
+ const char * old = morph; |
+ morph = strstr(morph, MORPH_DERI_SFX); |
+ if (!morph) morph = strstr(old, MORPH_INFL_SFX); |
+ if (!morph) morph = strstr(old, MORPH_TERM_SFX); |
+ while (morph) { |
+ n++; |
+ old = morph; |
+ morph = strstr(morph + 1, MORPH_DERI_SFX); |
+ if (!morph) morph = strstr(old + 1, MORPH_INFL_SFX); |
+ if (!morph) morph = strstr(old + 1, MORPH_TERM_SFX); |
+ } |
+ return n; |
+} |
+ |
+ |
+int fieldlen(const char * r) |
+{ |
+ int n = 0; |
+ while (r && *r != '\t' && *r != '\0' && *r != '\n' && *r != ' ') { |
+ r++; |
+ n++; |
+ } |
+ return n; |
+} |
+ |
+char * copy_field(char * dest, const char * morph, const char * var) |
+{ |
+ if (!morph) return NULL; |
+ const char * beg = strstr(morph, var); |
+ if (beg) { |
+ char * d = dest; |
+ for (beg += MORPH_TAG_LEN; *beg != ' ' && *beg != '\t' && |
+ *beg != '\n' && *beg != '\0'; d++, beg++) { |
+ *d = *beg; |
+ } |
+ *d = '\0'; |
+ return dest; |
+ } |
+ return NULL; |
+} |
+ |
char * mystrrep(char * word, const char * pat, const char * rep) { |
char * pos = strstr(word, pat); |
if (pos) { |
@@ -445,7 +601,35 @@ |
u16_u8(word, MAXWORDUTF8LEN, w, l); |
return 0; |
} |
+ |
+ int uniqlist(char ** list, int n) { |
+ int i; |
+ if (n < 2) return n; |
+ for (i = 0; i < n; i++) { |
+ for (int j = 0; j < i; j++) { |
+ if (list[j] && list[i] && (strcmp(list[j], list[i]) == 0)) { |
+ free(list[i]); |
+ list[i] = NULL; |
+ break; |
+ } |
+ } |
+ } |
+ int m = 1; |
+ for (i = 1; i < n; i++) if (list[i]) { |
+ list[m] = list[i]; |
+ m++; |
+ } |
+ return m; |
+ } |
+ void freelist(char *** list, int n) { |
+ if (list && *list && n > 0) { |
+ for (int i = 0; i < n; i++) if ((*list)[i]) free((*list)[i]); |
+ free(*list); |
+ *list = NULL; |
+ } |
+ } |
+ |
// convert null terminated string to all caps |
void mkallcap(char * p, const struct cs_info * csconv) |
{ |
@@ -478,8 +662,8 @@ |
for (int i = 0; i < nc; i++) { |
unsigned short idx = (u[i].h << 8) + u[i].l; |
if (idx != unicodetoupper(idx, langnum)) { |
- u[i].h = (unsigned char) (unicodetolower(idx, langnum) >> 8); |
- u[i].l = (unsigned char) (unicodetolower(idx, langnum) & 0x00FF); |
+ u[i].h = (unsigned char) (unicodetoupper(idx, langnum) >> 8); |
+ u[i].l = (unsigned char) (unicodetoupper(idx, langnum) & 0x00FF); |
} |
} |
} |
@@ -490,6 +674,20 @@ |
if (*p != '\0') *p = csconv[((unsigned char)*p)].cupper; |
} |
+ // conversion function for protected memory |
+ void store_pointer(char * dest, char * source) |
+ { |
+ memcpy(dest, &source, sizeof(char *)); |
+ } |
+ |
+ // conversion function for protected memory |
+ char * get_stored_pointer(char * s) |
+ { |
+ char * p; |
+ memcpy(&p, s, sizeof(char *)); |
+ return p; |
+ } |
+ |
#ifndef MOZILLA_CLIENT |
// convert null terminated string to all caps using encoding |
void enmkallcap(char * d, const char * p, const char * encoding) |
@@ -782,7 +980,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0xdd }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
@@ -1042,7 +1240,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0xdd }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
@@ -1302,7 +1500,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0xdd }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info iso4_tbl[] = { |
@@ -1561,7 +1759,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0xdd }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info iso5_tbl[] = { |
@@ -1820,7 +2018,7 @@ |
{ 0x00, 0xfc, 0xac }, |
{ 0x00, 0xfd, 0xfd }, |
{ 0x00, 0xfe, 0xae }, |
-{ 0x00, 0xff, 0xaf }, |
+{ 0x00, 0xff, 0xaf } |
}; |
struct cs_info iso6_tbl[] = { |
@@ -2079,7 +2277,7 @@ |
{ 0x00, 0xfc, 0xfc }, |
{ 0x00, 0xfd, 0xfd }, |
{ 0x00, 0xfe, 0xfe }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info iso7_tbl[] = { |
@@ -2338,7 +2536,7 @@ |
{ 0x00, 0xfc, 0xbc }, |
{ 0x00, 0xfd, 0xbe }, |
{ 0x00, 0xfe, 0xbf }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info iso8_tbl[] = { |
@@ -2597,7 +2795,7 @@ |
{ 0x00, 0xfc, 0xfc }, |
{ 0x00, 0xfd, 0xfd }, |
{ 0x00, 0xfe, 0xfe }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info iso9_tbl[] = { |
@@ -2856,7 +3054,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0x49 }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info iso10_tbl[] = { |
@@ -3115,7 +3313,7 @@ |
{ 0x00, 0xfc, 0xfc }, |
{ 0x00, 0xfd, 0xfd }, |
{ 0x00, 0xfe, 0xfe }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info koi8r_tbl[] = { |
@@ -3374,7 +3572,7 @@ |
{ 0x01, 0xdc, 0xfc }, |
{ 0x01, 0xdd, 0xfd }, |
{ 0x01, 0xde, 0xfe }, |
-{ 0x01, 0xdf, 0xff }, |
+{ 0x01, 0xdf, 0xff } |
}; |
struct cs_info koi8u_tbl[] = { |
@@ -3633,7 +3831,7 @@ |
{ 0x01, 0xdc, 0xfc }, |
{ 0x01, 0xdd, 0xfd }, |
{ 0x01, 0xde, 0xfe }, |
-{ 0x01, 0xdf, 0xff }, |
+{ 0x01, 0xdf, 0xff } |
}; |
struct cs_info cp1251_tbl[] = { |
@@ -3892,7 +4090,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0xdd }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xdf }, |
+{ 0x00, 0xff, 0xdf } |
}; |
struct cs_info iso13_tbl[] = { |
@@ -4151,7 +4349,7 @@ |
{ 0x00, 0xFC, 0xDC }, |
{ 0x00, 0xFD, 0xDD }, |
{ 0x00, 0xFE, 0xDE }, |
-{ 0x00, 0xFF, 0xFF }, |
+{ 0x00, 0xFF, 0xFF } |
}; |
@@ -4411,7 +4609,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0xdd }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
struct cs_info iso15_tbl[] = { |
@@ -4670,7 +4868,7 @@ |
{ 0x00, 0xfc, 0xdc }, |
{ 0x00, 0xfd, 0xdd }, |
{ 0x00, 0xfe, 0xde }, |
-{ 0x00, 0xff, 0xbe }, |
+{ 0x00, 0xff, 0xbe } |
}; |
struct cs_info iscii_devanagari_tbl[] = { |
@@ -4929,10 +5127,10 @@ |
{ 0x00, 0xfc, 0xfc }, |
{ 0x00, 0xfd, 0xfd }, |
{ 0x00, 0xfe, 0xfe }, |
-{ 0x00, 0xff, 0xff }, |
+{ 0x00, 0xff, 0xff } |
}; |
-struct enc_entry encds[] = { |
+static struct enc_entry encds[] = { |
{"ISO8859-1",iso1_tbl}, |
{"ISO8859-2",iso2_tbl}, |
{"ISO8859-3",iso3_tbl}, |
@@ -4949,7 +5147,7 @@ |
{"ISO8859-13", iso13_tbl}, |
{"ISO8859-14", iso14_tbl}, |
{"ISO8859-15", iso15_tbl}, |
-{"ISCII-DEVANAGARI", iscii_devanagari_tbl}, |
+{"ISCII-DEVANAGARI", iscii_devanagari_tbl} |
}; |
struct cs_info * get_current_cs(const char * es) { |
@@ -4958,6 +5156,7 @@ |
for (int i = 0; i < n; i++) { |
if (strcmp(es,encds[i].enc_name) == 0) { |
ccs = encds[i].cs_table; |
+ break; |
} |
} |
return ccs; |
@@ -5038,6 +5237,26 @@ |
} |
#endif |
+// primitive isalpha() replacement for tokenization |
+char * get_casechars(const char * enc) { |
+ struct cs_info * csconv = get_current_cs(enc); |
+ char expw[MAXLNLEN]; |
+ char * p = expw; |
+ for (int i = 0; i <= 255; i++) { |
+ if ((csconv[i].cupper != csconv[i].clower)) { |
+ *p = (char) i; |
+ p++; |
+ } |
+ } |
+ *p = '\0'; |
+#ifdef MOZILLA_CLIENT |
+ delete csconv; |
+#endif |
+ return mystrdup(expw); |
+} |
+ |
+ |
+ |
struct lang_map lang2enc[] = { |
{"ar", "UTF-8", LANG_ar}, |
{"az", "UTF-8", LANG_az}, |
@@ -5090,6 +5309,8 @@ |
#ifndef OPENOFFICEORG |
#ifndef MOZILLA_CLIENT |
int initialize_utf_tbl() { |
+ utf_tbl_count++; |
+ if (utf_tbl) return 0; |
utf_tbl = (unicode_info2 *) malloc(CONTSIZE * sizeof(unicode_info2)); |
if (utf_tbl) { |
int j; |
@@ -5110,7 +5331,11 @@ |
#endif |
void free_utf_tbl() { |
- if (utf_tbl) free(utf_tbl); |
+ if (utf_tbl_count > 0) utf_tbl_count--; |
+ if (utf_tbl && (utf_tbl_count == 0)) { |
+ free(utf_tbl); |
+ utf_tbl = NULL; |
+ } |
} |
#ifdef MOZILLA_CLIENT |
@@ -5133,11 +5358,11 @@ |
return u_toupper(c); |
#else |
#ifdef MOZILLA_CLIENT |
- unsigned short ret(c); |
- getcaseConv()->ToUpper(c, &ret); |
- return ret; |
+ PRUnichar ch2; |
+ getcaseConv()->ToUpper((PRUnichar) c, &ch2); |
+ return ch2; |
#else |
- return utf_tbl[c].cupper; |
+ return (utf_tbl) ? utf_tbl[c].cupper : c; |
#endif |
#endif |
} |
@@ -5153,11 +5378,11 @@ |
return u_tolower(c); |
#else |
#ifdef MOZILLA_CLIENT |
- unsigned short ret(c); |
- getcaseConv()->ToLower(c, &ret); |
- return ret; |
+ PRUnichar ch2; |
+ getcaseConv()->ToLower((PRUnichar) c, &ch2); |
+ return ch2; |
#else |
- return utf_tbl[c].clower; |
+ return (utf_tbl) ? utf_tbl[c].clower : c; |
#endif |
#endif |
} |
@@ -5167,10 +5392,72 @@ |
#ifdef OPENOFFICEORG |
return u_isalpha(c); |
#else |
- return utf_tbl[c].cletter; |
+ return (utf_tbl) ? utf_tbl[c].cletter : 0; |
#endif |
} |
+/* get type of capitalization */ |
+int get_captype(char * word, int nl, cs_info * csconv) { |
+ // now determine the capitalization type of the first nl letters |
+ int ncap = 0; |
+ int nneutral = 0; |
+ int firstcap = 0; |
+ if (csconv == NULL) return NOCAP; |
+ for (char * q = word; *q != '\0'; q++) { |
+ if (csconv[*((unsigned char *)q)].ccase) ncap++; |
+ if (csconv[*((unsigned char *)q)].cupper == csconv[*((unsigned char *)q)].clower) nneutral++; |
+ } |
+ if (ncap) { |
+ firstcap = csconv[*((unsigned char *) word)].ccase; |
+ } |
+ |
+ // now finally set the captype |
+ if (ncap == 0) { |
+ return NOCAP; |
+ } else if ((ncap == 1) && firstcap) { |
+ return INITCAP; |
+ } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { |
+ return ALLCAP; |
+ } else if ((ncap > 1) && firstcap) { |
+ return HUHINITCAP; |
+ } |
+ return HUHCAP; |
+} |
+ |
+int get_captype_utf8(w_char * word, int nl, int langnum) { |
+ // now determine the capitalization type of the first nl letters |
+ int ncap = 0; |
+ int nneutral = 0; |
+ int firstcap = 0; |
+ unsigned short idx; |
+ // don't check too long words |
+ if (nl >= MAXWORDLEN) return 0; |
+ // big Unicode character (non BMP area) |
+ if (nl == -1) return NOCAP; |
+ for (int i = 0; i < nl; i++) { |
+ idx = (word[i].h << 8) + word[i].l; |
+ if (idx != unicodetolower(idx, langnum)) ncap++; |
+ if (unicodetoupper(idx, langnum) == unicodetolower(idx, langnum)) nneutral++; |
+ } |
+ if (ncap) { |
+ idx = (word[0].h << 8) + word[0].l; |
+ firstcap = (idx != unicodetolower(idx, langnum)); |
+ } |
+ |
+ // now finally set the captype |
+ if (ncap == 0) { |
+ return NOCAP; |
+ } else if ((ncap == 1) && firstcap) { |
+ return INITCAP; |
+ } else if ((ncap == nl) || ((ncap + nneutral) == nl)) { |
+ return ALLCAP; |
+ } else if ((ncap > 1) && firstcap) { |
+ return HUHINITCAP; |
+ } |
+ return HUHCAP; |
+} |
+ |
+ |
// strip all ignored characters in the string |
void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len) |
{ |
@@ -5200,14 +5487,14 @@ |
*word = '\0'; |
} |
-int parse_string(char * line, char ** out, const char * name) |
+int parse_string(char * line, char ** out, int ln) |
{ |
char * tp = line; |
char * piece; |
int i = 0; |
int np = 0; |
if (*out) { |
- HUNSPELL_WARNING(stderr, "error: duplicate %s line\n", name); |
+ HUNSPELL_WARNING(stderr, "error: line %d: multiple definitions\n", ln); |
return 1; |
} |
piece = mystrsep(&tp, 0); |
@@ -5217,6 +5504,7 @@ |
case 0: { np++; break; } |
case 1: { |
*out = mystrdup(piece); |
+ if (!*out) return 1; |
np++; |
break; |
} |
@@ -5224,19 +5512,19 @@ |
} |
i++; |
} |
- free(piece); |
+ // free(piece); |
piece = mystrsep(&tp, 0); |
} |
if (np != 2) { |
- HUNSPELL_WARNING(stderr, "error: missing %s information\n", name); |
+ HUNSPELL_WARNING(stderr, "error: line %d: missing data\n", ln); |
return 1; |
} |
return 0; |
} |
-int parse_array(char * line, char ** out, |
- unsigned short ** out_utf16, int * out_utf16_len, const char * name, int utf8) { |
- if (parse_string(line, out, name)) return 1; |
+int parse_array(char * line, char ** out, unsigned short ** out_utf16, |
+ int * out_utf16_len, int utf8, int ln) { |
+ if (parse_string(line, out, ln)) return 1; |
if (utf8) { |
w_char w[MAXWORDLEN]; |
int n = u8_u16(w, MAXWORDLEN, *out); |
Property changes on: chrome\third_party\hunspell\src\hunspell\csutil.cxx |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |