Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1449)

Unified Diff: third_party/hunspell/src/parsers/xmlparser.cxx

Issue 2544793003: [spellcheck] Updated Hunspell to 1.5.4 (Closed)
Patch Set: Test Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/hunspell/src/parsers/xmlparser.hxx ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/hunspell/src/parsers/xmlparser.cxx
diff --git a/third_party/hunspell/src/parsers/xmlparser.cxx b/third_party/hunspell/src/parsers/xmlparser.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..30e6f2f2b43c00b911b19e8b6e72d83743361f85
--- /dev/null
+++ b/third_party/hunspell/src/parsers/xmlparser.cxx
@@ -0,0 +1,215 @@
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Hunspell, based on MySpell.
+ *
+ * The Initial Developers of the Original Code are
+ * Kevin Hendricks (MySpell) and Németh László (Hunspell).
+ * Portions created by the Initial Developers are Copyright (C) 2002-2005
+ * the Initial Developers. All Rights Reserved.
+ *
+ * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
+ * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
+ * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
+ * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
+ * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+#include <ctype.h>
+
+#include "../hunspell/csutil.hxx"
+#include "xmlparser.hxx"
+
+#ifndef W32
+using namespace std;
+#endif
+
+enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
+
+static const char* __PATTERN__[][2] = {{"<!--", "-->"},
+ {"<[cdata[", "]]>"}, // XML comment
+ {"<", ">"}};
+
+#define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2))
+
+static const char* (*__PATTERN2__)[2] = NULL;
+
+#define __PATTERN_LEN2__ 0
+
+#define ENTITY_APOS "&apos;"
+#define UTF8_APOS "\xe2\x80\x99"
+#define APOSTROPHE "'"
+
+XMLParser::XMLParser(const char* wordchars)
+ : TextParser(wordchars)
+ , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
+}
+
+XMLParser::XMLParser(const w_char* wordchars, int len)
+ : TextParser(wordchars, len)
+ , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
+}
+
+XMLParser::~XMLParser() {}
+
+int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) {
+ for (unsigned int i = 0; i < len; i++) {
+ const char* j = line[actual].c_str() + head;
+ const char* k = p[i][column];
+ while ((*k != '\0') && (tolower(*j) == *k)) {
+ j++;
+ k++;
+ }
+ if (*k == '\0')
+ return i;
+ }
+ return -1;
+}
+
+/*
+ * XML parser
+ *
+ */
+
+bool XMLParser::next_token(const char* PATTERN[][2],
+ unsigned int PATTERN_LEN,
+ const char* PATTERN2[][2],
+ unsigned int PATTERN_LEN2,
+ std::string& t) {
+ t.clear();
+ const char* latin1;
+
+ for (;;) {
+ switch (state) {
+ case ST_NON_WORD: // non word chars
+ prevstate = ST_NON_WORD;
+ if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
+ checkattr = 0;
+ if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
+ checkattr = 1;
+ }
+ state = ST_TAG;
+ } else if (is_wordchar(line[actual].c_str() + head)) {
+ state = ST_WORD;
+ token = head;
+ } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
+ state = ST_WORD;
+ token = head;
+ head += strlen(latin1);
+ } else if (line[actual][head] == '&') {
+ state = ST_CHAR_ENTITY;
+ }
+ break;
+ case ST_WORD: // wordchar
+ if ((latin1 = get_latin1(line[actual].c_str() + head))) {
+ head += strlen(latin1);
+ } else if ((is_wordchar((char*)APOSTROPHE) ||
+ (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
+ strncmp(line[actual].c_str() + head, ENTITY_APOS,
+ strlen(ENTITY_APOS)) == 0 &&
+ is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS))) {
+ head += strlen(ENTITY_APOS) - 1;
+ } else if (is_utf8() &&
+ is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
+ // to the WORDCHARS, if
+ // needed
+ strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
+ 0 &&
+ is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
+ head += strlen(UTF8_APOS) - 1;
+ } else if (!is_wordchar(line[actual].c_str() + head)) {
+ state = prevstate;
+ if (alloc_token(token, &head, t))
+ return true;
+ }
+ break;
+ case ST_TAG: // comment, labels, etc
+ int i;
+ if ((checkattr == 1) &&
+ ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) &&
+ (strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) {
+ checkattr = 2;
+ } else if ((checkattr > 0) && (line[actual][head] == '>')) {
+ state = ST_NON_WORD;
+ } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
+ (strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) {
+ state = ST_NON_WORD;
+ head += strlen(PATTERN[pattern_num][1]) - 1;
+ } else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) &&
+ ((line[actual][head] == '"') ||
+ (line[actual][head] == '\''))) {
+ quotmark = line[actual][head];
+ state = ST_ATTRIB;
+ }
+ break;
+ case ST_ATTRIB: // non word chars
+ prevstate = ST_ATTRIB;
+ if (line[actual][head] == quotmark) {
+ state = ST_TAG;
+ if (checkattr == 2)
+ checkattr = 1;
+ // for IMG ALT
+ } else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) {
+ state = ST_WORD;
+ token = head;
+ } else if (line[actual][head] == '&') {
+ state = ST_CHAR_ENTITY;
+ }
+ break;
+ case ST_CHAR_ENTITY: // SGML element
+ if ((tolower(line[actual][head]) == ';')) {
+ state = prevstate;
+ head--;
+ }
+ }
+ if (next_char(line[actual].c_str(), &head))
+ return false;
+ }
+}
+
+bool XMLParser::next_token(std::string& t) {
+ return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__,
+ __PATTERN_LEN2__, t);
+}
+
+int XMLParser::change_token(const char* word) {
+ if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL ||
+ strchr(word, '&') != NULL || strchr(word, '<') != NULL ||
+ strchr(word, '>') != NULL) {
+ std::string r(word);
+ mystrrep(r, "&", "__namp;__");
+ mystrrep(r, "__namp;__", "&amp;");
+ mystrrep(r, APOSTROPHE, ENTITY_APOS);
+ mystrrep(r, "\"", "&quot;");
+ mystrrep(r, ">", "&gt;");
+ mystrrep(r, "<", "&lt;");
+ return TextParser::change_token(r.c_str());
+ }
+ return TextParser::change_token(word);
+}
« no previous file with comments | « third_party/hunspell/src/parsers/xmlparser.hxx ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698