| OLD | NEW |
| 1 /* | 1 /* |
| 2 * parser classes for MySpell | 2 * parser classes for MySpell |
| 3 * | 3 * |
| 4 * implemented: text, HTML, TeX | 4 * implemented: text, HTML, TeX |
| 5 * | 5 * |
| 6 * Copyright (C) 2002, Laszlo Nemeth | 6 * Copyright (C) 2002, Laszlo Nemeth |
| 7 * | 7 * |
| 8 */ | 8 */ |
| 9 /* ***** BEGIN LICENSE BLOCK ***** |
| 10 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
| 11 * |
| 12 * The contents of this file are subject to the Mozilla Public License Version |
| 13 * 1.1 (the "License"); you may not use this file except in compliance with |
| 14 * the License. You may obtain a copy of the License at |
| 15 * http://www.mozilla.org/MPL/ |
| 16 * |
| 17 * Software distributed under the License is distributed on an "AS IS" basis, |
| 18 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| 19 * for the specific language governing rights and limitations under the |
| 20 * License. |
| 21 * |
| 22 * The Original Code is Hunspell, based on MySpell. |
| 23 * |
| 24 * The Initial Developers of the Original Code are |
| 25 * Kevin Hendricks (MySpell) and Németh László (Hunspell). |
| 26 * Portions created by the Initial Developers are Copyright (C) 2002-2005 |
| 27 * the Initial Developers. All Rights Reserved. |
| 28 * |
| 29 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
| 30 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
| 31 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
| 32 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
| 33 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
| 34 * |
| 35 * Alternatively, the contents of this file may be used under the terms of |
| 36 * either the GNU General Public License Version 2 or later (the "GPL"), or |
| 37 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
| 38 * in which case the provisions of the GPL or the LGPL are applicable instead |
| 39 * of those above. If you wish to allow use of your version of this file only |
| 40 * under the terms of either the GPL or the LGPL, and not to allow others to |
| 41 * use your version of this file under the terms of the MPL, indicate your |
| 42 * decision by deleting the provisions above and replace them with the notice |
| 43 * and other provisions required by the GPL or the LGPL. If you do not delete |
| 44 * the provisions above, a recipient may use your version of this file under |
| 45 * the terms of any one of the MPL, the GPL or the LGPL. |
| 46 * |
| 47 * ***** END LICENSE BLOCK ***** */ |
| 9 | 48 |
| 10 #ifndef _TEXTPARSER_HXX_ | 49 #ifndef TEXTPARSER_HXX_ |
| 11 #define _TEXTPARSER_HXX_ | 50 #define TEXTPARSER_HXX_ |
| 12 | 51 |
| 13 // set sum of actual and previous lines | 52 // set sum of actual and previous lines |
| 14 #define MAXPREVLINE 4 | 53 #define MAXPREVLINE 4 |
| 15 | 54 |
| 16 #ifndef MAXLNLEN | 55 #ifndef MAXLNLEN |
| 17 #define MAXLNLEN 8192 | 56 #define MAXLNLEN 8192 |
| 18 #endif | 57 #endif |
| 19 | 58 |
| 59 #include "../hunspell/w_char.hxx" |
| 60 |
| 61 #include <vector> |
| 62 |
| 20 /* | 63 /* |
| 21 * Base Text Parser | 64 * Base Text Parser |
| 22 * | 65 * |
| 23 */ | 66 */ |
| 24 | 67 |
| 25 class TextParser | 68 class TextParser { |
| 26 { | 69 protected: |
| 70 int wordcharacters[256]; // for detection of the word boundaries |
| 71 std::string line[MAXPREVLINE]; // parsed and previous lines |
| 72 std::vector<bool> urlline; // mask for url detection |
| 73 int checkurl; |
| 74 int actual; // actual line |
| 75 size_t head; // head position |
| 76 size_t token;// begin of token |
| 77 int state; // state of automata |
| 78 int utf8; // UTF-8 character encoding |
| 79 int next_char(const char* line, size_t* pos); |
| 80 const w_char* wordchars_utf16; |
| 81 int wclen; |
| 27 | 82 |
| 28 protected: | 83 public: |
| 29 void init(const char *); | 84 TextParser(const w_char* wordchars, int len); |
| 30 void init(unsigned short * wordchars, int len); | 85 explicit TextParser(const char* wc); |
| 31 int wordcharacters[256]; // for detection of the word boundari
es | |
| 32 char line[MAXPREVLINE][MAXLNLEN]; // parsed and previous lines | |
| 33 char urlline[MAXLNLEN]; // mask for url detection | |
| 34 int checkurl; | |
| 35 int actual; // actual line | |
| 36 int head; // head position | |
| 37 int token; // begin of token | |
| 38 int state; // state of automata | |
| 39 int utf8; // UTF-8 character encoding | |
| 40 int next_char(char * line, int * pos); | |
| 41 unsigned short * wordchars_utf16; | |
| 42 int wclen; | |
| 43 | |
| 44 public: | |
| 45 | |
| 46 TextParser(); | |
| 47 TextParser(unsigned short * wordchars, int len); | |
| 48 TextParser(const char * wc); | |
| 49 virtual ~TextParser(); | 86 virtual ~TextParser(); |
| 50 | 87 |
| 51 void put_line(char * line); | 88 void put_line(const char* line); |
| 52 char * get_line(); | 89 std::string get_line() const; |
| 53 char * get_prevline(int n); | 90 std::string get_prevline(int n) const; |
| 54 virtual char * next_token(); | 91 virtual bool next_token(std::string&); |
| 55 int change_token(const char * word); | 92 virtual int change_token(const char* word); |
| 56 void set_url_checking(int check); | 93 void set_url_checking(int check); |
| 57 | 94 |
| 58 int get_tokenpos(); | 95 size_t get_tokenpos(); |
| 59 int is_wordchar(char * w); | 96 int is_wordchar(const char* w); |
| 60 const char * get_latin1(char * s); | 97 inline int is_utf8() { return utf8; } |
| 61 char * next_char(); | 98 const char* get_latin1(const char* s); |
| 62 int tokenize_urls(); | 99 char* next_char(); |
| 63 void check_urls(); | 100 int tokenize_urls(); |
| 64 int get_url(int token_pos, int * head); | 101 void check_urls(); |
| 65 char * alloc_token(int token, int * head); | 102 int get_url(size_t token_pos, size_t* head); |
| 103 bool alloc_token(size_t token, size_t* head, std::string& out); |
| 104 private: |
| 105 void init(const char*); |
| 106 void init(const w_char* wordchars, int len); |
| 66 }; | 107 }; |
| 67 | 108 |
| 68 #endif | 109 #endif |
| 69 | |
| OLD | NEW |