third_party/sqlite/sqlite-src-3070603/ext/fts1/fts1_tokenizer1.c - Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch

Side by Side Diff: third_party/sqlite/sqlite-src-3070603/ext/fts1/fts1_tokenizer1.c

Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/sqlite/sqlite-src-3070603/ext/fts1/fts1_tokenizer.h ('k') | third_party/sqlite/sqlite-src-3070603/ext/fts1/fulltext.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 /*

	2 ** The author disclaims copyright to this source code.

	3 **

	4 *************************************************************************

	5 ** Implementation of the "simple" full-text-search tokenizer.

	6 */

	7

	8 /*

	9 ** The code in this file is only compiled if:

	10 **

	11 ** * The FTS1 module is being built as an extension

	12 ** (in which case SQLITE_CORE is not defined), or

	13 **

	14 ** * The FTS1 module is being built into the core of

	15 ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).

	16 */

	17 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS1)

	18

	19

	20 #include <assert.h>

	21 #include <stdlib.h>

	22 #include <stdio.h>

	23 #include <string.h>

	24 #include <ctype.h>

	25

	26 #include "fts1_tokenizer.h"

	27

	28 typedef struct simple_tokenizer {

	29 sqlite3_tokenizer base;

	30 char delim[128]; /* flag ASCII delimiters */

	31 } simple_tokenizer;

	32

	33 typedef struct simple_tokenizer_cursor {

	34 sqlite3_tokenizer_cursor base;

	35 const char pInput; / input we are tokenizing */

	36 int nBytes; /* size of the input */

	37 int iOffset; /* current position in pInput */

	38 int iToken; /* index of next token to be returned */

	39 char pToken; / storage for current token */

	40 int nTokenAllocated; /* space allocated to zToken buffer */

	41 } simple_tokenizer_cursor;

	42

	43

	44 /* Forward declaration */

	45 static const sqlite3_tokenizer_module simpleTokenizerModule;

	46

	47 static int isDelim(simple_tokenizer *t, unsigned char c){

	48 return c<0x80 && t->delim[c];

	49 }

	50

	51 /*

	52 ** Create a new tokenizer instance.

	53 */

	54 static int simpleCreate(

	55 int argc, const char * const *argv,

	56 sqlite3_tokenizer **ppTokenizer

	57 ){

	58 simple_tokenizer *t;

	59

	60 t = (simple_tokenizer ) calloc(sizeof(t), 1);

	61 if( t==NULL ) return SQLITE_NOMEM;

	62

	63 /* TODO(shess) Delimiters need to remain the same from run to run,

	64 ** else we need to reindex. One solution would be a meta-table to

	65 ** track such information in the database, then we'd only want this

	66 ** information on the initial create.

	67 */

	68 if( argc>1 ){

	69 int i, n = strlen(argv[1]);

	70 for(i=0; i<n; i++){

	71 unsigned char ch = argv[1][i];

	72 /* We explicitly don't support UTF-8 delimiters for now. */

	73 if( ch>=0x80 ){

	74 free(t);

	75 return SQLITE_ERROR;

	76 }

	77 t->delim[ch] = 1;

	78 }

	79 } else {

	80 /* Mark non-alphanumeric ASCII characters as delimiters */

	81 int i;

	82 for(i=1; i<0x80; i++){

	83 t->delim[i] = !isalnum(i);

	84 }

	85 }

	86

	87 *ppTokenizer = &t->base;

	88 return SQLITE_OK;

	89 }

	90

	91 /*

	92 ** Destroy a tokenizer

	93 */

	94 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){

	95 free(pTokenizer);

	96 return SQLITE_OK;

	97 }

	98

	99 /*

	100 ** Prepare to begin tokenizing a particular string. The input

	101 ** string to be tokenized is pInput[0..nBytes-1]. A cursor

	102 ** used to incrementally tokenize this string is returned in

	103 ** *ppCursor.

	104 */

	105 static int simpleOpen(

	106 sqlite3_tokenizer pTokenizer, / The tokenizer */

	107 const char pInput, int nBytes, / String to be tokenized */

	108 sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */

	109 ){

	110 simple_tokenizer_cursor *c;

	111

	112 c = (simple_tokenizer_cursor ) malloc(sizeof(c));

	113 if( c==NULL ) return SQLITE_NOMEM;

	114

	115 c->pInput = pInput;

	116 if( pInput==0 ){

	117 c->nBytes = 0;

	118 }else if( nBytes<0 ){

	119 c->nBytes = (int)strlen(pInput);

	120 }else{

	121 c->nBytes = nBytes;

	122 }

	123 c->iOffset = 0; /* start tokenizing at the beginning */

	124 c->iToken = 0;

	125 c->pToken = NULL; /* no space allocated, yet. */

	126 c->nTokenAllocated = 0;

	127

	128 *ppCursor = &c->base;

	129 return SQLITE_OK;

	130 }

	131

	132 /*

	133 ** Close a tokenization cursor previously opened by a call to

	134 ** simpleOpen() above.

	135 */

	136 static int simpleClose(sqlite3_tokenizer_cursor *pCursor){

	137 simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;

	138 free(c->pToken);

	139 free(c);

	140 return SQLITE_OK;

	141 }

	142

	143 /*

	144 ** Extract the next token from a tokenization cursor. The cursor must

	145 ** have been opened by a prior call to simpleOpen().

	146 */

	147 static int simpleNext(

	148 sqlite3_tokenizer_cursor pCursor, / Cursor returned by simpleOpen */

	149 const char *ppToken, / OUT: ppToken is the token text /

	150 int pnBytes, / OUT: Number of bytes in token */

	151 int piStartOffset, / OUT: Starting offset of token */

	152 int piEndOffset, / OUT: Ending offset of token */

	153 int piPosition / OUT: Position integer of token */

	154 ){

	155 simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;

	156 simple_tokenizer t = (simple_tokenizer ) pCursor->pTokenizer;

	157 unsigned char p = (unsigned char )c->pInput;

	158

	159 while( c->iOffset<c->nBytes ){

	160 int iStartOffset;

	161

	162 /* Scan past delimiter characters */

	163 while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){

	164 c->iOffset++;

	165 }

	166

	167 /* Count non-delimiter characters. */

	168 iStartOffset = c->iOffset;

	169 while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){

	170 c->iOffset++;

	171 }

	172

	173 if( c->iOffset>iStartOffset ){

	174 int i, n = c->iOffset-iStartOffset;

	175 if( n>c->nTokenAllocated ){

	176 c->nTokenAllocated = n+20;

	177 c->pToken = realloc(c->pToken, c->nTokenAllocated);

	178 if( c->pToken==NULL ) return SQLITE_NOMEM;

	179 }

	180 for(i=0; i<n; i++){

	181 /* TODO(shess) This needs expansion to handle UTF-8

	182 ** case-insensitivity.

	183 */

	184 unsigned char ch = p[iStartOffset+i];

	185 c->pToken[i] = ch<0x80 ? tolower(ch) : ch;

	186 }

	187 *ppToken = c->pToken;

	188 *pnBytes = n;

	189 *piStartOffset = iStartOffset;

	190 *piEndOffset = c->iOffset;

	191 *piPosition = c->iToken++;

	192

	193 return SQLITE_OK;

	194 }

	195 }

	196 return SQLITE_DONE;

	197 }

	198

	199 /*

	200 ** The set of routines that implement the simple tokenizer

	201 */

	202 static const sqlite3_tokenizer_module simpleTokenizerModule = {

	203 0,

	204 simpleCreate,

	205 simpleDestroy,

	206 simpleOpen,

	207 simpleClose,

	208 simpleNext,

	209 };

	210

	211 /*

	212 ** Allocate a new simple tokenizer. Return a pointer to the new

	213 ** tokenizer in *ppModule

	214 */

	215 void sqlite3Fts1SimpleTokenizerModule(

	216 sqlite3_tokenizer_module const**ppModule

	217 ){

	218 *ppModule = &simpleTokenizerModule;

	219 }

	220

	221 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS1) */

OLD	NEW