third_party/sqlite/sqlite-src-3080704/ext/fts1/simple_tokenizer.c - Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch

Unified Diff: third_party/sqlite/sqlite-src-3080704/ext/fts1/simple_tokenizer.c

Issue 949043002: Add //third_party/sqlite to dirs_to_snapshot, remove net_sql.patch (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/sqlite/sqlite-src-3080704/ext/fts1/fulltext.c ('k') | third_party/sqlite/sqlite-src-3080704/ext/fts1/tokenizer.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/sqlite/sqlite-src-3080704/ext/fts1/simple_tokenizer.c

diff --git a/third_party/sqlite/sqlite-src-3080704/ext/fts1/simple_tokenizer.c b/third_party/sqlite/sqlite-src-3080704/ext/fts1/simple_tokenizer.c

new file mode 100644

index 0000000000000000000000000000000000000000..d00a77089d871ccbd6528a943ab79045b0445a3a

--- /dev/null

+++ b/third_party/sqlite/sqlite-src-3080704/ext/fts1/simple_tokenizer.c

@@ -0,0 +1,174 @@

+/*

+** The author disclaims copyright to this source code.

+**

+*************************************************************************

+** Implementation of the "simple" full-text-search tokenizer.

+*/

+#include <assert.h>

+#if !defined(__APPLE__)

+#include <malloc.h>

+#else

+#include <stdlib.h>

+#endif

+#include <stdio.h>

+#include <string.h>

+#include <ctype.h>

+#include "tokenizer.h"

+/* Duplicate a string; the caller must free() the returned string.

+ * (We don't use strdup() since it's not part of the standard C library and

+ * may not be available everywhere.) */

+/* TODO(shess) Copied from fulltext.c, consider util.c for such

+** things. */

+static char *string_dup(const char *s){

+ char *str = malloc(strlen(s) + 1);

+ strcpy(str, s);

+ return str;

+typedef struct simple_tokenizer {

+ sqlite3_tokenizer base;

+ const char *zDelim; /* token delimiters */

+} simple_tokenizer;

+typedef struct simple_tokenizer_cursor {

+ sqlite3_tokenizer_cursor base;

+ const char *pInput; /* input we are tokenizing */

+ int nBytes; /* size of the input */

+ const char *pCurrent; /* current position in pInput */

+ int iToken; /* index of next token to be returned */

+ char *zToken; /* storage for current token */

+ int nTokenBytes; /* actual size of current token */

+ int nTokenAllocated; /* space allocated to zToken buffer */

+} simple_tokenizer_cursor;

+static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */

+static int simpleCreate(

+ int argc, const char **argv,

+ sqlite3_tokenizer **ppTokenizer

+){

+ simple_tokenizer *t;

+ t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));

+ /* TODO(shess) Delimiters need to remain the same from run to run,

+ ** else we need to reindex. One solution would be a meta-table to

+ ** track such information in the database, then we'd only want this

+ ** information on the initial create.

+ */

+ if( argc>1 ){

+ t->zDelim = string_dup(argv[1]);

+ } else {

+ /* Build a string excluding alphanumeric ASCII characters */

+ char zDelim[0x80]; /* nul-terminated, so nul not a member */

+ int i, j;

+ for(i=1, j=0; i<0x80; i++){

+ if( !isalnum(i) ){

+ zDelim[j++] = i;

+ }

+ zDelim[j++] = '\0';

+ assert( j<=sizeof(zDelim) );

+ t->zDelim = string_dup(zDelim);

+ }

+ *ppTokenizer = &t->base;

+ return SQLITE_OK;

+static int simpleDestroy(sqlite3_tokenizer *pTokenizer){

+ simple_tokenizer *t = (simple_tokenizer *) pTokenizer;

+ free((void *) t->zDelim);

+ free(t);

+ return SQLITE_OK;

+static int simpleOpen(

+ sqlite3_tokenizer *pTokenizer,

+ const char *pInput, int nBytes,

+ sqlite3_tokenizer_cursor **ppCursor

+){

+ simple_tokenizer_cursor *c;

+ c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));

+ c->pInput = pInput;

+ c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;

+ c->pCurrent = c->pInput; /* start tokenizing at the beginning */

+ c->iToken = 0;

+ c->zToken = NULL; /* no space allocated, yet. */

+ c->nTokenBytes = 0;

+ c->nTokenAllocated = 0;

+ *ppCursor = &c->base;

+ return SQLITE_OK;

+static int simpleClose(sqlite3_tokenizer_cursor *pCursor){

+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;

+ if( NULL!=c->zToken ){

+ free(c->zToken);

+ }

+ free(c);

+ return SQLITE_OK;

+static int simpleNext(

+ sqlite3_tokenizer_cursor *pCursor,

+ const char **ppToken, int *pnBytes,

+ int *piStartOffset, int *piEndOffset, int *piPosition

+){

+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;

+ simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;

+ int ii;

+ while( c->pCurrent-c->pInput<c->nBytes ){

+ int n = (int) strcspn(c->pCurrent, t->zDelim);

+ if( n>0 ){

+ if( n+1>c->nTokenAllocated ){

+ c->zToken = realloc(c->zToken, n+1);

+ }

+ for(ii=0; ii<n; ii++){

+ /* TODO(shess) This needs expansion to handle UTF-8

+ ** case-insensitivity.

+ */

+ char ch = c->pCurrent[ii];

+ c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;

+ }

+ c->zToken[n] = '\0';

+ *ppToken = c->zToken;

+ *pnBytes = n;

+ *piStartOffset = (int) (c->pCurrent-c->pInput);

+ *piEndOffset = *piStartOffset+n;

+ *piPosition = c->iToken++;

+ c->pCurrent += n + 1;

+ return SQLITE_OK;

+ }

+ c->pCurrent += n + 1;

+ /* TODO(shess) could strspn() to skip delimiters en masse. Needs

+ ** to happen in two places, though, which is annoying.

+ */

+ }

+ return SQLITE_DONE;

+static sqlite3_tokenizer_module simpleTokenizerModule = {

+ 0,

+ simpleCreate,

+ simpleDestroy,

+ simpleOpen,

+ simpleClose,

+ simpleNext,

+};

+void get_simple_tokenizer_module(

+ sqlite3_tokenizer_module **ppModule

+){

+ *ppModule = &simpleTokenizerModule;