third_party/sqlite/sqlite-src-3100200/ext/fts1/simple_tokenizer.c - Issue 2846743003: [sql] Remove SQLite 3.10.2 reference directory.

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/ext/fts1/simple_tokenizer.c

Issue 2846743003: [sql] Remove SQLite 3.10.2 reference directory. (Closed)

Patch Set: Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 /*

2 ** The author disclaims copyright to this source code.

3 **

4 *************************************************************************

5 ** Implementation of the "simple" full-text-search tokenizer.

6 */

7

8 #include <assert.h>

9 #if !defined(__APPLE__)

10 #include <malloc.h>

11 #else

12 #include <stdlib.h>

13 #endif

14 #include <stdio.h>

15 #include <string.h>

16 #include <ctype.h>

17

18 #include "tokenizer.h"

19

20 /* Duplicate a string; the caller must free() the returned string.

21 * (We don't use strdup() since it's not part of the standard C library and

22 * may not be available everywhere.) */

23 /* TODO(shess) Copied from fulltext.c, consider util.c for such

24 ** things. */

25 static char string_dup(const char s){

26 char *str = malloc(strlen(s) + 1);

27 strcpy(str, s);

28 return str;

29 }

30

31 typedef struct simple_tokenizer {

32 sqlite3_tokenizer base;

33 const char zDelim; / token delimiters */

34 } simple_tokenizer;

35

36 typedef struct simple_tokenizer_cursor {

37 sqlite3_tokenizer_cursor base;

38 const char pInput; / input we are tokenizing */

39 int nBytes; /* size of the input */

40 const char pCurrent; / current position in pInput */

41 int iToken; /* index of next token to be returned */

42 char zToken; / storage for current token */

43 int nTokenBytes; /* actual size of current token */

44 int nTokenAllocated; /* space allocated to zToken buffer */

45 } simple_tokenizer_cursor;

46

47 static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */

48

49 static int simpleCreate(

50 int argc, const char **argv,

51 sqlite3_tokenizer **ppTokenizer

52 ){

53 simple_tokenizer *t;

54

55 t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));

56 /* TODO(shess) Delimiters need to remain the same from run to run,

57 ** else we need to reindex. One solution would be a meta-table to

58 ** track such information in the database, then we'd only want this

59 ** information on the initial create.

60 */

61 if( argc>1 ){

62 t->zDelim = string_dup(argv[1]);

63 } else {

64 /* Build a string excluding alphanumeric ASCII characters */

65 char zDelim[0x80]; /* nul-terminated, so nul not a member */

66 int i, j;

67 for(i=1, j=0; i<0x80; i++){

68 if( !isalnum(i) ){

69 zDelim[j++] = i;

70 }

71 }

72 zDelim[j++] = '\0';

73 assert( j<=sizeof(zDelim) );

74 t->zDelim = string_dup(zDelim);

75 }

76

77 *ppTokenizer = &t->base;

78 return SQLITE_OK;

79 }

80

81 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){

82 simple_tokenizer t = (simple_tokenizer ) pTokenizer;

83

84 free((void *) t->zDelim);

85 free(t);

86

87 return SQLITE_OK;

88 }

89

90 static int simpleOpen(

91 sqlite3_tokenizer *pTokenizer,

92 const char *pInput, int nBytes,

93 sqlite3_tokenizer_cursor **ppCursor

94 ){

95 simple_tokenizer_cursor *c;

96

97 c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));

98 c->pInput = pInput;

99 c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;

100 c->pCurrent = c->pInput; /* start tokenizing at the beginning */

101 c->iToken = 0;

102 c->zToken = NULL; /* no space allocated, yet. */

103 c->nTokenBytes = 0;

104 c->nTokenAllocated = 0;

105

106 *ppCursor = &c->base;

107 return SQLITE_OK;

108 }

109

110 static int simpleClose(sqlite3_tokenizer_cursor *pCursor){

111 simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;

112

113 if( NULL!=c->zToken ){

114 free(c->zToken);

115 }

116 free(c);

117

118 return SQLITE_OK;

119 }

120

121 static int simpleNext(

122 sqlite3_tokenizer_cursor *pCursor,

123 const char *ppToken, int pnBytes,

124 int piStartOffset, int piEndOffset, int *piPosition

125 ){

126 simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;

127 simple_tokenizer t = (simple_tokenizer ) pCursor->pTokenizer;

128 int ii;

129

130 while( c->pCurrent-c->pInput<c->nBytes ){

131 int n = (int) strcspn(c->pCurrent, t->zDelim);

132 if( n>0 ){

133 if( n+1>c->nTokenAllocated ){

134 c->zToken = realloc(c->zToken, n+1);

135 }

136 for(ii=0; ii<n; ii++){

137 /* TODO(shess) This needs expansion to handle UTF-8

138 ** case-insensitivity.

139 */

140 char ch = c->pCurrent[ii];

141 c->zToken[ii] = (unsigned char)ch<0x80 ? tolower((unsigned char)ch):ch;

142 }

143 c->zToken[n] = '\0';

144 *ppToken = c->zToken;

145 *pnBytes = n;

146 *piStartOffset = (int) (c->pCurrent-c->pInput);

147 piEndOffset = piStartOffset+n;

148 *piPosition = c->iToken++;

149 c->pCurrent += n + 1;

150

151 return SQLITE_OK;

152 }

153 c->pCurrent += n + 1;

154 /* TODO(shess) could strspn() to skip delimiters en masse. Needs

155 ** to happen in two places, though, which is annoying.

156 */

157 }

158 return SQLITE_DONE;

159 }

160

161 static sqlite3_tokenizer_module simpleTokenizerModule = {

162 0,

163 simpleCreate,

164 simpleDestroy,

165 simpleOpen,

166 simpleClose,

167 simpleNext,

168 };

169

170 void get_simple_tokenizer_module(

171 sqlite3_tokenizer_module **ppModule

172 ){

173 *ppModule = &simpleTokenizerModule;

174 }

OLD	NEW