Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: third_party/sqlite/sqlite-src-3170000/ext/fts3/unicode/mkunicode.tcl

Issue 2747283002: [sql] Import reference version of SQLite 3.17.. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1
2 source [file join [file dirname [info script]] parseunicode.tcl]
3
4 proc print_rd {map} {
5 global tl_lookup_table
6 set aChar [list]
7 set lRange [list]
8
9 set nRange 1
10 set iFirst [lindex $map 0 0]
11 set cPrev [lindex $map 0 1]
12
13 foreach m [lrange $map 1 end] {
14 foreach {i c} $m {}
15
16 if {$cPrev == $c} {
17 for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
18 if {[info exists tl_lookup_table($j)]==0} break
19 }
20
21 if {$j==$i} {
22 set nNew [expr {(1 + $i - $iFirst)}]
23 if {$nNew<=8} {
24 set nRange $nNew
25 continue
26 }
27 }
28 }
29
30 lappend lRange [list $iFirst $nRange]
31 lappend aChar $cPrev
32
33 set iFirst $i
34 set cPrev $c
35 set nRange 1
36 }
37 lappend lRange [list $iFirst $nRange]
38 lappend aChar $cPrev
39
40 puts "/*"
41 puts "** If the argument is a codepoint corresponding to a lowercase letter"
42 puts "** in the ASCII range with a diacritic added, return the codepoint"
43 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
44 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
45 puts "** E\"). The resuls of passing a codepoint that corresponds to an"
46 puts "** uppercase letter are undefined."
47 puts "*/"
48 puts "static int ${::remove_diacritic}(int c)\{"
49 puts " unsigned short aDia\[\] = \{"
50 puts -nonewline " 0, "
51 set i 1
52 foreach r $lRange {
53 foreach {iCode nRange} $r {}
54 if {($i % 8)==0} {puts "" ; puts -nonewline " " }
55 incr i
56
57 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
58 puts -nonewline ", "
59 }
60 puts ""
61 puts " \};"
62 puts " char aChar\[\] = \{"
63 puts -nonewline " '\\0', "
64 set i 1
65 foreach c $aChar {
66 set str "'$c', "
67 if {$c == ""} { set str "'\\0', " }
68
69 if {($i % 12)==0} {puts "" ; puts -nonewline " " }
70 incr i
71 puts -nonewline "$str"
72 }
73 puts ""
74 puts " \};"
75 puts {
76 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
77 int iRes = 0;
78 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
79 int iLo = 0;
80 while( iHi>=iLo ){
81 int iTest = (iHi + iLo) / 2;
82 if( key >= aDia[iTest] ){
83 iRes = iTest;
84 iLo = iTest+1;
85 }else{
86 iHi = iTest-1;
87 }
88 }
89 assert( key>=aDia[iRes] );
90 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
91 puts "\}"
92 }
93
94 proc print_isdiacritic {zFunc map} {
95
96 set lCode [list]
97 foreach m $map {
98 foreach {code char} $m {}
99 if {$code && $char == ""} { lappend lCode $code }
100 }
101 set lCode [lsort -integer $lCode]
102 set iFirst [lindex $lCode 0]
103 set iLast [lindex $lCode end]
104
105 set i1 0
106 set i2 0
107
108 foreach c $lCode {
109 set i [expr $c - $iFirst]
110 if {$i < 32} {
111 set i1 [expr {$i1 | (1<<$i)}]
112 } else {
113 set i2 [expr {$i2 | (1<<($i-32))}]
114 }
115 }
116
117 puts "/*"
118 puts "** Return true if the argument interpreted as a unicode codepoint"
119 puts "** is a diacritical modifier character."
120 puts "*/"
121 puts "int ${zFunc}\(int c)\{"
122 puts " unsigned int mask0 = [format "0x%08X" $i1];"
123 puts " unsigned int mask1 = [format "0x%08X" $i2];"
124
125 puts " if( c<$iFirst || c>$iLast ) return 0;"
126 puts " return (c < $iFirst+32) ?"
127 puts " (mask0 & (1 << (c-$iFirst))) :"
128 puts " (mask1 & (1 << (c-$iFirst-32)));"
129 puts "\}"
130 }
131
132
133 #-------------------------------------------------------------------------
134
135 proc an_load_separator_ranges {} {
136 global unicodedata.txt
137 set lSep [an_load_unicodedata_text ${unicodedata.txt}]
138 unset -nocomplain iFirst
139 unset -nocomplain nRange
140 set lRange [list]
141 foreach sep $lSep {
142 if {0==[info exists iFirst]} {
143 set iFirst $sep
144 set nRange 1
145 } elseif { $sep == ($iFirst+$nRange) } {
146 incr nRange
147 } else {
148 lappend lRange [list $iFirst $nRange]
149 set iFirst $sep
150 set nRange 1
151 }
152 }
153 lappend lRange [list $iFirst $nRange]
154 set lRange
155 }
156
157 proc an_print_range_array {lRange} {
158 set iFirstMax 0
159 set nRangeMax 0
160 foreach range $lRange {
161 foreach {iFirst nRange} $range {}
162 if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
163 if {$nRange > $nRangeMax} {set nRangeMax $nRange}
164 }
165 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
166 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
167
168 puts -nonewline " "
169 puts [string trim {
170 /* Each unsigned integer in the following array corresponds to a contiguous
171 ** range of unicode codepoints that are not either letters or numbers (i.e.
172 ** codepoints for which this function should return 0).
173 **
174 ** The most significant 22 bits in each 32-bit value contain the first
175 ** codepoint in the range. The least significant 10 bits are used to store
176 ** the size of the range (always at least 1). In other words, the value
177 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
178 ** C. It is not possible to represent a range larger than 1023 codepoints
179 ** using this format.
180 */
181 }]
182 puts -nonewline " static const unsigned int aEntry\[\] = \{"
183 set i 0
184 foreach range $lRange {
185 foreach {iFirst nRange} $range {}
186 set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
187
188 if {($i % 5)==0} {puts "" ; puts -nonewline " "}
189 puts -nonewline " $u32,"
190 incr i
191 }
192 puts ""
193 puts " \};"
194 }
195
196 proc an_print_ascii_bitmap {lRange} {
197 foreach range $lRange {
198 foreach {iFirst nRange} $range {}
199 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
200 if {$i<=127} { set a($i) 1 }
201 }
202 }
203
204 set aAscii [list 0 0 0 0]
205 foreach key [array names a] {
206 set idx [expr $key >> 5]
207 lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
208 }
209
210 puts " static const unsigned int aAscii\[4\] = \{"
211 puts -nonewline " "
212 foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
213 puts ""
214 puts " \};"
215 }
216
217 proc print_isalnum {zFunc lRange} {
218 puts "/*"
219 puts "** Return true if the argument corresponds to a unicode codepoint"
220 puts "** classified as either a letter or a number. Otherwise false."
221 puts "**"
222 puts "** The results are undefined if the value passed to this function"
223 puts "** is less than zero."
224 puts "*/"
225 puts "int ${zFunc}\(int c)\{"
226 an_print_range_array $lRange
227 an_print_ascii_bitmap $lRange
228 puts {
229 if( (unsigned int)c<128 ){
230 return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
231 }else if( (unsigned int)c<(1<<22) ){
232 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
233 int iRes = 0;
234 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
235 int iLo = 0;
236 while( iHi>=iLo ){
237 int iTest = (iHi + iLo) / 2;
238 if( key >= aEntry[iTest] ){
239 iRes = iTest;
240 iLo = iTest+1;
241 }else{
242 iHi = iTest-1;
243 }
244 }
245 assert( aEntry[0]<key );
246 assert( key>=aEntry[iRes] );
247 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
248 }
249 return 1;}
250 puts "\}"
251 }
252
253 proc print_test_isalnum {zFunc lRange} {
254 foreach range $lRange {
255 foreach {iFirst nRange} $range {}
256 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
257 }
258
259 puts "static int isalnum_test(int *piCode)\{"
260 puts -nonewline " unsigned char aAlnum\[\] = \{"
261 for {set i 0} {$i < 70000} {incr i} {
262 if {($i % 32)==0} { puts "" ; puts -nonewline " " }
263 set bFlag [expr ![info exists a($i)]]
264 puts -nonewline "${bFlag},"
265 }
266 puts ""
267 puts " \};"
268
269 puts -nonewline " int aLargeSep\[\] = \{"
270 set i 0
271 foreach iSep [lsort -integer [array names a]] {
272 if {$iSep<70000} continue
273 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
274 puts -nonewline " $iSep,"
275 incr i
276 }
277 puts ""
278 puts " \};"
279 puts -nonewline " int aLargeOther\[\] = \{"
280 set i 0
281 foreach iSep [lsort -integer [array names a]] {
282 if {$iSep<70000} continue
283 if {[info exists a([expr $iSep-1])]==0} {
284 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
285 puts -nonewline " [expr $iSep-1],"
286 incr i
287 }
288 if {[info exists a([expr $iSep+1])]==0} {
289 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
290 puts -nonewline " [expr $iSep+1],"
291 incr i
292 }
293 }
294 puts ""
295 puts " \};"
296
297 puts [subst -nocommands {
298 int i;
299 for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
300 if( ${zFunc}(i)!=aAlnum[i] ){
301 *piCode = i;
302 return 1;
303 }
304 }
305 for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
306 if( ${zFunc}(aLargeSep[i])!=0 ){
307 *piCode = aLargeSep[i];
308 return 1;
309 }
310 }
311 for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
312 if( ${zFunc}(aLargeOther[i])!=1 ){
313 *piCode = aLargeOther[i];
314 return 1;
315 }
316 }
317 }]
318 puts " return 0;"
319 puts "\}"
320 }
321
322 #-------------------------------------------------------------------------
323
324 proc tl_create_records {} {
325 global tl_lookup_table
326
327 set iFirst ""
328 set nOff 0
329 set nRange 0
330 set nIncr 0
331
332 set lRecord [list]
333 foreach code [lsort -integer [array names tl_lookup_table]] {
334 set mapping $tl_lookup_table($code)
335 if {$iFirst == ""} {
336 set iFirst $code
337 set nOff [expr $mapping - $code]
338 set nRange 1
339 set nIncr 1
340 } else {
341 set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
342 if { $nRange==1 && ($diff==1 || $diff==2) } {
343 set nIncr $diff
344 }
345
346 if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
347 if { $nRange==1 } {set nIncr 1}
348 lappend lRecord [list $iFirst $nIncr $nRange $nOff]
349 set iFirst $code
350 set nOff [expr $mapping - $code]
351 set nRange 1
352 set nIncr 1
353 } else {
354 incr nRange
355 }
356 }
357 }
358
359 lappend lRecord [list $iFirst $nIncr $nRange $nOff]
360
361 set lRecord
362 }
363
364 proc tl_print_table_header {} {
365 puts -nonewline " "
366 puts [string trim {
367 /* Each entry in the following array defines a rule for folding a range
368 ** of codepoints to lower case. The rule applies to a range of nRange
369 ** codepoints starting at codepoint iCode.
370 **
371 ** If the least significant bit in flags is clear, then the rule applies
372 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
373 ** need to be folded). Or, if it is set, then the rule only applies to
374 ** every second codepoint in the range, starting with codepoint C.
375 **
376 ** The 7 most significant bits in flags are an index into the aiOff[]
377 ** array. If a specific codepoint C does require folding, then its lower
378 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
379 **
380 ** The contents of this array are generated by parsing the CaseFolding.txt
381 ** file distributed as part of the "Unicode Character Database". See
382 ** http://www.unicode.org for details.
383 */
384 }]
385 puts " static const struct TableEntry \{"
386 puts " unsigned short iCode;"
387 puts " unsigned char flags;"
388 puts " unsigned char nRange;"
389 puts " \} aEntry\[\] = \{"
390 }
391
392 proc tl_print_table_entry {togglevar entry liOff} {
393 upvar $togglevar t
394 foreach {iFirst nIncr nRange nOff} $entry {}
395
396 if {$iFirst > (1<<16)} { return 1 }
397
398 if {[info exists t]==0} {set t 0}
399 if {$t==0} { puts -nonewline " " }
400
401 set flags 0
402 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
403 if {$nOff<0} { incr nOff [expr (1<<16)] }
404
405 set idx [lsearch $liOff $nOff]
406 if {$idx<0} {error "malfunction generating aiOff"}
407 set flags [expr $flags + $idx*2]
408
409 set txt "{$iFirst, $flags, $nRange},"
410 if {$t==2} {
411 puts $txt
412 } else {
413 puts -nonewline [format "% -23s" $txt]
414 }
415 set t [expr ($t+1)%3]
416
417 return 0
418 }
419
420 proc tl_print_table_footer {togglevar} {
421 upvar $togglevar t
422 if {$t!=0} {puts ""}
423 puts " \};"
424 }
425
426 proc tl_print_if_entry {entry} {
427 foreach {iFirst nIncr nRange nOff} $entry {}
428 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
429
430 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
431 puts " ret = c + $nOff;"
432 puts " \}"
433 }
434
435 proc tl_generate_ioff_table {lRecord} {
436 foreach entry $lRecord {
437 foreach {iFirst nIncr nRange iOff} $entry {}
438 if {$iOff<0} { incr iOff [expr (1<<16)] }
439 if {[info exists a($iOff)]} continue
440 set a($iOff) 1
441 }
442
443 set liOff [lsort -integer [array names a]]
444 if {[llength $liOff]>128} { error "Too many distinct ioffs" }
445 return $liOff
446 }
447
448 proc tl_print_ioff_table {liOff} {
449 puts -nonewline " static const unsigned short aiOff\[\] = \{"
450 set i 0
451 foreach off $liOff {
452 if {($i % 8)==0} {puts "" ; puts -nonewline " "}
453 puts -nonewline [format "% -7s" "$off,"]
454 incr i
455 }
456 puts ""
457 puts " \};"
458
459 }
460
461 proc print_fold {zFunc} {
462
463 set lRecord [tl_create_records]
464
465 set lHigh [list]
466 puts "/*"
467 puts "** Interpret the argument as a unicode codepoint. If the codepoint"
468 puts "** is an upper case character that has a lower case equivalent,"
469 puts "** return the codepoint corresponding to the lower case version."
470 puts "** Otherwise, return a copy of the argument."
471 puts "**"
472 puts "** The results are undefined if the value passed to this function"
473 puts "** is less than zero."
474 puts "*/"
475 puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
476
477 set liOff [tl_generate_ioff_table $lRecord]
478 tl_print_table_header
479 foreach entry $lRecord {
480 if {[tl_print_table_entry toggle $entry $liOff]} {
481 lappend lHigh $entry
482 }
483 }
484 tl_print_table_footer toggle
485 tl_print_ioff_table $liOff
486
487 puts [subst -nocommands {
488 int ret = c;
489
490 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
491
492 if( c<128 ){
493 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
494 }else if( c<65536 ){
495 const struct TableEntry *p;
496 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
497 int iLo = 0;
498 int iRes = -1;
499
500 assert( c>aEntry[0].iCode );
501 while( iHi>=iLo ){
502 int iTest = (iHi + iLo) / 2;
503 int cmp = (c - aEntry[iTest].iCode);
504 if( cmp>=0 ){
505 iRes = iTest;
506 iLo = iTest+1;
507 }else{
508 iHi = iTest-1;
509 }
510 }
511
512 assert( iRes>=0 && c>=aEntry[iRes].iCode );
513 p = &aEntry[iRes];
514 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
515 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
516 assert( ret>0 );
517 }
518
519 if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
520 }
521 }]
522
523 foreach entry $lHigh {
524 tl_print_if_entry $entry
525 }
526
527 puts ""
528 puts " return ret;"
529 puts "\}"
530 }
531
532 proc print_fold_test {zFunc mappings} {
533 global tl_lookup_table
534
535 foreach m $mappings {
536 set c [lindex $m 1]
537 if {$c == ""} {
538 set extra([lindex $m 0]) 0
539 } else {
540 scan $c %c i
541 set extra([lindex $m 0]) $i
542 }
543 }
544
545 puts "static int fold_test(int *piCode)\{"
546 puts -nonewline " static int aLookup\[\] = \{"
547 for {set i 0} {$i < 70000} {incr i} {
548
549 set expected $i
550 catch { set expected $tl_lookup_table($i) }
551 set expected2 $expected
552 catch { set expected2 $extra($expected2) }
553
554 if {($i % 4)==0} { puts "" ; puts -nonewline " " }
555 puts -nonewline "$expected, $expected2, "
556 }
557 puts " \};"
558 puts " int i;"
559 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
560 puts " int iCode = (i/2);"
561 puts " int bFlag = i & 0x0001;"
562 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
563 puts " *piCode = iCode;"
564 puts " return 1;"
565 puts " \}"
566 puts " \}"
567 puts " return 0;"
568 puts "\}"
569 }
570
571
572 proc print_fileheader {} {
573 puts [string trim {
574 /*
575 ** 2012 May 25
576 **
577 ** The author disclaims copyright to this source code. In place of
578 ** a legal notice, here is a blessing:
579 **
580 ** May you do good and not evil.
581 ** May you find forgiveness for yourself and forgive others.
582 ** May you share freely, never taking more than you give.
583 **
584 ******************************************************************************
585 */
586
587 /*
588 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
589 */
590 }]
591 puts ""
592 if {$::generate_fts5_code} {
593 # no-op
594 } else {
595 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
596 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
597 }
598 puts ""
599 puts "#include <assert.h>"
600 puts ""
601 }
602
603 proc print_test_main {} {
604 puts ""
605 puts "#include <stdio.h>"
606 puts ""
607 puts "int main(int argc, char **argv)\{"
608 puts " int r1, r2;"
609 puts " int code;"
610 puts " r1 = isalnum_test(&code);"
611 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
612 puts " else printf(\"isalnum(): test passed\\n\");"
613 puts " r2 = fold_test(&code);"
614 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
615 puts " else printf(\"fold(): test passed\\n\");"
616 puts " return (r1 || r2);"
617 puts "\}"
618 }
619
620 # Proces the command line arguments. Exit early if they are not to
621 # our liking.
622 #
623 proc usage {} {
624 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
625 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
626 exit 1
627 }
628 if {[llength $argv]<2} usage
629 set unicodedata.txt [lindex $argv end]
630 set casefolding.txt [lindex $argv end-1]
631
632 set remove_diacritic remove_diacritic
633 set generate_test_code 0
634 set generate_fts5_code 0
635 set function_prefix "sqlite3Fts"
636 for {set i 0} {$i < [llength $argv]-2} {incr i} {
637 switch -- [lindex $argv $i] {
638 -test {
639 set generate_test_code 1
640 }
641 -fts5 {
642 set function_prefix sqlite3Fts5
643 set generate_fts5_code 1
644 set remove_diacritic fts5_remove_diacritic
645 }
646 default {
647 usage
648 }
649 }
650 }
651
652 print_fileheader
653
654 # Print the isalnum() function to stdout.
655 #
656 set lRange [an_load_separator_ranges]
657 print_isalnum ${function_prefix}UnicodeIsalnum $lRange
658
659 # Leave a gap between the two generated C functions.
660 #
661 puts ""
662 puts ""
663
664 # Load the fold data. This is used by the [rd_XXX] commands
665 # as well as [print_fold].
666 tl_load_casefolding_txt ${casefolding.txt}
667
668 set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
669 print_rd $mappings
670 puts ""
671 puts ""
672 print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
673 puts ""
674 puts ""
675
676 # Print the fold() function to stdout.
677 #
678 print_fold ${function_prefix}UnicodeFold
679
680 # Print the test routines and main() function to stdout, if -test
681 # was specified.
682 #
683 if {$::generate_test_code} {
684 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
685 print_fold_test ${function_prefix}UnicodeFold $mappings
686 print_test_main
687 }
688
689 if {$generate_fts5_code} {
690 # no-op
691 } else {
692 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
693 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
694 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698