Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(46)

Side by Side Diff: third_party/sqlite/src/ext/fts3/unicode/mkunicode.tcl

Issue 1610963002: Import SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 1
2 # 2 source [file join [file dirname [info script]] parseunicode.tcl]
3 # Parameter $zName must be a path to the file UnicodeData.txt. This command
4 # reads the file and returns a list of mappings required to remove all
5 # diacritical marks from a unicode string. Each mapping is itself a list
6 # consisting of two elements - the unicode codepoint and the single ASCII
7 # character that it should be replaced with, or an empty string if the
8 # codepoint should simply be removed from the input. Examples:
9 #
10 # { 224 a } (replace codepoint 224 to "a")
11 # { 769 "" } (remove codepoint 769 from input)
12 #
13 # Mappings are only returned for non-upper case codepoints. It is assumed
14 # that the input has already been folded to lower case.
15 #
16 proc rd_load_unicodedata_text {zName} {
17 global tl_lookup_table
18
19 set fd [open $zName]
20 set lField {
21 code
22 character_name
23 general_category
24 canonical_combining_classes
25 bidirectional_category
26 character_decomposition_mapping
27 decimal_digit_value
28 digit_value
29 numeric_value
30 mirrored
31 unicode_1_name
32 iso10646_comment_field
33 uppercase_mapping
34 lowercase_mapping
35 titlecase_mapping
36 }
37 set lRet [list]
38
39 while { ![eof $fd] } {
40 set line [gets $fd]
41 if {$line == ""} continue
42
43 set fields [split $line ";"]
44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
45 foreach $lField $fields {}
46 if { [llength $character_decomposition_mapping]!=2
47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
48 } {
49 continue
50 }
51
52 set iCode [expr "0x$code"]
53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
55
56 if {[info exists tl_lookup_table($iCode)]} continue
57
58 if { ($iAscii >= 97 && $iAscii <= 122)
59 || ($iAscii >= 65 && $iAscii <= 90)
60 } {
61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
62 set dia($iDia) 1
63 }
64 }
65
66 foreach d [array names dia] {
67 lappend lRet [list $d ""]
68 }
69 set lRet [lsort -integer -index 0 $lRet]
70
71 close $fd
72 set lRet
73 }
74
75 3
76 proc print_rd {map} { 4 proc print_rd {map} {
77 global tl_lookup_table 5 global tl_lookup_table
78 set aChar [list] 6 set aChar [list]
79 set lRange [list] 7 set lRange [list]
80 8
81 set nRange 1 9 set nRange 1
82 set iFirst [lindex $map 0 0] 10 set iFirst [lindex $map 0 0]
83 set cPrev [lindex $map 0 1] 11 set cPrev [lindex $map 0 1]
84 12
(...skipping 25 matching lines...) Expand all
110 lappend aChar $cPrev 38 lappend aChar $cPrev
111 39
112 puts "/*" 40 puts "/*"
113 puts "** If the argument is a codepoint corresponding to a lowercase letter" 41 puts "** If the argument is a codepoint corresponding to a lowercase letter"
114 puts "** in the ASCII range with a diacritic added, return the codepoint" 42 puts "** in the ASCII range with a diacritic added, return the codepoint"
115 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" 43 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
116 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" 44 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
117 puts "** E\"). The resuls of passing a codepoint that corresponds to an" 45 puts "** E\"). The resuls of passing a codepoint that corresponds to an"
118 puts "** uppercase letter are undefined." 46 puts "** uppercase letter are undefined."
119 puts "*/" 47 puts "*/"
120 puts "static int remove_diacritic(int c)\{" 48 puts "static int ${::remove_diacritic}(int c)\{"
121 puts " unsigned short aDia\[\] = \{" 49 puts " unsigned short aDia\[\] = \{"
122 puts -nonewline " 0, " 50 puts -nonewline " 0, "
123 set i 1 51 set i 1
124 foreach r $lRange { 52 foreach r $lRange {
125 foreach {iCode nRange} $r {} 53 foreach {iCode nRange} $r {}
126 if {($i % 8)==0} {puts "" ; puts -nonewline " " } 54 if {($i % 8)==0} {puts "" ; puts -nonewline " " }
127 incr i 55 incr i
128 56
129 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] 57 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
130 puts -nonewline ", " 58 puts -nonewline ", "
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
197 puts " if( c<$iFirst || c>$iLast ) return 0;" 125 puts " if( c<$iFirst || c>$iLast ) return 0;"
198 puts " return (c < $iFirst+32) ?" 126 puts " return (c < $iFirst+32) ?"
199 puts " (mask0 & (1 << (c-$iFirst))) :" 127 puts " (mask0 & (1 << (c-$iFirst))) :"
200 puts " (mask1 & (1 << (c-$iFirst-32)));" 128 puts " (mask1 & (1 << (c-$iFirst-32)));"
201 puts "\}" 129 puts "\}"
202 } 130 }
203 131
204 132
205 #------------------------------------------------------------------------- 133 #-------------------------------------------------------------------------
206 134
207 # Parameter $zName must be a path to the file UnicodeData.txt. This command
208 # reads the file and returns a list of codepoints (integers). The list
209 # contains all codepoints in the UnicodeData.txt assigned to any "General
210 # Category" that is not a "Letter" or "Number".
211 #
212 proc an_load_unicodedata_text {zName} {
213 set fd [open $zName]
214 set lField {
215 code
216 character_name
217 general_category
218 canonical_combining_classes
219 bidirectional_category
220 character_decomposition_mapping
221 decimal_digit_value
222 digit_value
223 numeric_value
224 mirrored
225 unicode_1_name
226 iso10646_comment_field
227 uppercase_mapping
228 lowercase_mapping
229 titlecase_mapping
230 }
231 set lRet [list]
232
233 while { ![eof $fd] } {
234 set line [gets $fd]
235 if {$line == ""} continue
236
237 set fields [split $line ";"]
238 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
239 foreach $lField $fields {}
240
241 set iCode [expr "0x$code"]
242 set bAlnum [expr {
243 [lsearch {L N} [string range $general_category 0 0]] >= 0
244 || $general_category=="Co"
245 }]
246
247 if { !$bAlnum } { lappend lRet $iCode }
248 }
249
250 close $fd
251 set lRet
252 }
253
254 proc an_load_separator_ranges {} { 135 proc an_load_separator_ranges {} {
255 global unicodedata.txt 136 global unicodedata.txt
256 set lSep [an_load_unicodedata_text ${unicodedata.txt}] 137 set lSep [an_load_unicodedata_text ${unicodedata.txt}]
257 unset -nocomplain iFirst 138 unset -nocomplain iFirst
258 unset -nocomplain nRange 139 unset -nocomplain nRange
259 set lRange [list] 140 set lRange [list]
260 foreach sep $lSep { 141 foreach sep $lSep {
261 if {0==[info exists iFirst]} { 142 if {0==[info exists iFirst]} {
262 set iFirst $sep 143 set iFirst $sep
263 set nRange 1 144 set nRange 1
(...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after
433 return 1; 314 return 1;
434 } 315 }
435 } 316 }
436 }] 317 }]
437 puts " return 0;" 318 puts " return 0;"
438 puts "\}" 319 puts "\}"
439 } 320 }
440 321
441 #------------------------------------------------------------------------- 322 #-------------------------------------------------------------------------
442 323
443 proc tl_load_casefolding_txt {zName} {
444 global tl_lookup_table
445
446 set fd [open $zName]
447 while { ![eof $fd] } {
448 set line [gets $fd]
449 if {[string range $line 0 0] == "#"} continue
450 if {$line == ""} continue
451
452 foreach x {a b c d} {unset -nocomplain $x}
453 foreach {a b c d} [split $line ";"] {}
454
455 set a2 [list]
456 set c2 [list]
457 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
458 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
459 set b [string trim $b]
460 set d [string trim $d]
461
462 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
463 }
464 }
465
466 proc tl_create_records {} { 324 proc tl_create_records {} {
467 global tl_lookup_table 325 global tl_lookup_table
468 326
469 set iFirst "" 327 set iFirst ""
470 set nOff 0 328 set nOff 0
471 set nRange 0 329 set nRange 0
472 set nIncr 0 330 set nIncr 0
473 331
474 set lRecord [list] 332 set lRecord [list]
475 foreach code [lsort -integer [array names tl_lookup_table]] { 333 foreach code [lsort -integer [array names tl_lookup_table]] {
(...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after
619 set liOff [tl_generate_ioff_table $lRecord] 477 set liOff [tl_generate_ioff_table $lRecord]
620 tl_print_table_header 478 tl_print_table_header
621 foreach entry $lRecord { 479 foreach entry $lRecord {
622 if {[tl_print_table_entry toggle $entry $liOff]} { 480 if {[tl_print_table_entry toggle $entry $liOff]} {
623 lappend lHigh $entry 481 lappend lHigh $entry
624 } 482 }
625 } 483 }
626 tl_print_table_footer toggle 484 tl_print_table_footer toggle
627 tl_print_ioff_table $liOff 485 tl_print_ioff_table $liOff
628 486
629 puts { 487 puts [subst -nocommands {
630 int ret = c; 488 int ret = c;
631 489
632 assert( c>=0 );
633 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); 490 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
634 491
635 if( c<128 ){ 492 if( c<128 ){
636 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); 493 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
637 }else if( c<65536 ){ 494 }else if( c<65536 ){
495 const struct TableEntry *p;
638 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; 496 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
639 int iLo = 0; 497 int iLo = 0;
640 int iRes = -1; 498 int iRes = -1;
641 499
500 assert( c>aEntry[0].iCode );
642 while( iHi>=iLo ){ 501 while( iHi>=iLo ){
643 int iTest = (iHi + iLo) / 2; 502 int iTest = (iHi + iLo) / 2;
644 int cmp = (c - aEntry[iTest].iCode); 503 int cmp = (c - aEntry[iTest].iCode);
645 if( cmp>=0 ){ 504 if( cmp>=0 ){
646 iRes = iTest; 505 iRes = iTest;
647 iLo = iTest+1; 506 iLo = iTest+1;
648 }else{ 507 }else{
649 iHi = iTest-1; 508 iHi = iTest-1;
650 } 509 }
651 } 510 }
652 assert( iRes<0 || c>=aEntry[iRes].iCode );
653 511
654 if( iRes>=0 ){ 512 assert( iRes>=0 && c>=aEntry[iRes].iCode );
655 const struct TableEntry *p = &aEntry[iRes]; 513 p = &aEntry[iRes];
656 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ 514 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
657 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; 515 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
658 assert( ret>0 ); 516 assert( ret>0 );
659 }
660 } 517 }
661 518
662 if( bRemoveDiacritic ) ret = remove_diacritic(ret); 519 if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
663 } 520 }
664 } 521 }]
665 522
666 foreach entry $lHigh { 523 foreach entry $lHigh {
667 tl_print_if_entry $entry 524 tl_print_if_entry $entry
668 } 525 }
669 526
670 puts "" 527 puts ""
671 puts " return ret;" 528 puts " return ret;"
672 puts "\}" 529 puts "\}"
673 } 530 }
674 531
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
725 ** May you share freely, never taking more than you give. 582 ** May you share freely, never taking more than you give.
726 ** 583 **
727 ****************************************************************************** 584 ******************************************************************************
728 */ 585 */
729 586
730 /* 587 /*
731 ** DO NOT EDIT THIS MACHINE GENERATED FILE. 588 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
732 */ 589 */
733 }] 590 }]
734 puts "" 591 puts ""
735 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" 592 if {$::generate_fts5_code} {
736 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" 593 # no-op
594 } else {
595 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
596 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
597 }
737 puts "" 598 puts ""
738 puts "#include <assert.h>" 599 puts "#include <assert.h>"
739 puts "" 600 puts ""
740 } 601 }
741 602
742 proc print_test_main {} { 603 proc print_test_main {} {
743 puts "" 604 puts ""
744 puts "#include <stdio.h>" 605 puts "#include <stdio.h>"
745 puts "" 606 puts ""
746 puts "int main(int argc, char **argv)\{" 607 puts "int main(int argc, char **argv)\{"
747 puts " int r1, r2;" 608 puts " int r1, r2;"
748 puts " int code;" 609 puts " int code;"
749 puts " r1 = isalnum_test(&code);" 610 puts " r1 = isalnum_test(&code);"
750 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" 611 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
751 puts " else printf(\"isalnum(): test passed\\n\");" 612 puts " else printf(\"isalnum(): test passed\\n\");"
752 puts " r2 = fold_test(&code);" 613 puts " r2 = fold_test(&code);"
753 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" 614 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
754 puts " else printf(\"fold(): test passed\\n\");" 615 puts " else printf(\"fold(): test passed\\n\");"
755 puts " return (r1 || r2);" 616 puts " return (r1 || r2);"
756 puts "\}" 617 puts "\}"
757 } 618 }
758 619
759 # Proces the command line arguments. Exit early if they are not to 620 # Proces the command line arguments. Exit early if they are not to
760 # our liking. 621 # our liking.
761 # 622 #
762 proc usage {} { 623 proc usage {} {
763 puts -nonewline stderr "Usage: $::argv0 ?-test? " 624 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
764 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" 625 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
765 exit 1 626 exit 1
766 } 627 }
767 if {[llength $argv]!=2 && [llength $argv]!=3} usage 628 if {[llength $argv]<2} usage
768 if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
769 set unicodedata.txt [lindex $argv end] 629 set unicodedata.txt [lindex $argv end]
770 set casefolding.txt [lindex $argv end-1] 630 set casefolding.txt [lindex $argv end-1]
771 set generate_test_code [expr {[llength $argv]==3}] 631
632 set remove_diacritic remove_diacritic
633 set generate_test_code 0
634 set generate_fts5_code 0
635 set function_prefix "sqlite3Fts"
636 for {set i 0} {$i < [llength $argv]-2} {incr i} {
637 switch -- [lindex $argv $i] {
638 -test {
639 set generate_test_code 1
640 }
641 -fts5 {
642 set function_prefix sqlite3Fts5
643 set generate_fts5_code 1
644 set remove_diacritic fts5_remove_diacritic
645 }
646 default {
647 usage
648 }
649 }
650 }
772 651
773 print_fileheader 652 print_fileheader
774 653
775 # Print the isalnum() function to stdout. 654 # Print the isalnum() function to stdout.
776 # 655 #
777 set lRange [an_load_separator_ranges] 656 set lRange [an_load_separator_ranges]
778 print_isalnum sqlite3FtsUnicodeIsalnum $lRange 657 print_isalnum ${function_prefix}UnicodeIsalnum $lRange
779 658
780 # Leave a gap between the two generated C functions. 659 # Leave a gap between the two generated C functions.
781 # 660 #
782 puts "" 661 puts ""
783 puts "" 662 puts ""
784 663
785 # Load the fold data. This is used by the [rd_XXX] commands 664 # Load the fold data. This is used by the [rd_XXX] commands
786 # as well as [print_fold]. 665 # as well as [print_fold].
787 tl_load_casefolding_txt ${casefolding.txt} 666 tl_load_casefolding_txt ${casefolding.txt}
788 667
789 set mappings [rd_load_unicodedata_text ${unicodedata.txt}] 668 set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
790 print_rd $mappings 669 print_rd $mappings
791 puts "" 670 puts ""
792 puts "" 671 puts ""
793 print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings 672 print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
794 puts "" 673 puts ""
795 puts "" 674 puts ""
796 675
797 # Print the fold() function to stdout. 676 # Print the fold() function to stdout.
798 # 677 #
799 print_fold sqlite3FtsUnicodeFold 678 print_fold ${function_prefix}UnicodeFold
800 679
801 # Print the test routines and main() function to stdout, if -test 680 # Print the test routines and main() function to stdout, if -test
802 # was specified. 681 # was specified.
803 # 682 #
804 if {$::generate_test_code} { 683 if {$::generate_test_code} {
805 print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange 684 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
806 print_fold_test sqlite3FtsUnicodeFold $mappings 685 print_fold_test ${function_prefix}UnicodeFold $mappings
807 print_test_main 686 print_test_main
808 } 687 }
809 688
810 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" 689 if {$generate_fts5_code} {
811 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" 690 # no-op
691 } else {
692 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
693 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"
694 }
OLDNEW
« no previous file with comments | « third_party/sqlite/src/ext/fts3/tool/fts3view.c ('k') | third_party/sqlite/src/ext/fts3/unicode/parseunicode.tcl » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698