| OLD | NEW |
| 1 | 1 |
| 2 # | 2 source [file join [file dirname [info script]] parseunicode.tcl] |
| 3 # Parameter $zName must be a path to the file UnicodeData.txt. This command | |
| 4 # reads the file and returns a list of mappings required to remove all | |
| 5 # diacritical marks from a unicode string. Each mapping is itself a list | |
| 6 # consisting of two elements - the unicode codepoint and the single ASCII | |
| 7 # character that it should be replaced with, or an empty string if the | |
| 8 # codepoint should simply be removed from the input. Examples: | |
| 9 # | |
| 10 # { 224 a } (replace codepoint 224 to "a") | |
| 11 # { 769 "" } (remove codepoint 769 from input) | |
| 12 # | |
| 13 # Mappings are only returned for non-upper case codepoints. It is assumed | |
| 14 # that the input has already been folded to lower case. | |
| 15 # | |
| 16 proc rd_load_unicodedata_text {zName} { | |
| 17 global tl_lookup_table | |
| 18 | |
| 19 set fd [open $zName] | |
| 20 set lField { | |
| 21 code | |
| 22 character_name | |
| 23 general_category | |
| 24 canonical_combining_classes | |
| 25 bidirectional_category | |
| 26 character_decomposition_mapping | |
| 27 decimal_digit_value | |
| 28 digit_value | |
| 29 numeric_value | |
| 30 mirrored | |
| 31 unicode_1_name | |
| 32 iso10646_comment_field | |
| 33 uppercase_mapping | |
| 34 lowercase_mapping | |
| 35 titlecase_mapping | |
| 36 } | |
| 37 set lRet [list] | |
| 38 | |
| 39 while { ![eof $fd] } { | |
| 40 set line [gets $fd] | |
| 41 if {$line == ""} continue | |
| 42 | |
| 43 set fields [split $line ";"] | |
| 44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } | |
| 45 foreach $lField $fields {} | |
| 46 if { [llength $character_decomposition_mapping]!=2 | |
| 47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 | |
| 48 } { | |
| 49 continue | |
| 50 } | |
| 51 | |
| 52 set iCode [expr "0x$code"] | |
| 53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] | |
| 54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] | |
| 55 | |
| 56 if {[info exists tl_lookup_table($iCode)]} continue | |
| 57 | |
| 58 if { ($iAscii >= 97 && $iAscii <= 122) | |
| 59 || ($iAscii >= 65 && $iAscii <= 90) | |
| 60 } { | |
| 61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]] | |
| 62 set dia($iDia) 1 | |
| 63 } | |
| 64 } | |
| 65 | |
| 66 foreach d [array names dia] { | |
| 67 lappend lRet [list $d ""] | |
| 68 } | |
| 69 set lRet [lsort -integer -index 0 $lRet] | |
| 70 | |
| 71 close $fd | |
| 72 set lRet | |
| 73 } | |
| 74 | |
| 75 | 3 |
| 76 proc print_rd {map} { | 4 proc print_rd {map} { |
| 77 global tl_lookup_table | 5 global tl_lookup_table |
| 78 set aChar [list] | 6 set aChar [list] |
| 79 set lRange [list] | 7 set lRange [list] |
| 80 | 8 |
| 81 set nRange 1 | 9 set nRange 1 |
| 82 set iFirst [lindex $map 0 0] | 10 set iFirst [lindex $map 0 0] |
| 83 set cPrev [lindex $map 0 1] | 11 set cPrev [lindex $map 0 1] |
| 84 | 12 |
| (...skipping 25 matching lines...) Expand all Loading... |
| 110 lappend aChar $cPrev | 38 lappend aChar $cPrev |
| 111 | 39 |
| 112 puts "/*" | 40 puts "/*" |
| 113 puts "** If the argument is a codepoint corresponding to a lowercase letter" | 41 puts "** If the argument is a codepoint corresponding to a lowercase letter" |
| 114 puts "** in the ASCII range with a diacritic added, return the codepoint" | 42 puts "** in the ASCII range with a diacritic added, return the codepoint" |
| 115 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" | 43 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" |
| 116 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" | 44 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" |
| 117 puts "** E\"). The resuls of passing a codepoint that corresponds to an" | 45 puts "** E\"). The resuls of passing a codepoint that corresponds to an" |
| 118 puts "** uppercase letter are undefined." | 46 puts "** uppercase letter are undefined." |
| 119 puts "*/" | 47 puts "*/" |
| 120 puts "static int remove_diacritic(int c)\{" | 48 puts "static int ${::remove_diacritic}(int c)\{" |
| 121 puts " unsigned short aDia\[\] = \{" | 49 puts " unsigned short aDia\[\] = \{" |
| 122 puts -nonewline " 0, " | 50 puts -nonewline " 0, " |
| 123 set i 1 | 51 set i 1 |
| 124 foreach r $lRange { | 52 foreach r $lRange { |
| 125 foreach {iCode nRange} $r {} | 53 foreach {iCode nRange} $r {} |
| 126 if {($i % 8)==0} {puts "" ; puts -nonewline " " } | 54 if {($i % 8)==0} {puts "" ; puts -nonewline " " } |
| 127 incr i | 55 incr i |
| 128 | 56 |
| 129 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] | 57 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] |
| 130 puts -nonewline ", " | 58 puts -nonewline ", " |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 197 puts " if( c<$iFirst || c>$iLast ) return 0;" | 125 puts " if( c<$iFirst || c>$iLast ) return 0;" |
| 198 puts " return (c < $iFirst+32) ?" | 126 puts " return (c < $iFirst+32) ?" |
| 199 puts " (mask0 & (1 << (c-$iFirst))) :" | 127 puts " (mask0 & (1 << (c-$iFirst))) :" |
| 200 puts " (mask1 & (1 << (c-$iFirst-32)));" | 128 puts " (mask1 & (1 << (c-$iFirst-32)));" |
| 201 puts "\}" | 129 puts "\}" |
| 202 } | 130 } |
| 203 | 131 |
| 204 | 132 |
| 205 #------------------------------------------------------------------------- | 133 #------------------------------------------------------------------------- |
| 206 | 134 |
| 207 # Parameter $zName must be a path to the file UnicodeData.txt. This command | |
| 208 # reads the file and returns a list of codepoints (integers). The list | |
| 209 # contains all codepoints in the UnicodeData.txt assigned to any "General | |
| 210 # Category" that is not a "Letter" or "Number". | |
| 211 # | |
| 212 proc an_load_unicodedata_text {zName} { | |
| 213 set fd [open $zName] | |
| 214 set lField { | |
| 215 code | |
| 216 character_name | |
| 217 general_category | |
| 218 canonical_combining_classes | |
| 219 bidirectional_category | |
| 220 character_decomposition_mapping | |
| 221 decimal_digit_value | |
| 222 digit_value | |
| 223 numeric_value | |
| 224 mirrored | |
| 225 unicode_1_name | |
| 226 iso10646_comment_field | |
| 227 uppercase_mapping | |
| 228 lowercase_mapping | |
| 229 titlecase_mapping | |
| 230 } | |
| 231 set lRet [list] | |
| 232 | |
| 233 while { ![eof $fd] } { | |
| 234 set line [gets $fd] | |
| 235 if {$line == ""} continue | |
| 236 | |
| 237 set fields [split $line ";"] | |
| 238 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } | |
| 239 foreach $lField $fields {} | |
| 240 | |
| 241 set iCode [expr "0x$code"] | |
| 242 set bAlnum [expr { | |
| 243 [lsearch {L N} [string range $general_category 0 0]] >= 0 | |
| 244 || $general_category=="Co" | |
| 245 }] | |
| 246 | |
| 247 if { !$bAlnum } { lappend lRet $iCode } | |
| 248 } | |
| 249 | |
| 250 close $fd | |
| 251 set lRet | |
| 252 } | |
| 253 | |
| 254 proc an_load_separator_ranges {} { | 135 proc an_load_separator_ranges {} { |
| 255 global unicodedata.txt | 136 global unicodedata.txt |
| 256 set lSep [an_load_unicodedata_text ${unicodedata.txt}] | 137 set lSep [an_load_unicodedata_text ${unicodedata.txt}] |
| 257 unset -nocomplain iFirst | 138 unset -nocomplain iFirst |
| 258 unset -nocomplain nRange | 139 unset -nocomplain nRange |
| 259 set lRange [list] | 140 set lRange [list] |
| 260 foreach sep $lSep { | 141 foreach sep $lSep { |
| 261 if {0==[info exists iFirst]} { | 142 if {0==[info exists iFirst]} { |
| 262 set iFirst $sep | 143 set iFirst $sep |
| 263 set nRange 1 | 144 set nRange 1 |
| (...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 433 return 1; | 314 return 1; |
| 434 } | 315 } |
| 435 } | 316 } |
| 436 }] | 317 }] |
| 437 puts " return 0;" | 318 puts " return 0;" |
| 438 puts "\}" | 319 puts "\}" |
| 439 } | 320 } |
| 440 | 321 |
| 441 #------------------------------------------------------------------------- | 322 #------------------------------------------------------------------------- |
| 442 | 323 |
| 443 proc tl_load_casefolding_txt {zName} { | |
| 444 global tl_lookup_table | |
| 445 | |
| 446 set fd [open $zName] | |
| 447 while { ![eof $fd] } { | |
| 448 set line [gets $fd] | |
| 449 if {[string range $line 0 0] == "#"} continue | |
| 450 if {$line == ""} continue | |
| 451 | |
| 452 foreach x {a b c d} {unset -nocomplain $x} | |
| 453 foreach {a b c d} [split $line ";"] {} | |
| 454 | |
| 455 set a2 [list] | |
| 456 set c2 [list] | |
| 457 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } | |
| 458 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } | |
| 459 set b [string trim $b] | |
| 460 set d [string trim $d] | |
| 461 | |
| 462 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } | |
| 463 } | |
| 464 } | |
| 465 | |
| 466 proc tl_create_records {} { | 324 proc tl_create_records {} { |
| 467 global tl_lookup_table | 325 global tl_lookup_table |
| 468 | 326 |
| 469 set iFirst "" | 327 set iFirst "" |
| 470 set nOff 0 | 328 set nOff 0 |
| 471 set nRange 0 | 329 set nRange 0 |
| 472 set nIncr 0 | 330 set nIncr 0 |
| 473 | 331 |
| 474 set lRecord [list] | 332 set lRecord [list] |
| 475 foreach code [lsort -integer [array names tl_lookup_table]] { | 333 foreach code [lsort -integer [array names tl_lookup_table]] { |
| (...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 619 set liOff [tl_generate_ioff_table $lRecord] | 477 set liOff [tl_generate_ioff_table $lRecord] |
| 620 tl_print_table_header | 478 tl_print_table_header |
| 621 foreach entry $lRecord { | 479 foreach entry $lRecord { |
| 622 if {[tl_print_table_entry toggle $entry $liOff]} { | 480 if {[tl_print_table_entry toggle $entry $liOff]} { |
| 623 lappend lHigh $entry | 481 lappend lHigh $entry |
| 624 } | 482 } |
| 625 } | 483 } |
| 626 tl_print_table_footer toggle | 484 tl_print_table_footer toggle |
| 627 tl_print_ioff_table $liOff | 485 tl_print_ioff_table $liOff |
| 628 | 486 |
| 629 puts { | 487 puts [subst -nocommands { |
| 630 int ret = c; | 488 int ret = c; |
| 631 | 489 |
| 632 assert( c>=0 ); | |
| 633 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); | 490 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); |
| 634 | 491 |
| 635 if( c<128 ){ | 492 if( c<128 ){ |
| 636 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); | 493 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); |
| 637 }else if( c<65536 ){ | 494 }else if( c<65536 ){ |
| 495 const struct TableEntry *p; |
| 638 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; | 496 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; |
| 639 int iLo = 0; | 497 int iLo = 0; |
| 640 int iRes = -1; | 498 int iRes = -1; |
| 641 | 499 |
| 500 assert( c>aEntry[0].iCode ); |
| 642 while( iHi>=iLo ){ | 501 while( iHi>=iLo ){ |
| 643 int iTest = (iHi + iLo) / 2; | 502 int iTest = (iHi + iLo) / 2; |
| 644 int cmp = (c - aEntry[iTest].iCode); | 503 int cmp = (c - aEntry[iTest].iCode); |
| 645 if( cmp>=0 ){ | 504 if( cmp>=0 ){ |
| 646 iRes = iTest; | 505 iRes = iTest; |
| 647 iLo = iTest+1; | 506 iLo = iTest+1; |
| 648 }else{ | 507 }else{ |
| 649 iHi = iTest-1; | 508 iHi = iTest-1; |
| 650 } | 509 } |
| 651 } | 510 } |
| 652 assert( iRes<0 || c>=aEntry[iRes].iCode ); | |
| 653 | 511 |
| 654 if( iRes>=0 ){ | 512 assert( iRes>=0 && c>=aEntry[iRes].iCode ); |
| 655 const struct TableEntry *p = &aEntry[iRes]; | 513 p = &aEntry[iRes]; |
| 656 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ | 514 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ |
| 657 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; | 515 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; |
| 658 assert( ret>0 ); | 516 assert( ret>0 ); |
| 659 } | |
| 660 } | 517 } |
| 661 | 518 |
| 662 if( bRemoveDiacritic ) ret = remove_diacritic(ret); | 519 if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret); |
| 663 } | 520 } |
| 664 } | 521 }] |
| 665 | 522 |
| 666 foreach entry $lHigh { | 523 foreach entry $lHigh { |
| 667 tl_print_if_entry $entry | 524 tl_print_if_entry $entry |
| 668 } | 525 } |
| 669 | 526 |
| 670 puts "" | 527 puts "" |
| 671 puts " return ret;" | 528 puts " return ret;" |
| 672 puts "\}" | 529 puts "\}" |
| 673 } | 530 } |
| 674 | 531 |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 725 ** May you share freely, never taking more than you give. | 582 ** May you share freely, never taking more than you give. |
| 726 ** | 583 ** |
| 727 ****************************************************************************** | 584 ****************************************************************************** |
| 728 */ | 585 */ |
| 729 | 586 |
| 730 /* | 587 /* |
| 731 ** DO NOT EDIT THIS MACHINE GENERATED FILE. | 588 ** DO NOT EDIT THIS MACHINE GENERATED FILE. |
| 732 */ | 589 */ |
| 733 }] | 590 }] |
| 734 puts "" | 591 puts "" |
| 735 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" | 592 if {$::generate_fts5_code} { |
| 736 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" | 593 # no-op |
| 594 } else { |
| 595 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" |
| 596 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" |
| 597 } |
| 737 puts "" | 598 puts "" |
| 738 puts "#include <assert.h>" | 599 puts "#include <assert.h>" |
| 739 puts "" | 600 puts "" |
| 740 } | 601 } |
| 741 | 602 |
| 742 proc print_test_main {} { | 603 proc print_test_main {} { |
| 743 puts "" | 604 puts "" |
| 744 puts "#include <stdio.h>" | 605 puts "#include <stdio.h>" |
| 745 puts "" | 606 puts "" |
| 746 puts "int main(int argc, char **argv)\{" | 607 puts "int main(int argc, char **argv)\{" |
| 747 puts " int r1, r2;" | 608 puts " int r1, r2;" |
| 748 puts " int code;" | 609 puts " int code;" |
| 749 puts " r1 = isalnum_test(&code);" | 610 puts " r1 = isalnum_test(&code);" |
| 750 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" | 611 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" |
| 751 puts " else printf(\"isalnum(): test passed\\n\");" | 612 puts " else printf(\"isalnum(): test passed\\n\");" |
| 752 puts " r2 = fold_test(&code);" | 613 puts " r2 = fold_test(&code);" |
| 753 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" | 614 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" |
| 754 puts " else printf(\"fold(): test passed\\n\");" | 615 puts " else printf(\"fold(): test passed\\n\");" |
| 755 puts " return (r1 || r2);" | 616 puts " return (r1 || r2);" |
| 756 puts "\}" | 617 puts "\}" |
| 757 } | 618 } |
| 758 | 619 |
| 759 # Proces the command line arguments. Exit early if they are not to | 620 # Proces the command line arguments. Exit early if they are not to |
| 760 # our liking. | 621 # our liking. |
| 761 # | 622 # |
| 762 proc usage {} { | 623 proc usage {} { |
| 763 puts -nonewline stderr "Usage: $::argv0 ?-test? " | 624 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? " |
| 764 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" | 625 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" |
| 765 exit 1 | 626 exit 1 |
| 766 } | 627 } |
| 767 if {[llength $argv]!=2 && [llength $argv]!=3} usage | 628 if {[llength $argv]<2} usage |
| 768 if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage | |
| 769 set unicodedata.txt [lindex $argv end] | 629 set unicodedata.txt [lindex $argv end] |
| 770 set casefolding.txt [lindex $argv end-1] | 630 set casefolding.txt [lindex $argv end-1] |
| 771 set generate_test_code [expr {[llength $argv]==3}] | 631 |
| 632 set remove_diacritic remove_diacritic |
| 633 set generate_test_code 0 |
| 634 set generate_fts5_code 0 |
| 635 set function_prefix "sqlite3Fts" |
| 636 for {set i 0} {$i < [llength $argv]-2} {incr i} { |
| 637 switch -- [lindex $argv $i] { |
| 638 -test { |
| 639 set generate_test_code 1 |
| 640 } |
| 641 -fts5 { |
| 642 set function_prefix sqlite3Fts5 |
| 643 set generate_fts5_code 1 |
| 644 set remove_diacritic fts5_remove_diacritic |
| 645 } |
| 646 default { |
| 647 usage |
| 648 } |
| 649 } |
| 650 } |
| 772 | 651 |
| 773 print_fileheader | 652 print_fileheader |
| 774 | 653 |
| 775 # Print the isalnum() function to stdout. | 654 # Print the isalnum() function to stdout. |
| 776 # | 655 # |
| 777 set lRange [an_load_separator_ranges] | 656 set lRange [an_load_separator_ranges] |
| 778 print_isalnum sqlite3FtsUnicodeIsalnum $lRange | 657 print_isalnum ${function_prefix}UnicodeIsalnum $lRange |
| 779 | 658 |
| 780 # Leave a gap between the two generated C functions. | 659 # Leave a gap between the two generated C functions. |
| 781 # | 660 # |
| 782 puts "" | 661 puts "" |
| 783 puts "" | 662 puts "" |
| 784 | 663 |
| 785 # Load the fold data. This is used by the [rd_XXX] commands | 664 # Load the fold data. This is used by the [rd_XXX] commands |
| 786 # as well as [print_fold]. | 665 # as well as [print_fold]. |
| 787 tl_load_casefolding_txt ${casefolding.txt} | 666 tl_load_casefolding_txt ${casefolding.txt} |
| 788 | 667 |
| 789 set mappings [rd_load_unicodedata_text ${unicodedata.txt}] | 668 set mappings [rd_load_unicodedata_text ${unicodedata.txt}] |
| 790 print_rd $mappings | 669 print_rd $mappings |
| 791 puts "" | 670 puts "" |
| 792 puts "" | 671 puts "" |
| 793 print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings | 672 print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings |
| 794 puts "" | 673 puts "" |
| 795 puts "" | 674 puts "" |
| 796 | 675 |
| 797 # Print the fold() function to stdout. | 676 # Print the fold() function to stdout. |
| 798 # | 677 # |
| 799 print_fold sqlite3FtsUnicodeFold | 678 print_fold ${function_prefix}UnicodeFold |
| 800 | 679 |
| 801 # Print the test routines and main() function to stdout, if -test | 680 # Print the test routines and main() function to stdout, if -test |
| 802 # was specified. | 681 # was specified. |
| 803 # | 682 # |
| 804 if {$::generate_test_code} { | 683 if {$::generate_test_code} { |
| 805 print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange | 684 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange |
| 806 print_fold_test sqlite3FtsUnicodeFold $mappings | 685 print_fold_test ${function_prefix}UnicodeFold $mappings |
| 807 print_test_main | 686 print_test_main |
| 808 } | 687 } |
| 809 | 688 |
| 810 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" | 689 if {$generate_fts5_code} { |
| 811 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" | 690 # no-op |
| 691 } else { |
| 692 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" |
| 693 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" |
| 694 } |
| OLD | NEW |