third_party/sqlite/src/ext/fts3/unicode/mkunicode.tcl - Issue 1610963002: Import SQLite 3.10.2.

Side by Side Diff: third_party/sqlite/src/ext/fts3/unicode/mkunicode.tcl

Issue 1610963002: Import SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1	1

2 #	2 source [file join [file dirname [info script]] parseunicode.tcl]

3 # Parameter $zName must be a path to the file UnicodeData.txt. This command

4 # reads the file and returns a list of mappings required to remove all

5 # diacritical marks from a unicode string. Each mapping is itself a list

6 # consisting of two elements - the unicode codepoint and the single ASCII

7 # character that it should be replaced with, or an empty string if the

8 # codepoint should simply be removed from the input. Examples:

9 #

10 # { 224 a } (replace codepoint 224 to "a")

11 # { 769 "" } (remove codepoint 769 from input)

12 #

13 # Mappings are only returned for non-upper case codepoints. It is assumed

14 # that the input has already been folded to lower case.

15 #

16 proc rd_load_unicodedata_text {zName} {

17 global tl_lookup_table

18

19 set fd [open $zName]

20 set lField {

21 code

22 character_name

23 general_category

24 canonical_combining_classes

25 bidirectional_category

26 character_decomposition_mapping

27 decimal_digit_value

28 digit_value

29 numeric_value

30 mirrored

31 unicode_1_name

32 iso10646_comment_field

33 uppercase_mapping

34 lowercase_mapping

35 titlecase_mapping

36 }

37 set lRet [list]

38

39 while { ![eof $fd] } {

40 set line [gets $fd]

41 if {$line == ""} continue

42

43 set fields [split $line ";"]

44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }

45 foreach $lField $fields {}

46 if { [llength $character_decomposition_mapping]!=2

47 \|\| [string is xdigit [lindex $character_decomposition_mapping 0]]==0

48 } {

49 continue

50 }

51

52 set iCode [expr "0x$code"]

53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]

54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]

55

56 if {[info exists tl_lookup_table($iCode)]} continue

57

58 if { ($iAscii >= 97 && $iAscii <= 122)

59 \|\| ($iAscii >= 65 && $iAscii <= 90)

60 } {

61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]]

62 set dia($iDia) 1

63 }

64 }

65

66 foreach d [array names dia] {

67 lappend lRet [list $d ""]

68 }

69 set lRet [lsort -integer -index 0 $lRet]

70

71 close $fd

72 set lRet

73 }

74

75	3

76 proc print_rd {map} {	4 proc print_rd {map} {

77 global tl_lookup_table	5 global tl_lookup_table

78 set aChar [list]	6 set aChar [list]

79 set lRange [list]	7 set lRange [list]

80	8

81 set nRange 1	9 set nRange 1

82 set iFirst [lindex $map 0 0]	10 set iFirst [lindex $map 0 0]

83 set cPrev [lindex $map 0 1]	11 set cPrev [lindex $map 0 1]

84	12

(...skipping 25 matching lines...) Expand all Loading...
110 lappend aChar $cPrev	38 lappend aChar $cPrev

111	39

112 puts "/*"	40 puts "/*"

113 puts "** If the argument is a codepoint corresponding to a lowercase letter"	41 puts "** If the argument is a codepoint corresponding to a lowercase letter"

114 puts "** in the ASCII range with a diacritic added, return the codepoint"	42 puts "** in the ASCII range with a diacritic added, return the codepoint"

115 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"	43 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"

116 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"	44 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"

117 puts "** E\"). The resuls of passing a codepoint that corresponds to an"	45 puts "** E\"). The resuls of passing a codepoint that corresponds to an"

118 puts "** uppercase letter are undefined."	46 puts "** uppercase letter are undefined."

119 puts "*/"	47 puts "*/"

120 puts "static int remove_diacritic(int c)\{"	48 puts "static int ${::remove_diacritic}(int c)\{"

121 puts " unsigned short aDia\[\] = \{"	49 puts " unsigned short aDia\[\] = \{"

122 puts -nonewline " 0, "	50 puts -nonewline " 0, "

123 set i 1	51 set i 1

124 foreach r $lRange {	52 foreach r $lRange {

125 foreach {iCode nRange} $r {}	53 foreach {iCode nRange} $r {}

126 if {($i % 8)==0} {puts "" ; puts -nonewline " " }	54 if {($i % 8)==0} {puts "" ; puts -nonewline " " }

127 incr i	55 incr i

128	56

129 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]	57 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]

130 puts -nonewline ", "	58 puts -nonewline ", "

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
197 puts " if( c<$iFirst \|\| c>$iLast ) return 0;"	125 puts " if( c<$iFirst \|\| c>$iLast ) return 0;"

198 puts " return (c < $iFirst+32) ?"	126 puts " return (c < $iFirst+32) ?"

199 puts " (mask0 & (1 << (c-$iFirst))) :"	127 puts " (mask0 & (1 << (c-$iFirst))) :"

200 puts " (mask1 & (1 << (c-$iFirst-32)));"	128 puts " (mask1 & (1 << (c-$iFirst-32)));"

201 puts "\}"	129 puts "\}"

202 }	130 }

203	131

204	132

205 #-------------------------------------------------------------------------	133 #-------------------------------------------------------------------------

206	134

207 # Parameter $zName must be a path to the file UnicodeData.txt. This command

208 # reads the file and returns a list of codepoints (integers). The list

209 # contains all codepoints in the UnicodeData.txt assigned to any "General

210 # Category" that is not a "Letter" or "Number".

211 #

212 proc an_load_unicodedata_text {zName} {

213 set fd [open $zName]

214 set lField {

215 code

216 character_name

217 general_category

218 canonical_combining_classes

219 bidirectional_category

220 character_decomposition_mapping

221 decimal_digit_value

222 digit_value

223 numeric_value

224 mirrored

225 unicode_1_name

226 iso10646_comment_field

227 uppercase_mapping

228 lowercase_mapping

229 titlecase_mapping

230 }

231 set lRet [list]

232

233 while { ![eof $fd] } {

234 set line [gets $fd]

235 if {$line == ""} continue

236

237 set fields [split $line ";"]

238 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }

239 foreach $lField $fields {}

240

241 set iCode [expr "0x$code"]

242 set bAlnum [expr {

243 [lsearch {L N} [string range $general_category 0 0]] >= 0

244 \|\| $general_category=="Co"

245 }]

246

247 if { !$bAlnum } { lappend lRet $iCode }

248 }

249

250 close $fd

251 set lRet

252 }

253

254 proc an_load_separator_ranges {} {	135 proc an_load_separator_ranges {} {

255 global unicodedata.txt	136 global unicodedata.txt

256 set lSep [an_load_unicodedata_text ${unicodedata.txt}]	137 set lSep [an_load_unicodedata_text ${unicodedata.txt}]

257 unset -nocomplain iFirst	138 unset -nocomplain iFirst

258 unset -nocomplain nRange	139 unset -nocomplain nRange

259 set lRange [list]	140 set lRange [list]

260 foreach sep $lSep {	141 foreach sep $lSep {

261 if {0==[info exists iFirst]} {	142 if {0==[info exists iFirst]} {

262 set iFirst $sep	143 set iFirst $sep

263 set nRange 1	144 set nRange 1

(...skipping 169 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
433 return 1;	314 return 1;

434 }	315 }

435 }	316 }

436 }]	317 }]

437 puts " return 0;"	318 puts " return 0;"

438 puts "\}"	319 puts "\}"

439 }	320 }

440	321

441 #-------------------------------------------------------------------------	322 #-------------------------------------------------------------------------

442	323

443 proc tl_load_casefolding_txt {zName} {

444 global tl_lookup_table

445

446 set fd [open $zName]

447 while { ![eof $fd] } {

448 set line [gets $fd]

449 if {[string range $line 0 0] == "#"} continue

450 if {$line == ""} continue

451

452 foreach x {a b c d} {unset -nocomplain $x}

453 foreach {a b c d} [split $line ";"] {}

454

455 set a2 [list]

456 set c2 [list]

457 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }

458 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }

459 set b [string trim $b]

460 set d [string trim $d]

461

462 if {$b=="C" \|\| $b=="S"} { set tl_lookup_table($a2) $c2 }

463 }

464 }

465

466 proc tl_create_records {} {	324 proc tl_create_records {} {

467 global tl_lookup_table	325 global tl_lookup_table

468	326

469 set iFirst ""	327 set iFirst ""

470 set nOff 0	328 set nOff 0

471 set nRange 0	329 set nRange 0

472 set nIncr 0	330 set nIncr 0

473	331

474 set lRecord [list]	332 set lRecord [list]

475 foreach code [lsort -integer [array names tl_lookup_table]] {	333 foreach code [lsort -integer [array names tl_lookup_table]] {

(...skipping 143 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
619 set liOff [tl_generate_ioff_table $lRecord]	477 set liOff [tl_generate_ioff_table $lRecord]

620 tl_print_table_header	478 tl_print_table_header

621 foreach entry $lRecord {	479 foreach entry $lRecord {

622 if {[tl_print_table_entry toggle $entry $liOff]} {	480 if {[tl_print_table_entry toggle $entry $liOff]} {

623 lappend lHigh $entry	481 lappend lHigh $entry

624 }	482 }

625 }	483 }

626 tl_print_table_footer toggle	484 tl_print_table_footer toggle

627 tl_print_ioff_table $liOff	485 tl_print_ioff_table $liOff

628	486

629 puts {	487 puts [subst -nocommands {

630 int ret = c;	488 int ret = c;

631	489

632 assert( c>=0 );

633 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );	490 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

634	491

635 if( c<128 ){	492 if( c<128 ){

636 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');	493 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');

637 }else if( c<65536 ){	494 }else if( c<65536 ){

	495 const struct TableEntry *p;

638 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;	496 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;

639 int iLo = 0;	497 int iLo = 0;

640 int iRes = -1;	498 int iRes = -1;

641	499

	500 assert( c>aEntry[0].iCode );

642 while( iHi>=iLo ){	501 while( iHi>=iLo ){

643 int iTest = (iHi + iLo) / 2;	502 int iTest = (iHi + iLo) / 2;

644 int cmp = (c - aEntry[iTest].iCode);	503 int cmp = (c - aEntry[iTest].iCode);

645 if( cmp>=0 ){	504 if( cmp>=0 ){

646 iRes = iTest;	505 iRes = iTest;

647 iLo = iTest+1;	506 iLo = iTest+1;

648 }else{	507 }else{

649 iHi = iTest-1;	508 iHi = iTest-1;

650 }	509 }

651 }	510 }

652 assert( iRes<0 \|\| c>=aEntry[iRes].iCode );

653	511

654 if( iRes>=0 ){	512 assert( iRes>=0 && c>=aEntry[iRes].iCode );

655 const struct TableEntry *p = &aEntry[iRes];	513 p = &aEntry[iRes];

656 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){	514 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){

657 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;	515 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;

658 assert( ret>0 );	516 assert( ret>0 );

659 }

660 }	517 }

661	518

662 if( bRemoveDiacritic ) ret = remove_diacritic(ret);	519 if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);

663 }	520 }

664 }	521 }]

665	522

666 foreach entry $lHigh {	523 foreach entry $lHigh {

667 tl_print_if_entry $entry	524 tl_print_if_entry $entry

668 }	525 }

669	526

670 puts ""	527 puts ""

671 puts " return ret;"	528 puts " return ret;"

672 puts "\}"	529 puts "\}"

673 }	530 }

674	531

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
725 ** May you share freely, never taking more than you give.	582 ** May you share freely, never taking more than you give.

726 **	583 **

727 ******************************************************************************	584 ******************************************************************************

728 */	585 */

729	586

730 /*	587 /*

731 ** DO NOT EDIT THIS MACHINE GENERATED FILE.	588 ** DO NOT EDIT THIS MACHINE GENERATED FILE.

732 */	589 */

733 }]	590 }]

734 puts ""	591 puts ""

735 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"	592 if {$::generate_fts5_code} {

736 puts "#if defined(SQLITE_ENABLE_FTS3) \|\| defined(SQLITE_ENABLE_FTS4)"	593 # no-op

	594 } else {

	595 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"

	596 puts "#if defined(SQLITE_ENABLE_FTS3) \|\| defined(SQLITE_ENABLE_FTS4)"

	597 }

737 puts ""	598 puts ""

738 puts "#include <assert.h>"	599 puts "#include <assert.h>"

739 puts ""	600 puts ""

740 }	601 }

741	602

742 proc print_test_main {} {	603 proc print_test_main {} {

743 puts ""	604 puts ""

744 puts "#include <stdio.h>"	605 puts "#include <stdio.h>"

745 puts ""	606 puts ""

746 puts "int main(int argc, char **argv)\{"	607 puts "int main(int argc, char **argv)\{"

747 puts " int r1, r2;"	608 puts " int r1, r2;"

748 puts " int code;"	609 puts " int code;"

749 puts " r1 = isalnum_test(&code);"	610 puts " r1 = isalnum_test(&code);"

750 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"	611 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"

751 puts " else printf(\"isalnum(): test passed\\n\");"	612 puts " else printf(\"isalnum(): test passed\\n\");"

752 puts " r2 = fold_test(&code);"	613 puts " r2 = fold_test(&code);"

753 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"	614 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"

754 puts " else printf(\"fold(): test passed\\n\");"	615 puts " else printf(\"fold(): test passed\\n\");"

755 puts " return (r1 \|\| r2);"	616 puts " return (r1 \|\| r2);"

756 puts "\}"	617 puts "\}"

757 }	618 }

758	619

759 # Proces the command line arguments. Exit early if they are not to	620 # Proces the command line arguments. Exit early if they are not to

760 # our liking.	621 # our liking.

761 #	622 #

762 proc usage {} {	623 proc usage {} {

763 puts -nonewline stderr "Usage: $::argv0 ?-test? "	624 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "

764 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"	625 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"

765 exit 1	626 exit 1

766 }	627 }

767 if {[llength $argv]!=2 && [llength $argv]!=3} usage	628 if {[llength $argv]<2} usage

768 if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage

769 set unicodedata.txt [lindex $argv end]	629 set unicodedata.txt [lindex $argv end]

770 set casefolding.txt [lindex $argv end-1]	630 set casefolding.txt [lindex $argv end-1]

771 set generate_test_code [expr {[llength $argv]==3}]	631

	632 set remove_diacritic remove_diacritic

	633 set generate_test_code 0

	634 set generate_fts5_code 0

	635 set function_prefix "sqlite3Fts"

	636 for {set i 0} {$i < [llength $argv]-2} {incr i} {

	637 switch -- [lindex $argv $i] {

	638 -test {

	639 set generate_test_code 1

	640 }

	641 -fts5 {

	642 set function_prefix sqlite3Fts5

	643 set generate_fts5_code 1

	644 set remove_diacritic fts5_remove_diacritic

	645 }

	646 default {

	647 usage

	648 }

	649 }

	650 }

772	651

773 print_fileheader	652 print_fileheader

774	653

775 # Print the isalnum() function to stdout.	654 # Print the isalnum() function to stdout.

776 #	655 #

777 set lRange [an_load_separator_ranges]	656 set lRange [an_load_separator_ranges]

778 print_isalnum sqlite3FtsUnicodeIsalnum $lRange	657 print_isalnum ${function_prefix}UnicodeIsalnum $lRange

779	658

780 # Leave a gap between the two generated C functions.	659 # Leave a gap between the two generated C functions.

781 #	660 #

782 puts ""	661 puts ""

783 puts ""	662 puts ""

784	663

785 # Load the fold data. This is used by the [rd_XXX] commands	664 # Load the fold data. This is used by the [rd_XXX] commands

786 # as well as [print_fold].	665 # as well as [print_fold].

787 tl_load_casefolding_txt ${casefolding.txt}	666 tl_load_casefolding_txt ${casefolding.txt}

788	667

789 set mappings [rd_load_unicodedata_text ${unicodedata.txt}]	668 set mappings [rd_load_unicodedata_text ${unicodedata.txt}]

790 print_rd $mappings	669 print_rd $mappings

791 puts ""	670 puts ""

792 puts ""	671 puts ""

793 print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings	672 print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings

794 puts ""	673 puts ""

795 puts ""	674 puts ""

796	675

797 # Print the fold() function to stdout.	676 # Print the fold() function to stdout.

798 #	677 #

799 print_fold sqlite3FtsUnicodeFold	678 print_fold ${function_prefix}UnicodeFold

800	679

801 # Print the test routines and main() function to stdout, if -test	680 # Print the test routines and main() function to stdout, if -test

802 # was specified.	681 # was specified.

803 #	682 #

804 if {$::generate_test_code} {	683 if {$::generate_test_code} {

805 print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange	684 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange

806 print_fold_test sqlite3FtsUnicodeFold $mappings	685 print_fold_test ${function_prefix}UnicodeFold $mappings

807 print_test_main	686 print_test_main

808 }	687 }

809	688

810 puts "#endif /* defined(SQLITE_ENABLE_FTS3) \|\| defined(SQLITE_ENABLE_FTS4) */"	689 if {$generate_fts5_code} {

811 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"	690 # no-op

	691 } else {

	692 puts "#endif /* defined(SQLITE_ENABLE_FTS3) \|\| defined(SQLITE_ENABLE_FTS4) */"

	693 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"

	694 }

OLD	NEW

« no previous file with comments | « third_party/sqlite/src/ext/fts3/tool/fts3view.c ('k') | third_party/sqlite/src/ext/fts3/unicode/parseunicode.tcl » ('j') | no next file with comments »