1
0

mkunicode.tcl 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811
  1. #
  2. # Parameter $zName must be a path to the file UnicodeData.txt. This command
  3. # reads the file and returns a list of mappings required to remove all
  4. # diacritical marks from a unicode string. Each mapping is itself a list
  5. # consisting of two elements - the unicode codepoint and the single ASCII
  6. # character that it should be replaced with, or an empty string if the
  7. # codepoint should simply be removed from the input. Examples:
  8. #
  9. # { 224 a } (replace codepoint 224 to "a")
  10. # { 769 "" } (remove codepoint 769 from input)
  11. #
  12. # Mappings are only returned for non-upper case codepoints. It is assumed
  13. # that the input has already been folded to lower case.
  14. #
  15. proc rd_load_unicodedata_text {zName} {
  16. global tl_lookup_table
  17. set fd [open $zName]
  18. set lField {
  19. code
  20. character_name
  21. general_category
  22. canonical_combining_classes
  23. bidirectional_category
  24. character_decomposition_mapping
  25. decimal_digit_value
  26. digit_value
  27. numeric_value
  28. mirrored
  29. unicode_1_name
  30. iso10646_comment_field
  31. uppercase_mapping
  32. lowercase_mapping
  33. titlecase_mapping
  34. }
  35. set lRet [list]
  36. while { ![eof $fd] } {
  37. set line [gets $fd]
  38. if {$line == ""} continue
  39. set fields [split $line ";"]
  40. if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
  41. foreach $lField $fields {}
  42. if { [llength $character_decomposition_mapping]!=2
  43. || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
  44. } {
  45. continue
  46. }
  47. set iCode [expr "0x$code"]
  48. set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
  49. set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
  50. if {[info exists tl_lookup_table($iCode)]} continue
  51. if { ($iAscii >= 97 && $iAscii <= 122)
  52. || ($iAscii >= 65 && $iAscii <= 90)
  53. } {
  54. lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
  55. set dia($iDia) 1
  56. }
  57. }
  58. foreach d [array names dia] {
  59. lappend lRet [list $d ""]
  60. }
  61. set lRet [lsort -integer -index 0 $lRet]
  62. close $fd
  63. set lRet
  64. }
  65. proc print_rd {map} {
  66. global tl_lookup_table
  67. set aChar [list]
  68. set lRange [list]
  69. set nRange 1
  70. set iFirst [lindex $map 0 0]
  71. set cPrev [lindex $map 0 1]
  72. foreach m [lrange $map 1 end] {
  73. foreach {i c} $m {}
  74. if {$cPrev == $c} {
  75. for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
  76. if {[info exists tl_lookup_table($j)]==0} break
  77. }
  78. if {$j==$i} {
  79. set nNew [expr {(1 + $i - $iFirst)}]
  80. if {$nNew<=8} {
  81. set nRange $nNew
  82. continue
  83. }
  84. }
  85. }
  86. lappend lRange [list $iFirst $nRange]
  87. lappend aChar $cPrev
  88. set iFirst $i
  89. set cPrev $c
  90. set nRange 1
  91. }
  92. lappend lRange [list $iFirst $nRange]
  93. lappend aChar $cPrev
  94. puts "/*"
  95. puts "** If the argument is a codepoint corresponding to a lowercase letter"
  96. puts "** in the ASCII range with a diacritic added, return the codepoint"
  97. puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
  98. puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
  99. puts "** E\"). The resuls of passing a codepoint that corresponds to an"
  100. puts "** uppercase letter are undefined."
  101. puts "*/"
  102. puts "static int remove_diacritic(int c)\{"
  103. puts " unsigned short aDia\[\] = \{"
  104. puts -nonewline " 0, "
  105. set i 1
  106. foreach r $lRange {
  107. foreach {iCode nRange} $r {}
  108. if {($i % 8)==0} {puts "" ; puts -nonewline " " }
  109. incr i
  110. puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
  111. puts -nonewline ", "
  112. }
  113. puts ""
  114. puts " \};"
  115. puts " char aChar\[\] = \{"
  116. puts -nonewline " '\\0', "
  117. set i 1
  118. foreach c $aChar {
  119. set str "'$c', "
  120. if {$c == ""} { set str "'\\0', " }
  121. if {($i % 12)==0} {puts "" ; puts -nonewline " " }
  122. incr i
  123. puts -nonewline "$str"
  124. }
  125. puts ""
  126. puts " \};"
  127. puts {
  128. unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  129. int iRes = 0;
  130. int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  131. int iLo = 0;
  132. while( iHi>=iLo ){
  133. int iTest = (iHi + iLo) / 2;
  134. if( key >= aDia[iTest] ){
  135. iRes = iTest;
  136. iLo = iTest+1;
  137. }else{
  138. iHi = iTest-1;
  139. }
  140. }
  141. assert( key>=aDia[iRes] );
  142. return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
  143. puts "\};"
  144. }
  145. proc print_isdiacritic {zFunc map} {
  146. set lCode [list]
  147. foreach m $map {
  148. foreach {code char} $m {}
  149. if {$code && $char == ""} { lappend lCode $code }
  150. }
  151. set lCode [lsort -integer $lCode]
  152. set iFirst [lindex $lCode 0]
  153. set iLast [lindex $lCode end]
  154. set i1 0
  155. set i2 0
  156. foreach c $lCode {
  157. set i [expr $c - $iFirst]
  158. if {$i < 32} {
  159. set i1 [expr {$i1 | (1<<$i)}]
  160. } else {
  161. set i2 [expr {$i2 | (1<<($i-32))}]
  162. }
  163. }
  164. puts "/*"
  165. puts "** Return true if the argument interpreted as a unicode codepoint"
  166. puts "** is a diacritical modifier character."
  167. puts "*/"
  168. puts "int ${zFunc}\(int c)\{"
  169. puts " unsigned int mask0 = [format "0x%08X" $i1];"
  170. puts " unsigned int mask1 = [format "0x%08X" $i2];"
  171. puts " if( c<$iFirst || c>$iLast ) return 0;"
  172. puts " return (c < $iFirst+32) ?"
  173. puts " (mask0 & (1 << (c-$iFirst))) :"
  174. puts " (mask1 & (1 << (c-$iFirst-32)));"
  175. puts "\}"
  176. }
  177. #-------------------------------------------------------------------------
  178. # Parameter $zName must be a path to the file UnicodeData.txt. This command
  179. # reads the file and returns a list of codepoints (integers). The list
  180. # contains all codepoints in the UnicodeData.txt assigned to any "General
  181. # Category" that is not a "Letter" or "Number".
  182. #
  183. proc an_load_unicodedata_text {zName} {
  184. set fd [open $zName]
  185. set lField {
  186. code
  187. character_name
  188. general_category
  189. canonical_combining_classes
  190. bidirectional_category
  191. character_decomposition_mapping
  192. decimal_digit_value
  193. digit_value
  194. numeric_value
  195. mirrored
  196. unicode_1_name
  197. iso10646_comment_field
  198. uppercase_mapping
  199. lowercase_mapping
  200. titlecase_mapping
  201. }
  202. set lRet [list]
  203. while { ![eof $fd] } {
  204. set line [gets $fd]
  205. if {$line == ""} continue
  206. set fields [split $line ";"]
  207. if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
  208. foreach $lField $fields {}
  209. set iCode [expr "0x$code"]
  210. set bAlnum [expr {
  211. [lsearch {L N} [string range $general_category 0 0]] >= 0
  212. || $general_category=="Co"
  213. }]
  214. if { !$bAlnum } { lappend lRet $iCode }
  215. }
  216. close $fd
  217. set lRet
  218. }
  219. proc an_load_separator_ranges {} {
  220. global unicodedata.txt
  221. set lSep [an_load_unicodedata_text ${unicodedata.txt}]
  222. unset -nocomplain iFirst
  223. unset -nocomplain nRange
  224. set lRange [list]
  225. foreach sep $lSep {
  226. if {0==[info exists iFirst]} {
  227. set iFirst $sep
  228. set nRange 1
  229. } elseif { $sep == ($iFirst+$nRange) } {
  230. incr nRange
  231. } else {
  232. lappend lRange [list $iFirst $nRange]
  233. set iFirst $sep
  234. set nRange 1
  235. }
  236. }
  237. lappend lRange [list $iFirst $nRange]
  238. set lRange
  239. }
  240. proc an_print_range_array {lRange} {
  241. set iFirstMax 0
  242. set nRangeMax 0
  243. foreach range $lRange {
  244. foreach {iFirst nRange} $range {}
  245. if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
  246. if {$nRange > $nRangeMax} {set nRangeMax $nRange}
  247. }
  248. if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
  249. if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
  250. puts -nonewline " "
  251. puts [string trim {
  252. /* Each unsigned integer in the following array corresponds to a contiguous
  253. ** range of unicode codepoints that are not either letters or numbers (i.e.
  254. ** codepoints for which this function should return 0).
  255. **
  256. ** The most significant 22 bits in each 32-bit value contain the first
  257. ** codepoint in the range. The least significant 10 bits are used to store
  258. ** the size of the range (always at least 1). In other words, the value
  259. ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
  260. ** C. It is not possible to represent a range larger than 1023 codepoints
  261. ** using this format.
  262. */
  263. }]
  264. puts -nonewline " const static unsigned int aEntry\[\] = \{"
  265. set i 0
  266. foreach range $lRange {
  267. foreach {iFirst nRange} $range {}
  268. set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
  269. if {($i % 5)==0} {puts "" ; puts -nonewline " "}
  270. puts -nonewline " $u32,"
  271. incr i
  272. }
  273. puts ""
  274. puts " \};"
  275. }
  276. proc an_print_ascii_bitmap {lRange} {
  277. foreach range $lRange {
  278. foreach {iFirst nRange} $range {}
  279. for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
  280. if {$i<=127} { set a($i) 1 }
  281. }
  282. }
  283. set aAscii [list 0 0 0 0]
  284. foreach key [array names a] {
  285. set idx [expr $key >> 5]
  286. lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
  287. }
  288. puts " static const unsigned int aAscii\[4\] = \{"
  289. puts -nonewline " "
  290. foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
  291. puts ""
  292. puts " \};"
  293. }
  294. proc print_isalnum {zFunc lRange} {
  295. puts "/*"
  296. puts "** Return true if the argument corresponds to a unicode codepoint"
  297. puts "** classified as either a letter or a number. Otherwise false."
  298. puts "**"
  299. puts "** The results are undefined if the value passed to this function"
  300. puts "** is less than zero."
  301. puts "*/"
  302. puts "int ${zFunc}\(int c)\{"
  303. an_print_range_array $lRange
  304. an_print_ascii_bitmap $lRange
  305. puts {
  306. if( c<128 ){
  307. return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
  308. }else if( c<(1<<22) ){
  309. unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
  310. int iRes;
  311. int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
  312. int iLo = 0;
  313. while( iHi>=iLo ){
  314. int iTest = (iHi + iLo) / 2;
  315. if( key >= aEntry[iTest] ){
  316. iRes = iTest;
  317. iLo = iTest+1;
  318. }else{
  319. iHi = iTest-1;
  320. }
  321. }
  322. assert( aEntry[0]<key );
  323. assert( key>=aEntry[iRes] );
  324. return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
  325. }
  326. return 1;}
  327. puts "\}"
  328. }
  329. proc print_test_isalnum {zFunc lRange} {
  330. foreach range $lRange {
  331. foreach {iFirst nRange} $range {}
  332. for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
  333. }
  334. puts "static int isalnum_test(int *piCode)\{"
  335. puts -nonewline " unsigned char aAlnum\[\] = \{"
  336. for {set i 0} {$i < 70000} {incr i} {
  337. if {($i % 32)==0} { puts "" ; puts -nonewline " " }
  338. set bFlag [expr ![info exists a($i)]]
  339. puts -nonewline "${bFlag},"
  340. }
  341. puts ""
  342. puts " \};"
  343. puts -nonewline " int aLargeSep\[\] = \{"
  344. set i 0
  345. foreach iSep [lsort -integer [array names a]] {
  346. if {$iSep<70000} continue
  347. if {($i % 8)==0} { puts "" ; puts -nonewline " " }
  348. puts -nonewline " $iSep,"
  349. incr i
  350. }
  351. puts ""
  352. puts " \};"
  353. puts -nonewline " int aLargeOther\[\] = \{"
  354. set i 0
  355. foreach iSep [lsort -integer [array names a]] {
  356. if {$iSep<70000} continue
  357. if {[info exists a([expr $iSep-1])]==0} {
  358. if {($i % 8)==0} { puts "" ; puts -nonewline " " }
  359. puts -nonewline " [expr $iSep-1],"
  360. incr i
  361. }
  362. if {[info exists a([expr $iSep+1])]==0} {
  363. if {($i % 8)==0} { puts "" ; puts -nonewline " " }
  364. puts -nonewline " [expr $iSep+1],"
  365. incr i
  366. }
  367. }
  368. puts ""
  369. puts " \};"
  370. puts [subst -nocommands {
  371. int i;
  372. for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
  373. if( ${zFunc}(i)!=aAlnum[i] ){
  374. *piCode = i;
  375. return 1;
  376. }
  377. }
  378. for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
  379. if( ${zFunc}(aLargeSep[i])!=0 ){
  380. *piCode = aLargeSep[i];
  381. return 1;
  382. }
  383. }
  384. for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
  385. if( ${zFunc}(aLargeOther[i])!=1 ){
  386. *piCode = aLargeOther[i];
  387. return 1;
  388. }
  389. }
  390. }]
  391. puts " return 0;"
  392. puts "\}"
  393. }
  394. #-------------------------------------------------------------------------
  395. proc tl_load_casefolding_txt {zName} {
  396. global tl_lookup_table
  397. set fd [open $zName]
  398. while { ![eof $fd] } {
  399. set line [gets $fd]
  400. if {[string range $line 0 0] == "#"} continue
  401. if {$line == ""} continue
  402. foreach x {a b c d} {unset -nocomplain $x}
  403. foreach {a b c d} [split $line ";"] {}
  404. set a2 [list]
  405. set c2 [list]
  406. foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
  407. foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
  408. set b [string trim $b]
  409. set d [string trim $d]
  410. if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
  411. }
  412. }
  413. proc tl_create_records {} {
  414. global tl_lookup_table
  415. set iFirst ""
  416. set nOff 0
  417. set nRange 0
  418. set nIncr 0
  419. set lRecord [list]
  420. foreach code [lsort -integer [array names tl_lookup_table]] {
  421. set mapping $tl_lookup_table($code)
  422. if {$iFirst == ""} {
  423. set iFirst $code
  424. set nOff [expr $mapping - $code]
  425. set nRange 1
  426. set nIncr 1
  427. } else {
  428. set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
  429. if { $nRange==1 && ($diff==1 || $diff==2) } {
  430. set nIncr $diff
  431. }
  432. if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
  433. if { $nRange==1 } {set nIncr 1}
  434. lappend lRecord [list $iFirst $nIncr $nRange $nOff]
  435. set iFirst $code
  436. set nOff [expr $mapping - $code]
  437. set nRange 1
  438. set nIncr 1
  439. } else {
  440. incr nRange
  441. }
  442. }
  443. }
  444. lappend lRecord [list $iFirst $nIncr $nRange $nOff]
  445. set lRecord
  446. }
  447. proc tl_print_table_header {} {
  448. puts -nonewline " "
  449. puts [string trim {
  450. /* Each entry in the following array defines a rule for folding a range
  451. ** of codepoints to lower case. The rule applies to a range of nRange
  452. ** codepoints starting at codepoint iCode.
  453. **
  454. ** If the least significant bit in flags is clear, then the rule applies
  455. ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  456. ** need to be folded). Or, if it is set, then the rule only applies to
  457. ** every second codepoint in the range, starting with codepoint C.
  458. **
  459. ** The 7 most significant bits in flags are an index into the aiOff[]
  460. ** array. If a specific codepoint C does require folding, then its lower
  461. ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
  462. **
  463. ** The contents of this array are generated by parsing the CaseFolding.txt
  464. ** file distributed as part of the "Unicode Character Database". See
  465. ** http://www.unicode.org for details.
  466. */
  467. }]
  468. puts " static const struct TableEntry \{"
  469. puts " unsigned short iCode;"
  470. puts " unsigned char flags;"
  471. puts " unsigned char nRange;"
  472. puts " \} aEntry\[\] = \{"
  473. }
  474. proc tl_print_table_entry {togglevar entry liOff} {
  475. upvar $togglevar t
  476. foreach {iFirst nIncr nRange nOff} $entry {}
  477. if {$iFirst > (1<<16)} { return 1 }
  478. if {[info exists t]==0} {set t 0}
  479. if {$t==0} { puts -nonewline " " }
  480. set flags 0
  481. if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
  482. if {$nOff<0} { incr nOff [expr (1<<16)] }
  483. set idx [lsearch $liOff $nOff]
  484. if {$idx<0} {error "malfunction generating aiOff"}
  485. set flags [expr $flags + $idx*2]
  486. set txt "{$iFirst, $flags, $nRange},"
  487. if {$t==2} {
  488. puts $txt
  489. } else {
  490. puts -nonewline [format "% -23s" $txt]
  491. }
  492. set t [expr ($t+1)%3]
  493. return 0
  494. }
  495. proc tl_print_table_footer {togglevar} {
  496. upvar $togglevar t
  497. if {$t!=0} {puts ""}
  498. puts " \};"
  499. }
  500. proc tl_print_if_entry {entry} {
  501. foreach {iFirst nIncr nRange nOff} $entry {}
  502. if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
  503. puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
  504. puts " ret = c + $nOff;"
  505. puts " \}"
  506. }
  507. proc tl_generate_ioff_table {lRecord} {
  508. foreach entry $lRecord {
  509. foreach {iFirst nIncr nRange iOff} $entry {}
  510. if {$iOff<0} { incr iOff [expr (1<<16)] }
  511. if {[info exists a($iOff)]} continue
  512. set a($iOff) 1
  513. }
  514. set liOff [lsort -integer [array names a]]
  515. if {[llength $liOff]>128} { error "Too many distinct ioffs" }
  516. return $liOff
  517. }
  518. proc tl_print_ioff_table {liOff} {
  519. puts -nonewline " static const unsigned short aiOff\[\] = \{"
  520. set i 0
  521. foreach off $liOff {
  522. if {($i % 8)==0} {puts "" ; puts -nonewline " "}
  523. puts -nonewline [format "% -7s" "$off,"]
  524. incr i
  525. }
  526. puts ""
  527. puts " \};"
  528. }
  529. proc print_fold {zFunc} {
  530. set lRecord [tl_create_records]
  531. set lHigh [list]
  532. puts "/*"
  533. puts "** Interpret the argument as a unicode codepoint. If the codepoint"
  534. puts "** is an upper case character that has a lower case equivalent,"
  535. puts "** return the codepoint corresponding to the lower case version."
  536. puts "** Otherwise, return a copy of the argument."
  537. puts "**"
  538. puts "** The results are undefined if the value passed to this function"
  539. puts "** is less than zero."
  540. puts "*/"
  541. puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
  542. set liOff [tl_generate_ioff_table $lRecord]
  543. tl_print_table_header
  544. foreach entry $lRecord {
  545. if {[tl_print_table_entry toggle $entry $liOff]} {
  546. lappend lHigh $entry
  547. }
  548. }
  549. tl_print_table_footer toggle
  550. tl_print_ioff_table $liOff
  551. puts {
  552. int ret = c;
  553. assert( c>=0 );
  554. assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
  555. if( c<128 ){
  556. if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  557. }else if( c<65536 ){
  558. int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
  559. int iLo = 0;
  560. int iRes = -1;
  561. while( iHi>=iLo ){
  562. int iTest = (iHi + iLo) / 2;
  563. int cmp = (c - aEntry[iTest].iCode);
  564. if( cmp>=0 ){
  565. iRes = iTest;
  566. iLo = iTest+1;
  567. }else{
  568. iHi = iTest-1;
  569. }
  570. }
  571. assert( iRes<0 || c>=aEntry[iRes].iCode );
  572. if( iRes>=0 ){
  573. const struct TableEntry *p = &aEntry[iRes];
  574. if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
  575. ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
  576. assert( ret>0 );
  577. }
  578. }
  579. if( bRemoveDiacritic ) ret = remove_diacritic(ret);
  580. }
  581. }
  582. foreach entry $lHigh {
  583. tl_print_if_entry $entry
  584. }
  585. puts ""
  586. puts " return ret;"
  587. puts "\}"
  588. }
  589. proc print_fold_test {zFunc mappings} {
  590. global tl_lookup_table
  591. foreach m $mappings {
  592. set c [lindex $m 1]
  593. if {$c == ""} {
  594. set extra([lindex $m 0]) 0
  595. } else {
  596. scan $c %c i
  597. set extra([lindex $m 0]) $i
  598. }
  599. }
  600. puts "static int fold_test(int *piCode)\{"
  601. puts -nonewline " static int aLookup\[\] = \{"
  602. for {set i 0} {$i < 70000} {incr i} {
  603. set expected $i
  604. catch { set expected $tl_lookup_table($i) }
  605. set expected2 $expected
  606. catch { set expected2 $extra($expected2) }
  607. if {($i % 4)==0} { puts "" ; puts -nonewline " " }
  608. puts -nonewline "$expected, $expected2, "
  609. }
  610. puts " \};"
  611. puts " int i;"
  612. puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
  613. puts " int iCode = (i/2);"
  614. puts " int bFlag = i & 0x0001;"
  615. puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
  616. puts " *piCode = iCode;"
  617. puts " return 1;"
  618. puts " \}"
  619. puts " \}"
  620. puts " return 0;"
  621. puts "\}"
  622. }
  623. proc print_fileheader {} {
  624. puts [string trim {
  625. /*
  626. ** 2012 May 25
  627. **
  628. ** The author disclaims copyright to this source code. In place of
  629. ** a legal notice, here is a blessing:
  630. **
  631. ** May you do good and not evil.
  632. ** May you find forgiveness for yourself and forgive others.
  633. ** May you share freely, never taking more than you give.
  634. **
  635. ******************************************************************************
  636. */
  637. /*
  638. ** DO NOT EDIT THIS MACHINE GENERATED FILE.
  639. */
  640. }]
  641. puts ""
  642. puts "#if defined(SQLITE_ENABLE_FTS4_UNICODE61)"
  643. puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
  644. puts ""
  645. puts "#include <assert.h>"
  646. puts ""
  647. }
  648. proc print_test_main {} {
  649. puts ""
  650. puts "#include <stdio.h>"
  651. puts ""
  652. puts "int main(int argc, char **argv)\{"
  653. puts " int r1, r2;"
  654. puts " int code;"
  655. puts " r1 = isalnum_test(&code);"
  656. puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
  657. puts " else printf(\"isalnum(): test passed\\n\");"
  658. puts " r2 = fold_test(&code);"
  659. puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
  660. puts " else printf(\"fold(): test passed\\n\");"
  661. puts " return (r1 || r2);"
  662. puts "\}"
  663. }
  664. # Proces the command line arguments. Exit early if they are not to
  665. # our liking.
  666. #
  667. proc usage {} {
  668. puts -nonewline stderr "Usage: $::argv0 ?-test? "
  669. puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
  670. exit 1
  671. }
  672. if {[llength $argv]!=2 && [llength $argv]!=3} usage
  673. if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
  674. set unicodedata.txt [lindex $argv end]
  675. set casefolding.txt [lindex $argv end-1]
  676. set generate_test_code [expr {[llength $argv]==3}]
  677. print_fileheader
  678. # Print the isalnum() function to stdout.
  679. #
  680. set lRange [an_load_separator_ranges]
  681. print_isalnum sqlite3FtsUnicodeIsalnum $lRange
  682. # Leave a gap between the two generated C functions.
  683. #
  684. puts ""
  685. puts ""
  686. # Load the fold data. This is used by the [rd_XXX] commands
  687. # as well as [print_fold].
  688. tl_load_casefolding_txt ${casefolding.txt}
  689. set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
  690. print_rd $mappings
  691. puts ""
  692. puts ""
  693. print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings
  694. puts ""
  695. puts ""
  696. # Print the fold() function to stdout.
  697. #
  698. print_fold sqlite3FtsUnicodeFold
  699. # Print the test routines and main() function to stdout, if -test
  700. # was specified.
  701. #
  702. if {$::generate_test_code} {
  703. print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
  704. print_fold_test sqlite3FtsUnicodeFold $mappings
  705. print_test_main
  706. }
  707. puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
  708. puts "#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */"