"vocab:unicode/UCA/allkeys.txt" parse-ducet ducet set-global
-! Fix up table for long contractions
-: help-one ( assoc key -- )
- ! Need to be more general? Not for DUCET, apparently
- 2 head 2dup swap key? [ 2drop ] [
- [ [ 1string of ] with { } map-as concat ]
- [ swap set-at ] 2bi
- ] if ;
-
+! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET
: fixup-ducet ( -- )
{
{
}
} ducet get-global '[ swap >string _ set-at ] assoc-each ;
-! Add a few missing ducet values
+! Add a few missing ducet values for Tibetan
+! https://www.unicode.org/reports/tr10/tr10-41.html#Well_Formed_DUCET
fixup-ducet
: tangut-block? ( char -- ? )
: building-last ( -- char )
building get [ 0 ] [ last last ] if-empty ;
+! https://www.unicode.org/reports/tr10/tr10-41.html#Collation_Graphemes
: blocked? ( char -- ? )
combining-class dup { 0 f } member?
[ drop building-last non-starter? ]
MEMO: categories-map ( -- hashtable )
categories H{ } zip-index-as ;
-CONSTANT: num-chars 0x2FA1E
+CONSTANT: NUM-CHARS 0x2FA1E
PRIVATE>
! Loading data from UnicodeData.txt
-: load-data ( -- data )
+: load-unicode-data ( -- data )
"vocab:unicode/UCD/UnicodeData.txt" load-data-file ;
: (process-data) ( index data -- newdata )
] assoc-each table ;
:: process-category ( data -- category-listing )
- num-chars <byte-array> :> table
+ NUM-CHARS <byte-array> :> table
2 data (process-data) [| char cat |
cat categories-map at char table ?set-nth
] assoc-each table fill-ranges ;
[ length 5 = ] filter
[ [ set-code-point ] each ] H{ } make ;
-load-data {
+load-unicode-data {
[ process-names name-map swap assoc-union! drop ]
[ 13 swap process-data simple-lower swap assoc-union! drop ]
[ 12 swap process-data simple-upper swap assoc-union! drop ]