basis/unicode/unicode.factor

   1 USING: accessors arrays assocs combinators.short-circuit fry
   2 hints interval-maps kernel math math.order sequences sorting
   3 strings unicode.breaks.private unicode.case.private
   4 unicode.categories unicode.collation unicode.collation.private
   5 unicode.data unicode.data.private unicode.normalize.private
   6 unicode.script locals ranges ;
   7 IN: unicode
   8
   9 CATEGORY: blank Zs Zl Zp | "\r\n\t" member? ;
  10
  11 CATEGORY: letter Ll | "Other_Lowercase" property? ;
  12
  13 CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
  14
  15 CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
  16
  17 CATEGORY: digit Nd Nl No ;
  18
  19 CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
  20
  21 CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
  22
  23 CATEGORY: control Cc ;
  24
  25 CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
  26
  27 CATEGORY-NOT: character Cn ;
  28
  29 CATEGORY: math Sm | "Other_Math" property? ;
  30
  31 : script-of ( char -- script )
  32     script-table interval-at ;
  33
  34 : name>char ( name -- char ) name-map at ; inline
  35
  36 : char>name ( char -- name ) name-map value-at ; inline
  37
  38 : ch>lower ( ch -- lower ) simple-lower ?at drop ; inline
  39
  40 : ch>upper ( ch -- upper ) simple-upper ?at drop ; inline
  41
  42 : ch>title ( ch -- title ) simple-title ?at drop ; inline
  43
  44 :: first-grapheme ( entire-str start -- i )
  45     start :> pos!
  46     entire-str length :> str-len
  47     0 pos 1 + entire-str <slice> grapheme-class
  48     pos 1 + str-len 1 - min pos!
  49     pos str-len 1 - [a..b] [
  50         1 + 0 swap entire-str <slice> grapheme-class
  51         dup rot swap grapheme-break?
  52     ] find drop nip
  53     [ 1 + ] [ str-len start - ] if* ;
  54
  55 :: first-grapheme-from ( start str -- i )
  56     str start first-grapheme start + ;
  57
  58 :: last-grapheme ( str -- i )
  59     str length 1 - :> pos!
  60     pos 0 = [ 0 ] [
  61         str grapheme-class
  62         pos 1 - 0 max pos!
  63         0 pos [a..b] [
  64             0 swap 1 + str <slice> grapheme-class
  65             dup rot grapheme-break?
  66         ] find-last drop ?1+ nip
  67     ] if ;
  68
  69 : last-grapheme-from ( end str -- i )
  70      swap head-slice last-grapheme ;
  71
  72 <PRIVATE
  73
  74 : >pieces ( str quot: ( str -- i ) -- graphemes )
  75     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
  76
  77 PRIVATE>
  78
  79 :: >graphemes ( str -- graphemes )
  80     str length :> str-len
  81     0 :> pos! 0 :> old-pos!
  82     [ f ! dummy
  83       pos old-pos! old-pos str-len < [
  84           str pos first-grapheme pos + pos! pos str-len <=
  85       ] [ f ] if ]
  86     [ drop old-pos pos str <slice> ] produce nip ;
  87
  88 : count-graphemes ( str -- n ) >graphemes length ; inline
  89
  90 : string-reverse ( str -- rts )
  91     >graphemes reverse! concat ;
  92
  93 : first-word ( str -- i )
  94     [ [ length ] [ first word-break-prop ] bi ] keep
  95     1 swap dup '[ _ word-break-next ] find-index-from
  96     drop nip swap or ;
  97
  98 : >words ( str -- words )
  99     [ first-word ] >pieces ;
 100
 101 <PRIVATE
 102
 103 : nth-next ( i str -- str[i-1] str[i] )
 104     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
 105
 106 PRIVATE>
 107
 108 : word-break-at? ( i str -- ? )
 109     {
 110         [ drop zero? ]
 111         [ length = ]
 112         [
 113             [ nth-next [ word-break-prop ] dip ] 2keep
 114             word-break-next nip
 115         ]
 116     } 2|| ;
 117
 118 : first-word-from ( start str -- i )
 119     over tail-slice first-word + ;
 120
 121 : last-word ( str -- i )
 122     [ length <iota> ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 123
 124 : last-word-from ( end str -- i )
 125     swap head-slice last-word ;
 126
 127 : >lower ( string -- lower )
 128     locale>lower final-sigma
 129     [ lower>> ] [ ch>lower ] map-case ;
 130
 131 HINTS: >lower string ;
 132
 133 : >upper ( string -- upper )
 134     locale>upper
 135     [ upper>> ] [ ch>upper ] map-case ;
 136
 137 HINTS: >upper string ;
 138
 139 <PRIVATE
 140
 141 : (>title) ( string -- title )
 142     locale>upper
 143     [ title>> ] [ ch>title ] map-case ; inline
 144
 145 PRIVATE>
 146
 147 : capitalize ( string -- title )
 148     unclip-slice 1string [ >lower ] [ (>title) ] bi*
 149     "" prepend-as ; inline
 150
 151 : >title ( string -- title )
 152     final-sigma >words [ capitalize ] map! concat ;
 153
 154 HINTS: >title string ;
 155
 156 : >case-fold ( string -- fold )
 157     >upper >lower ;
 158
 159 : lower? ( string -- ? ) dup >lower sequence= ;
 160
 161 : upper? ( string -- ? ) dup >upper sequence= ;
 162
 163 : title? ( string -- ? ) dup >title sequence= ;
 164
 165 : case-fold? ( string -- ? ) dup >case-fold sequence= ;
 166
 167 : nfd ( string -- nfd )
 168     [ (nfd) ] with-string ;
 169
 170 : nfkd ( string -- nfkd )
 171     [ (nfkd) ] with-string ;
 172
 173 : string-append ( s1 s2 -- string )
 174     [ append ] keep
 175     0 over ?nth non-starter?
 176     [ length dupd reorder-back ] [ drop ] if ;
 177
 178 HINTS: string-append string string ;
 179
 180 : nfc ( string -- nfc )
 181     [ (nfd) combine ] with-string ;
 182
 183 : nfkc ( string -- nfkc )
 184     [ (nfkd) combine ] with-string ;
 185
 186 : collation-key/nfd ( string -- key nfd )
 187     nfd [
 188         string>graphemes graphemes>weights
 189         filter-ignorable weights>bytes
 190     ] keep ;
 191
 192 <PRIVATE
 193
 194 : insensitive= ( str1 str2 levels-removed -- ? )
 195     [
 196         [ collation-key/nfd drop ] dip
 197         [ [ 0 = not ] trim-tail but-last ] times
 198     ] curry same? ;
 199
 200 PRIVATE>
 201
 202 : primary= ( str1 str2 -- ? )
 203     3 insensitive= ;
 204
 205 : secondary= ( str1 str2 -- ? )
 206     2 insensitive= ;
 207
 208 : tertiary= ( str1 str2 -- ? )
 209     1 insensitive= ;
 210
 211 : quaternary= ( str1 str2 -- ? )
 212     0 insensitive= ;
 213
 214 : sort-strings ( strings -- sorted )
 215     [ collation-key/nfd 2array ] map natural-sort values ;
 216
 217 : string<=> ( str1 str2 -- <=> )
 218     [ collation-key/nfd 2array ] compare ;
 219
 220 : upper-surrogate? ( ch -- ? ) 0xD800 0xDBFF between? ; inline
 221
 222 : under-surrogate? ( ch -- ? ) 0xDC00 0xDFFF between? ; inline
 223
 224 CONSTANT: unicode-supported {
 225     "collation"
 226 }
 227
 228 CONSTANT: unicode-unsupported {
 229     "bidi"
 230 }
 231
 232 CONSTANT: unicode-version "14.0.0"