basis/unicode/unicode.factor

   1 USING: accessors arrays assocs combinators.short-circuit fry
   2 hints interval-maps kernel math math.order sequences sorting
   3 strings unicode.breaks.private unicode.case.private
   4 unicode.categories unicode.collation unicode.collation.private
   5 unicode.data unicode.data.private unicode.normalize.private
   6 unicode.script ;
   7 IN: unicode
   8
   9 CATEGORY: blank Zs Zl Zp | "\r\n\t" member? ;
  10
  11 CATEGORY: letter Ll | "Other_Lowercase" property? ;
  12
  13 CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
  14
  15 CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
  16
  17 CATEGORY: digit Nd Nl No ;
  18
  19 CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
  20
  21 CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
  22
  23 CATEGORY: control Cc ;
  24
  25 CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
  26
  27 CATEGORY-NOT: character Cn ;
  28
  29 CATEGORY: math Sm | "Other_Math" property? ;
  30
  31 : script-of ( char -- script )
  32     script-table interval-at ;
  33
  34 : name>char ( name -- char ) name-map at ; inline
  35
  36 : char>name ( char -- name ) name-map value-at ; inline
  37
  38 : ch>lower ( ch -- lower ) simple-lower ?at drop ; inline
  39
  40 : ch>upper ( ch -- upper ) simple-upper ?at drop ; inline
  41
  42 : ch>title ( ch -- title ) simple-title ?at drop ; inline
  43
  44 : first-grapheme ( str -- i )
  45     unclip-slice grapheme-class over
  46     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
  47     nip swap length or 1 + ;
  48
  49 : first-grapheme-from ( start str -- i )
  50     over tail-slice first-grapheme + ;
  51
  52 : last-grapheme ( str -- i )
  53     unclip-last-slice grapheme-class swap
  54     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
  55
  56 : last-grapheme-from ( end str -- i )
  57     swap head-slice last-grapheme ;
  58
  59 <PRIVATE
  60
  61 : >pieces ( str quot: ( str -- i ) -- graphemes )
  62     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
  63
  64 PRIVATE>
  65
  66 : >graphemes ( str -- graphemes )
  67     [ first-grapheme ] >pieces ;
  68
  69 : string-reverse ( str -- rts )
  70     >graphemes reverse! concat ;
  71
  72 : first-word ( str -- i )
  73     [ [ length ] [ first word-break-prop ] bi ] keep
  74     1 swap dup '[ _ word-break-next ] find-index-from
  75     drop nip swap or ;
  76
  77 : >words ( str -- words )
  78     [ first-word ] >pieces ;
  79
  80 <PRIVATE
  81
  82 : nth-next ( i str -- str[i-1] str[i] )
  83     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
  84
  85 PRIVATE>
  86
  87 : word-break-at? ( i str -- ? )
  88     {
  89         [ drop zero? ]
  90         [ length = ]
  91         [
  92             [ nth-next [ word-break-prop ] dip ] 2keep
  93             word-break-next nip
  94         ]
  95     } 2|| ;
  96
  97 : first-word-from ( start str -- i )
  98     over tail-slice first-word + ;
  99
 100 : last-word ( str -- i )
 101     [ length <iota> ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 102
 103 : last-word-from ( end str -- i )
 104     swap head-slice last-word ;
 105
 106 : >lower ( string -- lower )
 107     locale>lower final-sigma
 108     [ lower>> ] [ ch>lower ] map-case ;
 109
 110 HINTS: >lower string ;
 111
 112 : >upper ( string -- upper )
 113     locale>upper
 114     [ upper>> ] [ ch>upper ] map-case ;
 115
 116 HINTS: >upper string ;
 117
 118 <PRIVATE
 119
 120 : (>title) ( string -- title )
 121     locale>upper
 122     [ title>> ] [ ch>title ] map-case ; inline
 123
 124 PRIVATE>
 125
 126 : capitalize ( string -- title )
 127     unclip-slice 1string [ >lower ] [ (>title) ] bi*
 128     "" prepend-as ; inline
 129
 130 : >title ( string -- title )
 131     final-sigma >words [ capitalize ] map! concat ;
 132
 133 HINTS: >title string ;
 134
 135 : >case-fold ( string -- fold )
 136     >upper >lower ;
 137
 138 : lower? ( string -- ? ) dup >lower sequence= ;
 139
 140 : upper? ( string -- ? ) dup >upper sequence= ;
 141
 142 : title? ( string -- ? ) dup >title sequence= ;
 143
 144 : case-fold? ( string -- ? ) dup >case-fold sequence= ;
 145
 146 : nfd ( string -- nfd )
 147     [ (nfd) ] with-string ;
 148
 149 : nfkd ( string -- nfkd )
 150     [ (nfkd) ] with-string ;
 151
 152 : string-append ( s1 s2 -- string )
 153     [ append ] keep
 154     0 over ?nth non-starter?
 155     [ length dupd reorder-back ] [ drop ] if ;
 156
 157 HINTS: string-append string string ;
 158
 159 : nfc ( string -- nfc )
 160     [ (nfd) combine ] with-string ;
 161
 162 : nfkc ( string -- nfkc )
 163     [ (nfkd) combine ] with-string ;
 164
 165 : collation-key/nfd ( string -- key nfd )
 166     nfd [
 167         string>graphemes graphemes>weights
 168         filter-ignorable weights>bytes
 169     ] keep ;
 170
 171 <PRIVATE
 172
 173 : insensitive= ( str1 str2 levels-removed -- ? )
 174     [
 175         [ collation-key/nfd drop ] dip
 176         [ [ 0 = not ] trim-tail but-last ] times
 177     ] curry same? ;
 178
 179 PRIVATE>
 180
 181 : primary= ( str1 str2 -- ? )
 182     3 insensitive= ;
 183
 184 : secondary= ( str1 str2 -- ? )
 185     2 insensitive= ;
 186
 187 : tertiary= ( str1 str2 -- ? )
 188     1 insensitive= ;
 189
 190 : quaternary= ( str1 str2 -- ? )
 191     0 insensitive= ;
 192
 193 : sort-strings ( strings -- sorted )
 194     [ collation-key/nfd 2array ] map natural-sort values ;
 195
 196 : string<=> ( str1 str2 -- <=> )
 197     [ collation-key/nfd 2array ] compare ;
 198
 199 CONSTANT: unicode-supported {
 200     "collation"
 201 }
 202
 203 CONSTANT: unicode-unsupported {
 204     "bidi"
 205 }
 206
 207 CONSTANT: unicode-version "10.0"