basis/unicode/unicode.factor

   1
   2 USING: accessors arrays assocs combinators.short-circuit fry
   3 hints interval-maps kernel math math.order sequences sorting
   4 strings unicode.breaks.private unicode.case.private
   5 unicode.categories unicode.collation unicode.collation.private
   6 unicode.data unicode.data.private unicode.normalize.private
   7 unicode.script ;
   8
   9 IN: unicode
  10
  11 CATEGORY: blank Zs Zl Zp | "\r\n\t" member? ;
  12
  13 CATEGORY: letter Ll | "Other_Lowercase" property? ;
  14
  15 CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
  16
  17 CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
  18
  19 CATEGORY: digit Nd Nl No ;
  20
  21 CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
  22
  23 CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
  24
  25 CATEGORY: control Cc ;
  26
  27 CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
  28
  29 CATEGORY-NOT: character Cn ;
  30
  31 CATEGORY: math Sm | "Other_Math" property? ;
  32
  33 : script-of ( char -- script )
  34     script-table interval-at ;
  35
  36 : name>char ( name -- char ) name-map at ; inline
  37
  38 : char>name ( char -- name ) name-map value-at ; inline
  39
  40 : ch>lower ( ch -- lower ) simple-lower ?at drop ; inline
  41
  42 : ch>upper ( ch -- upper ) simple-upper ?at drop ; inline
  43
  44 : ch>title ( ch -- title ) simple-title ?at drop ; inline
  45
  46 : first-grapheme ( str -- i )
  47     unclip-slice grapheme-class over
  48     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
  49     nip swap length or 1 + ;
  50
  51 : first-grapheme-from ( start str -- i )
  52     over tail-slice first-grapheme + ;
  53
  54 : last-grapheme ( str -- i )
  55     unclip-last-slice grapheme-class swap
  56     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
  57
  58 : last-grapheme-from ( end str -- i )
  59     swap head-slice last-grapheme ;
  60
  61 <PRIVATE
  62
  63 : >pieces ( str quot: ( str -- i ) -- graphemes )
  64     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
  65
  66 PRIVATE>
  67
  68 : >graphemes ( str -- graphemes )
  69     [ first-grapheme ] >pieces ;
  70
  71 : string-reverse ( str -- rts )
  72     >graphemes reverse! concat ;
  73
  74 : first-word ( str -- i )
  75     [ [ length ] [ first word-break-prop ] bi ] keep
  76     1 swap dup '[ _ word-break-next ] find-index-from
  77     drop nip swap or ;
  78
  79 : >words ( str -- words )
  80     [ first-word ] >pieces ;
  81
  82 <PRIVATE
  83
  84 : nth-next ( i str -- str[i-1] str[i] )
  85     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
  86
  87 PRIVATE>
  88
  89 : word-break-at? ( i str -- ? )
  90     {
  91         [ drop zero? ]
  92         [ length = ]
  93         [
  94             [ nth-next [ word-break-prop ] dip ] 2keep
  95             word-break-next nip
  96         ]
  97     } 2|| ;
  98
  99 : first-word-from ( start str -- i )
 100     over tail-slice first-word + ;
 101
 102 : last-word ( str -- i )
 103     [ length <iota> ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 104
 105 : last-word-from ( end str -- i )
 106     swap head-slice last-word ;
 107
 108 : >lower ( string -- lower )
 109     locale>lower final-sigma
 110     [ lower>> ] [ ch>lower ] map-case ;
 111
 112 HINTS: >lower string ;
 113
 114 : >upper ( string -- upper )
 115     locale>upper
 116     [ upper>> ] [ ch>upper ] map-case ;
 117
 118 HINTS: >upper string ;
 119
 120 <PRIVATE
 121
 122 : (>title) ( string -- title )
 123     locale>upper
 124     [ title>> ] [ ch>title ] map-case ; inline
 125
 126 PRIVATE>
 127
 128 : capitalize ( string -- title )
 129     unclip-slice 1string [ >lower ] [ (>title) ] bi*
 130     "" prepend-as ; inline
 131
 132 : >title ( string -- title )
 133     final-sigma >words [ capitalize ] map! concat ;
 134
 135 HINTS: >title string ;
 136
 137 : >case-fold ( string -- fold )
 138     >upper >lower ;
 139
 140 : lower? ( string -- ? ) dup >lower sequence= ;
 141
 142 : upper? ( string -- ? ) dup >upper sequence= ;
 143
 144 : title? ( string -- ? ) dup >title sequence= ;
 145
 146 : case-fold? ( string -- ? ) dup >case-fold sequence= ;
 147
 148 : nfd ( string -- nfd )
 149     [ (nfd) ] with-string ;
 150
 151 : nfkd ( string -- nfkd )
 152     [ (nfkd) ] with-string ;
 153
 154 : string-append ( s1 s2 -- string )
 155     [ append ] keep
 156     0 over ?nth non-starter?
 157     [ length dupd reorder-back ] [ drop ] if ;
 158
 159 HINTS: string-append string string ;
 160
 161 : nfc ( string -- nfc )
 162     [ (nfd) combine ] with-string ;
 163
 164 : nfkc ( string -- nfkc )
 165     [ (nfkd) combine ] with-string ;
 166
 167 : collation-key ( string -- key )
 168     nfd string>graphemes graphemes>weights
 169     filter-ignorable weights>bytes ;
 170
 171 <PRIVATE
 172
 173 : insensitive= ( str1 str2 levels-removed -- ? )
 174     [
 175         [ collation-key ] dip
 176         [ [ 0 = not ] trim-tail but-last ] times
 177     ] curry same? ;
 178
 179 PRIVATE>
 180
 181 : primary= ( str1 str2 -- ? )
 182     3 insensitive= ;
 183
 184 : secondary= ( str1 str2 -- ? )
 185     2 insensitive= ;
 186
 187 : tertiary= ( str1 str2 -- ? )
 188     1 insensitive= ;
 189
 190 : quaternary= ( str1 str2 -- ? )
 191     0 insensitive= ;
 192
 193 : w/collation-key ( str -- {str,key} )
 194     [ collation-key ] keep 2array ;
 195
 196 : sort-strings ( strings -- sorted )
 197     [ w/collation-key ] map natural-sort values ;
 198
 199 : string<=> ( str1 str2 -- <=> )
 200     [ w/collation-key ] compare ;