1 USING: accessors arrays assocs combinators.short-circuit fry
2 hints interval-maps kernel math math.order sequences sorting
3 strings unicode.breaks.private unicode.case.private
4 unicode.categories unicode.collation unicode.collation.private
5 unicode.data unicode.data.private unicode.normalize.private
6 unicode.script locals ranges ;
9 CATEGORY: blank Zs Zl Zp | "\r\n\t" member? ;
11 CATEGORY: letter Ll | "Other_Lowercase" property? ;
13 CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
15 CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
17 CATEGORY: digit Nd Nl No ;
19 CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
21 CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
23 CATEGORY: control Cc ;
25 CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
27 CATEGORY-NOT: character Cn ;
29 CATEGORY: math Sm | "Other_Math" property? ;
31 : script-of ( char -- script )
32 script-table interval-at ;
34 : name>char ( name -- char ) name-map at ; inline
36 : char>name ( char -- name ) name-map value-at ; inline
38 : ch>lower ( ch -- lower ) simple-lower ?at drop ; inline
40 : ch>upper ( ch -- upper ) simple-upper ?at drop ; inline
42 : ch>title ( ch -- title ) simple-title ?at drop ; inline
44 :: first-grapheme ( entire-str start -- i )
46 entire-str length :> str-len
47 0 pos 1 + entire-str <slice> grapheme-class
48 pos 1 + str-len 1 - min pos!
49 pos str-len 1 - [a..b] [
50 1 + 0 swap entire-str <slice> grapheme-class
51 dup rot swap grapheme-break?
53 [ 1 + ] [ str-len start - ] if* ;
55 :: first-grapheme-from ( start str -- i )
56 str start first-grapheme start + ;
58 :: last-grapheme ( str -- i )
59 str length 1 - :> pos!
64 0 swap 1 + str <slice> grapheme-class
65 dup rot grapheme-break?
66 ] find-last drop ?1+ nip
69 : last-grapheme-from ( end str -- i )
70 swap head-slice last-grapheme ;
74 : >pieces ( str quot: ( str -- i ) -- graphemes )
75 [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
79 :: >graphemes ( str -- graphemes )
81 0 :> pos! 0 :> old-pos!
83 pos old-pos! old-pos str-len < [
84 str pos first-grapheme pos + pos! pos str-len <=
86 [ drop old-pos pos str <slice> ] produce nip ;
88 : count-graphemes ( str -- n ) >graphemes length ; inline
90 : string-reverse ( str -- rts )
91 >graphemes reverse! concat ;
93 : first-word ( str -- i )
94 [ [ length ] [ first word-break-prop ] bi ] keep
95 1 swap dup '[ _ word-break-next ] find-index-from
98 : >words ( str -- words )
99 [ first-word ] >pieces ;
103 : nth-next ( i str -- str[i-1] str[i] )
104 [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
108 : word-break-at? ( i str -- ? )
113 [ nth-next [ word-break-prop ] dip ] 2keep
118 : first-word-from ( start str -- i )
119 over tail-slice first-word + ;
121 : last-word ( str -- i )
122 [ length <iota> ] keep '[ _ word-break-at? ] find-last drop 0 or ;
124 : last-word-from ( end str -- i )
125 swap head-slice last-word ;
127 : >lower ( string -- lower )
128 locale>lower final-sigma
129 [ lower>> ] [ ch>lower ] map-case ;
131 HINTS: >lower string ;
133 : >upper ( string -- upper )
135 [ upper>> ] [ ch>upper ] map-case ;
137 HINTS: >upper string ;
141 : (>title) ( string -- title )
143 [ title>> ] [ ch>title ] map-case ; inline
147 : capitalize ( string -- title )
148 unclip-slice 1string [ >lower ] [ (>title) ] bi*
149 "" prepend-as ; inline
151 : >title ( string -- title )
152 final-sigma >words [ capitalize ] map! concat ;
154 HINTS: >title string ;
156 : >case-fold ( string -- fold )
159 : lower? ( string -- ? ) dup >lower sequence= ;
161 : upper? ( string -- ? ) dup >upper sequence= ;
163 : title? ( string -- ? ) dup >title sequence= ;
165 : case-fold? ( string -- ? ) dup >case-fold sequence= ;
167 : nfd ( string -- nfd )
168 [ (nfd) ] with-string ;
170 : nfkd ( string -- nfkd )
171 [ (nfkd) ] with-string ;
173 : string-append ( s1 s2 -- string )
175 0 over ?nth non-starter?
176 [ length dupd reorder-back ] [ drop ] if ;
178 HINTS: string-append string string ;
180 : nfc ( string -- nfc )
181 [ (nfd) combine ] with-string ;
183 : nfkc ( string -- nfkc )
184 [ (nfkd) combine ] with-string ;
186 : collation-key/nfd ( string -- key nfd )
188 string>graphemes graphemes>weights
189 filter-ignorable weights>bytes
194 : insensitive= ( str1 str2 levels-removed -- ? )
196 [ collation-key/nfd drop ] dip
197 [ [ 0 = not ] trim-tail but-last ] times
202 : primary= ( str1 str2 -- ? )
205 : secondary= ( str1 str2 -- ? )
208 : tertiary= ( str1 str2 -- ? )
211 : quaternary= ( str1 str2 -- ? )
214 : sort-strings ( strings -- sorted )
215 [ collation-key/nfd 2array ] map natural-sort values ;
217 : string<=> ( str1 str2 -- <=> )
218 [ collation-key/nfd 2array ] compare ;
220 : upper-surrogate? ( ch -- ? ) 0xD800 0xDBFF between? ; inline
222 : under-surrogate? ( ch -- ? ) 0xDC00 0xDFFF between? ; inline
224 CONSTANT: unicode-supported {
228 CONSTANT: unicode-unsupported {
232 CONSTANT: unicode-version "14.0.0"