factor: trim some using lists

[factor.git] / basis / unicode / unicode.factor
diff --git a/basis/unicode/unicode.factor b/basis/unicode/unicode.factor

index 32adb961d4e156a391b5c16ff366a0bc4efac0ac..e67b3a5366d3199fc797e7d81b62e818fa2ef3e8 100644 (file)
--- a/basis/unicode/unicode.factor
+++ b/basis/unicode/unicode.factor
@@ -1 +1,232 @@
+USING: accessors arrays assocs combinators.short-circuit fry
+hints interval-maps kernel math math.order sequences sorting
+strings unicode.breaks.private unicode.case.private
+unicode.categories unicode.collation unicode.collation.private
+unicode.data unicode.data.private unicode.normalize.private
+unicode.script ranges ;
  IN: unicode
+
+CATEGORY: blank Zs Zl Zp | "\r\n\t" member? ;
+
+CATEGORY: letter Ll | "Other_Lowercase" property? ;
+
+CATEGORY: LETTER Lu | "Other_Uppercase" property? ;
+
+CATEGORY: Letter Lu Ll Lt Lm Lo Nl ;
+
+CATEGORY: digit Nd Nl No ;
+
+CATEGORY-NOT: printable Cc Cf Cs Co Cn ;
+
+CATEGORY: alpha Lu Ll Lt Lm Lo Nd Nl No | "Other_Alphabetic" property? ;
+
+CATEGORY: control Cc ;
+
+CATEGORY-NOT: uncased Lu Ll Lt Lm Mn Me ;
+
+CATEGORY-NOT: character Cn ;
+
+CATEGORY: math Sm | "Other_Math" property? ;
+
+: script-of ( char -- script )
+    script-table interval-at ;
+
+: name>char ( name -- char ) name-map at ; inline
+
+: char>name ( char -- name ) name-map value-at ; inline
+
+: ch>lower ( ch -- lower ) simple-lower ?at drop ; inline
+
+: ch>upper ( ch -- upper ) simple-upper ?at drop ; inline
+
+: ch>title ( ch -- title ) simple-title ?at drop ; inline
+
+:: first-grapheme ( entire-str start -- i )
+    start :> pos!
+    entire-str length :> str-len
+    0 pos 1 + entire-str <slice> grapheme-class
+    pos 1 + str-len 1 - min pos!
+    pos str-len 1 - [a..b] [
+        1 + 0 swap entire-str <slice> grapheme-class
+        dup rot swap grapheme-break?
+    ] find drop nip
+    [ 1 + ] [ str-len start - ] if* ;
+
+:: first-grapheme-from ( start str -- i )
+    str start first-grapheme start + ;
+
+:: last-grapheme ( str -- i )
+    str length 1 - :> pos!
+    pos 0 = [ 0 ] [
+        str grapheme-class
+        pos 1 - 0 max pos!
+        0 pos [a..b] [
+            0 swap 1 + str <slice> grapheme-class
+            dup rot grapheme-break?
+        ] find-last drop ?1+ nip
+    ] if ;
+
+: last-grapheme-from ( end str -- i )
+     swap head-slice last-grapheme ;
+
+<PRIVATE
+
+: >pieces ( str quot: ( str -- i ) -- graphemes )
+    [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
+
+PRIVATE>
+
+:: >graphemes ( str -- graphemes )
+    str length :> str-len
+    0 :> pos! 0 :> old-pos!
+    [ f ! dummy
+      pos old-pos! old-pos str-len < [
+          str pos first-grapheme pos + pos! pos str-len <=
+      ] [ f ] if ]
+    [ drop old-pos pos str <slice> ] produce nip ;
+
+: count-graphemes ( str -- n ) >graphemes length ; inline
+
+: string-reverse ( str -- rts )
+    >graphemes reverse! concat ;
+
+: first-word ( str -- i )
+    [ [ length ] [ first word-break-prop ] bi ] keep
+    1 swap dup '[ _ word-break-next ] find-index-from
+    drop nip swap or ;
+
+: >words ( str -- words )
+    [ first-word ] >pieces ;
+
+<PRIVATE
+
+: nth-next ( i str -- str[i-1] str[i] )
+    [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
+
+PRIVATE>
+
+: word-break-at? ( i str -- ? )
+    {
+        [ drop zero? ]
+        [ length = ]
+        [
+            [ nth-next [ word-break-prop ] dip ] 2keep
+            word-break-next nip
+        ]
+    } 2|| ;
+
+: first-word-from ( start str -- i )
+    over tail-slice first-word + ;
+
+: last-word ( str -- i )
+    [ length <iota> ] keep '[ _ word-break-at? ] find-last drop 0 or ;
+
+: last-word-from ( end str -- i )
+    swap head-slice last-word ;
+
+: >lower ( string -- lower )
+    locale>lower final-sigma
+    [ lower>> ] [ ch>lower ] map-case ;
+
+HINTS: >lower string ;
+
+: >upper ( string -- upper )
+    locale>upper
+    [ upper>> ] [ ch>upper ] map-case ;
+
+HINTS: >upper string ;
+
+<PRIVATE
+
+: (>title) ( string -- title )
+    locale>upper
+    [ title>> ] [ ch>title ] map-case ; inline
+
+PRIVATE>
+
+: capitalize ( string -- title )
+    unclip-slice 1string [ >lower ] [ (>title) ] bi*
+    "" prepend-as ; inline
+
+: >title ( string -- title )
+    final-sigma >words [ capitalize ] map! concat ;
+
+HINTS: >title string ;
+
+: >case-fold ( string -- fold )
+    >upper >lower ;
+
+: lower? ( string -- ? ) dup >lower sequence= ;
+
+: upper? ( string -- ? ) dup >upper sequence= ;
+
+: title? ( string -- ? ) dup >title sequence= ;
+
+: case-fold? ( string -- ? ) dup >case-fold sequence= ;
+
+: nfd ( string -- nfd )
+    [ (nfd) ] with-string ;
+
+: nfkd ( string -- nfkd )
+    [ (nfkd) ] with-string ;
+
+: string-append ( s1 s2 -- string )
+    [ append ] keep
+    0 over ?nth non-starter?
+    [ length dupd reorder-back ] [ drop ] if ;
+
+HINTS: string-append string string ;
+
+: nfc ( string -- nfc )
+    [ (nfd) combine ] with-string ;
+
+: nfkc ( string -- nfkc )
+    [ (nfkd) combine ] with-string ;
+
+: collation-key/nfd ( string -- key nfd )
+    nfd [
+        string>graphemes graphemes>weights
+        filter-ignorable weights>bytes
+    ] keep ;
+
+<PRIVATE
+
+: insensitive= ( str1 str2 levels-removed -- ? )
+    [
+        [ collation-key/nfd drop ] dip
+        [ [ 0 = not ] trim-tail but-last ] times
+    ] curry same? ;
+
+PRIVATE>
+
+: primary= ( str1 str2 -- ? )
+    3 insensitive= ;
+
+: secondary= ( str1 str2 -- ? )
+    2 insensitive= ;
+
+: tertiary= ( str1 str2 -- ? )
+    1 insensitive= ;
+
+: quaternary= ( str1 str2 -- ? )
+    0 insensitive= ;
+
+: sort-strings ( strings -- sorted )
+    [ collation-key/nfd 2array ] map natural-sort values ;
+
+: string<=> ( str1 str2 -- <=> )
+    [ collation-key/nfd 2array ] compare ;
+
+: upper-surrogate? ( ch -- ? ) 0xD800 0xDBFF between? ; inline
+
+: under-surrogate? ( ch -- ? ) 0xDC00 0xDFFF between? ; inline
+
+CONSTANT: unicode-supported {
+    "collation"
+}
+
+CONSTANT: unicode-unsupported {
+    "bidi"
+}
+
+CONSTANT: unicode-version "14.0.0"