! Copyright (C) 2008 Daniel Ehrenberg. ! See http://factorcode.org/license.txt for BSD license. USING: combinators.short-circuit unicode.categories kernel math combinators splitting sequences math.parser io.files io assocs arrays namespaces make math.ranges unicode.normalize unicode.normalize.private values io.encodings.ascii unicode.data compiler.units fry unicode.categories.syntax alien.syntax sets accessors interval-maps memoize locals words simple-flat-file ; IN: unicode.breaks ] curry replicate ; SYMBOL: table : finish-table ( -- table ) table get [ [ 1 = ] map ] map ; : eval-seq ( seq -- seq ) [ ?execute ] map ; : (set-table) ( class1 class2 val -- ) [ table get nth ] dip '[ _ or ] change-nth ; : set-table ( classes1 classes2 val -- ) [ [ eval-seq ] bi@ ] dip [ [ (set-table) ] curry with each ] 2curry each ; : connect ( class1 class2 -- ) 1 set-table ; : disconnect ( class1 class2 -- ) 0 set-table ; : make-grapheme-table ( -- ) { CR } { LF } connect { Control CR LF } graphemes disconnect graphemes { Control CR LF } disconnect { L } { L V LV LVT } connect { LV V } { V T } connect { LVT T } { T } connect graphemes { Extend } connect graphemes { SpacingMark } connect { Prepend } graphemes connect ; VALUE: grapheme-table : grapheme-break? ( class1 class2 -- ? ) grapheme-table nth nth not ; PRIVATE> : first-grapheme ( str -- i ) unclip-slice grapheme-class over [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop nip swap length or 1+ ; : first-grapheme-from ( start str -- i ) over tail-slice first-grapheme + ; : last-grapheme ( str -- i ) unclip-last-slice grapheme-class swap [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ; : last-grapheme-from ( end str -- i ) swap head-slice last-grapheme ; pieces ( str quot: ( str -- i ) -- graphemes ) [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline PRIVATE> : >graphemes ( str -- graphemes ) [ first-grapheme ] >pieces ; : string-reverse ( str -- rts ) >graphemes reverse concat ; : first-word ( str -- i ) [ unclip-slice word-break-prop over ] keep '[ swap _ word-break-next ] assoc-find 2drop nip swap length or 1+ ; : >words ( str -- words ) [ first-word ] >pieces ; : word-break-at? ( i str -- ? ) { [ drop zero? ] [ length = ] [ [ nth-next [ word-break-prop ] dip ] 2keep word-break-next nip ] } 2|| ; : first-word-from ( start str -- i ) over tail-slice first-word + ; : last-word ( str -- i ) [ length ] keep '[ _ word-break-at? ] find-last drop 0 or ; : last-word-from ( end str -- i ) swap head-slice last-word ;