! See http://factorcode.org/license.txt for BSD license.
USING: combinators.short-circuit unicode.categories kernel math
combinators splitting sequences math.parser io.files io assocs
-arrays namespaces make math.ranges unicode.normalize.private values
-io.encodings.ascii unicode.syntax unicode.data compiler.units fry
-alien.syntax sets accessors interval-maps memoize locals words ;
+arrays namespaces make math.ranges unicode.normalize
+unicode.normalize.private values io.encodings.ascii
+unicode.data compiler.units fry unicode.categories.syntax
+alien.syntax sets accessors interval-maps memoize locals words
+simple-flat-file ;
IN: unicode.breaks
<PRIVATE
[ drop Control ]
} case ;
-CATEGORY: (extend) Me Mn ;
-: extend? ( ch -- ? )
- { [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
+CATEGORY: extend
+ Me Mn |
+ "Other_Grapheme_Extend" property? ;
: loe? ( ch -- ? )
"Logical_Order_Exception" property? ;
: finish-table ( -- table )
table get [ [ 1 = ] map ] map ;
-: eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ;
+: eval-seq ( seq -- seq ) [ ?execute ] map ;
: (set-table) ( class1 class2 val -- )
[ table get nth ] dip '[ _ or ] change-nth ;
: connect ( class1 class2 -- ) 1 set-table ;
: disconnect ( class1 class2 -- ) 0 set-table ;
-: break-around ( classes1 classes2 -- )
- [ disconnect ] [ swap disconnect ] 2bi ;
-
: make-grapheme-table ( -- )
{ CR } { LF } connect
{ Control CR LF } graphemes disconnect
: grapheme-break? ( class1 class2 -- ? )
grapheme-table nth nth not ;
-: chars ( i str n -- str[i] str[i+n] )
- swap [ dupd + ] dip [ ?nth ] curry bi@ ;
-
PRIVATE>
: first-grapheme ( str -- i )
unclip-slice grapheme-class over
[ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
- nip swap length or 1+ ;
+ nip swap length or 1 + ;
+
+: first-grapheme-from ( start str -- i )
+ over tail-slice first-grapheme + ;
+
+: last-grapheme ( str -- i )
+ unclip-last-slice grapheme-class swap
+ [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
+
+: last-grapheme-from ( end str -- i )
+ swap head-slice last-grapheme ;
<PRIVATE
: string-reverse ( str -- rts )
>graphemes reverse concat ;
-: last-grapheme ( str -- i )
- unclip-last-slice grapheme-class swap
- [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
-
<PRIVATE
graphemes init-table table
VALUE: word-break-table
-"vocab:unicode/data/WordBreakProperty.txt" load-script
+"vocab:unicode/data/WordBreakProperty.txt" load-interval-file
to: word-break-table
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
swap [ format/extended? not ] find-from drop ;
: walk-up ( str i -- j )
- dupd 1+ (walk-up) [ 1+ (walk-up) ] [ drop f ] if* ;
+ dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
: (walk-down) ( str i -- j )
swap [ format/extended? not ] find-last-from drop ;
: walk-down ( str i -- j )
- dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ;
+ dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
: word-break? ( str i table-entry -- ? )
{
: first-word ( str -- i )
[ unclip-slice word-break-prop over <enum> ] keep
'[ swap _ word-break-next ] assoc-find 2drop
- nip swap length or 1+ ;
+ nip swap length or 1 + ;
: >words ( str -- words )
[ first-word ] >pieces ;
<PRIVATE
: nth-next ( i str -- str[i-1] str[i] )
- [ [ 1- ] keep ] dip '[ _ nth ] bi@ ;
+ [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
PRIVATE>
word-break-next nip
]
} 2|| ;
+
+: first-word-from ( start str -- i )
+ over tail-slice first-word + ;
+
+: last-word ( str -- i )
+ [ length ] keep '[ _ word-break-at? ] find-last drop 0 or ;
+
+: last-word-from ( end str -- i )
+ swap head-slice last-word ;