str [
dup quot call cut-slice
swap , quot (>pieces)
- ] unless-empty ;
+ ] unless-empty ; inline recursive
: >pieces ( str quot -- graphemes )
- [ (>pieces) ] { } make ;
+ [ (>pieces) ] { } make ; inline
: >graphemes ( str -- graphemes )
[ first-grapheme ] >pieces ;
C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
wMidNum wMidNumLet wNumeric wExtendNumLet words ;
-MEMO: word-break-classes ( -- table )
+: word-break-classes ( -- table ) ! Is there a way to avoid this?
H{
- { "Other" wOther } { "CR" wCR } { "LF" wLF } { "Newline" wNewline }
- { "Extend" wExtend } { "Format" wFormat } { "Katakana" wKatakana }
- { "ALetter" wALetter } { "MidLetter" wMidLetter }
- { "MidNum" wMidNum } { "MidNumLet" wMidNumLet } { "Numeric" wNumeric }
- { "ExtendNumLet" wExtendNumLet }
- } [ execute ] assoc-map ;
+ { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
+ { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
+ { "ALetter" 7 } { "MidLetter" 8 }
+ { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
+ { "ExtendNumLet" 12 }
+ } ;
: word-break-prop ( char -- word-break-prop )
word-break-table interval-at
[ make-word-table finish-word-table ] with-variable
to: word-table
-: word-break? ( class1 class2 -- ? )
- word-table nth nth not ;
-
-: skip? ( char -- ? )
- word-break-prop { 4 5 } member? ; ! wExtend or wFormat
+: word-table-nth ( class1 class2 -- ? )
+ word-table nth nth ;
+
+: property-not= ( i str property -- ? )
+ pick [
+ [ ?nth ] dip swap
+ [ word-break-prop = not ] [ drop f ] if*
+ ] [ 3drop t ] if ;
+
+: format/extended? ( ch -- ? )
+ word-break-prop { 4 5 } member? ;
+
+:: walk-up ( str i -- j )
+ i 1 + str [ format/extended? not ] find-from drop
+ 1+ str [ format/extended? not ] find-from drop ; ! possible bounds error?
+
+:: walk-down ( str i -- j )
+ i str [ format/extended? not ] find-last-from drop
+ 1- str [ format/extended? not ] find-last-from drop ; ! possible bounds error?
+
+:: word-break? ( table-entry i str -- ? )
+ table-entry {
+ { t [ f ] }
+ { f [ t ] }
+ { check-letter-after
+ [ str i walk-up str wALetter property-not= ] }
+ { check-letter-before
+ [ str i walk-down str wALetter property-not= ] }
+ { check-number-after
+ [ str i walk-up str wNumeric property-not= ] }
+ { check-number-before
+ [ str i walk-down str wNumeric property-not= ] }
+ } case ;
-: word-break-next ( old-class new-char -- next-class ? )
- word-break-prop dup { 4 5 } member?
- [ drop f ] [ tuck word-break? ] if ;
+:: word-break-next ( old-class new-char i str -- next-class ? )
+ new-char word-break-prop dup { 4 5 } member?
+ [ drop old-class dup { 1 2 3 } member? ]
+ [ old-class over word-table-nth i str word-break? ] if ;
-: first-word ( str -- i )
- unclip-slice word-break-prop over
- [ word-break-next ] find-index
+:: first-word ( str -- i )
+ str unclip-slice word-break-prop over <enum>
+ [ swap str word-break-next ] assoc-find 2drop
nip swap length or 1+ ;
-! This must be changed to ignore format/extended chars and
-! handle symbols in the table specially
: >words ( str -- words )
[ first-word ] >pieces ;