1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: combinators.short-circuit unicode.categories kernel math
4 combinators splitting sequences math.parser io.files io assocs
5 arrays namespaces make math.ranges unicode.normalize.private values
6 io.encodings.ascii unicode.syntax unicode.data compiler.units fry
7 alien.syntax sets accessors interval-maps memoize locals words ;
13 C-ENUM: Any L V T LV LVT Extend Control CR LF
14 SpacingMark Prepend graphemes ;
16 : jamo-class ( ch -- class )
17 dup initial? [ drop L ]
18 [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
20 : hangul-class ( ch -- class )
21 hangul-base - HEX: 1C mod zero? LV LVT ? ;
23 CATEGORY: grapheme-control Zl Zp Cc Cf ;
24 : control-class ( ch -- class )
28 { HEX: 200C [ Extend ] }
29 { HEX: 200D [ Extend ] }
33 CATEGORY: (extend) Me Mn ;
35 { [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
38 "Logical_Order_Exception" property? ;
40 CATEGORY: spacing Mc ;
42 : grapheme-class ( ch -- class )
44 { [ dup jamo? ] [ jamo-class ] }
45 { [ dup hangul? ] [ hangul-class ] }
46 { [ dup grapheme-control? ] [ control-class ] }
47 { [ dup extend? ] [ drop Extend ] }
48 { [ dup spacing? ] [ drop SpacingMark ] }
49 { [ loe? ] [ Prepend ] }
53 : init-table ( size -- table )
54 dup [ f <array> ] curry replicate ;
58 : finish-table ( -- table )
59 table get [ [ 1 = ] map ] map ;
61 : eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ;
63 : (set-table) ( class1 class2 val -- )
64 [ table get nth ] dip '[ _ or ] change-nth ;
66 : set-table ( classes1 classes2 val -- )
67 [ [ eval-seq ] bi@ ] dip
68 [ [ (set-table) ] curry with each ] 2curry each ;
70 : connect ( class1 class2 -- ) 1 set-table ;
71 : disconnect ( class1 class2 -- ) 0 set-table ;
73 : break-around ( classes1 classes2 -- )
74 [ disconnect ] [ swap disconnect ] 2bi ;
76 : make-grapheme-table ( -- )
78 { Control CR LF } graphemes disconnect
79 graphemes { Control CR LF } disconnect
80 { L } { L V LV LVT } connect
81 { LV V } { V T } connect
82 { LVT T } { T } connect
83 graphemes { Extend } connect
84 graphemes { SpacingMark } connect
85 { Prepend } graphemes connect ;
89 : grapheme-break? ( class1 class2 -- ? )
90 grapheme-table nth nth not ;
92 : chars ( i str n -- str[i] str[i+n] )
93 swap [ dupd + ] dip [ ?nth ] curry bi@ ;
97 : first-grapheme ( str -- i )
98 unclip-slice grapheme-class over
99 [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
100 nip swap length or 1+ ;
104 : >pieces ( str quot: ( str -- i ) -- graphemes )
105 [ dup empty? not ] swap '[ dup @ cut-slice swap ]
106 [ ] produce nip ; inline
110 : >graphemes ( str -- graphemes )
111 [ first-grapheme ] >pieces ;
113 : string-reverse ( str -- rts )
114 >graphemes reverse concat ;
116 : last-grapheme ( str -- i )
117 unclip-last-slice grapheme-class swap
118 [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
122 graphemes init-table table
123 [ make-grapheme-table finish-table ] with-variable
128 VALUE: word-break-table
130 "resource:basis/unicode/data/WordBreakProperty.txt" load-script
133 C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
134 wMidNum wMidNumLet wNumeric wExtendNumLet words ;
136 : word-break-classes ( -- table ) ! Is there a way to avoid this?
138 { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
139 { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
140 { "ALetter" 7 } { "MidLetter" 8 }
141 { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
142 { "ExtendNumLet" 12 }
145 : word-break-prop ( char -- word-break-prop )
146 word-break-table interval-at
147 word-break-classes at [ wOther ] unless* ;
149 SYMBOL: check-letter-before
150 SYMBOL: check-letter-after
151 SYMBOL: check-number-before
152 SYMBOL: check-number-after
154 : make-word-table ( -- )
155 { wCR } { wLF } connect
156 { wNewline wCR wLF } words disconnect
157 words { wNewline wCR wLF } disconnect
158 { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
159 { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
160 { wNumeric wALetter } { wNumeric wALetter } connect
161 { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
162 { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
163 { wKatakana } { wKatakana } connect
164 { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
165 [ connect ] [ swap connect ] 2bi ;
169 : finish-word-table ( -- table )
171 [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
174 words init-table table
175 [ make-word-table finish-word-table ] with-variable
178 : word-table-nth ( class1 class2 -- ? )
181 :: property-not= ( str i property -- ? )
183 i str ?nth [ word-break-prop property = not ]
187 : format/extended? ( ch -- ? )
188 word-break-prop { 4 5 } member? ;
190 : (walk-up) ( str i -- j )
191 swap [ format/extended? not ] find-from drop ;
193 : walk-up ( str i -- j )
194 dupd 1+ (walk-up) [ 1+ (walk-up) ] [ drop f ] if* ;
196 : (walk-down) ( str i -- j )
197 swap [ format/extended? not ] find-last-from drop ;
199 : walk-down ( str i -- j )
200 dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ;
202 : word-break? ( str i table-entry -- ? )
207 [ dupd walk-up wALetter property-not= ] }
208 { check-letter-before
209 [ dupd walk-down wALetter property-not= ] }
211 [ dupd walk-up wNumeric property-not= ] }
212 { check-number-before
213 [ dupd walk-down wNumeric property-not= ] }
216 :: word-break-next ( old-class new-char i str -- next-class ? )
217 new-char format/extended?
218 [ old-class dup { 1 2 3 } member? ] [
219 new-char word-break-prop old-class over word-table-nth
220 [ str i ] dip word-break?
225 : first-word ( str -- i )
226 [ unclip-slice word-break-prop over <enum> ] keep
227 '[ swap _ word-break-next ] assoc-find 2drop
228 nip swap length or 1+ ;
230 : >words ( str -- words )
231 [ first-word ] >pieces ;