1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: combinators.short-circuit unicode.categories kernel math
4 combinators splitting sequences math.parser io.files io assocs
5 arrays namespaces make math.ranges unicode.normalize
6 unicode.normalize.private values io.encodings.ascii
7 unicode.data compiler.units fry unicode.categories.syntax
8 alien.syntax sets accessors interval-maps memoize locals words
25 CONSTANT: SpacingMark 10
27 CONSTANT: graphemes 12
29 : jamo-class ( ch -- class )
30 dup initial? [ drop L ]
31 [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
33 : hangul-class ( ch -- class )
34 hangul-base - HEX: 1C mod zero? LV LVT ? ;
36 CATEGORY: grapheme-control Zl Zp Cc Cf ;
37 : control-class ( ch -- class )
41 { HEX: 200C [ Extend ] }
42 { HEX: 200D [ Extend ] }
48 "Other_Grapheme_Extend" property? ;
51 "Logical_Order_Exception" property? ;
53 CATEGORY: spacing Mc ;
55 : grapheme-class ( ch -- class )
57 { [ dup jamo? ] [ jamo-class ] }
58 { [ dup hangul? ] [ hangul-class ] }
59 { [ dup grapheme-control? ] [ control-class ] }
60 { [ dup extend? ] [ drop Extend ] }
61 { [ dup spacing? ] [ drop SpacingMark ] }
62 { [ loe? ] [ Prepend ] }
66 : init-table ( size -- table )
67 dup [ f <array> ] curry replicate ;
71 : finish-table ( -- table )
72 table get [ [ 1 = ] map ] map ;
74 : eval-seq ( seq -- seq ) [ ?execute ] map ;
76 : (set-table) ( class1 class2 val -- )
77 [ table get nth ] dip '[ _ or ] change-nth ;
79 : set-table ( classes1 classes2 val -- )
80 [ [ eval-seq ] bi@ ] dip
81 [ [ (set-table) ] curry with each ] 2curry each ;
83 : connect ( class1 class2 -- ) 1 set-table ;
84 : disconnect ( class1 class2 -- ) 0 set-table ;
86 : make-grapheme-table ( -- )
88 { Control CR LF } graphemes iota disconnect
89 graphemes iota { Control CR LF } disconnect
90 { L } { L V LV LVT } connect
91 { LV V } { V T } connect
92 { LVT T } { T } connect
93 graphemes iota { Extend } connect
94 graphemes iota { SpacingMark } connect
95 { Prepend } graphemes iota connect ;
99 : grapheme-break? ( class1 class2 -- ? )
100 grapheme-table nth nth not ;
104 : first-grapheme ( str -- i )
105 unclip-slice grapheme-class over
106 [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
107 nip swap length or 1 + ;
109 : first-grapheme-from ( start str -- i )
110 over tail-slice first-grapheme + ;
112 : last-grapheme ( str -- i )
113 unclip-last-slice grapheme-class swap
114 [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
116 : last-grapheme-from ( end str -- i )
117 swap head-slice last-grapheme ;
121 : >pieces ( str quot: ( str -- i ) -- graphemes )
122 [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
126 : >graphemes ( str -- graphemes )
127 [ first-grapheme ] >pieces ;
129 : string-reverse ( str -- rts )
130 >graphemes reverse concat ;
134 graphemes init-table table
135 [ make-grapheme-table finish-table ] with-variable
140 VALUE: word-break-table
142 "vocab:unicode/data/WordBreakProperty.txt" load-interval-file
151 CONSTANT: wKatakana 6
153 CONSTANT: wMidLetter 8
155 CONSTANT: wMidNumLet 10
156 CONSTANT: wNumeric 11
157 CONSTANT: wExtendNumLet 12
160 : word-break-classes ( -- table ) ! Is there a way to avoid this?
162 { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
163 { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
164 { "ALetter" 7 } { "MidLetter" 8 }
165 { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
166 { "ExtendNumLet" 12 }
169 : word-break-prop ( char -- word-break-prop )
170 word-break-table interval-at
171 word-break-classes at [ wOther ] unless* ;
173 SYMBOL: check-letter-before
174 SYMBOL: check-letter-after
175 SYMBOL: check-number-before
176 SYMBOL: check-number-after
178 : make-word-table ( -- )
179 { wCR } { wLF } connect
180 { wNewline wCR wLF } words iota disconnect
181 words iota { wNewline wCR wLF } disconnect
182 { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
183 { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
184 { wNumeric wALetter } { wNumeric wALetter } connect
185 { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
186 { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
187 { wKatakana } { wKatakana } connect
188 { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
189 [ connect ] [ swap connect ] 2bi ;
193 : finish-word-table ( -- table )
195 [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
198 words init-table table
199 [ make-word-table finish-word-table ] with-variable
202 : word-table-nth ( class1 class2 -- ? )
205 :: property-not= ( str i property -- ? )
207 i str ?nth [ word-break-prop property = not ]
211 : format/extended? ( ch -- ? )
212 word-break-prop { 4 5 } member? ;
214 : (walk-up) ( str i -- j )
215 swap [ format/extended? not ] find-from drop ;
217 : walk-up ( str i -- j )
218 dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
220 : (walk-down) ( str i -- j )
221 swap [ format/extended? not ] find-last-from drop ;
223 : walk-down ( str i -- j )
224 dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
226 : word-break? ( str i table-entry -- ? )
231 [ dupd walk-up wALetter property-not= ] }
232 { check-letter-before
233 [ dupd walk-down wALetter property-not= ] }
235 [ dupd walk-up wNumeric property-not= ] }
236 { check-number-before
237 [ dupd walk-down wNumeric property-not= ] }
240 :: word-break-next ( old-class new-char i str -- next-class ? )
241 new-char format/extended?
242 [ old-class dup { 1 2 3 } member? ] [
243 new-char word-break-prop old-class over word-table-nth
244 [ str i ] dip word-break?
249 : first-word ( str -- i )
250 [ unclip-slice word-break-prop over <enum> ] keep
251 '[ swap _ word-break-next ] assoc-find 2drop
252 nip swap length or 1 + ;
254 : >words ( str -- words )
255 [ first-word ] >pieces ;
259 : nth-next ( i str -- str[i-1] str[i] )
260 [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
264 : word-break-at? ( i str -- ? )
269 [ nth-next [ word-break-prop ] dip ] 2keep
274 : first-word-from ( start str -- i )
275 over tail-slice first-word + ;
277 : last-word ( str -- i )
278 [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
280 : last-word-from ( end str -- i )
281 swap head-slice last-word ;