1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors alien.syntax arrays assocs combinators
4 combinators.short-circuit compiler.units fry interval-maps io
5 io.encodings.ascii io.files kernel literals locals make math
6 math.parser math.ranges memoize namespaces sequences
7 sets simple-flat-file splitting unicode.categories
8 unicode.categories.syntax unicode.data unicode.normalize
9 unicode.normalize.private values words ;
10 FROM: sequences => change-nth ;
26 CONSTANT: SpacingMark 10
28 CONSTANT: graphemes 12
30 : jamo-class ( ch -- class )
31 dup initial? [ drop L ]
32 [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
34 : hangul-class ( ch -- class )
35 hangul-base - HEX: 1C mod zero? LV LVT ? ;
37 CATEGORY: grapheme-control Zl Zp Cc Cf ;
38 : control-class ( ch -- class )
42 { HEX: 200C [ Extend ] }
43 { HEX: 200D [ Extend ] }
49 "Other_Grapheme_Extend" property? ;
52 "Logical_Order_Exception" property? ;
54 CATEGORY: spacing Mc ;
56 : grapheme-class ( ch -- class )
58 { [ dup jamo? ] [ jamo-class ] }
59 { [ dup hangul? ] [ hangul-class ] }
60 { [ dup grapheme-control? ] [ control-class ] }
61 { [ dup extend? ] [ drop Extend ] }
62 { [ dup spacing? ] [ drop SpacingMark ] }
63 { [ loe? ] [ Prepend ] }
67 : init-table ( size -- table )
68 dup [ f <array> ] curry replicate ;
72 : finish-table ( -- table )
73 table get [ [ 1 = ] map ] map ;
75 : eval-seq ( seq -- seq ) [ ?execute ] map ;
77 : (set-table) ( class1 class2 val -- )
78 [ table get nth ] dip '[ _ or ] change-nth ;
80 : set-table ( classes1 classes2 val -- )
81 [ [ eval-seq ] bi@ ] dip
82 [ [ (set-table) ] curry with each ] 2curry each ;
84 : connect ( class1 class2 -- ) 1 set-table ;
85 : disconnect ( class1 class2 -- ) 0 set-table ;
87 : make-grapheme-table ( -- )
89 { Control CR LF } graphemes iota disconnect
90 graphemes iota { Control CR LF } disconnect
91 { L } { L V LV LVT } connect
92 { LV V } { V T } connect
93 { LVT T } { T } connect
94 graphemes iota { Extend } connect
95 graphemes iota { SpacingMark } connect
96 { Prepend } graphemes iota connect ;
100 : grapheme-break? ( class1 class2 -- ? )
101 grapheme-table nth nth not ;
105 : first-grapheme ( str -- i )
106 unclip-slice grapheme-class over
107 [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
108 nip swap length or 1 + ;
110 : first-grapheme-from ( start str -- i )
111 over tail-slice first-grapheme + ;
113 : last-grapheme ( str -- i )
114 unclip-last-slice grapheme-class swap
115 [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
117 : last-grapheme-from ( end str -- i )
118 swap head-slice last-grapheme ;
122 : >pieces ( str quot: ( str -- i ) -- graphemes )
123 [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
127 : >graphemes ( str -- graphemes )
128 [ first-grapheme ] >pieces ;
130 : string-reverse ( str -- rts )
131 >graphemes reverse concat ;
135 graphemes init-table table
136 [ make-grapheme-table finish-table ] with-variable
137 \ grapheme-table set-value
141 VALUE: word-break-table
143 "vocab:unicode/data/WordBreakProperty.txt" load-interval-file
144 \ word-break-table set-value
152 CONSTANT: wKatakana 6
154 CONSTANT: wMidLetter 8
156 CONSTANT: wMidNumLet 10
157 CONSTANT: wNumeric 11
158 CONSTANT: wExtendNumLet 12
161 : word-break-classes ( -- table ) ! Is there a way to avoid this?
163 { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
164 { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
165 { "ALetter" 7 } { "MidLetter" 8 }
166 { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
167 { "ExtendNumLet" 12 }
170 : word-break-prop ( char -- word-break-prop )
171 word-break-table interval-at
172 word-break-classes at [ wOther ] unless* ;
174 SYMBOL: check-letter-before
175 SYMBOL: check-letter-after
176 SYMBOL: check-number-before
177 SYMBOL: check-number-after
179 : make-word-table ( -- )
180 { wCR } { wLF } connect
181 { wNewline wCR wLF } words iota disconnect
182 words iota { wNewline wCR wLF } disconnect
183 { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
184 { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
185 { wNumeric wALetter } { wNumeric wALetter } connect
186 { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
187 { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
188 { wKatakana } { wKatakana } connect
189 { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
190 [ connect ] [ swap connect ] 2bi ;
194 : finish-word-table ( -- table )
196 [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
199 words init-table table
200 [ make-word-table finish-word-table ] with-variable
201 \ word-table set-value
203 : word-table-nth ( class1 class2 -- ? )
206 :: property-not= ( str i property -- ? )
208 i str ?nth [ word-break-prop property = not ]
212 : (format/extended?) ( class -- ? )
213 ${ wExtend wFormat } member? ; inline
215 : format/extended? ( ch -- ? )
216 word-break-prop (format/extended?) ;
218 : (walk-up) ( str i -- j )
219 swap [ format/extended? not ] find-from drop ;
221 : walk-up ( str i -- j )
222 dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
224 : (walk-down) ( str i -- j )
225 swap [ format/extended? not ] find-last-from drop ;
227 : walk-down ( str i -- j )
228 dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
230 : word-break? ( str i table-entry -- ? )
235 [ dupd walk-up wALetter property-not= ] }
236 { check-letter-before
237 [ dupd walk-down wALetter property-not= ] }
239 [ dupd walk-up wNumeric property-not= ] }
240 { check-number-before
241 [ dupd walk-down wNumeric property-not= ] }
244 :: word-break-next ( old-class new-char i str -- next-class ? )
245 new-char word-break-prop :> new-class
246 new-class (format/extended?)
247 [ old-class dup ${ wCR wLF wNewline } member? ] [
248 new-class old-class over word-table-nth
249 [ str i ] dip word-break?
254 : first-word ( str -- i )
255 [ unclip-slice word-break-prop over ] keep
256 '[ _ word-break-next ] find-index drop
257 nip swap length or 1 + ;
259 : >words ( str -- words )
260 [ first-word ] >pieces ;
264 : nth-next ( i str -- str[i-1] str[i] )
265 [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
269 : word-break-at? ( i str -- ? )
274 [ nth-next [ word-break-prop ] dip ] 2keep
279 : first-word-from ( start str -- i )
280 over tail-slice first-word + ;
282 : last-word ( str -- i )
283 [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
285 : last-word-from ( end str -- i )
286 swap head-slice last-word ;