1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors alien.syntax arrays assocs combinators
4 combinators.short-circuit compiler.units fry interval-maps io
5 io.encodings.ascii io.files kernel literals locals make math
6 math.parser math.ranges memoize namespaces parser sequences
7 sets simple-flat-file splitting unicode.categories
8 unicode.categories.syntax unicode.data unicode.normalize
9 unicode.normalize.private words words.constant ;
10 FROM: sequences => change-nth ;
27 CONSTANT: SpacingMark 10
29 CONSTANT: graphemes 12
31 : jamo-class ( ch -- class )
32 dup initial? [ drop L ]
33 [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
35 : hangul-class ( ch -- class )
36 hangul-base - 0x1C mod zero? LV LVT ? ;
38 CATEGORY: grapheme-control Zl Zp Cc Cf ;
39 : control-class ( ch -- class )
50 "Other_Grapheme_Extend" property? ;
53 "Logical_Order_Exception" property? ;
55 CATEGORY: spacing Mc ;
57 : grapheme-class ( ch -- class )
59 { [ dup jamo? ] [ jamo-class ] }
60 { [ dup hangul? ] [ hangul-class ] }
61 { [ dup grapheme-control? ] [ control-class ] }
62 { [ dup extend? ] [ drop Extend ] }
63 { [ dup spacing? ] [ drop SpacingMark ] }
64 { [ loe? ] [ Prepend ] }
68 : init-table ( size -- table )
69 dup [ f <array> ] curry replicate ;
73 : finish-table ( -- table )
74 table get [ [ 1 = ] map ] map ;
76 : eval-seq ( seq -- seq )
77 [ dup word? [ execute( -- x ) ] when ] map ;
79 : (set-table) ( class1 class2 val -- )
80 [ table get nth ] dip '[ _ or ] change-nth ;
82 : set-table ( classes1 classes2 val -- )
83 [ [ eval-seq ] bi@ ] dip
84 [ [ (set-table) ] curry with each ] 2curry each ;
86 : connect ( class1 class2 -- ) 1 set-table ;
87 : disconnect ( class1 class2 -- ) 0 set-table ;
89 : make-grapheme-table ( -- )
91 { Control CR LF } graphemes iota disconnect
92 graphemes iota { Control CR LF } disconnect
93 { L } { L V LV LVT } connect
94 { LV V } { V T } connect
95 { LVT T } { T } connect
96 graphemes iota { Extend } connect
97 graphemes iota { SpacingMark } connect
98 { Prepend } graphemes iota connect ;
100 "grapheme-table" create-word-in
101 graphemes init-table table
102 [ make-grapheme-table finish-table ] with-variable
106 : grapheme-break? ( class1 class2 -- ? )
107 grapheme-table nth nth not ;
111 : first-grapheme ( str -- i )
112 unclip-slice grapheme-class over
113 [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
114 nip swap length or 1 + ;
116 : first-grapheme-from ( start str -- i )
117 over tail-slice first-grapheme + ;
119 : last-grapheme ( str -- i )
120 unclip-last-slice grapheme-class swap
121 [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
123 : last-grapheme-from ( end str -- i )
124 swap head-slice last-grapheme ;
128 : >pieces ( str quot: ( str -- i ) -- graphemes )
129 [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
133 : >graphemes ( str -- graphemes )
134 [ first-grapheme ] >pieces ;
136 : string-reverse ( str -- rts )
137 >graphemes reverse! concat ;
149 CONSTANT: wKatakana 6
151 CONSTANT: wMidLetter 8
153 CONSTANT: wMidNumLet 10
154 CONSTANT: wNumeric 11
155 CONSTANT: wExtendNumLet 12
156 CONSTANT: unicode-words 13
158 ! Is there a way to avoid this?
159 CONSTANT: word-break-classes H{
160 { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
161 { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
162 { "ALetter" 7 } { "MidLetter" 8 }
163 { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
164 { "ExtendNumLet" 12 }
167 "word-break-table" create-word-in
168 "vocab:unicode/data/WordBreakProperty.txt"
169 load-interval-file dup array>>
170 [ 2 swap [ word-break-classes at ] change-nth ] each
174 : word-break-prop ( char -- word-break-prop )
175 word-break-table interval-at wOther or ;
178 SYMBOL: check-letter-before
179 SYMBOL: check-letter-after
180 SYMBOL: check-number-before
181 SYMBOL: check-number-after
183 : make-word-table ( -- )
184 { wCR } { wLF } connect
185 { wNewline wCR wLF } unicode-words iota disconnect
186 unicode-words iota { wNewline wCR wLF } disconnect
187 { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
188 { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
189 { wNumeric wALetter } { wNumeric wALetter } connect
190 { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
191 { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
192 { wKatakana } { wKatakana } connect
193 { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
194 [ connect ] [ swap connect ] 2bi ;
196 : finish-word-table ( -- table )
198 [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
201 "word-table" create-word-in
202 unicode-words init-table table
203 [ make-word-table finish-word-table ] with-variable
207 : word-table-nth ( class1 class2 -- ? )
210 :: property-not= ( str i property -- ? )
212 i str ?nth [ word-break-prop property = not ]
216 : (format/extended?) ( class -- ? )
217 ${ wExtend wFormat } member? ; inline
219 : format/extended? ( ch -- ? )
220 word-break-prop (format/extended?) ;
222 : (walk-up) ( str i -- j )
223 swap [ format/extended? not ] find-from drop ;
225 : walk-up ( str i -- j )
226 dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
228 : (walk-down) ( str i -- j )
229 swap [ format/extended? not ] find-last-from drop ;
231 : walk-down ( str i -- j )
232 dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
234 : word-break? ( str i table-entry -- ? )
239 [ dupd walk-up wALetter property-not= ] }
240 { check-letter-before
241 [ dupd walk-down wALetter property-not= ] }
243 [ dupd walk-up wNumeric property-not= ] }
244 { check-number-before
245 [ dupd walk-down wNumeric property-not= ] }
248 :: word-break-next ( old-class new-char i str -- next-class ? )
249 new-char word-break-prop :> new-class
250 new-class (format/extended?)
251 [ old-class dup ${ wCR wLF wNewline } member? ] [
252 new-class old-class over word-table-nth
253 [ str i 1 - ] dip word-break?
258 : first-word ( str -- i )
259 [ [ length ] [ first word-break-prop ] bi ] keep
260 1 swap dup '[ _ word-break-next ] find-index-from
263 : >words ( str -- words )
264 [ first-word ] >pieces ;
268 : nth-next ( i str -- str[i-1] str[i] )
269 [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
273 : word-break-at? ( i str -- ? )
278 [ nth-next [ word-break-prop ] dip ] 2keep
283 : first-word-from ( start str -- i )
284 over tail-slice first-word + ;
286 : last-word ( str -- i )
287 [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
289 : last-word-from ( end str -- i )
290 swap head-slice last-word ;