1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors alien.syntax arrays assocs combinators
4 combinators.short-circuit compiler.units fry interval-maps io
5 io.encodings.ascii io.files kernel literals locals make math
6 math.parser math.ranges memoize namespaces parser sequences
7 sets simple-flat-file splitting unicode.categories
8 unicode.categories.syntax unicode.data unicode.normalize
9 unicode.normalize.private words words.constant ;
26 CONSTANT: SpacingMark 10
28 CONSTANT: graphemes 12
30 : jamo-class ( ch -- class )
31 dup initial? [ drop L ]
32 [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
34 : hangul-class ( ch -- class )
35 hangul-base - 0x1C mod zero? LV LVT ? ;
37 CATEGORY: grapheme-control Zl Zp Cc Cf ;
38 : control-class ( ch -- class )
49 "Other_Grapheme_Extend" property? ;
52 "Logical_Order_Exception" property? ;
54 CATEGORY: spacing Mc ;
56 : grapheme-class ( ch -- class )
58 { [ dup jamo? ] [ jamo-class ] }
59 { [ dup hangul? ] [ hangul-class ] }
60 { [ dup grapheme-control? ] [ control-class ] }
61 { [ dup extend? ] [ drop Extend ] }
62 { [ dup spacing? ] [ drop SpacingMark ] }
63 { [ loe? ] [ Prepend ] }
67 : init-table ( size -- table )
68 dup [ f <array> ] curry replicate ;
72 : finish-table ( -- table )
73 table get [ [ 1 = ] map ] map ;
75 : eval-seq ( seq -- seq )
76 [ dup word? [ execute( -- x ) ] when ] map ;
78 : (set-table) ( class1 class2 val -- )
79 [ table get nth ] dip '[ _ or ] change-nth ;
81 : set-table ( classes1 classes2 val -- )
82 [ [ eval-seq ] bi@ ] dip
83 [ [ (set-table) ] curry with each ] 2curry each ;
85 : connect ( class1 class2 -- ) 1 set-table ;
86 : disconnect ( class1 class2 -- ) 0 set-table ;
88 : make-grapheme-table ( -- )
90 { Control CR LF } graphemes iota disconnect
91 graphemes iota { Control CR LF } disconnect
92 { L } { L V LV LVT } connect
93 { LV V } { V T } connect
94 { LVT T } { T } connect
95 graphemes iota { Extend } connect
96 graphemes iota { SpacingMark } connect
97 { Prepend } graphemes iota connect ;
99 "grapheme-table" create-word-in
100 graphemes init-table table
101 [ make-grapheme-table finish-table ] with-variable
105 : grapheme-break? ( class1 class2 -- ? )
106 grapheme-table nth nth not ;
110 : first-grapheme ( str -- i )
111 unclip-slice grapheme-class over
112 [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
113 nip swap length or 1 + ;
115 : first-grapheme-from ( start str -- i )
116 over tail-slice first-grapheme + ;
118 : last-grapheme ( str -- i )
119 unclip-last-slice grapheme-class swap
120 [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
122 : last-grapheme-from ( end str -- i )
123 swap head-slice last-grapheme ;
127 : >pieces ( str quot: ( str -- i ) -- graphemes )
128 [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
132 : >graphemes ( str -- graphemes )
133 [ first-grapheme ] >pieces ;
135 : string-reverse ( str -- rts )
136 >graphemes reverse! concat ;
148 CONSTANT: wKatakana 6
150 CONSTANT: wMidLetter 8
152 CONSTANT: wMidNumLet 10
153 CONSTANT: wNumeric 11
154 CONSTANT: wExtendNumLet 12
155 CONSTANT: unicode-words 13
157 ! Is there a way to avoid this?
158 CONSTANT: word-break-classes H{
159 { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
160 { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
161 { "ALetter" 7 } { "MidLetter" 8 }
162 { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
163 { "ExtendNumLet" 12 }
166 "word-break-table" create-word-in
167 "vocab:unicode/data/WordBreakProperty.txt"
168 load-interval-file dup array>>
169 [ 2 swap [ word-break-classes at ] change-nth ] each
173 : word-break-prop ( char -- word-break-prop )
174 word-break-table interval-at wOther or ;
177 SYMBOL: check-letter-before
178 SYMBOL: check-letter-after
179 SYMBOL: check-number-before
180 SYMBOL: check-number-after
182 : make-word-table ( -- )
183 { wCR } { wLF } connect
184 { wNewline wCR wLF } unicode-words iota disconnect
185 unicode-words iota { wNewline wCR wLF } disconnect
186 { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
187 { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
188 { wNumeric wALetter } { wNumeric wALetter } connect
189 { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
190 { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
191 { wKatakana } { wKatakana } connect
192 { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
193 [ connect ] [ swap connect ] 2bi ;
195 : finish-word-table ( -- table )
197 [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
200 "word-table" create-word-in
201 unicode-words init-table table
202 [ make-word-table finish-word-table ] with-variable
206 : word-table-nth ( class1 class2 -- ? )
209 :: property-not= ( str i property -- ? )
211 i str ?nth [ word-break-prop property = not ]
215 : (format/extended?) ( class -- ? )
216 ${ wExtend wFormat } member? ; inline
218 : format/extended? ( ch -- ? )
219 word-break-prop (format/extended?) ;
221 : (walk-up) ( str i -- j )
222 swap [ format/extended? not ] find-from drop ;
224 : walk-up ( str i -- j )
225 dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
227 : (walk-down) ( str i -- j )
228 swap [ format/extended? not ] find-last-from drop ;
230 : walk-down ( str i -- j )
231 dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
233 : word-break? ( str i table-entry -- ? )
238 [ dupd walk-up wALetter property-not= ] }
239 { check-letter-before
240 [ dupd walk-down wALetter property-not= ] }
242 [ dupd walk-up wNumeric property-not= ] }
243 { check-number-before
244 [ dupd walk-down wNumeric property-not= ] }
247 :: word-break-next ( old-class new-char i str -- next-class ? )
248 new-char word-break-prop :> new-class
249 new-class (format/extended?)
250 [ old-class dup ${ wCR wLF wNewline } member? ] [
251 new-class old-class over word-table-nth
252 [ str i 1 - ] dip word-break?
257 : first-word ( str -- i )
258 [ [ length ] [ first word-break-prop ] bi ] keep
259 1 swap dup '[ _ word-break-next ] find-index-from
262 : >words ( str -- words )
263 [ first-word ] >pieces ;
267 : nth-next ( i str -- str[i-1] str[i] )
268 [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
272 : word-break-at? ( i str -- ? )
277 [ nth-next [ word-break-prop ] dip ] 2keep
282 : first-word-from ( start str -- i )
283 over tail-slice first-word + ;
285 : last-word ( str -- i )
286 [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
288 : last-word-from ( end str -- i )
289 swap head-slice last-word ;