1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators fry interval-maps
4 kernel literals locals math namespaces parser sequences
5 simple-flat-file unicode.categories unicode.data
6 unicode.normalize.private words words.constant math.order ;
13 :: load-interval-file-for ( filename n key -- table )
14 filename load-data-file [ n swap nth key = ] filter
15 intern-values expand-ranges ;
19 CONSTANT: emoji-modifier-table $[
20 "resource:basis/unicode/UCD/auxiliary/emoji-data.txt"
21 1 "Emoji_Modifier" load-interval-file-for
24 CONSTANT: extended-pictographic-table $[
25 "resource:basis/unicode/UCD/auxiliary/emoji-data.txt"
26 1 "Extended_Pictographic" load-interval-file-for
29 CONSTANT: spacing-mark-exceptions-table $[
31 0x102B 0x102C 0x1038 { 0x1062 0x1064 } { 0x1067 0x106D }
32 0x1083 { 0x1087 0x108C } 0x108F { 0x109A 0x109C } 0x1A61
33 0x1A63 0x1A64 0xAA7B 0xAA7D 0x11720 0x11721
49 CONSTANT: SpacingMark 10
52 CONSTANT: Extended_Pictographic 13
53 CONSTANT: (Extended_Pictographic-Extend*-)ZWJ 14
54 CONSTANT: Regional_Indicator(even) 15
55 CONSTANT: Regional_Indicator(odd) 16
56 CONSTANT: graphemes 17
58 : jamo-class ( ch -- class )
59 dup initial? [ drop L ]
60 [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
62 : hangul-class ( ch -- class )
63 hangul-base - 0x1C mod zero? LV LVT ? ;
67 "Other_Grapheme_Extend" property? ;
69 CATEGORY: grapheme-control Zl Zp Cc Cf ;
71 : control-class ( ch -- class )
73 { [ dup CHAR: \r = ] [ drop CR ] }
74 { [ dup CHAR: \n = ] [ drop LF ] }
75 { [ dup 0x200C = ] [ drop Extend ] }
76 { [ dup 0x200D = ] [ drop ZWJ ] }
77 { [ dup "Other_Grapheme_Extend" property? ] [ drop Extend ] }
82 "Logical_Order_Exception" property? ;
84 CATEGORY: spacing Mc ;
86 : regional? ( ch -- ? )
87 "Regional_Indicator" property? ;
90 : modifier? ( ch -- ? )
91 emoji-modifier-table interval-key? ; inline
93 :: grapheme-class ( str -- class )
96 { [ dup jamo? ] [ jamo-class ] }
97 { [ dup hangul? ] [ hangul-class ] }
98 { [ dup grapheme-control? ] [
99 control-class dup ZWJ = [
101 str unclip-last-slice drop dup [
103 [ control-class Extend = ]
106 ] find-last drop [ swap ?nth ] [ last ] if*
107 extended-pictographic-table interval-key? [
108 (Extended_Pictographic-Extend*-)ZWJ
112 { [ dup extend? ] [ drop Extend ] }
113 { [ dup modifier? ] [ drop Extend ] }
115 spacing-mark-exceptions-table
116 interval-key? [ Any ] [ SpacingMark ] if ] }
117 { [ dup loe? ] [ drop Prepend ] }
118 { [ dup regional? ] [
121 str unclip-last-slice drop [
122 regional? [ ri-even? not ri-even?! f ] [ t ] if
125 Regional_Indicator(even)
127 Regional_Indicator(odd)
130 { [ dup extended-pictographic-table interval-key? ] [
131 drop Extended_Pictographic
137 : init-table ( size -- table )
138 dup [ f <array> ] curry replicate ;
142 : finish-table ( -- table )
143 table get [ [ 1 = ] map ] map ;
145 : eval-seq ( seq -- seq )
146 [ dup word? [ execute( -- x ) ] when ] map ;
148 : (set-table) ( class1 class2 val -- )
149 [ table get nth ] dip '[ _ or ] change-nth ;
151 : set-table ( classes1 classes2 val -- )
152 [ [ eval-seq ] bi@ ] dip
153 [ [ (set-table) ] curry with each ] 2curry each ;
155 : connect ( class1 class2 -- ) 1 set-table ;
156 : disconnect ( class1 class2 -- ) 0 set-table ;
158 : make-grapheme-table ( -- )
159 { CR } { LF } connect ! GB3
160 { Control CR LF } graphemes <iota> disconnect ! GB4
161 graphemes <iota> { Control CR LF } disconnect ! GB5
162 { L } { L V LV LVT } connect ! GB6
163 { LV V } { V T } connect ! GB7
164 { LVT T } { T } connect ! GB8
165 graphemes <iota> { Extend ZWJ (Extended_Pictographic-Extend*-)ZWJ } connect ! GB9
166 graphemes <iota> { SpacingMark } connect ! GB9a
167 { Prepend } graphemes <iota> connect ! GB9b
168 { (Extended_Pictographic-Extend*-)ZWJ } { Extended_Pictographic } connect ! GB11
169 { Regional_Indicator(odd) } { Regional_Indicator(even) } connect ; ! GB12,13
172 CONSTANT: grapheme-table $[
173 graphemes init-table table
174 [ make-grapheme-table finish-table ] with-variable
177 : grapheme-break? ( class1 class2 -- ? )
178 grapheme-table nth nth not ;
188 CONSTANT: wRegional_Indicator 6
190 CONSTANT: wKatakana 8
191 CONSTANT: wHebrew_Letter 9
192 CONSTANT: wALetter 10
193 CONSTANT: wSingle_Quote 11
194 CONSTANT: wDouble_Quote 12
195 CONSTANT: wMidNumLet 13
196 CONSTANT: wMidLetter 14
198 CONSTANT: wNumeric 16
199 CONSTANT: wExtendNumLet 17
200 CONSTANT: wWSegSpace 18
201 CONSTANT: unicode-words 19
205 CONSTANT: word-break-table $[
206 "resource:basis/unicode/UCD/auxiliary/WordBreakProperty.txt"
207 load-interval-file dup array>> [
210 { "Other" [ wOther ] }
213 { "Newline" [ wNewline ] }
214 { "Extend" [ wExtend ] }
216 { "Regional_Indicator" [ wRegional_Indicator ] }
217 { "Format" [ wFormat ] }
218 { "Katakana" [ wKatakana ] }
219 { "Hebrew_Letter" [ wHebrew_Letter ] }
220 { "ALetter" [ wALetter ] }
221 { "Single_Quote" [ wSingle_Quote ] }
222 { "Double_Quote" [ wDouble_Quote ] }
223 { "MidNumLet" [ wMidNumLet ] }
224 { "MidLetter" [ wMidLetter ] }
225 { "MidNum" [ wMidNum ] }
226 { "Numeric" [ wNumeric ] }
227 { "ExtendNumLet" [ wExtendNumLet ] }
228 { "WSegSpace" [ wWSegSpace ] }
235 : word-break-prop ( char -- word-break-prop )
236 word-break-table interval-at wOther or ;
239 SYMBOL: check-AHletter-before
240 SYMBOL: check-AHletter-after
241 SYMBOL: check-Hebrew-letter-before
242 SYMBOL: check-Hebrew-letter-after
243 SYMBOL: check-number-before
244 SYMBOL: check-number-after
245 SYMBOL: check-Extended_Pictographic
246 SYMBOL: check-RI-pair
248 : make-word-table ( -- )
249 { wCR } { wLF } connect ! WB3
252 wLF } unicode-words <iota> disconnect
253 unicode-words <iota> { wNewline ! WB3b
256 { wZWJ } unicode-words <iota> check-Extended_Pictographic set-table ! WB3c
257 { wWSegSpace } { wWSegSpace } connect ! WB3d
258 unicode-words <iota> { wZWJ } connect ! WB4
260 wHebrew_Letter } { wALetter
261 wHebrew_Letter } connect
263 wHebrew_Letter } { wMidLetter
265 wSingle_Quote } check-AHletter-after set-table
268 wSingle_Quote } { wALetter
269 wHebrew_Letter } check-AHletter-before set-table
270 { wHebrew_Letter } { wSingle_Quote } connect ! WB7a
271 { wHebrew_Letter } { wDouble_Quote } check-Hebrew-letter-after set-table ! WB7b
272 { wDouble_Quote } { wHebrew_Letter } check-Hebrew-letter-before set-table ! WB7c
273 { wNumeric } { wNumeric } connect ! WB8
275 wHebrew_Letter } { wNumeric } connect ! WB9
276 { wNumeric } { wALetter ! WB10
277 wHebrew_Letter } connect
280 wSingle_Quote } { wNumeric } check-number-before set-table
281 { wNumeric } { wMidNum ! WB12
283 wSingle_Quote } check-number-after set-table
284 { wKatakana } { wKatakana } connect ! WB13
289 wExtendNumLet } { wExtendNumLet } connect
290 { wExtendNumLet } { wALetter ! WB13b
294 { wRegional_Indicator } { wRegional_Indicator } check-RI-pair set-table ; ! WB15,16
296 : finish-word-table ( -- table )
298 [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
303 CONSTANT: word-table $[
304 unicode-words init-table table
305 [ make-word-table finish-word-table ] with-variable
309 : word-table-nth ( class1 class2 -- ? )
312 :: property-not= ( str i property -- ? )
314 i str ?nth [ word-break-prop property = not ]
318 : (format/extended?) ( class -- ? )
319 ${ wExtend wFormat } member? ; inline ! WB4
321 : format/extended? ( ch -- ? )
322 word-break-prop (format/extended?) ;
324 : (format/extended/zwj?) ( class -- ? )
325 ${ wExtend wFormat wZWJ } member? ; inline ! WB4
327 : format/extended/zwj? ( ch -- ? )
328 word-break-prop (format/extended/zwj?) ;
330 : (walk-up) ( str i -- j )
331 swap [ format/extended/zwj? not ] find-from drop ;
333 : walk-up ( str i -- j )
334 dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
336 : (walk-down) ( str i -- j )
337 swap [ format/extended/zwj? not ] find-last-from drop ;
339 : walk-down ( str i -- j )
340 dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
342 :: word-break? ( str i table-entry -- ? )
347 { check-AHletter-after
349 [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
350 { check-AHletter-before
352 [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
353 { check-Hebrew-letter-after
354 [ dupd walk-up wHebrew_Letter property-not= ] }
355 { check-Hebrew-letter-before
356 [ dupd walk-down wHebrew_Letter property-not= ] }
358 [ dupd walk-up wNumeric property-not= ] }
359 { check-number-before
360 [ dupd walk-down wNumeric property-not= ] }
361 { check-Extended_Pictographic
362 [ swap ?nth extended-pictographic-table interval-key? ] }
367 regional? [ ri-even? not ri-even?! f ] [ t ] if
368 ] find-last-from 2drop
373 :: word-break-next ( old-class new-char i str -- next-class ? )
374 new-char word-break-prop :> new-class
375 new-class (format/extended?)
376 [ old-class dup ${ wCR wLF wNewline } member? ] [
377 new-class old-class over word-table-nth
378 [ str i 1 - ] dip word-break?