]> gitweb.factorcode.org Git - factor.git/blob - basis/unicode/breaks/breaks.factor
UNICODE TEXT SEGMENTATION 12.0.0
[factor.git] / basis / unicode / breaks / breaks.factor
1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators fry interval-maps
4 kernel literals locals math namespaces parser sequences
5 simple-flat-file unicode.categories unicode.data
6 unicode.normalize.private words words.constant math.order ;
7 IN: unicode.breaks
8
9 <PRIVATE
10
11 ! Grapheme breaks
12 <<
13 CONSTANT: Any 0
14 CONSTANT: L 1
15 CONSTANT: V 2
16 CONSTANT: T 3
17 CONSTANT: LV 4
18 CONSTANT: LVT 5
19 CONSTANT: Extend 6
20 CONSTANT: Control 7
21 CONSTANT: CR 8
22 CONSTANT: LF 9
23 CONSTANT: SpacingMark 10
24 CONSTANT: Prepend 11
25 CONSTANT: ZWJ 12
26 CONSTANT: Extended_Pictographic 13
27 CONSTANT: (Extended_Pictographic-Extend*-)ZWJ 14
28 CONSTANT: Regional_Indicator(even) 15
29 CONSTANT: Regional_Indicator(odd) 16
30 CONSTANT: graphemes 17
31
32 :: load-interval-file-for ( filename n key -- table )
33     filename load-data-file [ n swap nth key = ] filter
34     intern-values expand-ranges ;
35
36 "emoji-modifier-table" create-word-in
37 "resource:basis/unicode/UCD/auxiliary/emoji-data.txt"
38 1 "Emoji_Modifier" load-interval-file-for
39 define-constant
40
41 "extended-pictographic-table" create-word-in
42 "resource:basis/unicode/UCD/auxiliary/emoji-data.txt"
43 1 "Extended_Pictographic" load-interval-file-for
44 define-constant
45
46 "spacing-mark-exceptions-table" create-word-in
47 {
48     0x102B 0x102C 0x1038 { 0x1062 0x1064 } { 0x1067 0x106D }
49     0x1083 { 0x1087 0x108C } 0x108F { 0x109A 0x109C } 0x1A61
50     0x1A63 0x1A64 0xAA7B 0xAA7D 0x11720 0x11721
51 }
52 <interval-set> define-constant
53
54 : jamo-class ( ch -- class )
55     dup initial? [ drop L ]
56     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
57
58 : hangul-class ( ch -- class )
59     hangul-base - 0x1C mod zero? LV LVT ? ;
60
61 CATEGORY: extend
62     Me Mn |
63     "Other_Grapheme_Extend" property? ;
64
65 CATEGORY: grapheme-control Zl Zp Cc Cf ;
66
67 : control-class ( ch -- class )
68     {
69         { [ dup CHAR: \r = ]  [ drop CR ] }
70         { [ dup CHAR: \n = ] [ drop LF ] }
71         { [ dup 0x200C = ] [ drop Extend ] }
72         { [ dup 0x200D = ] [ drop ZWJ ] }
73         { [ dup "Other_Grapheme_Extend" property? ] [ drop Extend ] }
74         [ drop Control ]
75     } cond ;
76
77 : loe? ( ch -- ? )
78     "Logical_Order_Exception" property? ;
79
80 CATEGORY: spacing Mc ;
81
82 : regional? ( ch -- ? )
83     "Regional_Indicator" property? ; 
84 >>
85
86 : modifier? ( ch -- ? ) emoji-modifier-table interval-key? ; inline
87
88 :: grapheme-class ( str -- class )
89     str last
90     {
91         { [ dup jamo? ] [ jamo-class ] }
92         { [ dup hangul? ] [ hangul-class ] }
93         { [ dup grapheme-control? ] [
94               control-class dup ZWJ = [
95                   drop
96                   str unclip-last-slice drop dup [
97                       [ extend? ]
98                       [ control-class Extend = ]
99                       [ modifier? ]
100                       tri or or not
101                   ] find-last drop [ swap ?nth ] [ last ] if*
102                   extended-pictographic-table interval-key? [                      
103                       (Extended_Pictographic-Extend*-)ZWJ
104                   ] [ ZWJ ] if
105               ] when               
106           ] }
107         { [ dup extend? ] [ drop Extend ] }
108         { [ dup modifier? ] [ drop Extend ] }
109         { [ dup spacing? ] [ spacing-mark-exceptions-table
110                              interval-key? [ Any ] [ SpacingMark ] if ] }
111         { [ dup loe? ] [ drop Prepend ] }
112         { [ dup regional? ] [
113               drop  
114               f :> ri-even?!
115               str unclip-last-slice drop [
116                   regional? [ ri-even? not ri-even?! f ] [ t ] if
117               ] find-last 2drop
118               ri-even? [
119                   Regional_Indicator(even)
120               ] [
121                   Regional_Indicator(odd)
122               ] if
123           ] }
124         { [ dup extended-pictographic-table interval-key? ] [
125               drop Extended_Pictographic
126           ] }
127         [ drop Any ]
128     } cond ;
129
130 <<
131 : init-table ( size -- table )
132     dup [ f <array> ] curry replicate ;
133
134 SYMBOL: table
135
136 : finish-table ( -- table )
137     table get [ [ 1 = ] map ] map ;
138
139 : eval-seq ( seq -- seq )
140     [ dup word? [ execute( -- x ) ] when ] map ;
141
142 : (set-table) ( class1 class2 val -- )
143     [ table get nth ] dip '[ _ or ] change-nth ;
144
145 : set-table ( classes1 classes2 val -- )
146     [ [ eval-seq ] bi@ ] dip
147     [ [ (set-table) ] curry with each ] 2curry each ;
148
149 : connect ( class1 class2 -- ) 1 set-table ;
150 : disconnect ( class1 class2 -- ) 0 set-table ;
151
152 : make-grapheme-table ( -- )
153     { CR } { LF } connect                                                       ! GB3
154     { Control CR LF } graphemes <iota> disconnect                               ! GB4
155     graphemes <iota> { Control CR LF } disconnect                               ! GB5
156     { L } { L V LV LVT } connect                                                ! GB6
157     { LV V } { V T } connect                                                    ! GB7
158     { LVT T } { T } connect                                                     ! GB8
159     graphemes <iota> { Extend ZWJ (Extended_Pictographic-Extend*-)ZWJ } connect ! GB9
160     graphemes <iota> { SpacingMark } connect                                    ! GB9a
161     { Prepend } graphemes <iota> connect                                        ! GB9b
162     { (Extended_Pictographic-Extend*-)ZWJ } { Extended_Pictographic } connect   ! GB11
163     { Regional_Indicator(odd) } { Regional_Indicator(even) } connect ;          ! GB12,13
164
165 "grapheme-table" create-word-in
166 graphemes init-table table
167 [ make-grapheme-table finish-table ] with-variable
168 define-constant
169 >>
170
171 : grapheme-break? ( class1 class2 -- ? )
172     grapheme-table nth nth not ;
173
174 ! Word breaks
175 <<
176 CONSTANT: wOther 0
177 CONSTANT: wCR 1
178 CONSTANT: wLF 2
179 CONSTANT: wNewline 3
180 CONSTANT: wExtend 4
181 CONSTANT: wZWJ 5
182 CONSTANT: wRegional_Indicator 6
183 CONSTANT: wFormat 7
184 CONSTANT: wKatakana 8
185 CONSTANT: wHebrew_Letter 9
186 CONSTANT: wALetter 10
187 CONSTANT: wSingle_Quote 11
188 CONSTANT: wDouble_Quote 12
189 CONSTANT: wMidNumLet 13
190 CONSTANT: wMidLetter 14
191 CONSTANT: wMidNum 15
192 CONSTANT: wNumeric 16
193 CONSTANT: wExtendNumLet 17
194 CONSTANT: wWSegSpace 18
195 CONSTANT: unicode-words 19
196
197 ! Is there a way to avoid this?
198 CONSTANT: word-break-classes H{
199     { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
200     { "Extend" 4 } { "ZWJ" 5 } { "Regional_Indicator" 6 }
201     { "Format" 7 } { "Katakana" 8 } { "Hebrew_Letter" 9 }
202     { "ALetter" 10 } { "Single_Quote" 11 } { "Double_Quote" 12 }
203     { "MidNumLet" 13 } { "MidLetter" 14 } { "MidNum" 15 }
204     { "Numeric" 16 } { "ExtendNumLet" 17 } { "WSegSpace" 18 }
205 }
206
207 "word-break-table" create-word-in
208 "resource:basis/unicode/UCD/auxiliary/WordBreakProperty.txt"
209 load-interval-file dup array>>
210 [ 2 swap [ word-break-classes at ] change-nth ] each
211 define-constant
212 >>
213
214 : word-break-prop ( char -- word-break-prop )
215     word-break-table interval-at wOther or ;
216
217 <<
218 SYMBOL: check-AHletter-before
219 SYMBOL: check-AHletter-after
220 SYMBOL: check-Hebrew-letter-before
221 SYMBOL: check-Hebrew-letter-after
222 SYMBOL: check-number-before
223 SYMBOL: check-number-after
224 SYMBOL: check-Extended_Pictographic
225 SYMBOL: check-RI-pair
226
227 : make-word-table ( -- )
228     { wCR } { wLF } connect                                                   ! WB3
229     { wNewline                                                                ! WB3a
230       wCR
231       wLF } unicode-words <iota> disconnect
232     unicode-words <iota> { wNewline                                           ! WB3b
233                            wCR
234                            wLF } disconnect
235     { wZWJ } unicode-words <iota> check-Extended_Pictographic set-table       ! WB3c
236     { wWSegSpace } { wWSegSpace } connect                                     ! WB3d
237     unicode-words <iota> { wZWJ } connect                                     ! WB4
238     { wALetter                                                                ! WB5
239       wHebrew_Letter } { wALetter
240                          wHebrew_Letter } connect
241     { wALetter                                                                ! WB6
242       wHebrew_Letter } { wMidLetter
243                          wMidNumLet
244                          wSingle_Quote } check-AHletter-after set-table
245     { wMidLetter                                                              ! WB7
246       wMidNumLet
247       wSingle_Quote } { wALetter
248                         wHebrew_Letter } check-AHletter-before set-table
249     { wHebrew_Letter } { wSingle_Quote } connect                              ! WB7a
250     { wHebrew_Letter } { wDouble_Quote } check-Hebrew-letter-after set-table  ! WB7b 
251     { wDouble_Quote } { wHebrew_Letter } check-Hebrew-letter-before set-table ! WB7c 
252     { wNumeric } { wNumeric } connect                                         ! WB8
253     { wALetter
254       wHebrew_Letter } { wNumeric } connect                                   ! WB9
255     { wNumeric } { wALetter                                                   ! WB10
256                    wHebrew_Letter } connect
257     { wMidNum                                                                 ! WB11
258       wMidNumLet
259       wSingle_Quote } { wNumeric } check-number-before set-table
260     { wNumeric } { wMidNum                                                    ! WB12
261                    wMidNumLet
262                    wSingle_Quote } check-number-after set-table
263     { wKatakana } { wKatakana } connect                                       ! WB13
264     { wALetter                                                                ! WB13a 
265       wHebrew_Letter
266       wNumeric
267       wKatakana
268       wExtendNumLet } { wExtendNumLet } connect
269     { wExtendNumLet } { wALetter                                              ! WB13b
270                         wHebrew_Letter
271                         wNumeric
272                         wKatakana } connect
273     { wRegional_Indicator } { wRegional_Indicator } check-RI-pair set-table ; ! WB15,16
274
275 : finish-word-table ( -- table )
276     table get [
277         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
278     ] map ;
279
280 "word-table" create-word-in
281 unicode-words init-table table
282 [ make-word-table finish-word-table ] with-variable
283 define-constant
284 >>
285
286 : word-table-nth ( class1 class2 -- ? )
287     word-table nth nth ;
288
289 :: property-not= ( str i property -- ? )
290     i [
291         i str ?nth [ word-break-prop property = not ]
292         [ f ] if*
293     ] [ t ] if ;
294
295 : (format/extended?) ( class -- ? )
296     ${ wExtend wFormat } member? ; inline                                     ! WB4
297
298 : format/extended? ( ch -- ? )
299     word-break-prop (format/extended?) ;
300
301 : (format/extended/zwj?) ( class -- ? )
302     ${ wExtend wFormat wZWJ } member? ; inline                                ! WB4
303
304 : format/extended/zwj? ( ch -- ? )
305     word-break-prop (format/extended/zwj?) ;
306
307 : (walk-up) ( str i -- j )
308     swap [ format/extended/zwj? not ] find-from drop ;
309
310 : walk-up ( str i -- j )
311     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
312
313 : (walk-down) ( str i -- j )
314     swap [ format/extended/zwj? not ] find-last-from drop ;
315
316 : walk-down ( str i -- j )
317     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
318
319 :: word-break? ( str i table-entry -- ? )
320     str i table-entry
321     {
322         { t [ 2drop f ] }
323         { f [ 2drop t ] }
324         { check-AHletter-after
325           [ dupd walk-up
326             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
327         { check-AHletter-before
328           [ dupd walk-down
329             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
330         { check-Hebrew-letter-after
331           [ dupd walk-up wHebrew_Letter property-not= ] }
332         { check-Hebrew-letter-before
333           [ dupd walk-down wHebrew_Letter property-not= ] }
334         { check-number-after
335           [ dupd walk-up wNumeric property-not= ] }
336         { check-number-before
337           [ dupd walk-down wNumeric property-not= ] }
338         { check-Extended_Pictographic
339           [ swap ?nth extended-pictographic-table interval-key? ] }
340         { check-RI-pair [
341               2drop 
342               f :> ri-even?!
343               i str [
344                   regional? [ ri-even? not ri-even?! f ] [ t ] if
345               ] find-last-from 2drop
346               ri-even? not
347           ] }
348     } case ;
349
350 :: word-break-next ( old-class new-char i str -- next-class ? )
351     new-char word-break-prop :> new-class
352     new-class (format/extended?)
353     [ old-class dup ${ wCR wLF wNewline } member? ] [
354         new-class old-class over word-table-nth
355         [ str i 1 - ] dip word-break?
356     ] if ;
357
358 PRIVATE>