]> gitweb.factorcode.org Git - factor.git/blob - basis/unicode/breaks/breaks.factor
94a1d3f75249dd1a2ef5f5b11b79b9253a060369
[factor.git] / basis / unicode / breaks / breaks.factor
1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators fry interval-maps
4 kernel literals locals math namespaces parser sequences
5 simple-flat-file unicode.categories unicode.data
6 unicode.normalize.private words words.constant math.order ;
7 IN: unicode.breaks
8
9 <PRIVATE
10
11 <<
12
13 :: load-interval-file-for ( filename n key -- table )
14     filename load-data-file [ n swap nth key = ] filter
15     intern-values expand-ranges ;
16
17 >>
18
19 CONSTANT: emoji-modifier-table $[
20     "resource:basis/unicode/UCD/emoji/emoji-data.txt"
21     1 "Emoji_Modifier" load-interval-file-for
22 ]
23
24 CONSTANT: extended-pictographic-table $[
25     "resource:basis/unicode/UCD/emoji/emoji-data.txt"
26     1 "Extended_Pictographic" load-interval-file-for
27 ]
28
29 CONSTANT: spacing-mark-exceptions-table $[
30     {
31         0x102B 0x102C 0x1038 { 0x1062 0x1064 } { 0x1067 0x106D }
32         0x1083 { 0x1087 0x108C } 0x108F { 0x109A 0x109C } 0x1A61
33         0x1A63 0x1A64 0xAA7B 0xAA7D 0x11720 0x11721
34     } <interval-set>
35 ]
36
37 ! Grapheme breaks
38 <<
39 CONSTANT: Any 0
40 CONSTANT: L 1
41 CONSTANT: V 2
42 CONSTANT: T 3
43 CONSTANT: LV 4
44 CONSTANT: LVT 5
45 CONSTANT: Extend 6
46 CONSTANT: Control 7
47 CONSTANT: CR 8
48 CONSTANT: LF 9
49 CONSTANT: SpacingMark 10
50 CONSTANT: Prepend 11
51 CONSTANT: ZWJ 12
52 CONSTANT: Extended_Pictographic 13
53 CONSTANT: (Extended_Pictographic-Extend*-)ZWJ 14
54 CONSTANT: Regional_Indicator(even) 15
55 CONSTANT: Regional_Indicator(odd) 16
56 CONSTANT: graphemes 17
57
58 : jamo-class ( ch -- class )
59     dup initial? [ drop L ]
60     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
61
62 : hangul-class ( ch -- class )
63     hangul-base - 0x1C mod zero? LV LVT ? ;
64
65 CATEGORY: extend
66     Me Mn |
67     "Other_Grapheme_Extend" property? ;
68
69 CATEGORY: grapheme-control Zl Zp Cc Cf ;
70
71 : control-class ( ch -- class )
72     {
73         { [ dup CHAR: \r = ]  [ drop CR ] }
74         { [ dup CHAR: \n = ] [ drop LF ] }
75         { [ dup 0x200C = ] [ drop Extend ] }
76         { [ dup 0x200D = ] [ drop ZWJ ] }
77         { [ dup "Other_Grapheme_Extend" property? ] [ drop Extend ] }
78         [ drop Control ]
79     } cond ;
80
81 : loe? ( ch -- ? )
82     "Logical_Order_Exception" property? ;
83
84 CATEGORY: spacing Mc ;
85
86 : regional? ( ch -- ? )
87     "Regional_Indicator" property? ;
88 >>
89
90 : modifier? ( ch -- ? )
91     emoji-modifier-table interval-key? ; inline
92
93 :: grapheme-class ( str -- class )
94     str last
95     {
96         { [ dup jamo? ] [ jamo-class ] }
97         { [ dup hangul? ] [ hangul-class ] }
98         { [ dup grapheme-control? ] [
99               control-class dup ZWJ = [
100                   drop
101                   str unclip-last-slice drop dup [
102                       [ extend? ]
103                       [ control-class Extend = ]
104                       [ modifier? ]
105                       tri or or not
106                   ] find-last drop [ swap ?nth ] [ last ] if*
107                   extended-pictographic-table interval-key? [
108                       (Extended_Pictographic-Extend*-)ZWJ
109                   ] [ ZWJ ] if
110               ] when
111           ] }
112         { [ dup extend? ] [ drop Extend ] }
113         { [ dup modifier? ] [ drop Extend ] }
114         { [ dup spacing? ] [
115                spacing-mark-exceptions-table
116                interval-key? [ Any ] [ SpacingMark ] if ] }
117         { [ dup loe? ] [ drop Prepend ] }
118         { [ dup regional? ] [
119               drop
120               f :> ri-even?!
121               str unclip-last-slice drop [
122                   regional? [ ri-even? not ri-even?! f ] [ t ] if
123               ] find-last 2drop
124               ri-even? [
125                   Regional_Indicator(even)
126               ] [
127                   Regional_Indicator(odd)
128               ] if
129           ] }
130         { [ dup extended-pictographic-table interval-key? ] [
131               drop Extended_Pictographic
132           ] }
133         [ drop Any ]
134     } cond ;
135
136 <<
137 : init-table ( size -- table )
138     dup [ f <array> ] curry replicate ;
139
140 SYMBOL: table
141
142 : finish-table ( -- table )
143     table get [ [ 1 = ] map ] map ;
144
145 : eval-seq ( seq -- seq )
146     [ dup word? [ execute( -- x ) ] when ] map ;
147
148 : (set-table) ( class1 class2 val -- )
149     [ table get nth ] dip '[ _ or ] change-nth ;
150
151 : set-table ( classes1 classes2 val -- )
152     [ [ eval-seq ] bi@ ] dip
153     [ [ (set-table) ] curry with each ] 2curry each ;
154
155 : connect ( class1 class2 -- ) 1 set-table ;
156 : disconnect ( class1 class2 -- ) 0 set-table ;
157
158 : make-grapheme-table ( -- )
159     { CR } { LF } connect                                                       ! GB3
160     { Control CR LF } graphemes <iota> disconnect                               ! GB4
161     graphemes <iota> { Control CR LF } disconnect                               ! GB5
162     { L } { L V LV LVT } connect                                                ! GB6
163     { LV V } { V T } connect                                                    ! GB7
164     { LVT T } { T } connect                                                     ! GB8
165     graphemes <iota> { Extend ZWJ (Extended_Pictographic-Extend*-)ZWJ } connect ! GB9
166     graphemes <iota> { SpacingMark } connect                                    ! GB9a
167     { Prepend } graphemes <iota> connect                                        ! GB9b
168     { (Extended_Pictographic-Extend*-)ZWJ } { Extended_Pictographic } connect   ! GB11
169     { Regional_Indicator(odd) } { Regional_Indicator(even) } connect ;          ! GB12,13
170 >>
171
172 CONSTANT: grapheme-table $[
173     graphemes init-table table
174     [ make-grapheme-table finish-table ] with-variable
175 ]
176
177 : grapheme-break? ( class1 class2 -- ? )
178     grapheme-table nth nth not ;
179
180 ! Word breaks
181 <<
182 CONSTANT: wOther 0
183 CONSTANT: wCR 1
184 CONSTANT: wLF 2
185 CONSTANT: wNewline 3
186 CONSTANT: wExtend 4
187 CONSTANT: wZWJ 5
188 CONSTANT: wRegional_Indicator 6
189 CONSTANT: wFormat 7
190 CONSTANT: wKatakana 8
191 CONSTANT: wHebrew_Letter 9
192 CONSTANT: wALetter 10
193 CONSTANT: wSingle_Quote 11
194 CONSTANT: wDouble_Quote 12
195 CONSTANT: wMidNumLet 13
196 CONSTANT: wMidLetter 14
197 CONSTANT: wMidNum 15
198 CONSTANT: wNumeric 16
199 CONSTANT: wExtendNumLet 17
200 CONSTANT: wWSegSpace 18
201 CONSTANT: unicode-words 19
202 >>
203
204 <<
205 CONSTANT: word-break-table $[
206     "resource:basis/unicode/UCD/auxiliary/WordBreakProperty.txt"
207     load-interval-file dup array>> [
208         2 swap [
209             {
210                 { "Other" [ wOther ] }
211                 { "CR" [ wCR ] }
212                 { "LF" [ wLF ] }
213                 { "Newline" [ wNewline ] }
214                 { "Extend" [ wExtend ] }
215                 { "ZWJ" [ wZWJ ]  }
216                 { "Regional_Indicator" [ wRegional_Indicator ] }
217                 { "Format" [ wFormat ] }
218                 { "Katakana" [ wKatakana ] }
219                 { "Hebrew_Letter" [ wHebrew_Letter ] }
220                 { "ALetter" [ wALetter ] }
221                 { "Single_Quote" [ wSingle_Quote ] }
222                 { "Double_Quote" [ wDouble_Quote ] }
223                 { "MidNumLet" [ wMidNumLet ] }
224                 { "MidLetter" [ wMidLetter ] }
225                 { "MidNum" [ wMidNum ] }
226                 { "Numeric" [ wNumeric ] }
227                 { "ExtendNumLet" [ wExtendNumLet ] }
228                 { "WSegSpace" [ wWSegSpace ] }
229             } case
230         ] change-nth
231     ] each
232 ]
233 >>
234
235 : word-break-prop ( char -- word-break-prop )
236     word-break-table interval-at wOther or ;
237
238 <<
239 SYMBOL: check-AHletter-before
240 SYMBOL: check-AHletter-after
241 SYMBOL: check-Hebrew-letter-before
242 SYMBOL: check-Hebrew-letter-after
243 SYMBOL: check-number-before
244 SYMBOL: check-number-after
245 SYMBOL: check-Extended_Pictographic
246 SYMBOL: check-RI-pair
247
248 : make-word-table ( -- )
249     { wCR } { wLF } connect                                                   ! WB3
250     { wNewline                                                                ! WB3a
251       wCR
252       wLF } unicode-words <iota> disconnect
253     unicode-words <iota> { wNewline                                           ! WB3b
254                            wCR
255                            wLF } disconnect
256     { wZWJ } unicode-words <iota> check-Extended_Pictographic set-table       ! WB3c
257     { wWSegSpace } { wWSegSpace } connect                                     ! WB3d
258     unicode-words <iota> { wZWJ } connect                                     ! WB4
259     { wALetter                                                                ! WB5
260       wHebrew_Letter } { wALetter
261                          wHebrew_Letter } connect
262     { wALetter                                                                ! WB6
263       wHebrew_Letter } { wMidLetter
264                          wMidNumLet
265                          wSingle_Quote } check-AHletter-after set-table
266     { wMidLetter                                                              ! WB7
267       wMidNumLet
268       wSingle_Quote } { wALetter
269                         wHebrew_Letter } check-AHletter-before set-table
270     { wHebrew_Letter } { wSingle_Quote } connect                              ! WB7a
271     { wHebrew_Letter } { wDouble_Quote } check-Hebrew-letter-after set-table  ! WB7b 
272     { wDouble_Quote } { wHebrew_Letter } check-Hebrew-letter-before set-table ! WB7c 
273     { wNumeric } { wNumeric } connect                                         ! WB8
274     { wALetter
275       wHebrew_Letter } { wNumeric } connect                                   ! WB9
276     { wNumeric } { wALetter                                                   ! WB10
277                    wHebrew_Letter } connect
278     { wMidNum                                                                 ! WB11
279       wMidNumLet
280       wSingle_Quote } { wNumeric } check-number-before set-table
281     { wNumeric } { wMidNum                                                    ! WB12
282                    wMidNumLet
283                    wSingle_Quote } check-number-after set-table
284     { wKatakana } { wKatakana } connect                                       ! WB13
285     { wALetter                                                                ! WB13a 
286       wHebrew_Letter
287       wNumeric
288       wKatakana
289       wExtendNumLet } { wExtendNumLet } connect
290     { wExtendNumLet } { wALetter                                              ! WB13b
291                         wHebrew_Letter
292                         wNumeric
293                         wKatakana } connect
294     { wRegional_Indicator } { wRegional_Indicator } check-RI-pair set-table ; ! WB15,16
295
296 : finish-word-table ( -- table )
297     table get [
298         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
299     ] map ;
300 >>
301
302 <<
303 CONSTANT: word-table $[
304     unicode-words init-table table
305     [ make-word-table finish-word-table ] with-variable
306 ]
307 >>
308
309 : word-table-nth ( class1 class2 -- ? )
310     word-table nth nth ;
311
312 :: property-not= ( str i property -- ? )
313     i [
314         i str ?nth [ word-break-prop property = not ]
315         [ f ] if*
316     ] [ t ] if ;
317
318 : (format/extended?) ( class -- ? )
319     ${ wExtend wFormat } member? ; inline                                     ! WB4
320
321 : format/extended? ( ch -- ? )
322     word-break-prop (format/extended?) ;
323
324 : (format/extended/zwj?) ( class -- ? )
325     ${ wExtend wFormat wZWJ } member? ; inline                                ! WB4
326
327 : format/extended/zwj? ( ch -- ? )
328     word-break-prop (format/extended/zwj?) ;
329
330 : (walk-up) ( str i -- j )
331     swap [ format/extended/zwj? not ] find-from drop ;
332
333 : walk-up ( str i -- j )
334     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
335
336 : (walk-down) ( str i -- j )
337     swap [ format/extended/zwj? not ] find-last-from drop ;
338
339 : walk-down ( str i -- j )
340     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
341
342 :: word-break? ( str i table-entry -- ? )
343     str i table-entry
344     {
345         { t [ 2drop f ] }
346         { f [ 2drop t ] }
347         { check-AHletter-after
348           [ dupd walk-up
349             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
350         { check-AHletter-before
351           [ dupd walk-down
352             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
353         { check-Hebrew-letter-after
354           [ dupd walk-up wHebrew_Letter property-not= ] }
355         { check-Hebrew-letter-before
356           [ dupd walk-down wHebrew_Letter property-not= ] }
357         { check-number-after
358           [ dupd walk-up wNumeric property-not= ] }
359         { check-number-before
360           [ dupd walk-down wNumeric property-not= ] }
361         { check-Extended_Pictographic
362           [ swap ?nth extended-pictographic-table interval-key? ] }
363         { check-RI-pair [
364               2drop 
365               f :> ri-even?!
366               i str [
367                   regional? [ ri-even? not ri-even?! f ] [ t ] if
368               ] find-last-from 2drop
369               ri-even? not
370           ] }
371     } case ;
372
373 :: word-break-next ( old-class new-char i str -- next-class ? )
374     new-char word-break-prop :> new-class
375     new-class (format/extended?)
376     [ old-class dup ${ wCR wLF wNewline } member? ] [
377         new-class old-class over word-table-nth
378         [ str i 1 - ] dip word-break?
379     ] if ;
380
381 PRIVATE>