]> gitweb.factorcode.org Git - factor.git/blob - basis/unicode/breaks/breaks.factor
unicode: cleanup by using literals and CONSTANT:.
[factor.git] / basis / unicode / breaks / breaks.factor
1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators fry interval-maps
4 kernel literals locals math namespaces parser sequences
5 simple-flat-file unicode.categories unicode.data
6 unicode.normalize.private words words.constant math.order ;
7 IN: unicode.breaks
8
9 <PRIVATE
10
11 :: load-interval-file-for ( filename n key -- table )
12     filename load-data-file [ n swap nth key = ] filter
13     intern-values expand-ranges ;
14
15 CONSTANT: emoji-modifier-table $[
16     "resource:basis/unicode/UCD/auxiliary/emoji-data.txt"
17     1 "Emoji_Modifier" load-interval-file-for
18 ]
19
20 CONSTANT: extended-pictographic-table $[
21     "resource:basis/unicode/UCD/auxiliary/emoji-data.txt"
22     1 "Extended_Pictographic" load-interval-file-for
23 ]
24
25 CONSTANT: spacing-mark-exceptions-table $[
26     {
27         0x102B 0x102C 0x1038 { 0x1062 0x1064 } { 0x1067 0x106D }
28         0x1083 { 0x1087 0x108C } 0x108F { 0x109A 0x109C } 0x1A61
29         0x1A63 0x1A64 0xAA7B 0xAA7D 0x11720 0x11721
30     } <interval-set>
31 ]
32
33 ! Grapheme breaks
34 <<
35 CONSTANT: Any 0
36 CONSTANT: L 1
37 CONSTANT: V 2
38 CONSTANT: T 3
39 CONSTANT: LV 4
40 CONSTANT: LVT 5
41 CONSTANT: Extend 6
42 CONSTANT: Control 7
43 CONSTANT: CR 8
44 CONSTANT: LF 9
45 CONSTANT: SpacingMark 10
46 CONSTANT: Prepend 11
47 CONSTANT: ZWJ 12
48 CONSTANT: Extended_Pictographic 13
49 CONSTANT: (Extended_Pictographic-Extend*-)ZWJ 14
50 CONSTANT: Regional_Indicator(even) 15
51 CONSTANT: Regional_Indicator(odd) 16
52 CONSTANT: graphemes 17
53
54 : jamo-class ( ch -- class )
55     dup initial? [ drop L ]
56     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
57
58 : hangul-class ( ch -- class )
59     hangul-base - 0x1C mod zero? LV LVT ? ;
60
61 CATEGORY: extend
62     Me Mn |
63     "Other_Grapheme_Extend" property? ;
64
65 CATEGORY: grapheme-control Zl Zp Cc Cf ;
66
67 : control-class ( ch -- class )
68     {
69         { [ dup CHAR: \r = ]  [ drop CR ] }
70         { [ dup CHAR: \n = ] [ drop LF ] }
71         { [ dup 0x200C = ] [ drop Extend ] }
72         { [ dup 0x200D = ] [ drop ZWJ ] }
73         { [ dup "Other_Grapheme_Extend" property? ] [ drop Extend ] }
74         [ drop Control ]
75     } cond ;
76
77 : loe? ( ch -- ? )
78     "Logical_Order_Exception" property? ;
79
80 CATEGORY: spacing Mc ;
81
82 : regional? ( ch -- ? )
83     "Regional_Indicator" property? ;
84 >>
85
86 : modifier? ( ch -- ? )
87     emoji-modifier-table interval-key? ; inline
88
89 :: grapheme-class ( str -- class )
90     str last
91     {
92         { [ dup jamo? ] [ jamo-class ] }
93         { [ dup hangul? ] [ hangul-class ] }
94         { [ dup grapheme-control? ] [
95               control-class dup ZWJ = [
96                   drop
97                   str unclip-last-slice drop dup [
98                       [ extend? ]
99                       [ control-class Extend = ]
100                       [ modifier? ]
101                       tri or or not
102                   ] find-last drop [ swap ?nth ] [ last ] if*
103                   extended-pictographic-table interval-key? [
104                       (Extended_Pictographic-Extend*-)ZWJ
105                   ] [ ZWJ ] if
106               ] when
107           ] }
108         { [ dup extend? ] [ drop Extend ] }
109         { [ dup modifier? ] [ drop Extend ] }
110         { [ dup spacing? ] [
111                spacing-mark-exceptions-table
112                interval-key? [ Any ] [ SpacingMark ] if ] }
113         { [ dup loe? ] [ drop Prepend ] }
114         { [ dup regional? ] [
115               drop
116               f :> ri-even?!
117               str unclip-last-slice drop [
118                   regional? [ ri-even? not ri-even?! f ] [ t ] if
119               ] find-last 2drop
120               ri-even? [
121                   Regional_Indicator(even)
122               ] [
123                   Regional_Indicator(odd)
124               ] if
125           ] }
126         { [ dup extended-pictographic-table interval-key? ] [
127               drop Extended_Pictographic
128           ] }
129         [ drop Any ]
130     } cond ;
131
132 <<
133 : init-table ( size -- table )
134     dup [ f <array> ] curry replicate ;
135
136 SYMBOL: table
137
138 : finish-table ( -- table )
139     table get [ [ 1 = ] map ] map ;
140
141 : eval-seq ( seq -- seq )
142     [ dup word? [ execute( -- x ) ] when ] map ;
143
144 : (set-table) ( class1 class2 val -- )
145     [ table get nth ] dip '[ _ or ] change-nth ;
146
147 : set-table ( classes1 classes2 val -- )
148     [ [ eval-seq ] bi@ ] dip
149     [ [ (set-table) ] curry with each ] 2curry each ;
150
151 : connect ( class1 class2 -- ) 1 set-table ;
152 : disconnect ( class1 class2 -- ) 0 set-table ;
153
154 : make-grapheme-table ( -- )
155     { CR } { LF } connect                                                       ! GB3
156     { Control CR LF } graphemes <iota> disconnect                               ! GB4
157     graphemes <iota> { Control CR LF } disconnect                               ! GB5
158     { L } { L V LV LVT } connect                                                ! GB6
159     { LV V } { V T } connect                                                    ! GB7
160     { LVT T } { T } connect                                                     ! GB8
161     graphemes <iota> { Extend ZWJ (Extended_Pictographic-Extend*-)ZWJ } connect ! GB9
162     graphemes <iota> { SpacingMark } connect                                    ! GB9a
163     { Prepend } graphemes <iota> connect                                        ! GB9b
164     { (Extended_Pictographic-Extend*-)ZWJ } { Extended_Pictographic } connect   ! GB11
165     { Regional_Indicator(odd) } { Regional_Indicator(even) } connect ;          ! GB12,13
166 >>
167
168 CONSTANT: grapheme-table $[
169     graphemes init-table table
170     [ make-grapheme-table finish-table ] with-variable
171 ]
172
173 : grapheme-break? ( class1 class2 -- ? )
174     grapheme-table nth nth not ;
175
176 ! Word breaks
177 <<
178 CONSTANT: wOther 0
179 CONSTANT: wCR 1
180 CONSTANT: wLF 2
181 CONSTANT: wNewline 3
182 CONSTANT: wExtend 4
183 CONSTANT: wZWJ 5
184 CONSTANT: wRegional_Indicator 6
185 CONSTANT: wFormat 7
186 CONSTANT: wKatakana 8
187 CONSTANT: wHebrew_Letter 9
188 CONSTANT: wALetter 10
189 CONSTANT: wSingle_Quote 11
190 CONSTANT: wDouble_Quote 12
191 CONSTANT: wMidNumLet 13
192 CONSTANT: wMidLetter 14
193 CONSTANT: wMidNum 15
194 CONSTANT: wNumeric 16
195 CONSTANT: wExtendNumLet 17
196 CONSTANT: wWSegSpace 18
197 CONSTANT: unicode-words 19
198 >>
199
200 <<
201 CONSTANT: word-break-table $[
202     "resource:basis/unicode/UCD/auxiliary/WordBreakProperty.txt"
203     load-interval-file dup array>> [
204         2 swap [
205             {
206                 { "Other" [ wOther ] }
207                 { "CR" [ wCR ] }
208                 { "LF" [ wLF ] }
209                 { "Newline" [ wNewline ] }
210                 { "Extend" [ wExtend ] }
211                 { "ZWJ" [ wZWJ ]  }
212                 { "Regional_Indicator" [ wRegional_Indicator ] }
213                 { "Format" [ wFormat ] }
214                 { "Katakana" [ wKatakana ] }
215                 { "Hebrew_Letter" [ wHebrew_Letter ] }
216                 { "ALetter" [ wALetter ] }
217                 { "Single_Quote" [ wSingle_Quote ] }
218                 { "Double_Quote" [ wDouble_Quote ] }
219                 { "MidNumLet" [ wMidNumLet ] }
220                 { "MidLetter" [ wMidLetter ] }
221                 { "MidNum" [ wMidNum ] }
222                 { "Numeric" [ wNumeric ] }
223                 { "ExtendNumLet" [ wExtendNumLet ] }
224                 { "WSegSpace" [ wWSegSpace ] }
225             } case
226         ] change-nth
227     ] each
228 ]
229 >>
230
231 : word-break-prop ( char -- word-break-prop )
232     word-break-table interval-at wOther or ;
233
234 <<
235 SYMBOL: check-AHletter-before
236 SYMBOL: check-AHletter-after
237 SYMBOL: check-Hebrew-letter-before
238 SYMBOL: check-Hebrew-letter-after
239 SYMBOL: check-number-before
240 SYMBOL: check-number-after
241 SYMBOL: check-Extended_Pictographic
242 SYMBOL: check-RI-pair
243
244 : make-word-table ( -- )
245     { wCR } { wLF } connect                                                   ! WB3
246     { wNewline                                                                ! WB3a
247       wCR
248       wLF } unicode-words <iota> disconnect
249     unicode-words <iota> { wNewline                                           ! WB3b
250                            wCR
251                            wLF } disconnect
252     { wZWJ } unicode-words <iota> check-Extended_Pictographic set-table       ! WB3c
253     { wWSegSpace } { wWSegSpace } connect                                     ! WB3d
254     unicode-words <iota> { wZWJ } connect                                     ! WB4
255     { wALetter                                                                ! WB5
256       wHebrew_Letter } { wALetter
257                          wHebrew_Letter } connect
258     { wALetter                                                                ! WB6
259       wHebrew_Letter } { wMidLetter
260                          wMidNumLet
261                          wSingle_Quote } check-AHletter-after set-table
262     { wMidLetter                                                              ! WB7
263       wMidNumLet
264       wSingle_Quote } { wALetter
265                         wHebrew_Letter } check-AHletter-before set-table
266     { wHebrew_Letter } { wSingle_Quote } connect                              ! WB7a
267     { wHebrew_Letter } { wDouble_Quote } check-Hebrew-letter-after set-table  ! WB7b 
268     { wDouble_Quote } { wHebrew_Letter } check-Hebrew-letter-before set-table ! WB7c 
269     { wNumeric } { wNumeric } connect                                         ! WB8
270     { wALetter
271       wHebrew_Letter } { wNumeric } connect                                   ! WB9
272     { wNumeric } { wALetter                                                   ! WB10
273                    wHebrew_Letter } connect
274     { wMidNum                                                                 ! WB11
275       wMidNumLet
276       wSingle_Quote } { wNumeric } check-number-before set-table
277     { wNumeric } { wMidNum                                                    ! WB12
278                    wMidNumLet
279                    wSingle_Quote } check-number-after set-table
280     { wKatakana } { wKatakana } connect                                       ! WB13
281     { wALetter                                                                ! WB13a 
282       wHebrew_Letter
283       wNumeric
284       wKatakana
285       wExtendNumLet } { wExtendNumLet } connect
286     { wExtendNumLet } { wALetter                                              ! WB13b
287                         wHebrew_Letter
288                         wNumeric
289                         wKatakana } connect
290     { wRegional_Indicator } { wRegional_Indicator } check-RI-pair set-table ; ! WB15,16
291
292 : finish-word-table ( -- table )
293     table get [
294         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
295     ] map ;
296
297 CONSTANT: word-table $[
298     unicode-words init-table table
299     [ make-word-table finish-word-table ] with-variable
300 ]
301 >>
302
303 : word-table-nth ( class1 class2 -- ? )
304     word-table nth nth ;
305
306 :: property-not= ( str i property -- ? )
307     i [
308         i str ?nth [ word-break-prop property = not ]
309         [ f ] if*
310     ] [ t ] if ;
311
312 : (format/extended?) ( class -- ? )
313     ${ wExtend wFormat } member? ; inline                                     ! WB4
314
315 : format/extended? ( ch -- ? )
316     word-break-prop (format/extended?) ;
317
318 : (format/extended/zwj?) ( class -- ? )
319     ${ wExtend wFormat wZWJ } member? ; inline                                ! WB4
320
321 : format/extended/zwj? ( ch -- ? )
322     word-break-prop (format/extended/zwj?) ;
323
324 : (walk-up) ( str i -- j )
325     swap [ format/extended/zwj? not ] find-from drop ;
326
327 : walk-up ( str i -- j )
328     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
329
330 : (walk-down) ( str i -- j )
331     swap [ format/extended/zwj? not ] find-last-from drop ;
332
333 : walk-down ( str i -- j )
334     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
335
336 :: word-break? ( str i table-entry -- ? )
337     str i table-entry
338     {
339         { t [ 2drop f ] }
340         { f [ 2drop t ] }
341         { check-AHletter-after
342           [ dupd walk-up
343             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
344         { check-AHletter-before
345           [ dupd walk-down
346             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
347         { check-Hebrew-letter-after
348           [ dupd walk-up wHebrew_Letter property-not= ] }
349         { check-Hebrew-letter-before
350           [ dupd walk-down wHebrew_Letter property-not= ] }
351         { check-number-after
352           [ dupd walk-up wNumeric property-not= ] }
353         { check-number-before
354           [ dupd walk-down wNumeric property-not= ] }
355         { check-Extended_Pictographic
356           [ swap ?nth extended-pictographic-table interval-key? ] }
357         { check-RI-pair [
358               2drop 
359               f :> ri-even?!
360               i str [
361                   regional? [ ri-even? not ri-even?! f ] [ t ] if
362               ] find-last-from 2drop
363               ri-even? not
364           ] }
365     } case ;
366
367 :: word-break-next ( old-class new-char i str -- next-class ? )
368     new-char word-break-prop :> new-class
369     new-class (format/extended?)
370     [ old-class dup ${ wCR wLF wNewline } member? ] [
371         new-class old-class over word-table-nth
372         [ str i 1 - ] dip word-break?
373     ] if ;
374
375 PRIVATE>