]> gitweb.factorcode.org Git - factor.git/blob - basis/unicode/breaks/breaks.factor
factor: trim some using lists
[factor.git] / basis / unicode / breaks / breaks.factor
1 ! Copyright (C) 2008 Daniel Ehrenberg.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays combinators interval-maps kernel
4 literals math namespaces sequences simple-flat-file
5 unicode.categories unicode.data unicode.normalize.private words ;
6 IN: unicode.breaks
7
8 <PRIVATE
9
10 <<
11
12 :: load-interval-file-for ( filename n key -- table )
13     filename load-data-file [ n swap nth key = ] filter
14     intern-values expand-ranges ;
15
16 >>
17
18 CONSTANT: emoji-modifier-table $[
19     "resource:basis/unicode/UCD/emoji/emoji-data.txt"
20     1 "Emoji_Modifier" load-interval-file-for
21 ]
22
23 CONSTANT: extended-pictographic-table $[
24     "resource:basis/unicode/UCD/emoji/emoji-data.txt"
25     1 "Extended_Pictographic" load-interval-file-for
26 ]
27
28 CONSTANT: spacing-mark-exceptions-table $[
29     {
30         0x102B 0x102C 0x1038 { 0x1062 0x1064 } { 0x1067 0x106D }
31         0x1083 { 0x1087 0x108C } 0x108F { 0x109A 0x109C } 0x1A61
32         0x1A63 0x1A64 0xAA7B 0xAA7D 0x11720 0x11721
33     } <interval-set>
34 ]
35
36 ! Grapheme breaks
37 <<
38 CONSTANT: Any 0
39 CONSTANT: L 1
40 CONSTANT: V 2
41 CONSTANT: T 3
42 CONSTANT: LV 4
43 CONSTANT: LVT 5
44 CONSTANT: Extend 6
45 CONSTANT: Control 7
46 CONSTANT: CR 8
47 CONSTANT: LF 9
48 CONSTANT: SpacingMark 10
49 CONSTANT: Prepend 11
50 CONSTANT: ZWJ 12
51 CONSTANT: Extended_Pictographic 13
52 CONSTANT: (Extended_Pictographic-Extend*-)ZWJ 14
53 CONSTANT: Regional_Indicator(even) 15
54 CONSTANT: Regional_Indicator(odd) 16
55 CONSTANT: graphemes 17
56
57 : jamo-class ( ch -- class )
58     dup initial? [ drop L ]
59     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
60
61 : hangul-class ( ch -- class )
62     hangul-base - 0x1C mod zero? LV LVT ? ;
63
64 CATEGORY: extend
65     Me Mn |
66     "Other_Grapheme_Extend" property? ;
67
68 CATEGORY: grapheme-control Zl Zp Cc Cf ;
69
70 : control-class ( ch -- class )
71     {
72         { [ dup CHAR: \r = ]  [ drop CR ] }
73         { [ dup CHAR: \n = ] [ drop LF ] }
74         { [ dup 0x200C = ] [ drop Extend ] }
75         { [ dup 0x200D = ] [ drop ZWJ ] }
76         { [ dup "Other_Grapheme_Extend" property? ] [ drop Extend ] }
77         [ drop Control ]
78     } cond ;
79
80 : loe? ( ch -- ? )
81     "Logical_Order_Exception" property? ;
82
83 CATEGORY: spacing Mc ;
84
85 : regional? ( ch -- ? )
86     "Regional_Indicator" property? ;
87 >>
88
89 : modifier? ( ch -- ? )
90     emoji-modifier-table interval-key? ; inline
91
92 :: grapheme-class ( str -- class )
93     str last
94     {
95         { [ dup jamo? ] [ jamo-class ] }
96         { [ dup hangul? ] [ hangul-class ] }
97         { [ dup grapheme-control? ] [
98               control-class dup ZWJ = [
99                   drop
100                   str unclip-last-slice drop dup [
101                       [ extend? ]
102                       [ control-class Extend = ]
103                       [ modifier? ]
104                       tri or or not
105                   ] find-last drop [ swap ?nth ] [ last ] if*
106                   extended-pictographic-table interval-key? [
107                       (Extended_Pictographic-Extend*-)ZWJ
108                   ] [ ZWJ ] if
109               ] when
110           ] }
111         { [ dup extend? ] [ drop Extend ] }
112         { [ dup modifier? ] [ drop Extend ] }
113         { [ dup spacing? ] [
114                spacing-mark-exceptions-table
115                interval-key? [ Any ] [ SpacingMark ] if ] }
116         { [ dup loe? ] [ drop Prepend ] }
117         { [ dup regional? ] [
118               drop
119               f :> ri-even?!
120               str unclip-last-slice drop [
121                   regional? [ ri-even? not ri-even?! f ] [ t ] if
122               ] find-last 2drop
123               ri-even? [
124                   Regional_Indicator(even)
125               ] [
126                   Regional_Indicator(odd)
127               ] if
128           ] }
129         { [ dup extended-pictographic-table interval-key? ] [
130               drop Extended_Pictographic
131           ] }
132         [ drop Any ]
133     } cond ;
134
135 <<
136 : init-table ( size -- table )
137     dup [ f <array> ] curry replicate ;
138
139 SYMBOL: table
140
141 : finish-table ( -- table )
142     table get [ [ 1 = ] map ] map ;
143
144 : eval-seq ( seq -- seq )
145     [ dup word? [ execute( -- x ) ] when ] map ;
146
147 : (set-table) ( class1 class2 val -- )
148     [ table get nth ] dip '[ _ or ] change-nth ;
149
150 : set-table ( classes1 classes2 val -- )
151     [ [ eval-seq ] bi@ ] dip
152     [ [ (set-table) ] curry with each ] 2curry each ;
153
154 : connect ( class1 class2 -- ) 1 set-table ;
155 : disconnect ( class1 class2 -- ) 0 set-table ;
156
157 : make-grapheme-table ( -- )
158     { CR } { LF } connect                                                       ! GB3
159     { Control CR LF } graphemes <iota> disconnect                               ! GB4
160     graphemes <iota> { Control CR LF } disconnect                               ! GB5
161     { L } { L V LV LVT } connect                                                ! GB6
162     { LV V } { V T } connect                                                    ! GB7
163     { LVT T } { T } connect                                                     ! GB8
164     graphemes <iota> { Extend ZWJ (Extended_Pictographic-Extend*-)ZWJ } connect ! GB9
165     graphemes <iota> { SpacingMark } connect                                    ! GB9a
166     { Prepend } graphemes <iota> connect                                        ! GB9b
167     { (Extended_Pictographic-Extend*-)ZWJ } { Extended_Pictographic } connect   ! GB11
168     { Regional_Indicator(odd) } { Regional_Indicator(even) } connect ;          ! GB12,13
169 >>
170
171 CONSTANT: grapheme-table $[
172     graphemes init-table table
173     [ make-grapheme-table finish-table ] with-variable
174 ]
175
176 : grapheme-break? ( class1 class2 -- ? )
177     grapheme-table nth nth not ;
178
179 ! Word breaks
180 <<
181 CONSTANT: wOther 0
182 CONSTANT: wCR 1
183 CONSTANT: wLF 2
184 CONSTANT: wNewline 3
185 CONSTANT: wExtend 4
186 CONSTANT: wZWJ 5
187 CONSTANT: wRegional_Indicator 6
188 CONSTANT: wFormat 7
189 CONSTANT: wKatakana 8
190 CONSTANT: wHebrew_Letter 9
191 CONSTANT: wALetter 10
192 CONSTANT: wSingle_Quote 11
193 CONSTANT: wDouble_Quote 12
194 CONSTANT: wMidNumLet 13
195 CONSTANT: wMidLetter 14
196 CONSTANT: wMidNum 15
197 CONSTANT: wNumeric 16
198 CONSTANT: wExtendNumLet 17
199 CONSTANT: wWSegSpace 18
200 CONSTANT: unicode-words 19
201 >>
202
203 <<
204 CONSTANT: word-break-table $[
205     "resource:basis/unicode/UCD/auxiliary/WordBreakProperty.txt"
206     load-interval-file dup array>> [
207         2 swap [
208             {
209                 { "Other" [ wOther ] }
210                 { "CR" [ wCR ] }
211                 { "LF" [ wLF ] }
212                 { "Newline" [ wNewline ] }
213                 { "Extend" [ wExtend ] }
214                 { "ZWJ" [ wZWJ ]  }
215                 { "Regional_Indicator" [ wRegional_Indicator ] }
216                 { "Format" [ wFormat ] }
217                 { "Katakana" [ wKatakana ] }
218                 { "Hebrew_Letter" [ wHebrew_Letter ] }
219                 { "ALetter" [ wALetter ] }
220                 { "Single_Quote" [ wSingle_Quote ] }
221                 { "Double_Quote" [ wDouble_Quote ] }
222                 { "MidNumLet" [ wMidNumLet ] }
223                 { "MidLetter" [ wMidLetter ] }
224                 { "MidNum" [ wMidNum ] }
225                 { "Numeric" [ wNumeric ] }
226                 { "ExtendNumLet" [ wExtendNumLet ] }
227                 { "WSegSpace" [ wWSegSpace ] }
228             } case
229         ] change-nth
230     ] each
231 ]
232 >>
233
234 : word-break-prop ( char -- word-break-prop )
235     word-break-table interval-at wOther or ;
236
237 <<
238 SYMBOL: check-AHletter-before
239 SYMBOL: check-AHletter-after
240 SYMBOL: check-Hebrew-letter-before
241 SYMBOL: check-Hebrew-letter-after
242 SYMBOL: check-number-before
243 SYMBOL: check-number-after
244 SYMBOL: check-Extended_Pictographic
245 SYMBOL: check-RI-pair
246
247 : make-word-table ( -- )
248     { wCR } { wLF } connect                                                   ! WB3
249     { wNewline                                                                ! WB3a
250       wCR
251       wLF } unicode-words <iota> disconnect
252     unicode-words <iota> { wNewline                                           ! WB3b
253                            wCR
254                            wLF } disconnect
255     { wZWJ } unicode-words <iota> check-Extended_Pictographic set-table       ! WB3c
256     { wWSegSpace } { wWSegSpace } connect                                     ! WB3d
257     unicode-words <iota> { wZWJ } connect                                     ! WB4
258     { wALetter                                                                ! WB5
259       wHebrew_Letter } { wALetter
260                          wHebrew_Letter } connect
261     { wALetter                                                                ! WB6
262       wHebrew_Letter } { wMidLetter
263                          wMidNumLet
264                          wSingle_Quote } check-AHletter-after set-table
265     { wMidLetter                                                              ! WB7
266       wMidNumLet
267       wSingle_Quote } { wALetter
268                         wHebrew_Letter } check-AHletter-before set-table
269     { wHebrew_Letter } { wSingle_Quote } connect                              ! WB7a
270     { wHebrew_Letter } { wDouble_Quote } check-Hebrew-letter-after set-table  ! WB7b 
271     { wDouble_Quote } { wHebrew_Letter } check-Hebrew-letter-before set-table ! WB7c 
272     { wNumeric } { wNumeric } connect                                         ! WB8
273     { wALetter
274       wHebrew_Letter } { wNumeric } connect                                   ! WB9
275     { wNumeric } { wALetter                                                   ! WB10
276                    wHebrew_Letter } connect
277     { wMidNum                                                                 ! WB11
278       wMidNumLet
279       wSingle_Quote } { wNumeric } check-number-before set-table
280     { wNumeric } { wMidNum                                                    ! WB12
281                    wMidNumLet
282                    wSingle_Quote } check-number-after set-table
283     { wKatakana } { wKatakana } connect                                       ! WB13
284     { wALetter                                                                ! WB13a 
285       wHebrew_Letter
286       wNumeric
287       wKatakana
288       wExtendNumLet } { wExtendNumLet } connect
289     { wExtendNumLet } { wALetter                                              ! WB13b
290                         wHebrew_Letter
291                         wNumeric
292                         wKatakana } connect
293     { wRegional_Indicator } { wRegional_Indicator } check-RI-pair set-table ; ! WB15,16
294
295 : finish-word-table ( -- table )
296     table get [
297         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
298     ] map ;
299 >>
300
301 <<
302 CONSTANT: word-table $[
303     unicode-words init-table table
304     [ make-word-table finish-word-table ] with-variable
305 ]
306 >>
307
308 : word-table-nth ( class1 class2 -- ? )
309     word-table nth nth ;
310
311 :: property-not= ( str i property -- ? )
312     i [
313         i str ?nth [ word-break-prop property = not ]
314         [ f ] if*
315     ] [ t ] if ;
316
317 : (format/extended?) ( class -- ? )
318     ${ wExtend wFormat } member? ; inline                                     ! WB4
319
320 : format/extended? ( ch -- ? )
321     word-break-prop (format/extended?) ;
322
323 : (format/extended/zwj?) ( class -- ? )
324     ${ wExtend wFormat wZWJ } member? ; inline                                ! WB4
325
326 : format/extended/zwj? ( ch -- ? )
327     word-break-prop (format/extended/zwj?) ;
328
329 : (walk-up) ( str i -- j )
330     swap [ format/extended/zwj? not ] find-from drop ;
331
332 : walk-up ( str i -- j )
333     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
334
335 : (walk-down) ( str i -- j )
336     swap [ format/extended/zwj? not ] find-last-from drop ;
337
338 : walk-down ( str i -- j )
339     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
340
341 :: word-break? ( str i table-entry -- ? )
342     str i table-entry
343     {
344         { t [ 2drop f ] }
345         { f [ 2drop t ] }
346         { check-AHletter-after
347           [ dupd walk-up
348             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
349         { check-AHletter-before
350           [ dupd walk-down
351             [ wALetter property-not= ] [ wHebrew_Letter property-not= ] 2bi or ] }
352         { check-Hebrew-letter-after
353           [ dupd walk-up wHebrew_Letter property-not= ] }
354         { check-Hebrew-letter-before
355           [ dupd walk-down wHebrew_Letter property-not= ] }
356         { check-number-after
357           [ dupd walk-up wNumeric property-not= ] }
358         { check-number-before
359           [ dupd walk-down wNumeric property-not= ] }
360         { check-Extended_Pictographic
361           [ swap ?nth extended-pictographic-table interval-key? ] }
362         { check-RI-pair [
363               2drop 
364               f :> ri-even?!
365               i str [
366                   regional? [ ri-even? not ri-even?! f ] [ t ] if
367               ] find-last-from 2drop
368               ri-even? not
369           ] }
370     } case ;
371
372 :: word-break-next ( old-class new-char i str -- next-class ? )
373     new-char word-break-prop :> new-class
374     new-class (format/extended?)
375     [ old-class dup ${ wCR wLF wNewline } member? ] [
376         new-class old-class over word-table-nth
377         [ str i 1 - ] dip word-break?
378     ] if ;
379
380 PRIVATE>