basis/unicode/breaks/breaks.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors alien.syntax arrays assocs combinators
   4 combinators.short-circuit compiler.units fry interval-maps io
   5 io.encodings.ascii io.files kernel literals locals make math
   6 math.parser math.ranges memoize namespaces parser sequences
   7 sets simple-flat-file splitting unicode.categories
   8 unicode.categories.syntax unicode.data unicode.normalize
   9 unicode.normalize.private words words.constant ;
  10 IN: unicode.breaks
  11
  12 <PRIVATE
  13
  14 ! Grapheme breaks
  15 <<
  16 CONSTANT: Any 0
  17 CONSTANT: L 1
  18 CONSTANT: V 2
  19 CONSTANT: T 3
  20 CONSTANT: LV 4
  21 CONSTANT: LVT 5
  22 CONSTANT: Extend 6
  23 CONSTANT: Control 7
  24 CONSTANT: CR 8
  25 CONSTANT: LF 9
  26 CONSTANT: SpacingMark 10
  27 CONSTANT: Prepend 11
  28 CONSTANT: graphemes 12
  29
  30 : jamo-class ( ch -- class )
  31     dup initial? [ drop L ]
  32     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
  33
  34 : hangul-class ( ch -- class )
  35     hangul-base - 0x1C mod zero? LV LVT ? ;
  36
  37 CATEGORY: grapheme-control Zl Zp Cc Cf ;
  38 : control-class ( ch -- class )
  39     {
  40         { CHAR: \r [ CR ] }
  41         { CHAR: \n [ LF ] }
  42         { 0x200C [ Extend ] }
  43         { 0x200D [ Extend ] }
  44         [ drop Control ]
  45     } case ;
  46
  47 CATEGORY: extend
  48     Me Mn |
  49     "Other_Grapheme_Extend" property? ;
  50
  51 : loe? ( ch -- ? )
  52     "Logical_Order_Exception" property? ;
  53
  54 CATEGORY: spacing Mc ;
  55
  56 : grapheme-class ( ch -- class )
  57     {
  58         { [ dup jamo? ] [ jamo-class ] }
  59         { [ dup hangul? ] [ hangul-class ] }
  60         { [ dup grapheme-control? ] [ control-class ] }
  61         { [ dup extend? ] [ drop Extend ] }
  62         { [ dup spacing? ] [ drop SpacingMark ] }
  63         { [ loe? ] [ Prepend ] }
  64         [ Any ]
  65     } cond ;
  66
  67 : init-table ( size -- table )
  68     dup [ f <array> ] curry replicate ;
  69
  70 SYMBOL: table
  71
  72 : finish-table ( -- table )
  73     table get [ [ 1 = ] map ] map ;
  74
  75 : eval-seq ( seq -- seq )
  76     [ dup word? [ execute( -- x ) ] when ] map ;
  77
  78 : (set-table) ( class1 class2 val -- )
  79     [ table get nth ] dip '[ _ or ] change-nth ;
  80
  81 : set-table ( classes1 classes2 val -- )
  82     [ [ eval-seq ] bi@ ] dip
  83     [ [ (set-table) ] curry with each ] 2curry each ;
  84
  85 : connect ( class1 class2 -- ) 1 set-table ;
  86 : disconnect ( class1 class2 -- ) 0 set-table ;
  87
  88 : make-grapheme-table ( -- )
  89     { CR } { LF } connect
  90     { Control CR LF } graphemes iota disconnect
  91     graphemes iota { Control CR LF } disconnect
  92     { L } { L V LV LVT } connect
  93     { LV V } { V T } connect
  94     { LVT T } { T } connect
  95     graphemes iota { Extend } connect
  96     graphemes iota { SpacingMark } connect
  97     { Prepend } graphemes iota connect ;
  98
  99 "grapheme-table" create-word-in
 100 graphemes init-table table
 101 [ make-grapheme-table finish-table ] with-variable
 102 define-constant
 103 >>
 104
 105 : grapheme-break? ( class1 class2 -- ? )
 106     grapheme-table nth nth not ;
 107
 108 PRIVATE>
 109
 110 : first-grapheme ( str -- i )
 111     unclip-slice grapheme-class over
 112     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
 113     nip swap length or 1 + ;
 114
 115 : first-grapheme-from ( start str -- i )
 116     over tail-slice first-grapheme + ;
 117
 118 : last-grapheme ( str -- i )
 119     unclip-last-slice grapheme-class swap
 120     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
 121
 122 : last-grapheme-from ( end str -- i )
 123     swap head-slice last-grapheme ;
 124
 125 <PRIVATE
 126
 127 : >pieces ( str quot: ( str -- i ) -- graphemes )
 128     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
 129
 130 PRIVATE>
 131
 132 : >graphemes ( str -- graphemes )
 133     [ first-grapheme ] >pieces ;
 134
 135 : string-reverse ( str -- rts )
 136     >graphemes reverse! concat ;
 137
 138 <PRIVATE
 139
 140 ! Word breaks
 141 <<
 142 CONSTANT: wOther 0
 143 CONSTANT: wCR 1
 144 CONSTANT: wLF 2
 145 CONSTANT: wNewline 3
 146 CONSTANT: wExtend 4
 147 CONSTANT: wFormat 5
 148 CONSTANT: wKatakana 6
 149 CONSTANT: wALetter 7
 150 CONSTANT: wMidLetter 8
 151 CONSTANT: wMidNum 9
 152 CONSTANT: wMidNumLet 10
 153 CONSTANT: wNumeric 11
 154 CONSTANT: wExtendNumLet 12
 155 CONSTANT: unicode-words 13
 156
 157 ! Is there a way to avoid this?
 158 CONSTANT: word-break-classes H{
 159     { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
 160     { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
 161     { "ALetter" 7 } { "MidLetter" 8 }
 162     { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
 163     { "ExtendNumLet" 12 }
 164 }
 165
 166 "word-break-table" create-word-in
 167 "vocab:unicode/data/WordBreakProperty.txt"
 168 load-interval-file dup array>>
 169 [ 2 swap [ word-break-classes at ] change-nth ] each
 170 define-constant
 171 >>
 172
 173 : word-break-prop ( char -- word-break-prop )
 174     word-break-table interval-at wOther or ;
 175
 176 <<
 177 SYMBOL: check-letter-before
 178 SYMBOL: check-letter-after
 179 SYMBOL: check-number-before
 180 SYMBOL: check-number-after
 181
 182 : make-word-table ( -- )
 183     { wCR } { wLF } connect
 184     { wNewline wCR wLF } unicode-words iota disconnect
 185     unicode-words iota { wNewline wCR wLF } disconnect
 186     { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
 187     { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
 188     { wNumeric wALetter } { wNumeric wALetter } connect
 189     { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
 190     { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
 191     { wKatakana } { wKatakana } connect
 192     { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
 193     [ connect ] [ swap connect ] 2bi ;
 194
 195 : finish-word-table ( -- table )
 196     table get [
 197         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
 198     ] map ;
 199
 200 "word-table" create-word-in
 201 unicode-words init-table table
 202 [ make-word-table finish-word-table ] with-variable
 203 define-constant
 204 >>
 205
 206 : word-table-nth ( class1 class2 -- ? )
 207     word-table nth nth ;
 208
 209 :: property-not= ( str i property -- ? )
 210     i [
 211         i str ?nth [ word-break-prop property = not ]
 212         [ f ] if*
 213     ] [ t ] if ;
 214
 215 : (format/extended?) ( class -- ? )
 216     ${ wExtend wFormat } member? ; inline
 217
 218 : format/extended? ( ch -- ? )
 219     word-break-prop (format/extended?) ;
 220
 221 : (walk-up) ( str i -- j )
 222     swap [ format/extended? not ] find-from drop ;
 223
 224 : walk-up ( str i -- j )
 225     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 226
 227 : (walk-down) ( str i -- j )
 228     swap [ format/extended? not ] find-last-from drop ;
 229
 230 : walk-down ( str i -- j )
 231     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 232
 233 : word-break? ( str i table-entry -- ? )
 234     {
 235         { t [ 2drop f ] }
 236         { f [ 2drop t ] }
 237         { check-letter-after
 238             [ dupd walk-up wALetter property-not= ] }
 239         { check-letter-before
 240             [ dupd walk-down wALetter property-not= ] }
 241         { check-number-after
 242             [ dupd walk-up wNumeric property-not= ] }
 243         { check-number-before
 244             [ dupd walk-down wNumeric property-not= ] }
 245     } case ;
 246
 247 :: word-break-next ( old-class new-char i str -- next-class ? )
 248     new-char word-break-prop :> new-class
 249     new-class (format/extended?)
 250     [ old-class dup ${ wCR wLF wNewline } member? ] [
 251         new-class old-class over word-table-nth
 252         [ str i 1 - ] dip word-break?
 253     ] if ;
 254
 255 PRIVATE>
 256
 257  : first-word ( str -- i )
 258     [ [ length ] [ first word-break-prop ] bi ] keep
 259     1 swap dup '[ _ word-break-next ] find-index-from
 260     drop nip swap or ;
 261
 262 : >words ( str -- words )
 263     [ first-word ] >pieces ;
 264
 265 <PRIVATE
 266
 267 : nth-next ( i str -- str[i-1] str[i] )
 268     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
 269
 270 PRIVATE>
 271
 272 : word-break-at? ( i str -- ? )
 273     {
 274         [ drop zero? ]
 275         [ length = ]
 276         [
 277             [ nth-next [ word-break-prop ] dip ] 2keep
 278             word-break-next nip
 279         ]
 280     } 2|| ;
 281
 282 : first-word-from ( start str -- i )
 283     over tail-slice first-word + ;
 284
 285 : last-word ( str -- i )
 286     [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 287
 288 : last-word-from ( end str -- i )
 289     swap head-slice last-word ;