basis/unicode/breaks/breaks.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors alien.syntax arrays assocs combinators
   4 combinators.short-circuit compiler.units fry interval-maps io
   5 io.encodings.ascii io.files kernel literals locals make math
   6 math.parser math.ranges memoize namespaces sequences
   7 sets simple-flat-file splitting unicode.categories
   8 unicode.categories.syntax unicode.data unicode.normalize
   9 unicode.normalize.private values words ;
  10 FROM: sequences => change-nth ;
  11 IN: unicode.breaks
  12
  13 <PRIVATE
  14 ! Grapheme breaks
  15
  16 CONSTANT: Any 0
  17 CONSTANT: L 1
  18 CONSTANT: V 2
  19 CONSTANT: T 3
  20 CONSTANT: LV 4
  21 CONSTANT: LVT 5
  22 CONSTANT: Extend 6
  23 CONSTANT: Control 7
  24 CONSTANT: CR 8
  25 CONSTANT: LF 9
  26 CONSTANT: SpacingMark 10
  27 CONSTANT: Prepend 11
  28 CONSTANT: graphemes 12
  29
  30 : jamo-class ( ch -- class )
  31     dup initial? [ drop L ]
  32     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
  33
  34 : hangul-class ( ch -- class )
  35     hangul-base - HEX: 1C mod zero? LV LVT ? ;
  36
  37 CATEGORY: grapheme-control Zl Zp Cc Cf ;
  38 : control-class ( ch -- class )
  39     {
  40         { CHAR: \r [ CR ] }
  41         { CHAR: \n [ LF ] }
  42         { HEX: 200C [ Extend ] }
  43         { HEX: 200D [ Extend ] }
  44         [ drop Control ]
  45     } case ;
  46
  47 CATEGORY: extend
  48     Me Mn |
  49     "Other_Grapheme_Extend" property? ;
  50
  51 : loe? ( ch -- ? )
  52     "Logical_Order_Exception" property? ;
  53
  54 CATEGORY: spacing Mc ;
  55
  56 : grapheme-class ( ch -- class )
  57     {
  58         { [ dup jamo? ] [ jamo-class ] }
  59         { [ dup hangul? ] [ hangul-class ] }
  60         { [ dup grapheme-control? ] [ control-class ] }
  61         { [ dup extend? ] [ drop Extend ] }
  62         { [ dup spacing? ] [ drop SpacingMark ] }
  63         { [ loe? ] [ Prepend ] }
  64         [ Any ]
  65     } cond ;
  66
  67 : init-table ( size -- table )
  68     dup [ f <array> ] curry replicate ;
  69
  70 SYMBOL: table
  71
  72 : finish-table ( -- table )
  73     table get [ [ 1 = ] map ] map ;
  74
  75 : eval-seq ( seq -- seq ) [ ?execute ] map ;
  76
  77 : (set-table) ( class1 class2 val -- )
  78     [ table get nth ] dip '[ _ or ] change-nth ;
  79
  80 : set-table ( classes1 classes2 val -- )
  81     [ [ eval-seq ] bi@ ] dip
  82     [ [ (set-table) ] curry with each ] 2curry each ;
  83
  84 : connect ( class1 class2 -- ) 1 set-table ;
  85 : disconnect ( class1 class2 -- ) 0 set-table ;
  86
  87 : make-grapheme-table ( -- )
  88     { CR } { LF } connect
  89     { Control CR LF } graphemes iota disconnect
  90     graphemes iota { Control CR LF } disconnect
  91     { L } { L V LV LVT } connect
  92     { LV V } { V T } connect
  93     { LVT T } { T } connect
  94     graphemes iota { Extend } connect
  95     graphemes iota { SpacingMark } connect
  96     { Prepend } graphemes iota connect ;
  97
  98 VALUE: grapheme-table
  99
 100 : grapheme-break? ( class1 class2 -- ? )
 101     grapheme-table nth nth not ;
 102
 103 PRIVATE>
 104
 105 : first-grapheme ( str -- i )
 106     unclip-slice grapheme-class over
 107     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
 108     nip swap length or 1 + ;
 109
 110 : first-grapheme-from ( start str -- i )
 111     over tail-slice first-grapheme + ;
 112
 113 : last-grapheme ( str -- i )
 114     unclip-last-slice grapheme-class swap
 115     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
 116
 117 : last-grapheme-from ( end str -- i )
 118     swap head-slice last-grapheme ;
 119
 120 <PRIVATE
 121
 122 : >pieces ( str quot: ( str -- i ) -- graphemes )
 123     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
 124
 125 PRIVATE>
 126
 127 : >graphemes ( str -- graphemes )
 128     [ first-grapheme ] >pieces ;
 129
 130 : string-reverse ( str -- rts )
 131     >graphemes reverse concat ;
 132
 133 <PRIVATE
 134
 135 graphemes init-table table
 136 [ make-grapheme-table finish-table ] with-variable
 137 \ grapheme-table set-value
 138
 139 ! Word breaks
 140
 141 VALUE: word-break-table
 142
 143 "vocab:unicode/data/WordBreakProperty.txt" load-interval-file
 144 \ word-break-table set-value
 145
 146 CONSTANT: wOther 0
 147 CONSTANT: wCR 1
 148 CONSTANT: wLF 2
 149 CONSTANT: wNewline 3
 150 CONSTANT: wExtend 4
 151 CONSTANT: wFormat 5
 152 CONSTANT: wKatakana 6
 153 CONSTANT: wALetter 7
 154 CONSTANT: wMidLetter 8
 155 CONSTANT: wMidNum 9
 156 CONSTANT: wMidNumLet 10
 157 CONSTANT: wNumeric 11
 158 CONSTANT: wExtendNumLet 12
 159 CONSTANT: words 13
 160
 161 : word-break-classes ( -- table ) ! Is there a way to avoid this?
 162     H{
 163         { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
 164         { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
 165         { "ALetter" 7 } { "MidLetter" 8 }
 166         { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
 167         { "ExtendNumLet" 12 }
 168     } ;
 169
 170 : word-break-prop ( char -- word-break-prop )
 171     word-break-table interval-at
 172     word-break-classes at [ wOther ] unless* ;
 173
 174 SYMBOL: check-letter-before
 175 SYMBOL: check-letter-after
 176 SYMBOL: check-number-before
 177 SYMBOL: check-number-after
 178
 179 : make-word-table ( -- )
 180     { wCR } { wLF } connect
 181     { wNewline wCR wLF } words iota disconnect
 182     words iota { wNewline wCR wLF } disconnect
 183     { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
 184     { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
 185     { wNumeric wALetter } { wNumeric wALetter } connect
 186     { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
 187     { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
 188     { wKatakana } { wKatakana } connect
 189     { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
 190     [ connect ] [ swap connect ] 2bi ;
 191
 192 VALUE: word-table
 193
 194 : finish-word-table ( -- table )
 195     table get [
 196         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
 197     ] map ;
 198
 199 words init-table table
 200 [ make-word-table finish-word-table ] with-variable
 201 \ word-table set-value
 202
 203 : word-table-nth ( class1 class2 -- ? )
 204     word-table nth nth ;
 205
 206 :: property-not= ( str i property -- ? )
 207     i [
 208         i str ?nth [ word-break-prop property = not ]
 209         [ f ] if*
 210     ] [ t ] if ;
 211
 212 : (format/extended?) ( class -- ? )
 213     ${ wExtend wFormat } member? ; inline
 214
 215 : format/extended? ( ch -- ? )
 216     word-break-prop (format/extended?) ;
 217
 218 : (walk-up) ( str i -- j )
 219     swap [ format/extended? not ] find-from drop ;
 220
 221 : walk-up ( str i -- j )
 222     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 223
 224 : (walk-down) ( str i -- j )
 225     swap [ format/extended? not ] find-last-from drop ;
 226
 227 : walk-down ( str i -- j )
 228     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 229
 230 : word-break? ( str i table-entry -- ? )
 231     {
 232         { t [ 2drop f ] }
 233         { f [ 2drop t ] }
 234         { check-letter-after
 235             [ dupd walk-up wALetter property-not= ] }
 236         { check-letter-before
 237             [ dupd walk-down wALetter property-not= ] }
 238         { check-number-after
 239             [ dupd walk-up wNumeric property-not= ] }
 240         { check-number-before
 241             [ dupd walk-down wNumeric property-not= ] }
 242     } case ;
 243
 244 :: word-break-next ( old-class new-char i str -- next-class ? )
 245     new-char word-break-prop :> new-class
 246     new-class (format/extended?)
 247     [ old-class dup ${ wCR wLF wNewline } member? ] [
 248         new-class old-class over word-table-nth
 249         [ str i ] dip word-break?
 250     ] if ;
 251
 252 PRIVATE>
 253
 254 : first-word ( str -- i )
 255     [ unclip-slice word-break-prop over ] keep
 256     '[ _ word-break-next ] find-index drop
 257     nip swap length or 1 + ;
 258
 259 : >words ( str -- words )
 260     [ first-word ] >pieces ;
 261
 262 <PRIVATE
 263
 264 : nth-next ( i str -- str[i-1] str[i] )
 265     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
 266
 267 PRIVATE>
 268
 269 : word-break-at? ( i str -- ? )
 270     {
 271         [ drop zero? ]
 272         [ length = ]
 273         [
 274             [ nth-next [ word-break-prop ] dip ] 2keep
 275             word-break-next nip
 276         ]
 277     } 2|| ;
 278
 279 : first-word-from ( start str -- i )
 280     over tail-slice first-word + ;
 281
 282 : last-word ( str -- i )
 283     [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 284
 285 : last-word-from ( end str -- i )
 286     swap head-slice last-word ;