basis/unicode/breaks/breaks.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: combinators.short-circuit unicode.categories kernel math
   4 combinators splitting sequences math.parser io.files io assocs
   5 arrays namespaces make math.ranges unicode.normalize
   6 unicode.normalize.private values io.encodings.ascii
   7 unicode.data compiler.units fry unicode.categories.syntax
   8 alien.syntax sets accessors interval-maps memoize locals words
   9 simple-flat-file ;
  10 IN: unicode.breaks
  11
  12 <PRIVATE
  13 ! Grapheme breaks
  14
  15 ENUM: f Any L V T LV LVT Extend Control CR LF
  16     SpacingMark Prepend graphemes ;
  17
  18 : jamo-class ( ch -- class )
  19     dup initial? [ drop L ]
  20     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
  21
  22 : hangul-class ( ch -- class )
  23     hangul-base - HEX: 1C mod zero? LV LVT ? ;
  24
  25 CATEGORY: grapheme-control Zl Zp Cc Cf ;
  26 : control-class ( ch -- class )
  27     {
  28         { CHAR: \r [ CR ] }
  29         { CHAR: \n [ LF ] }
  30         { HEX: 200C [ Extend ] }
  31         { HEX: 200D [ Extend ] }
  32         [ drop Control ]
  33     } case ;
  34
  35 CATEGORY: extend
  36     Me Mn |
  37     "Other_Grapheme_Extend" property? ;
  38
  39 : loe? ( ch -- ? )
  40     "Logical_Order_Exception" property? ;
  41
  42 CATEGORY: spacing Mc ;
  43
  44 : grapheme-class ( ch -- class )
  45     {
  46         { [ dup jamo? ] [ jamo-class ] }
  47         { [ dup hangul? ] [ hangul-class ] }
  48         { [ dup grapheme-control? ] [ control-class ] }
  49         { [ dup extend? ] [ drop Extend ] }
  50         { [ dup spacing? ] [ drop SpacingMark ] }
  51         { [ loe? ] [ Prepend ] }
  52         [ Any ]
  53     } cond ;
  54
  55 : init-table ( size -- table )
  56     dup [ f <array> ] curry replicate ;
  57
  58 SYMBOL: table
  59
  60 : finish-table ( -- table )
  61     table get [ [ 1 = ] map ] map ;
  62
  63 : eval-seq ( seq -- seq ) [ ?execute ] map ;
  64
  65 : (set-table) ( class1 class2 val -- )
  66     [ table get nth ] dip '[ _ or ] change-nth ;
  67
  68 : set-table ( classes1 classes2 val -- )
  69     [ [ eval-seq ] bi@ ] dip
  70     [ [ (set-table) ] curry with each ] 2curry each ;
  71
  72 : connect ( class1 class2 -- ) 1 set-table ;
  73 : disconnect ( class1 class2 -- ) 0 set-table ;
  74
  75 : make-grapheme-table ( -- )
  76     { CR } { LF } connect
  77     { Control CR LF } graphemes iota disconnect
  78     graphemes iota { Control CR LF } disconnect
  79     { L } { L V LV LVT } connect
  80     { LV V } { V T } connect
  81     { LVT T } { T } connect
  82     graphemes iota { Extend } connect
  83     graphemes iota { SpacingMark } connect
  84     { Prepend } graphemes iota connect ;
  85
  86 VALUE: grapheme-table
  87
  88 : grapheme-break? ( class1 class2 -- ? )
  89     grapheme-table nth nth not ;
  90
  91 PRIVATE>
  92
  93 : first-grapheme ( str -- i )
  94     unclip-slice grapheme-class over
  95     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
  96     nip swap length or 1 + ;
  97
  98 : first-grapheme-from ( start str -- i )
  99     over tail-slice first-grapheme + ;
 100
 101 : last-grapheme ( str -- i )
 102     unclip-last-slice grapheme-class swap
 103     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
 104
 105 : last-grapheme-from ( end str -- i )
 106     swap head-slice last-grapheme ;
 107
 108 <PRIVATE
 109
 110 : >pieces ( str quot: ( str -- i ) -- graphemes )
 111     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
 112
 113 PRIVATE>
 114
 115 : >graphemes ( str -- graphemes )
 116     [ first-grapheme ] >pieces ;
 117
 118 : string-reverse ( str -- rts )
 119     >graphemes reverse concat ;
 120
 121 <PRIVATE
 122
 123 graphemes init-table table
 124 [ make-grapheme-table finish-table ] with-variable
 125 to: grapheme-table
 126
 127 ! Word breaks
 128
 129 VALUE: word-break-table
 130
 131 "vocab:unicode/data/WordBreakProperty.txt" load-interval-file
 132 to: word-break-table
 133
 134 ENUM: f wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
 135 wMidNum wMidNumLet wNumeric wExtendNumLet words ;
 136
 137 : word-break-classes ( -- table ) ! Is there a way to avoid this?
 138     H{
 139         { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
 140         { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
 141         { "ALetter" 7 } { "MidLetter" 8 }
 142         { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
 143         { "ExtendNumLet" 12 }
 144     } ;
 145
 146 : word-break-prop ( char -- word-break-prop )
 147     word-break-table interval-at
 148     word-break-classes at [ wOther ] unless* ;
 149
 150 SYMBOL: check-letter-before
 151 SYMBOL: check-letter-after
 152 SYMBOL: check-number-before
 153 SYMBOL: check-number-after
 154
 155 : make-word-table ( -- )
 156     { wCR } { wLF } connect
 157     { wNewline wCR wLF } words iota disconnect
 158     words iota { wNewline wCR wLF } disconnect
 159     { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
 160     { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
 161     { wNumeric wALetter } { wNumeric wALetter } connect
 162     { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
 163     { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
 164     { wKatakana } { wKatakana } connect
 165     { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
 166     [ connect ] [ swap connect ] 2bi ;
 167
 168 VALUE: word-table
 169
 170 : finish-word-table ( -- table )
 171     table get [
 172         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
 173     ] map ;
 174
 175 words init-table table
 176 [ make-word-table finish-word-table ] with-variable
 177 to: word-table
 178
 179 : word-table-nth ( class1 class2 -- ? )
 180     word-table nth nth ;
 181
 182 :: property-not= ( str i property -- ? )
 183     i [
 184         i str ?nth [ word-break-prop property = not ]
 185         [ f ] if*
 186     ] [ t ] if ;
 187
 188 : format/extended? ( ch -- ? )
 189     word-break-prop { 4 5 } member? ;
 190
 191 : (walk-up) ( str i -- j )
 192     swap [ format/extended? not ] find-from drop ;
 193
 194 : walk-up ( str i -- j )
 195     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 196
 197 : (walk-down) ( str i -- j )
 198     swap [ format/extended? not ] find-last-from drop ;
 199
 200 : walk-down ( str i -- j )
 201     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 202
 203 : word-break? ( str i table-entry -- ? )
 204     {
 205         { t [ 2drop f ] }
 206         { f [ 2drop t ] }
 207         { check-letter-after
 208             [ dupd walk-up wALetter property-not= ] }
 209         { check-letter-before
 210             [ dupd walk-down wALetter property-not= ] }
 211         { check-number-after
 212             [ dupd walk-up wNumeric property-not= ] }
 213         { check-number-before
 214             [ dupd walk-down wNumeric property-not= ] }
 215     } case ;
 216
 217 :: word-break-next ( old-class new-char i str -- next-class ? )
 218     new-char format/extended?
 219     [ old-class dup { 1 2 3 } member? ] [
 220         new-char word-break-prop old-class over word-table-nth
 221         [ str i ] dip word-break?
 222     ] if ;
 223
 224 PRIVATE>
 225
 226 : first-word ( str -- i )
 227     [ unclip-slice word-break-prop over <enum> ] keep
 228     '[ swap _ word-break-next ] assoc-find 2drop
 229     nip swap length or 1 + ;
 230
 231 : >words ( str -- words )
 232     [ first-word ] >pieces ;
 233
 234 <PRIVATE
 235
 236 : nth-next ( i str -- str[i-1] str[i] )
 237     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
 238
 239 PRIVATE>
 240
 241 : word-break-at? ( i str -- ? )
 242     {
 243         [ drop zero? ]
 244         [ length = ]
 245         [
 246             [ nth-next [ word-break-prop ] dip ] 2keep
 247             word-break-next nip
 248         ]
 249     } 2|| ;
 250
 251 : first-word-from ( start str -- i )
 252     over tail-slice first-word + ;
 253
 254 : last-word ( str -- i )
 255     [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 256
 257 : last-word-from ( end str -- i )
 258     swap head-slice last-word ;