basis/unicode/breaks/breaks.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: combinators.short-circuit unicode.categories kernel math
   4 combinators splitting sequences math.parser io.files io assocs
   5 arrays namespaces make math.ranges unicode.normalize
   6 unicode.normalize.private values io.encodings.ascii
   7 unicode.syntax unicode.data compiler.units fry
   8 alien.syntax sets accessors interval-maps memoize locals words ;
   9 IN: unicode.breaks
  10
  11 <PRIVATE
  12 ! Grapheme breaks
  13
  14 C-ENUM: Any L V T LV LVT Extend Control CR LF
  15     SpacingMark Prepend graphemes ;
  16
  17 : jamo-class ( ch -- class )
  18     dup initial? [ drop L ]
  19     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
  20
  21 : hangul-class ( ch -- class )
  22     hangul-base - HEX: 1C mod zero? LV LVT ? ;
  23
  24 CATEGORY: grapheme-control Zl Zp Cc Cf ;
  25 : control-class ( ch -- class )
  26     {
  27         { CHAR: \r [ CR ] }
  28         { CHAR: \n [ LF ] }
  29         { HEX: 200C [ Extend ] }
  30         { HEX: 200D [ Extend ] }
  31         [ drop Control ]
  32     } case ;
  33
  34 CATEGORY: (extend) Me Mn ;
  35 : extend? ( ch -- ? )
  36     { [ (extend)? ] [ "Other_Grapheme_Extend" property? ] } 1|| ;
  37
  38 : loe? ( ch -- ? )
  39     "Logical_Order_Exception" property? ;
  40
  41 CATEGORY: spacing Mc ;
  42
  43 : grapheme-class ( ch -- class )
  44     {
  45         { [ dup jamo? ] [ jamo-class ] }
  46         { [ dup hangul? ] [ hangul-class ] }
  47         { [ dup grapheme-control? ] [ control-class ] }
  48         { [ dup extend? ] [ drop Extend ] }
  49         { [ dup spacing? ] [ drop SpacingMark ] }
  50         { [ loe? ] [ Prepend ] }
  51         [ Any ]
  52     } cond ;
  53
  54 : init-table ( size -- table )
  55     dup [ f <array> ] curry replicate ;
  56
  57 SYMBOL: table
  58
  59 : finish-table ( -- table )
  60     table get [ [ 1 = ] map ] map ;
  61
  62 : eval-seq ( seq -- seq ) [ dup word? [ execute ] when ] map ;
  63
  64 : (set-table) ( class1 class2 val -- )
  65     [ table get nth ] dip '[ _ or ] change-nth ;
  66
  67 : set-table ( classes1 classes2 val -- )
  68     [ [ eval-seq ] bi@ ] dip
  69     [ [ (set-table) ] curry with each ] 2curry each ;
  70
  71 : connect ( class1 class2 -- ) 1 set-table ;
  72 : disconnect ( class1 class2 -- ) 0 set-table ;
  73
  74 : break-around ( classes1 classes2 -- )
  75     [ disconnect ] [ swap disconnect ] 2bi ;
  76
  77 : make-grapheme-table ( -- )
  78     { CR } { LF } connect
  79     { Control CR LF } graphemes disconnect
  80     graphemes { Control CR LF } disconnect
  81     { L } { L V LV LVT } connect
  82     { LV V } { V T } connect
  83     { LVT T } { T } connect
  84     graphemes { Extend } connect
  85     graphemes { SpacingMark } connect
  86     { Prepend } graphemes connect ;
  87
  88 VALUE: grapheme-table
  89
  90 : grapheme-break? ( class1 class2 -- ? )
  91     grapheme-table nth nth not ;
  92
  93 : chars ( i str n -- str[i] str[i+n] )
  94     swap [ dupd + ] dip [ ?nth ] curry bi@ ;
  95
  96 PRIVATE>
  97
  98 : first-grapheme ( str -- i )
  99     unclip-slice grapheme-class over
 100     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
 101     nip swap length or 1+ ;
 102
 103 <PRIVATE
 104
 105 : >pieces ( str quot: ( str -- i ) -- graphemes )
 106     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
 107
 108 PRIVATE>
 109
 110 : >graphemes ( str -- graphemes )
 111     [ first-grapheme ] >pieces ;
 112
 113 : string-reverse ( str -- rts )
 114     >graphemes reverse concat ;
 115
 116 : last-grapheme ( str -- i )
 117     unclip-last-slice grapheme-class swap
 118     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
 119
 120 <PRIVATE
 121
 122 graphemes init-table table
 123 [ make-grapheme-table finish-table ] with-variable
 124 to: grapheme-table
 125
 126 ! Word breaks
 127
 128 VALUE: word-break-table
 129
 130 "vocab:unicode/data/WordBreakProperty.txt" load-key-value
 131 to: word-break-table
 132
 133 C-ENUM: wOther wCR wLF wNewline wExtend wFormat wKatakana wALetter wMidLetter
 134 wMidNum wMidNumLet wNumeric wExtendNumLet words ;
 135
 136 : word-break-classes ( -- table ) ! Is there a way to avoid this?
 137     H{
 138         { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
 139         { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
 140         { "ALetter" 7 } { "MidLetter" 8 }
 141         { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
 142         { "ExtendNumLet" 12 }
 143     } ;
 144
 145 : word-break-prop ( char -- word-break-prop )
 146     word-break-table interval-at
 147     word-break-classes at [ wOther ] unless* ;
 148
 149 SYMBOL: check-letter-before
 150 SYMBOL: check-letter-after
 151 SYMBOL: check-number-before
 152 SYMBOL: check-number-after
 153
 154 : make-word-table ( -- )
 155     { wCR } { wLF } connect
 156     { wNewline wCR wLF } words disconnect
 157     words { wNewline wCR wLF } disconnect
 158     { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
 159     { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
 160     { wNumeric wALetter } { wNumeric wALetter } connect
 161     { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
 162     { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
 163     { wKatakana } { wKatakana } connect
 164     { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
 165     [ connect ] [ swap connect ] 2bi ;
 166
 167 VALUE: word-table
 168
 169 : finish-word-table ( -- table )
 170     table get [
 171         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
 172     ] map ;
 173
 174 words init-table table
 175 [ make-word-table finish-word-table ] with-variable
 176 to: word-table
 177
 178 : word-table-nth ( class1 class2 -- ? )
 179     word-table nth nth ;
 180
 181 :: property-not= ( str i property -- ? )
 182     i [
 183         i str ?nth [ word-break-prop property = not ]
 184         [ f ] if*
 185     ] [ t ] if ;
 186
 187 : format/extended? ( ch -- ? )
 188     word-break-prop { 4 5 } member? ;
 189
 190 : (walk-up) ( str i -- j )
 191     swap [ format/extended? not ] find-from drop ;
 192
 193 : walk-up ( str i -- j )
 194     dupd 1+ (walk-up) [ 1+ (walk-up) ] [ drop f ] if* ;
 195
 196 : (walk-down) ( str i -- j )
 197     swap [ format/extended? not ] find-last-from drop ;
 198
 199 : walk-down ( str i -- j )
 200     dupd (walk-down) [ 1- (walk-down) ] [ drop f ] if* ;
 201
 202 : word-break? ( str i table-entry -- ? )
 203     {
 204         { t [ 2drop f ] }
 205         { f [ 2drop t ] }
 206         { check-letter-after
 207             [ dupd walk-up wALetter property-not= ] }
 208         { check-letter-before
 209             [ dupd walk-down wALetter property-not= ] }
 210         { check-number-after
 211             [ dupd walk-up wNumeric property-not= ] }
 212         { check-number-before
 213             [ dupd walk-down wNumeric property-not= ] }
 214     } case ;
 215
 216 :: word-break-next ( old-class new-char i str -- next-class ? )
 217     new-char format/extended?
 218     [ old-class dup { 1 2 3 } member? ] [
 219         new-char word-break-prop old-class over word-table-nth
 220         [ str i ] dip word-break?
 221     ] if ;
 222
 223 PRIVATE>
 224
 225 : first-word ( str -- i )
 226     [ unclip-slice word-break-prop over <enum> ] keep
 227     '[ swap _ word-break-next ] assoc-find 2drop
 228     nip swap length or 1+ ;
 229
 230 : >words ( str -- words )
 231     [ first-word ] >pieces ;
 232
 233 <PRIVATE
 234
 235 : nth-next ( i str -- str[i-1] str[i] )
 236     [ [ 1- ] keep ] dip '[ _ nth ] bi@ ;
 237
 238 PRIVATE>
 239
 240 : word-break-at? ( i str -- ? )
 241     {
 242         [ drop zero? ]
 243         [ length = ]
 244         [
 245             [ nth-next [ word-break-prop ] dip ] 2keep
 246             word-break-next nip
 247         ]
 248     } 2|| ;