basis/unicode/breaks/breaks.factor

   1 ! Copyright (C) 2008 Daniel Ehrenberg.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: combinators.short-circuit unicode.categories kernel math
   4 combinators splitting sequences math.parser io.files io assocs
   5 arrays namespaces make math.ranges unicode.normalize
   6 unicode.normalize.private values io.encodings.ascii
   7 unicode.data compiler.units fry unicode.categories.syntax
   8 alien.syntax sets accessors interval-maps memoize locals words
   9 simple-flat-file ;
  10 IN: unicode.breaks
  11
  12 <PRIVATE
  13 ! Grapheme breaks
  14
  15 CONSTANT: Any 0
  16 CONSTANT: L 1
  17 CONSTANT: V 2
  18 CONSTANT: T 3
  19 CONSTANT: LV 4
  20 CONSTANT: LVT 5
  21 CONSTANT: Extend 6
  22 CONSTANT: Control 7
  23 CONSTANT: CR 8
  24 CONSTANT: LF 9
  25 CONSTANT: SpacingMark 10
  26 CONSTANT: Prepend 11
  27 CONSTANT: graphemes 12
  28
  29 : jamo-class ( ch -- class )
  30     dup initial? [ drop L ]
  31     [ dup medial? [ drop V ] [ final? T Any ? ] if ] if ;
  32
  33 : hangul-class ( ch -- class )
  34     hangul-base - HEX: 1C mod zero? LV LVT ? ;
  35
  36 CATEGORY: grapheme-control Zl Zp Cc Cf ;
  37 : control-class ( ch -- class )
  38     {
  39         { CHAR: \r [ CR ] }
  40         { CHAR: \n [ LF ] }
  41         { HEX: 200C [ Extend ] }
  42         { HEX: 200D [ Extend ] }
  43         [ drop Control ]
  44     } case ;
  45
  46 CATEGORY: extend
  47     Me Mn |
  48     "Other_Grapheme_Extend" property? ;
  49
  50 : loe? ( ch -- ? )
  51     "Logical_Order_Exception" property? ;
  52
  53 CATEGORY: spacing Mc ;
  54
  55 : grapheme-class ( ch -- class )
  56     {
  57         { [ dup jamo? ] [ jamo-class ] }
  58         { [ dup hangul? ] [ hangul-class ] }
  59         { [ dup grapheme-control? ] [ control-class ] }
  60         { [ dup extend? ] [ drop Extend ] }
  61         { [ dup spacing? ] [ drop SpacingMark ] }
  62         { [ loe? ] [ Prepend ] }
  63         [ Any ]
  64     } cond ;
  65
  66 : init-table ( size -- table )
  67     dup [ f <array> ] curry replicate ;
  68
  69 SYMBOL: table
  70
  71 : finish-table ( -- table )
  72     table get [ [ 1 = ] map ] map ;
  73
  74 : eval-seq ( seq -- seq ) [ ?execute ] map ;
  75
  76 : (set-table) ( class1 class2 val -- )
  77     [ table get nth ] dip '[ _ or ] change-nth ;
  78
  79 : set-table ( classes1 classes2 val -- )
  80     [ [ eval-seq ] bi@ ] dip
  81     [ [ (set-table) ] curry with each ] 2curry each ;
  82
  83 : connect ( class1 class2 -- ) 1 set-table ;
  84 : disconnect ( class1 class2 -- ) 0 set-table ;
  85
  86 : make-grapheme-table ( -- )
  87     { CR } { LF } connect
  88     { Control CR LF } graphemes iota disconnect
  89     graphemes iota { Control CR LF } disconnect
  90     { L } { L V LV LVT } connect
  91     { LV V } { V T } connect
  92     { LVT T } { T } connect
  93     graphemes iota { Extend } connect
  94     graphemes iota { SpacingMark } connect
  95     { Prepend } graphemes iota connect ;
  96
  97 VALUE: grapheme-table
  98
  99 : grapheme-break? ( class1 class2 -- ? )
 100     grapheme-table nth nth not ;
 101
 102 PRIVATE>
 103
 104 : first-grapheme ( str -- i )
 105     unclip-slice grapheme-class over
 106     [ grapheme-class [ nip ] [ grapheme-break? ] 2bi ] find drop
 107     nip swap length or 1 + ;
 108
 109 : first-grapheme-from ( start str -- i )
 110     over tail-slice first-grapheme + ;
 111
 112 : last-grapheme ( str -- i )
 113     unclip-last-slice grapheme-class swap
 114     [ grapheme-class dup rot grapheme-break? ] find-last drop ?1+ nip ;
 115
 116 : last-grapheme-from ( end str -- i )
 117     swap head-slice last-grapheme ;
 118
 119 <PRIVATE
 120
 121 : >pieces ( str quot: ( str -- i ) -- graphemes )
 122     [ dup empty? not ] swap '[ dup @ cut-slice swap ] produce nip ; inline
 123
 124 PRIVATE>
 125
 126 : >graphemes ( str -- graphemes )
 127     [ first-grapheme ] >pieces ;
 128
 129 : string-reverse ( str -- rts )
 130     >graphemes reverse concat ;
 131
 132 <PRIVATE
 133
 134 graphemes init-table table
 135 [ make-grapheme-table finish-table ] with-variable
 136 to: grapheme-table
 137
 138 ! Word breaks
 139
 140 VALUE: word-break-table
 141
 142 "vocab:unicode/data/WordBreakProperty.txt" load-interval-file
 143 to: word-break-table
 144
 145 CONSTANT: wOther 0
 146 CONSTANT: wCR 1
 147 CONSTANT: wLF 2
 148 CONSTANT: wNewline 3
 149 CONSTANT: wExtend 4
 150 CONSTANT: wFormat 5
 151 CONSTANT: wKatakana 6
 152 CONSTANT: wALetter 7
 153 CONSTANT: wMidLetter 8
 154 CONSTANT: wMidNum 9
 155 CONSTANT: wMidNumLet 10
 156 CONSTANT: wNumeric 11
 157 CONSTANT: wExtendNumLet 12
 158 CONSTANT: words 13
 159
 160 : word-break-classes ( -- table ) ! Is there a way to avoid this?
 161     H{
 162         { "Other" 0 } { "CR" 1 } { "LF" 2 } { "Newline" 3 }
 163         { "Extend" 4 } { "Format" 5 } { "Katakana" 6 }
 164         { "ALetter" 7 } { "MidLetter" 8 }
 165         { "MidNum" 9 } { "MidNumLet" 10 } { "Numeric" 11 }
 166         { "ExtendNumLet" 12 }
 167     } ;
 168
 169 : word-break-prop ( char -- word-break-prop )
 170     word-break-table interval-at
 171     word-break-classes at [ wOther ] unless* ;
 172
 173 SYMBOL: check-letter-before
 174 SYMBOL: check-letter-after
 175 SYMBOL: check-number-before
 176 SYMBOL: check-number-after
 177
 178 : make-word-table ( -- )
 179     { wCR } { wLF } connect
 180     { wNewline wCR wLF } words iota disconnect
 181     words iota { wNewline wCR wLF } disconnect
 182     { wALetter } { wMidLetter wMidNumLet } check-letter-after set-table
 183     { wMidLetter wMidNumLet } { wALetter } check-letter-before set-table
 184     { wNumeric wALetter } { wNumeric wALetter } connect
 185     { wNumeric } { wMidNum wMidNumLet } check-number-after set-table
 186     { wMidNum wMidNumLet } { wNumeric } check-number-before set-table
 187     { wKatakana } { wKatakana } connect
 188     { wALetter wNumeric wKatakana wExtendNumLet } { wExtendNumLet }
 189     [ connect ] [ swap connect ] 2bi ;
 190
 191 VALUE: word-table
 192
 193 : finish-word-table ( -- table )
 194     table get [
 195         [ { { 0 [ f ] } { 1 [ t ] } [ ] } case ] map
 196     ] map ;
 197
 198 words init-table table
 199 [ make-word-table finish-word-table ] with-variable
 200 to: word-table
 201
 202 : word-table-nth ( class1 class2 -- ? )
 203     word-table nth nth ;
 204
 205 :: property-not= ( str i property -- ? )
 206     i [
 207         i str ?nth [ word-break-prop property = not ]
 208         [ f ] if*
 209     ] [ t ] if ;
 210
 211 : format/extended? ( ch -- ? )
 212     word-break-prop { 4 5 } member? ;
 213
 214 : (walk-up) ( str i -- j )
 215     swap [ format/extended? not ] find-from drop ;
 216
 217 : walk-up ( str i -- j )
 218     dupd 1 + (walk-up) [ 1 + (walk-up) ] [ drop f ] if* ;
 219
 220 : (walk-down) ( str i -- j )
 221     swap [ format/extended? not ] find-last-from drop ;
 222
 223 : walk-down ( str i -- j )
 224     dupd (walk-down) [ 1 - (walk-down) ] [ drop f ] if* ;
 225
 226 : word-break? ( str i table-entry -- ? )
 227     {
 228         { t [ 2drop f ] }
 229         { f [ 2drop t ] }
 230         { check-letter-after
 231             [ dupd walk-up wALetter property-not= ] }
 232         { check-letter-before
 233             [ dupd walk-down wALetter property-not= ] }
 234         { check-number-after
 235             [ dupd walk-up wNumeric property-not= ] }
 236         { check-number-before
 237             [ dupd walk-down wNumeric property-not= ] }
 238     } case ;
 239
 240 :: word-break-next ( old-class new-char i str -- next-class ? )
 241     new-char format/extended?
 242     [ old-class dup { 1 2 3 } member? ] [
 243         new-char word-break-prop old-class over word-table-nth
 244         [ str i ] dip word-break?
 245     ] if ;
 246
 247 PRIVATE>
 248
 249 : first-word ( str -- i )
 250     [ unclip-slice word-break-prop over <enum> ] keep
 251     '[ swap _ word-break-next ] assoc-find 2drop
 252     nip swap length or 1 + ;
 253
 254 : >words ( str -- words )
 255     [ first-word ] >pieces ;
 256
 257 <PRIVATE
 258
 259 : nth-next ( i str -- str[i-1] str[i] )
 260     [ [ 1 - ] keep ] dip '[ _ nth ] bi@ ;
 261
 262 PRIVATE>
 263
 264 : word-break-at? ( i str -- ? )
 265     {
 266         [ drop zero? ]
 267         [ length = ]
 268         [
 269             [ nth-next [ word-break-prop ] dip ] 2keep
 270             word-break-next nip
 271         ]
 272     } 2|| ;
 273
 274 : first-word-from ( start str -- i )
 275     over tail-slice first-word + ;
 276
 277 : last-word ( str -- i )
 278     [ length iota ] keep '[ _ word-break-at? ] find-last drop 0 or ;
 279
 280 : last-word-from ( end str -- i )
 281     swap head-slice last-word ;